xref: /linux/net/core/dev.c (revision eecb20720f1b29019725515051e41bc7c079f91f)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 #include <linux/cpu_rmap.h>
136 #include <linux/if_tunnel.h>
137 #include <linux/if_pppox.h>
138 #include <linux/ppp_defs.h>
139 #include <linux/net_tstamp.h>
140 
141 #include "net-sysfs.h"
142 
143 /* Instead of increasing this, you should create a hash table. */
144 #define MAX_GRO_SKBS 8
145 
146 /* This should be increased if a protocol with a bigger head is added. */
147 #define GRO_MAX_HEAD (MAX_HEADER + 128)
148 
149 /*
150  *	The list of packet types we will receive (as opposed to discard)
151  *	and the routines to invoke.
152  *
153  *	Why 16. Because with 16 the only overlap we get on a hash of the
154  *	low nibble of the protocol value is RARP/SNAP/X.25.
155  *
156  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
157  *             sure which should go first, but I bet it won't make much
158  *             difference if we are running VLANs.  The good news is that
159  *             this protocol won't be in the list unless compiled in, so
160  *             the average user (w/out VLANs) will not be adversely affected.
161  *             --BLG
162  *
163  *		0800	IP
164  *		8100    802.1Q VLAN
165  *		0001	802.3
166  *		0002	AX.25
167  *		0004	802.2
168  *		8035	RARP
169  *		0005	SNAP
170  *		0805	X.25
171  *		0806	ARP
172  *		8137	IPX
173  *		0009	Localtalk
174  *		86DD	IPv6
175  */
176 
177 #define PTYPE_HASH_SIZE	(16)
178 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
179 
180 static DEFINE_SPINLOCK(ptype_lock);
181 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
182 static struct list_head ptype_all __read_mostly;	/* Taps */
183 
184 /*
185  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
186  * semaphore.
187  *
188  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
189  *
190  * Writers must hold the rtnl semaphore while they loop through the
191  * dev_base_head list, and hold dev_base_lock for writing when they do the
192  * actual updates.  This allows pure readers to access the list even
193  * while a writer is preparing to update it.
194  *
195  * To put it another way, dev_base_lock is held for writing only to
196  * protect against pure readers; the rtnl semaphore provides the
197  * protection against other writers.
198  *
199  * See, for example usages, register_netdevice() and
200  * unregister_netdevice(), which must be called with the rtnl
201  * semaphore held.
202  */
203 DEFINE_RWLOCK(dev_base_lock);
204 EXPORT_SYMBOL(dev_base_lock);
205 
206 static inline void dev_base_seq_inc(struct net *net)
207 {
208 	while (++net->dev_base_seq == 0);
209 }
210 
211 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
212 {
213 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
214 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
215 }
216 
217 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
218 {
219 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
220 }
221 
222 static inline void rps_lock(struct softnet_data *sd)
223 {
224 #ifdef CONFIG_RPS
225 	spin_lock(&sd->input_pkt_queue.lock);
226 #endif
227 }
228 
229 static inline void rps_unlock(struct softnet_data *sd)
230 {
231 #ifdef CONFIG_RPS
232 	spin_unlock(&sd->input_pkt_queue.lock);
233 #endif
234 }
235 
236 /* Device list insertion */
237 static int list_netdevice(struct net_device *dev)
238 {
239 	struct net *net = dev_net(dev);
240 
241 	ASSERT_RTNL();
242 
243 	write_lock_bh(&dev_base_lock);
244 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
245 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
246 	hlist_add_head_rcu(&dev->index_hlist,
247 			   dev_index_hash(net, dev->ifindex));
248 	write_unlock_bh(&dev_base_lock);
249 
250 	dev_base_seq_inc(net);
251 
252 	return 0;
253 }
254 
255 /* Device list removal
256  * caller must respect a RCU grace period before freeing/reusing dev
257  */
258 static void unlist_netdevice(struct net_device *dev)
259 {
260 	ASSERT_RTNL();
261 
262 	/* Unlink dev from the device chain */
263 	write_lock_bh(&dev_base_lock);
264 	list_del_rcu(&dev->dev_list);
265 	hlist_del_rcu(&dev->name_hlist);
266 	hlist_del_rcu(&dev->index_hlist);
267 	write_unlock_bh(&dev_base_lock);
268 
269 	dev_base_seq_inc(dev_net(dev));
270 }
271 
272 /*
273  *	Our notifier list
274  */
275 
276 static RAW_NOTIFIER_HEAD(netdev_chain);
277 
278 /*
279  *	Device drivers call our routines to queue packets here. We empty the
280  *	queue in the local softnet handler.
281  */
282 
283 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
284 EXPORT_PER_CPU_SYMBOL(softnet_data);
285 
286 #ifdef CONFIG_LOCKDEP
287 /*
288  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
289  * according to dev->type
290  */
291 static const unsigned short netdev_lock_type[] =
292 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
293 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
294 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
295 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
296 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
297 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
298 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
299 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
300 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
301 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
302 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
303 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
304 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
305 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
306 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
307 	 ARPHRD_VOID, ARPHRD_NONE};
308 
309 static const char *const netdev_lock_name[] =
310 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
311 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
312 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
313 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
314 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
315 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
316 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
317 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
318 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
319 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
320 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
321 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
322 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
323 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
324 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
325 	 "_xmit_VOID", "_xmit_NONE"};
326 
327 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
328 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
329 
330 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
331 {
332 	int i;
333 
334 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
335 		if (netdev_lock_type[i] == dev_type)
336 			return i;
337 	/* the last key is used by default */
338 	return ARRAY_SIZE(netdev_lock_type) - 1;
339 }
340 
341 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
342 						 unsigned short dev_type)
343 {
344 	int i;
345 
346 	i = netdev_lock_pos(dev_type);
347 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
348 				   netdev_lock_name[i]);
349 }
350 
351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
352 {
353 	int i;
354 
355 	i = netdev_lock_pos(dev->type);
356 	lockdep_set_class_and_name(&dev->addr_list_lock,
357 				   &netdev_addr_lock_key[i],
358 				   netdev_lock_name[i]);
359 }
360 #else
361 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
362 						 unsigned short dev_type)
363 {
364 }
365 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
366 {
367 }
368 #endif
369 
370 /*******************************************************************************
371 
372 		Protocol management and registration routines
373 
374 *******************************************************************************/
375 
376 /*
377  *	Add a protocol ID to the list. Now that the input handler is
378  *	smarter we can dispense with all the messy stuff that used to be
379  *	here.
380  *
381  *	BEWARE!!! Protocol handlers, mangling input packets,
382  *	MUST BE last in hash buckets and checking protocol handlers
383  *	MUST start from promiscuous ptype_all chain in net_bh.
384  *	It is true now, do not change it.
385  *	Explanation follows: if protocol handler, mangling packet, will
386  *	be the first on list, it is not able to sense, that packet
387  *	is cloned and should be copied-on-write, so that it will
388  *	change it and subsequent readers will get broken packet.
389  *							--ANK (980803)
390  */
391 
392 static inline struct list_head *ptype_head(const struct packet_type *pt)
393 {
394 	if (pt->type == htons(ETH_P_ALL))
395 		return &ptype_all;
396 	else
397 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
398 }
399 
400 /**
401  *	dev_add_pack - add packet handler
402  *	@pt: packet type declaration
403  *
404  *	Add a protocol handler to the networking stack. The passed &packet_type
405  *	is linked into kernel lists and may not be freed until it has been
406  *	removed from the kernel lists.
407  *
408  *	This call does not sleep therefore it can not
409  *	guarantee all CPU's that are in middle of receiving packets
410  *	will see the new packet type (until the next received packet).
411  */
412 
413 void dev_add_pack(struct packet_type *pt)
414 {
415 	struct list_head *head = ptype_head(pt);
416 
417 	spin_lock(&ptype_lock);
418 	list_add_rcu(&pt->list, head);
419 	spin_unlock(&ptype_lock);
420 }
421 EXPORT_SYMBOL(dev_add_pack);
422 
423 /**
424  *	__dev_remove_pack	 - remove packet handler
425  *	@pt: packet type declaration
426  *
427  *	Remove a protocol handler that was previously added to the kernel
428  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
429  *	from the kernel lists and can be freed or reused once this function
430  *	returns.
431  *
432  *      The packet type might still be in use by receivers
433  *	and must not be freed until after all the CPU's have gone
434  *	through a quiescent state.
435  */
436 void __dev_remove_pack(struct packet_type *pt)
437 {
438 	struct list_head *head = ptype_head(pt);
439 	struct packet_type *pt1;
440 
441 	spin_lock(&ptype_lock);
442 
443 	list_for_each_entry(pt1, head, list) {
444 		if (pt == pt1) {
445 			list_del_rcu(&pt->list);
446 			goto out;
447 		}
448 	}
449 
450 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
451 out:
452 	spin_unlock(&ptype_lock);
453 }
454 EXPORT_SYMBOL(__dev_remove_pack);
455 
456 /**
457  *	dev_remove_pack	 - remove packet handler
458  *	@pt: packet type declaration
459  *
460  *	Remove a protocol handler that was previously added to the kernel
461  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
462  *	from the kernel lists and can be freed or reused once this function
463  *	returns.
464  *
465  *	This call sleeps to guarantee that no CPU is looking at the packet
466  *	type after return.
467  */
468 void dev_remove_pack(struct packet_type *pt)
469 {
470 	__dev_remove_pack(pt);
471 
472 	synchronize_net();
473 }
474 EXPORT_SYMBOL(dev_remove_pack);
475 
476 /******************************************************************************
477 
478 		      Device Boot-time Settings Routines
479 
480 *******************************************************************************/
481 
482 /* Boot time configuration table */
483 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
484 
485 /**
486  *	netdev_boot_setup_add	- add new setup entry
487  *	@name: name of the device
488  *	@map: configured settings for the device
489  *
490  *	Adds new setup entry to the dev_boot_setup list.  The function
491  *	returns 0 on error and 1 on success.  This is a generic routine to
492  *	all netdevices.
493  */
494 static int netdev_boot_setup_add(char *name, struct ifmap *map)
495 {
496 	struct netdev_boot_setup *s;
497 	int i;
498 
499 	s = dev_boot_setup;
500 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
501 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
502 			memset(s[i].name, 0, sizeof(s[i].name));
503 			strlcpy(s[i].name, name, IFNAMSIZ);
504 			memcpy(&s[i].map, map, sizeof(s[i].map));
505 			break;
506 		}
507 	}
508 
509 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
510 }
511 
512 /**
513  *	netdev_boot_setup_check	- check boot time settings
514  *	@dev: the netdevice
515  *
516  * 	Check boot time settings for the device.
517  *	The found settings are set for the device to be used
518  *	later in the device probing.
519  *	Returns 0 if no settings found, 1 if they are.
520  */
521 int netdev_boot_setup_check(struct net_device *dev)
522 {
523 	struct netdev_boot_setup *s = dev_boot_setup;
524 	int i;
525 
526 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
527 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
528 		    !strcmp(dev->name, s[i].name)) {
529 			dev->irq 	= s[i].map.irq;
530 			dev->base_addr 	= s[i].map.base_addr;
531 			dev->mem_start 	= s[i].map.mem_start;
532 			dev->mem_end 	= s[i].map.mem_end;
533 			return 1;
534 		}
535 	}
536 	return 0;
537 }
538 EXPORT_SYMBOL(netdev_boot_setup_check);
539 
540 
541 /**
542  *	netdev_boot_base	- get address from boot time settings
543  *	@prefix: prefix for network device
544  *	@unit: id for network device
545  *
546  * 	Check boot time settings for the base address of device.
547  *	The found settings are set for the device to be used
548  *	later in the device probing.
549  *	Returns 0 if no settings found.
550  */
551 unsigned long netdev_boot_base(const char *prefix, int unit)
552 {
553 	const struct netdev_boot_setup *s = dev_boot_setup;
554 	char name[IFNAMSIZ];
555 	int i;
556 
557 	sprintf(name, "%s%d", prefix, unit);
558 
559 	/*
560 	 * If device already registered then return base of 1
561 	 * to indicate not to probe for this interface
562 	 */
563 	if (__dev_get_by_name(&init_net, name))
564 		return 1;
565 
566 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
567 		if (!strcmp(name, s[i].name))
568 			return s[i].map.base_addr;
569 	return 0;
570 }
571 
572 /*
573  * Saves at boot time configured settings for any netdevice.
574  */
575 int __init netdev_boot_setup(char *str)
576 {
577 	int ints[5];
578 	struct ifmap map;
579 
580 	str = get_options(str, ARRAY_SIZE(ints), ints);
581 	if (!str || !*str)
582 		return 0;
583 
584 	/* Save settings */
585 	memset(&map, 0, sizeof(map));
586 	if (ints[0] > 0)
587 		map.irq = ints[1];
588 	if (ints[0] > 1)
589 		map.base_addr = ints[2];
590 	if (ints[0] > 2)
591 		map.mem_start = ints[3];
592 	if (ints[0] > 3)
593 		map.mem_end = ints[4];
594 
595 	/* Add new entry to the list */
596 	return netdev_boot_setup_add(str, &map);
597 }
598 
599 __setup("netdev=", netdev_boot_setup);
600 
601 /*******************************************************************************
602 
603 			    Device Interface Subroutines
604 
605 *******************************************************************************/
606 
607 /**
608  *	__dev_get_by_name	- find a device by its name
609  *	@net: the applicable net namespace
610  *	@name: name to find
611  *
612  *	Find an interface by name. Must be called under RTNL semaphore
613  *	or @dev_base_lock. If the name is found a pointer to the device
614  *	is returned. If the name is not found then %NULL is returned. The
615  *	reference counters are not incremented so the caller must be
616  *	careful with locks.
617  */
618 
619 struct net_device *__dev_get_by_name(struct net *net, const char *name)
620 {
621 	struct hlist_node *p;
622 	struct net_device *dev;
623 	struct hlist_head *head = dev_name_hash(net, name);
624 
625 	hlist_for_each_entry(dev, p, head, name_hlist)
626 		if (!strncmp(dev->name, name, IFNAMSIZ))
627 			return dev;
628 
629 	return NULL;
630 }
631 EXPORT_SYMBOL(__dev_get_by_name);
632 
633 /**
634  *	dev_get_by_name_rcu	- find a device by its name
635  *	@net: the applicable net namespace
636  *	@name: name to find
637  *
638  *	Find an interface by name.
639  *	If the name is found a pointer to the device is returned.
640  * 	If the name is not found then %NULL is returned.
641  *	The reference counters are not incremented so the caller must be
642  *	careful with locks. The caller must hold RCU lock.
643  */
644 
645 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
646 {
647 	struct hlist_node *p;
648 	struct net_device *dev;
649 	struct hlist_head *head = dev_name_hash(net, name);
650 
651 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
652 		if (!strncmp(dev->name, name, IFNAMSIZ))
653 			return dev;
654 
655 	return NULL;
656 }
657 EXPORT_SYMBOL(dev_get_by_name_rcu);
658 
659 /**
660  *	dev_get_by_name		- find a device by its name
661  *	@net: the applicable net namespace
662  *	@name: name to find
663  *
664  *	Find an interface by name. This can be called from any
665  *	context and does its own locking. The returned handle has
666  *	the usage count incremented and the caller must use dev_put() to
667  *	release it when it is no longer needed. %NULL is returned if no
668  *	matching device is found.
669  */
670 
671 struct net_device *dev_get_by_name(struct net *net, const char *name)
672 {
673 	struct net_device *dev;
674 
675 	rcu_read_lock();
676 	dev = dev_get_by_name_rcu(net, name);
677 	if (dev)
678 		dev_hold(dev);
679 	rcu_read_unlock();
680 	return dev;
681 }
682 EXPORT_SYMBOL(dev_get_by_name);
683 
684 /**
685  *	__dev_get_by_index - find a device by its ifindex
686  *	@net: the applicable net namespace
687  *	@ifindex: index of device
688  *
689  *	Search for an interface by index. Returns %NULL if the device
690  *	is not found or a pointer to the device. The device has not
691  *	had its reference counter increased so the caller must be careful
692  *	about locking. The caller must hold either the RTNL semaphore
693  *	or @dev_base_lock.
694  */
695 
696 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
697 {
698 	struct hlist_node *p;
699 	struct net_device *dev;
700 	struct hlist_head *head = dev_index_hash(net, ifindex);
701 
702 	hlist_for_each_entry(dev, p, head, index_hlist)
703 		if (dev->ifindex == ifindex)
704 			return dev;
705 
706 	return NULL;
707 }
708 EXPORT_SYMBOL(__dev_get_by_index);
709 
710 /**
711  *	dev_get_by_index_rcu - find a device by its ifindex
712  *	@net: the applicable net namespace
713  *	@ifindex: index of device
714  *
715  *	Search for an interface by index. Returns %NULL if the device
716  *	is not found or a pointer to the device. The device has not
717  *	had its reference counter increased so the caller must be careful
718  *	about locking. The caller must hold RCU lock.
719  */
720 
721 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
722 {
723 	struct hlist_node *p;
724 	struct net_device *dev;
725 	struct hlist_head *head = dev_index_hash(net, ifindex);
726 
727 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
728 		if (dev->ifindex == ifindex)
729 			return dev;
730 
731 	return NULL;
732 }
733 EXPORT_SYMBOL(dev_get_by_index_rcu);
734 
735 
736 /**
737  *	dev_get_by_index - find a device by its ifindex
738  *	@net: the applicable net namespace
739  *	@ifindex: index of device
740  *
741  *	Search for an interface by index. Returns NULL if the device
742  *	is not found or a pointer to the device. The device returned has
743  *	had a reference added and the pointer is safe until the user calls
744  *	dev_put to indicate they have finished with it.
745  */
746 
747 struct net_device *dev_get_by_index(struct net *net, int ifindex)
748 {
749 	struct net_device *dev;
750 
751 	rcu_read_lock();
752 	dev = dev_get_by_index_rcu(net, ifindex);
753 	if (dev)
754 		dev_hold(dev);
755 	rcu_read_unlock();
756 	return dev;
757 }
758 EXPORT_SYMBOL(dev_get_by_index);
759 
760 /**
761  *	dev_getbyhwaddr_rcu - find a device by its hardware address
762  *	@net: the applicable net namespace
763  *	@type: media type of device
764  *	@ha: hardware address
765  *
766  *	Search for an interface by MAC address. Returns NULL if the device
767  *	is not found or a pointer to the device.
768  *	The caller must hold RCU or RTNL.
769  *	The returned device has not had its ref count increased
770  *	and the caller must therefore be careful about locking
771  *
772  */
773 
774 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
775 				       const char *ha)
776 {
777 	struct net_device *dev;
778 
779 	for_each_netdev_rcu(net, dev)
780 		if (dev->type == type &&
781 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
782 			return dev;
783 
784 	return NULL;
785 }
786 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
787 
788 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
789 {
790 	struct net_device *dev;
791 
792 	ASSERT_RTNL();
793 	for_each_netdev(net, dev)
794 		if (dev->type == type)
795 			return dev;
796 
797 	return NULL;
798 }
799 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
800 
801 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
802 {
803 	struct net_device *dev, *ret = NULL;
804 
805 	rcu_read_lock();
806 	for_each_netdev_rcu(net, dev)
807 		if (dev->type == type) {
808 			dev_hold(dev);
809 			ret = dev;
810 			break;
811 		}
812 	rcu_read_unlock();
813 	return ret;
814 }
815 EXPORT_SYMBOL(dev_getfirstbyhwtype);
816 
817 /**
818  *	dev_get_by_flags_rcu - find any device with given flags
819  *	@net: the applicable net namespace
820  *	@if_flags: IFF_* values
821  *	@mask: bitmask of bits in if_flags to check
822  *
823  *	Search for any interface with the given flags. Returns NULL if a device
824  *	is not found or a pointer to the device. Must be called inside
825  *	rcu_read_lock(), and result refcount is unchanged.
826  */
827 
828 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
829 				    unsigned short mask)
830 {
831 	struct net_device *dev, *ret;
832 
833 	ret = NULL;
834 	for_each_netdev_rcu(net, dev) {
835 		if (((dev->flags ^ if_flags) & mask) == 0) {
836 			ret = dev;
837 			break;
838 		}
839 	}
840 	return ret;
841 }
842 EXPORT_SYMBOL(dev_get_by_flags_rcu);
843 
844 /**
845  *	dev_valid_name - check if name is okay for network device
846  *	@name: name string
847  *
848  *	Network device names need to be valid file names to
849  *	to allow sysfs to work.  We also disallow any kind of
850  *	whitespace.
851  */
852 int dev_valid_name(const char *name)
853 {
854 	if (*name == '\0')
855 		return 0;
856 	if (strlen(name) >= IFNAMSIZ)
857 		return 0;
858 	if (!strcmp(name, ".") || !strcmp(name, ".."))
859 		return 0;
860 
861 	while (*name) {
862 		if (*name == '/' || isspace(*name))
863 			return 0;
864 		name++;
865 	}
866 	return 1;
867 }
868 EXPORT_SYMBOL(dev_valid_name);
869 
870 /**
871  *	__dev_alloc_name - allocate a name for a device
872  *	@net: network namespace to allocate the device name in
873  *	@name: name format string
874  *	@buf:  scratch buffer and result name string
875  *
876  *	Passed a format string - eg "lt%d" it will try and find a suitable
877  *	id. It scans list of devices to build up a free map, then chooses
878  *	the first empty slot. The caller must hold the dev_base or rtnl lock
879  *	while allocating the name and adding the device in order to avoid
880  *	duplicates.
881  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
882  *	Returns the number of the unit assigned or a negative errno code.
883  */
884 
885 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
886 {
887 	int i = 0;
888 	const char *p;
889 	const int max_netdevices = 8*PAGE_SIZE;
890 	unsigned long *inuse;
891 	struct net_device *d;
892 
893 	p = strnchr(name, IFNAMSIZ-1, '%');
894 	if (p) {
895 		/*
896 		 * Verify the string as this thing may have come from
897 		 * the user.  There must be either one "%d" and no other "%"
898 		 * characters.
899 		 */
900 		if (p[1] != 'd' || strchr(p + 2, '%'))
901 			return -EINVAL;
902 
903 		/* Use one page as a bit array of possible slots */
904 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
905 		if (!inuse)
906 			return -ENOMEM;
907 
908 		for_each_netdev(net, d) {
909 			if (!sscanf(d->name, name, &i))
910 				continue;
911 			if (i < 0 || i >= max_netdevices)
912 				continue;
913 
914 			/*  avoid cases where sscanf is not exact inverse of printf */
915 			snprintf(buf, IFNAMSIZ, name, i);
916 			if (!strncmp(buf, d->name, IFNAMSIZ))
917 				set_bit(i, inuse);
918 		}
919 
920 		i = find_first_zero_bit(inuse, max_netdevices);
921 		free_page((unsigned long) inuse);
922 	}
923 
924 	if (buf != name)
925 		snprintf(buf, IFNAMSIZ, name, i);
926 	if (!__dev_get_by_name(net, buf))
927 		return i;
928 
929 	/* It is possible to run out of possible slots
930 	 * when the name is long and there isn't enough space left
931 	 * for the digits, or if all bits are used.
932 	 */
933 	return -ENFILE;
934 }
935 
936 /**
937  *	dev_alloc_name - allocate a name for a device
938  *	@dev: device
939  *	@name: name format string
940  *
941  *	Passed a format string - eg "lt%d" it will try and find a suitable
942  *	id. It scans list of devices to build up a free map, then chooses
943  *	the first empty slot. The caller must hold the dev_base or rtnl lock
944  *	while allocating the name and adding the device in order to avoid
945  *	duplicates.
946  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
947  *	Returns the number of the unit assigned or a negative errno code.
948  */
949 
950 int dev_alloc_name(struct net_device *dev, const char *name)
951 {
952 	char buf[IFNAMSIZ];
953 	struct net *net;
954 	int ret;
955 
956 	BUG_ON(!dev_net(dev));
957 	net = dev_net(dev);
958 	ret = __dev_alloc_name(net, name, buf);
959 	if (ret >= 0)
960 		strlcpy(dev->name, buf, IFNAMSIZ);
961 	return ret;
962 }
963 EXPORT_SYMBOL(dev_alloc_name);
964 
965 static int dev_get_valid_name(struct net_device *dev, const char *name)
966 {
967 	struct net *net;
968 
969 	BUG_ON(!dev_net(dev));
970 	net = dev_net(dev);
971 
972 	if (!dev_valid_name(name))
973 		return -EINVAL;
974 
975 	if (strchr(name, '%'))
976 		return dev_alloc_name(dev, name);
977 	else if (__dev_get_by_name(net, name))
978 		return -EEXIST;
979 	else if (dev->name != name)
980 		strlcpy(dev->name, name, IFNAMSIZ);
981 
982 	return 0;
983 }
984 
985 /**
986  *	dev_change_name - change name of a device
987  *	@dev: device
988  *	@newname: name (or format string) must be at least IFNAMSIZ
989  *
990  *	Change name of a device, can pass format strings "eth%d".
991  *	for wildcarding.
992  */
993 int dev_change_name(struct net_device *dev, const char *newname)
994 {
995 	char oldname[IFNAMSIZ];
996 	int err = 0;
997 	int ret;
998 	struct net *net;
999 
1000 	ASSERT_RTNL();
1001 	BUG_ON(!dev_net(dev));
1002 
1003 	net = dev_net(dev);
1004 	if (dev->flags & IFF_UP)
1005 		return -EBUSY;
1006 
1007 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1008 		return 0;
1009 
1010 	memcpy(oldname, dev->name, IFNAMSIZ);
1011 
1012 	err = dev_get_valid_name(dev, newname);
1013 	if (err < 0)
1014 		return err;
1015 
1016 rollback:
1017 	ret = device_rename(&dev->dev, dev->name);
1018 	if (ret) {
1019 		memcpy(dev->name, oldname, IFNAMSIZ);
1020 		return ret;
1021 	}
1022 
1023 	write_lock_bh(&dev_base_lock);
1024 	hlist_del_rcu(&dev->name_hlist);
1025 	write_unlock_bh(&dev_base_lock);
1026 
1027 	synchronize_rcu();
1028 
1029 	write_lock_bh(&dev_base_lock);
1030 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1031 	write_unlock_bh(&dev_base_lock);
1032 
1033 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1034 	ret = notifier_to_errno(ret);
1035 
1036 	if (ret) {
1037 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1038 		if (err >= 0) {
1039 			err = ret;
1040 			memcpy(dev->name, oldname, IFNAMSIZ);
1041 			goto rollback;
1042 		} else {
1043 			printk(KERN_ERR
1044 			       "%s: name change rollback failed: %d.\n",
1045 			       dev->name, ret);
1046 		}
1047 	}
1048 
1049 	return err;
1050 }
1051 
1052 /**
1053  *	dev_set_alias - change ifalias of a device
1054  *	@dev: device
1055  *	@alias: name up to IFALIASZ
1056  *	@len: limit of bytes to copy from info
1057  *
1058  *	Set ifalias for a device,
1059  */
1060 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1061 {
1062 	ASSERT_RTNL();
1063 
1064 	if (len >= IFALIASZ)
1065 		return -EINVAL;
1066 
1067 	if (!len) {
1068 		if (dev->ifalias) {
1069 			kfree(dev->ifalias);
1070 			dev->ifalias = NULL;
1071 		}
1072 		return 0;
1073 	}
1074 
1075 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1076 	if (!dev->ifalias)
1077 		return -ENOMEM;
1078 
1079 	strlcpy(dev->ifalias, alias, len+1);
1080 	return len;
1081 }
1082 
1083 
1084 /**
1085  *	netdev_features_change - device changes features
1086  *	@dev: device to cause notification
1087  *
1088  *	Called to indicate a device has changed features.
1089  */
1090 void netdev_features_change(struct net_device *dev)
1091 {
1092 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1093 }
1094 EXPORT_SYMBOL(netdev_features_change);
1095 
1096 /**
1097  *	netdev_state_change - device changes state
1098  *	@dev: device to cause notification
1099  *
1100  *	Called to indicate a device has changed state. This function calls
1101  *	the notifier chains for netdev_chain and sends a NEWLINK message
1102  *	to the routing socket.
1103  */
1104 void netdev_state_change(struct net_device *dev)
1105 {
1106 	if (dev->flags & IFF_UP) {
1107 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1108 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1109 	}
1110 }
1111 EXPORT_SYMBOL(netdev_state_change);
1112 
1113 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1114 {
1115 	return call_netdevice_notifiers(event, dev);
1116 }
1117 EXPORT_SYMBOL(netdev_bonding_change);
1118 
1119 /**
1120  *	dev_load 	- load a network module
1121  *	@net: the applicable net namespace
1122  *	@name: name of interface
1123  *
1124  *	If a network interface is not present and the process has suitable
1125  *	privileges this function loads the module. If module loading is not
1126  *	available in this kernel then it becomes a nop.
1127  */
1128 
1129 void dev_load(struct net *net, const char *name)
1130 {
1131 	struct net_device *dev;
1132 	int no_module;
1133 
1134 	rcu_read_lock();
1135 	dev = dev_get_by_name_rcu(net, name);
1136 	rcu_read_unlock();
1137 
1138 	no_module = !dev;
1139 	if (no_module && capable(CAP_NET_ADMIN))
1140 		no_module = request_module("netdev-%s", name);
1141 	if (no_module && capable(CAP_SYS_MODULE)) {
1142 		if (!request_module("%s", name))
1143 			pr_err("Loading kernel module for a network device "
1144 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1145 "instead\n", name);
1146 	}
1147 }
1148 EXPORT_SYMBOL(dev_load);
1149 
1150 static int __dev_open(struct net_device *dev)
1151 {
1152 	const struct net_device_ops *ops = dev->netdev_ops;
1153 	int ret;
1154 
1155 	ASSERT_RTNL();
1156 
1157 	if (!netif_device_present(dev))
1158 		return -ENODEV;
1159 
1160 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1161 	ret = notifier_to_errno(ret);
1162 	if (ret)
1163 		return ret;
1164 
1165 	set_bit(__LINK_STATE_START, &dev->state);
1166 
1167 	if (ops->ndo_validate_addr)
1168 		ret = ops->ndo_validate_addr(dev);
1169 
1170 	if (!ret && ops->ndo_open)
1171 		ret = ops->ndo_open(dev);
1172 
1173 	if (ret)
1174 		clear_bit(__LINK_STATE_START, &dev->state);
1175 	else {
1176 		dev->flags |= IFF_UP;
1177 		net_dmaengine_get();
1178 		dev_set_rx_mode(dev);
1179 		dev_activate(dev);
1180 	}
1181 
1182 	return ret;
1183 }
1184 
1185 /**
1186  *	dev_open	- prepare an interface for use.
1187  *	@dev:	device to open
1188  *
1189  *	Takes a device from down to up state. The device's private open
1190  *	function is invoked and then the multicast lists are loaded. Finally
1191  *	the device is moved into the up state and a %NETDEV_UP message is
1192  *	sent to the netdev notifier chain.
1193  *
1194  *	Calling this function on an active interface is a nop. On a failure
1195  *	a negative errno code is returned.
1196  */
1197 int dev_open(struct net_device *dev)
1198 {
1199 	int ret;
1200 
1201 	if (dev->flags & IFF_UP)
1202 		return 0;
1203 
1204 	ret = __dev_open(dev);
1205 	if (ret < 0)
1206 		return ret;
1207 
1208 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1209 	call_netdevice_notifiers(NETDEV_UP, dev);
1210 
1211 	return ret;
1212 }
1213 EXPORT_SYMBOL(dev_open);
1214 
1215 static int __dev_close_many(struct list_head *head)
1216 {
1217 	struct net_device *dev;
1218 
1219 	ASSERT_RTNL();
1220 	might_sleep();
1221 
1222 	list_for_each_entry(dev, head, unreg_list) {
1223 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1224 
1225 		clear_bit(__LINK_STATE_START, &dev->state);
1226 
1227 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1228 		 * can be even on different cpu. So just clear netif_running().
1229 		 *
1230 		 * dev->stop() will invoke napi_disable() on all of it's
1231 		 * napi_struct instances on this device.
1232 		 */
1233 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1234 	}
1235 
1236 	dev_deactivate_many(head);
1237 
1238 	list_for_each_entry(dev, head, unreg_list) {
1239 		const struct net_device_ops *ops = dev->netdev_ops;
1240 
1241 		/*
1242 		 *	Call the device specific close. This cannot fail.
1243 		 *	Only if device is UP
1244 		 *
1245 		 *	We allow it to be called even after a DETACH hot-plug
1246 		 *	event.
1247 		 */
1248 		if (ops->ndo_stop)
1249 			ops->ndo_stop(dev);
1250 
1251 		dev->flags &= ~IFF_UP;
1252 		net_dmaengine_put();
1253 	}
1254 
1255 	return 0;
1256 }
1257 
1258 static int __dev_close(struct net_device *dev)
1259 {
1260 	int retval;
1261 	LIST_HEAD(single);
1262 
1263 	list_add(&dev->unreg_list, &single);
1264 	retval = __dev_close_many(&single);
1265 	list_del(&single);
1266 	return retval;
1267 }
1268 
1269 static int dev_close_many(struct list_head *head)
1270 {
1271 	struct net_device *dev, *tmp;
1272 	LIST_HEAD(tmp_list);
1273 
1274 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1275 		if (!(dev->flags & IFF_UP))
1276 			list_move(&dev->unreg_list, &tmp_list);
1277 
1278 	__dev_close_many(head);
1279 
1280 	list_for_each_entry(dev, head, unreg_list) {
1281 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1282 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1283 	}
1284 
1285 	/* rollback_registered_many needs the complete original list */
1286 	list_splice(&tmp_list, head);
1287 	return 0;
1288 }
1289 
1290 /**
1291  *	dev_close - shutdown an interface.
1292  *	@dev: device to shutdown
1293  *
1294  *	This function moves an active device into down state. A
1295  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1296  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1297  *	chain.
1298  */
1299 int dev_close(struct net_device *dev)
1300 {
1301 	if (dev->flags & IFF_UP) {
1302 		LIST_HEAD(single);
1303 
1304 		list_add(&dev->unreg_list, &single);
1305 		dev_close_many(&single);
1306 		list_del(&single);
1307 	}
1308 	return 0;
1309 }
1310 EXPORT_SYMBOL(dev_close);
1311 
1312 
1313 /**
1314  *	dev_disable_lro - disable Large Receive Offload on a device
1315  *	@dev: device
1316  *
1317  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1318  *	called under RTNL.  This is needed if received packets may be
1319  *	forwarded to another interface.
1320  */
1321 void dev_disable_lro(struct net_device *dev)
1322 {
1323 	u32 flags;
1324 
1325 	/*
1326 	 * If we're trying to disable lro on a vlan device
1327 	 * use the underlying physical device instead
1328 	 */
1329 	if (is_vlan_dev(dev))
1330 		dev = vlan_dev_real_dev(dev);
1331 
1332 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1333 		flags = dev->ethtool_ops->get_flags(dev);
1334 	else
1335 		flags = ethtool_op_get_flags(dev);
1336 
1337 	if (!(flags & ETH_FLAG_LRO))
1338 		return;
1339 
1340 	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1341 	if (unlikely(dev->features & NETIF_F_LRO))
1342 		netdev_WARN(dev, "failed to disable LRO!\n");
1343 }
1344 EXPORT_SYMBOL(dev_disable_lro);
1345 
1346 
1347 static int dev_boot_phase = 1;
1348 
1349 /**
1350  *	register_netdevice_notifier - register a network notifier block
1351  *	@nb: notifier
1352  *
1353  *	Register a notifier to be called when network device events occur.
1354  *	The notifier passed is linked into the kernel structures and must
1355  *	not be reused until it has been unregistered. A negative errno code
1356  *	is returned on a failure.
1357  *
1358  * 	When registered all registration and up events are replayed
1359  *	to the new notifier to allow device to have a race free
1360  *	view of the network device list.
1361  */
1362 
1363 int register_netdevice_notifier(struct notifier_block *nb)
1364 {
1365 	struct net_device *dev;
1366 	struct net_device *last;
1367 	struct net *net;
1368 	int err;
1369 
1370 	rtnl_lock();
1371 	err = raw_notifier_chain_register(&netdev_chain, nb);
1372 	if (err)
1373 		goto unlock;
1374 	if (dev_boot_phase)
1375 		goto unlock;
1376 	for_each_net(net) {
1377 		for_each_netdev(net, dev) {
1378 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1379 			err = notifier_to_errno(err);
1380 			if (err)
1381 				goto rollback;
1382 
1383 			if (!(dev->flags & IFF_UP))
1384 				continue;
1385 
1386 			nb->notifier_call(nb, NETDEV_UP, dev);
1387 		}
1388 	}
1389 
1390 unlock:
1391 	rtnl_unlock();
1392 	return err;
1393 
1394 rollback:
1395 	last = dev;
1396 	for_each_net(net) {
1397 		for_each_netdev(net, dev) {
1398 			if (dev == last)
1399 				goto outroll;
1400 
1401 			if (dev->flags & IFF_UP) {
1402 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1403 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1404 			}
1405 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1406 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1407 		}
1408 	}
1409 
1410 outroll:
1411 	raw_notifier_chain_unregister(&netdev_chain, nb);
1412 	goto unlock;
1413 }
1414 EXPORT_SYMBOL(register_netdevice_notifier);
1415 
1416 /**
1417  *	unregister_netdevice_notifier - unregister a network notifier block
1418  *	@nb: notifier
1419  *
1420  *	Unregister a notifier previously registered by
1421  *	register_netdevice_notifier(). The notifier is unlinked into the
1422  *	kernel structures and may then be reused. A negative errno code
1423  *	is returned on a failure.
1424  */
1425 
1426 int unregister_netdevice_notifier(struct notifier_block *nb)
1427 {
1428 	int err;
1429 
1430 	rtnl_lock();
1431 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1432 	rtnl_unlock();
1433 	return err;
1434 }
1435 EXPORT_SYMBOL(unregister_netdevice_notifier);
1436 
1437 /**
1438  *	call_netdevice_notifiers - call all network notifier blocks
1439  *      @val: value passed unmodified to notifier function
1440  *      @dev: net_device pointer passed unmodified to notifier function
1441  *
1442  *	Call all network notifier blocks.  Parameters and return value
1443  *	are as for raw_notifier_call_chain().
1444  */
1445 
1446 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1447 {
1448 	ASSERT_RTNL();
1449 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1450 }
1451 EXPORT_SYMBOL(call_netdevice_notifiers);
1452 
1453 /* When > 0 there are consumers of rx skb time stamps */
1454 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1455 
1456 void net_enable_timestamp(void)
1457 {
1458 	atomic_inc(&netstamp_needed);
1459 }
1460 EXPORT_SYMBOL(net_enable_timestamp);
1461 
1462 void net_disable_timestamp(void)
1463 {
1464 	atomic_dec(&netstamp_needed);
1465 }
1466 EXPORT_SYMBOL(net_disable_timestamp);
1467 
1468 static inline void net_timestamp_set(struct sk_buff *skb)
1469 {
1470 	if (atomic_read(&netstamp_needed))
1471 		__net_timestamp(skb);
1472 	else
1473 		skb->tstamp.tv64 = 0;
1474 }
1475 
1476 static inline void net_timestamp_check(struct sk_buff *skb)
1477 {
1478 	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1479 		__net_timestamp(skb);
1480 }
1481 
1482 static int net_hwtstamp_validate(struct ifreq *ifr)
1483 {
1484 	struct hwtstamp_config cfg;
1485 	enum hwtstamp_tx_types tx_type;
1486 	enum hwtstamp_rx_filters rx_filter;
1487 	int tx_type_valid = 0;
1488 	int rx_filter_valid = 0;
1489 
1490 	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1491 		return -EFAULT;
1492 
1493 	if (cfg.flags) /* reserved for future extensions */
1494 		return -EINVAL;
1495 
1496 	tx_type = cfg.tx_type;
1497 	rx_filter = cfg.rx_filter;
1498 
1499 	switch (tx_type) {
1500 	case HWTSTAMP_TX_OFF:
1501 	case HWTSTAMP_TX_ON:
1502 	case HWTSTAMP_TX_ONESTEP_SYNC:
1503 		tx_type_valid = 1;
1504 		break;
1505 	}
1506 
1507 	switch (rx_filter) {
1508 	case HWTSTAMP_FILTER_NONE:
1509 	case HWTSTAMP_FILTER_ALL:
1510 	case HWTSTAMP_FILTER_SOME:
1511 	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1512 	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1513 	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1514 	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1515 	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1516 	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1517 	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1518 	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1519 	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1520 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
1521 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
1522 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1523 		rx_filter_valid = 1;
1524 		break;
1525 	}
1526 
1527 	if (!tx_type_valid || !rx_filter_valid)
1528 		return -ERANGE;
1529 
1530 	return 0;
1531 }
1532 
1533 static inline bool is_skb_forwardable(struct net_device *dev,
1534 				      struct sk_buff *skb)
1535 {
1536 	unsigned int len;
1537 
1538 	if (!(dev->flags & IFF_UP))
1539 		return false;
1540 
1541 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1542 	if (skb->len <= len)
1543 		return true;
1544 
1545 	/* if TSO is enabled, we don't care about the length as the packet
1546 	 * could be forwarded without being segmented before
1547 	 */
1548 	if (skb_is_gso(skb))
1549 		return true;
1550 
1551 	return false;
1552 }
1553 
1554 /**
1555  * dev_forward_skb - loopback an skb to another netif
1556  *
1557  * @dev: destination network device
1558  * @skb: buffer to forward
1559  *
1560  * return values:
1561  *	NET_RX_SUCCESS	(no congestion)
1562  *	NET_RX_DROP     (packet was dropped, but freed)
1563  *
1564  * dev_forward_skb can be used for injecting an skb from the
1565  * start_xmit function of one device into the receive queue
1566  * of another device.
1567  *
1568  * The receiving device may be in another namespace, so
1569  * we have to clear all information in the skb that could
1570  * impact namespace isolation.
1571  */
1572 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1573 {
1574 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1575 		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1576 			atomic_long_inc(&dev->rx_dropped);
1577 			kfree_skb(skb);
1578 			return NET_RX_DROP;
1579 		}
1580 	}
1581 
1582 	skb_orphan(skb);
1583 	nf_reset(skb);
1584 
1585 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1586 		atomic_long_inc(&dev->rx_dropped);
1587 		kfree_skb(skb);
1588 		return NET_RX_DROP;
1589 	}
1590 	skb_set_dev(skb, dev);
1591 	skb->tstamp.tv64 = 0;
1592 	skb->pkt_type = PACKET_HOST;
1593 	skb->protocol = eth_type_trans(skb, dev);
1594 	return netif_rx(skb);
1595 }
1596 EXPORT_SYMBOL_GPL(dev_forward_skb);
1597 
1598 static inline int deliver_skb(struct sk_buff *skb,
1599 			      struct packet_type *pt_prev,
1600 			      struct net_device *orig_dev)
1601 {
1602 	atomic_inc(&skb->users);
1603 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1604 }
1605 
1606 /*
1607  *	Support routine. Sends outgoing frames to any network
1608  *	taps currently in use.
1609  */
1610 
1611 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1612 {
1613 	struct packet_type *ptype;
1614 	struct sk_buff *skb2 = NULL;
1615 	struct packet_type *pt_prev = NULL;
1616 
1617 	rcu_read_lock();
1618 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1619 		/* Never send packets back to the socket
1620 		 * they originated from - MvS (miquels@drinkel.ow.org)
1621 		 */
1622 		if ((ptype->dev == dev || !ptype->dev) &&
1623 		    (ptype->af_packet_priv == NULL ||
1624 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1625 			if (pt_prev) {
1626 				deliver_skb(skb2, pt_prev, skb->dev);
1627 				pt_prev = ptype;
1628 				continue;
1629 			}
1630 
1631 			skb2 = skb_clone(skb, GFP_ATOMIC);
1632 			if (!skb2)
1633 				break;
1634 
1635 			net_timestamp_set(skb2);
1636 
1637 			/* skb->nh should be correctly
1638 			   set by sender, so that the second statement is
1639 			   just protection against buggy protocols.
1640 			 */
1641 			skb_reset_mac_header(skb2);
1642 
1643 			if (skb_network_header(skb2) < skb2->data ||
1644 			    skb2->network_header > skb2->tail) {
1645 				if (net_ratelimit())
1646 					printk(KERN_CRIT "protocol %04x is "
1647 					       "buggy, dev %s\n",
1648 					       ntohs(skb2->protocol),
1649 					       dev->name);
1650 				skb_reset_network_header(skb2);
1651 			}
1652 
1653 			skb2->transport_header = skb2->network_header;
1654 			skb2->pkt_type = PACKET_OUTGOING;
1655 			pt_prev = ptype;
1656 		}
1657 	}
1658 	if (pt_prev)
1659 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1660 	rcu_read_unlock();
1661 }
1662 
1663 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1664  * @dev: Network device
1665  * @txq: number of queues available
1666  *
1667  * If real_num_tx_queues is changed the tc mappings may no longer be
1668  * valid. To resolve this verify the tc mapping remains valid and if
1669  * not NULL the mapping. With no priorities mapping to this
1670  * offset/count pair it will no longer be used. In the worst case TC0
1671  * is invalid nothing can be done so disable priority mappings. If is
1672  * expected that drivers will fix this mapping if they can before
1673  * calling netif_set_real_num_tx_queues.
1674  */
1675 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1676 {
1677 	int i;
1678 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1679 
1680 	/* If TC0 is invalidated disable TC mapping */
1681 	if (tc->offset + tc->count > txq) {
1682 		pr_warning("Number of in use tx queues changed "
1683 			   "invalidating tc mappings. Priority "
1684 			   "traffic classification disabled!\n");
1685 		dev->num_tc = 0;
1686 		return;
1687 	}
1688 
1689 	/* Invalidated prio to tc mappings set to TC0 */
1690 	for (i = 1; i < TC_BITMASK + 1; i++) {
1691 		int q = netdev_get_prio_tc_map(dev, i);
1692 
1693 		tc = &dev->tc_to_txq[q];
1694 		if (tc->offset + tc->count > txq) {
1695 			pr_warning("Number of in use tx queues "
1696 				   "changed. Priority %i to tc "
1697 				   "mapping %i is no longer valid "
1698 				   "setting map to 0\n",
1699 				   i, q);
1700 			netdev_set_prio_tc_map(dev, i, 0);
1701 		}
1702 	}
1703 }
1704 
1705 /*
1706  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1707  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1708  */
1709 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1710 {
1711 	int rc;
1712 
1713 	if (txq < 1 || txq > dev->num_tx_queues)
1714 		return -EINVAL;
1715 
1716 	if (dev->reg_state == NETREG_REGISTERED ||
1717 	    dev->reg_state == NETREG_UNREGISTERING) {
1718 		ASSERT_RTNL();
1719 
1720 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1721 						  txq);
1722 		if (rc)
1723 			return rc;
1724 
1725 		if (dev->num_tc)
1726 			netif_setup_tc(dev, txq);
1727 
1728 		if (txq < dev->real_num_tx_queues)
1729 			qdisc_reset_all_tx_gt(dev, txq);
1730 	}
1731 
1732 	dev->real_num_tx_queues = txq;
1733 	return 0;
1734 }
1735 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1736 
1737 #ifdef CONFIG_RPS
1738 /**
1739  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1740  *	@dev: Network device
1741  *	@rxq: Actual number of RX queues
1742  *
1743  *	This must be called either with the rtnl_lock held or before
1744  *	registration of the net device.  Returns 0 on success, or a
1745  *	negative error code.  If called before registration, it always
1746  *	succeeds.
1747  */
1748 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1749 {
1750 	int rc;
1751 
1752 	if (rxq < 1 || rxq > dev->num_rx_queues)
1753 		return -EINVAL;
1754 
1755 	if (dev->reg_state == NETREG_REGISTERED) {
1756 		ASSERT_RTNL();
1757 
1758 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1759 						  rxq);
1760 		if (rc)
1761 			return rc;
1762 	}
1763 
1764 	dev->real_num_rx_queues = rxq;
1765 	return 0;
1766 }
1767 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1768 #endif
1769 
1770 static inline void __netif_reschedule(struct Qdisc *q)
1771 {
1772 	struct softnet_data *sd;
1773 	unsigned long flags;
1774 
1775 	local_irq_save(flags);
1776 	sd = &__get_cpu_var(softnet_data);
1777 	q->next_sched = NULL;
1778 	*sd->output_queue_tailp = q;
1779 	sd->output_queue_tailp = &q->next_sched;
1780 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1781 	local_irq_restore(flags);
1782 }
1783 
1784 void __netif_schedule(struct Qdisc *q)
1785 {
1786 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1787 		__netif_reschedule(q);
1788 }
1789 EXPORT_SYMBOL(__netif_schedule);
1790 
1791 void dev_kfree_skb_irq(struct sk_buff *skb)
1792 {
1793 	if (atomic_dec_and_test(&skb->users)) {
1794 		struct softnet_data *sd;
1795 		unsigned long flags;
1796 
1797 		local_irq_save(flags);
1798 		sd = &__get_cpu_var(softnet_data);
1799 		skb->next = sd->completion_queue;
1800 		sd->completion_queue = skb;
1801 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1802 		local_irq_restore(flags);
1803 	}
1804 }
1805 EXPORT_SYMBOL(dev_kfree_skb_irq);
1806 
1807 void dev_kfree_skb_any(struct sk_buff *skb)
1808 {
1809 	if (in_irq() || irqs_disabled())
1810 		dev_kfree_skb_irq(skb);
1811 	else
1812 		dev_kfree_skb(skb);
1813 }
1814 EXPORT_SYMBOL(dev_kfree_skb_any);
1815 
1816 
1817 /**
1818  * netif_device_detach - mark device as removed
1819  * @dev: network device
1820  *
1821  * Mark device as removed from system and therefore no longer available.
1822  */
1823 void netif_device_detach(struct net_device *dev)
1824 {
1825 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1826 	    netif_running(dev)) {
1827 		netif_tx_stop_all_queues(dev);
1828 	}
1829 }
1830 EXPORT_SYMBOL(netif_device_detach);
1831 
1832 /**
1833  * netif_device_attach - mark device as attached
1834  * @dev: network device
1835  *
1836  * Mark device as attached from system and restart if needed.
1837  */
1838 void netif_device_attach(struct net_device *dev)
1839 {
1840 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1841 	    netif_running(dev)) {
1842 		netif_tx_wake_all_queues(dev);
1843 		__netdev_watchdog_up(dev);
1844 	}
1845 }
1846 EXPORT_SYMBOL(netif_device_attach);
1847 
1848 /**
1849  * skb_dev_set -- assign a new device to a buffer
1850  * @skb: buffer for the new device
1851  * @dev: network device
1852  *
1853  * If an skb is owned by a device already, we have to reset
1854  * all data private to the namespace a device belongs to
1855  * before assigning it a new device.
1856  */
1857 #ifdef CONFIG_NET_NS
1858 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1859 {
1860 	skb_dst_drop(skb);
1861 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1862 		secpath_reset(skb);
1863 		nf_reset(skb);
1864 		skb_init_secmark(skb);
1865 		skb->mark = 0;
1866 		skb->priority = 0;
1867 		skb->nf_trace = 0;
1868 		skb->ipvs_property = 0;
1869 #ifdef CONFIG_NET_SCHED
1870 		skb->tc_index = 0;
1871 #endif
1872 	}
1873 	skb->dev = dev;
1874 }
1875 EXPORT_SYMBOL(skb_set_dev);
1876 #endif /* CONFIG_NET_NS */
1877 
1878 /*
1879  * Invalidate hardware checksum when packet is to be mangled, and
1880  * complete checksum manually on outgoing path.
1881  */
1882 int skb_checksum_help(struct sk_buff *skb)
1883 {
1884 	__wsum csum;
1885 	int ret = 0, offset;
1886 
1887 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1888 		goto out_set_summed;
1889 
1890 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1891 		/* Let GSO fix up the checksum. */
1892 		goto out_set_summed;
1893 	}
1894 
1895 	offset = skb_checksum_start_offset(skb);
1896 	BUG_ON(offset >= skb_headlen(skb));
1897 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1898 
1899 	offset += skb->csum_offset;
1900 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1901 
1902 	if (skb_cloned(skb) &&
1903 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1904 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1905 		if (ret)
1906 			goto out;
1907 	}
1908 
1909 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1910 out_set_summed:
1911 	skb->ip_summed = CHECKSUM_NONE;
1912 out:
1913 	return ret;
1914 }
1915 EXPORT_SYMBOL(skb_checksum_help);
1916 
1917 /**
1918  *	skb_gso_segment - Perform segmentation on skb.
1919  *	@skb: buffer to segment
1920  *	@features: features for the output path (see dev->features)
1921  *
1922  *	This function segments the given skb and returns a list of segments.
1923  *
1924  *	It may return NULL if the skb requires no segmentation.  This is
1925  *	only possible when GSO is used for verifying header integrity.
1926  */
1927 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1928 {
1929 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1930 	struct packet_type *ptype;
1931 	__be16 type = skb->protocol;
1932 	int vlan_depth = ETH_HLEN;
1933 	int err;
1934 
1935 	while (type == htons(ETH_P_8021Q)) {
1936 		struct vlan_hdr *vh;
1937 
1938 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1939 			return ERR_PTR(-EINVAL);
1940 
1941 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1942 		type = vh->h_vlan_encapsulated_proto;
1943 		vlan_depth += VLAN_HLEN;
1944 	}
1945 
1946 	skb_reset_mac_header(skb);
1947 	skb->mac_len = skb->network_header - skb->mac_header;
1948 	__skb_pull(skb, skb->mac_len);
1949 
1950 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1951 		struct net_device *dev = skb->dev;
1952 		struct ethtool_drvinfo info = {};
1953 
1954 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1955 			dev->ethtool_ops->get_drvinfo(dev, &info);
1956 
1957 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1958 		     info.driver, dev ? dev->features : 0L,
1959 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1960 		     skb->len, skb->data_len, skb->ip_summed);
1961 
1962 		if (skb_header_cloned(skb) &&
1963 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1964 			return ERR_PTR(err);
1965 	}
1966 
1967 	rcu_read_lock();
1968 	list_for_each_entry_rcu(ptype,
1969 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1970 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1971 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1972 				err = ptype->gso_send_check(skb);
1973 				segs = ERR_PTR(err);
1974 				if (err || skb_gso_ok(skb, features))
1975 					break;
1976 				__skb_push(skb, (skb->data -
1977 						 skb_network_header(skb)));
1978 			}
1979 			segs = ptype->gso_segment(skb, features);
1980 			break;
1981 		}
1982 	}
1983 	rcu_read_unlock();
1984 
1985 	__skb_push(skb, skb->data - skb_mac_header(skb));
1986 
1987 	return segs;
1988 }
1989 EXPORT_SYMBOL(skb_gso_segment);
1990 
1991 /* Take action when hardware reception checksum errors are detected. */
1992 #ifdef CONFIG_BUG
1993 void netdev_rx_csum_fault(struct net_device *dev)
1994 {
1995 	if (net_ratelimit()) {
1996 		printk(KERN_ERR "%s: hw csum failure.\n",
1997 			dev ? dev->name : "<unknown>");
1998 		dump_stack();
1999 	}
2000 }
2001 EXPORT_SYMBOL(netdev_rx_csum_fault);
2002 #endif
2003 
2004 /* Actually, we should eliminate this check as soon as we know, that:
2005  * 1. IOMMU is present and allows to map all the memory.
2006  * 2. No high memory really exists on this machine.
2007  */
2008 
2009 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2010 {
2011 #ifdef CONFIG_HIGHMEM
2012 	int i;
2013 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2014 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2015 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2016 			if (PageHighMem(skb_frag_page(frag)))
2017 				return 1;
2018 		}
2019 	}
2020 
2021 	if (PCI_DMA_BUS_IS_PHYS) {
2022 		struct device *pdev = dev->dev.parent;
2023 
2024 		if (!pdev)
2025 			return 0;
2026 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2027 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2028 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2029 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2030 				return 1;
2031 		}
2032 	}
2033 #endif
2034 	return 0;
2035 }
2036 
2037 struct dev_gso_cb {
2038 	void (*destructor)(struct sk_buff *skb);
2039 };
2040 
2041 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2042 
2043 static void dev_gso_skb_destructor(struct sk_buff *skb)
2044 {
2045 	struct dev_gso_cb *cb;
2046 
2047 	do {
2048 		struct sk_buff *nskb = skb->next;
2049 
2050 		skb->next = nskb->next;
2051 		nskb->next = NULL;
2052 		kfree_skb(nskb);
2053 	} while (skb->next);
2054 
2055 	cb = DEV_GSO_CB(skb);
2056 	if (cb->destructor)
2057 		cb->destructor(skb);
2058 }
2059 
2060 /**
2061  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2062  *	@skb: buffer to segment
2063  *	@features: device features as applicable to this skb
2064  *
2065  *	This function segments the given skb and stores the list of segments
2066  *	in skb->next.
2067  */
2068 static int dev_gso_segment(struct sk_buff *skb, int features)
2069 {
2070 	struct sk_buff *segs;
2071 
2072 	segs = skb_gso_segment(skb, features);
2073 
2074 	/* Verifying header integrity only. */
2075 	if (!segs)
2076 		return 0;
2077 
2078 	if (IS_ERR(segs))
2079 		return PTR_ERR(segs);
2080 
2081 	skb->next = segs;
2082 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2083 	skb->destructor = dev_gso_skb_destructor;
2084 
2085 	return 0;
2086 }
2087 
2088 /*
2089  * Try to orphan skb early, right before transmission by the device.
2090  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2091  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2092  */
2093 static inline void skb_orphan_try(struct sk_buff *skb)
2094 {
2095 	struct sock *sk = skb->sk;
2096 
2097 	if (sk && !skb_shinfo(skb)->tx_flags) {
2098 		/* skb_tx_hash() wont be able to get sk.
2099 		 * We copy sk_hash into skb->rxhash
2100 		 */
2101 		if (!skb->rxhash)
2102 			skb->rxhash = sk->sk_hash;
2103 		skb_orphan(skb);
2104 	}
2105 }
2106 
2107 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2108 {
2109 	return ((features & NETIF_F_GEN_CSUM) ||
2110 		((features & NETIF_F_V4_CSUM) &&
2111 		 protocol == htons(ETH_P_IP)) ||
2112 		((features & NETIF_F_V6_CSUM) &&
2113 		 protocol == htons(ETH_P_IPV6)) ||
2114 		((features & NETIF_F_FCOE_CRC) &&
2115 		 protocol == htons(ETH_P_FCOE)));
2116 }
2117 
2118 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2119 {
2120 	if (!can_checksum_protocol(features, protocol)) {
2121 		features &= ~NETIF_F_ALL_CSUM;
2122 		features &= ~NETIF_F_SG;
2123 	} else if (illegal_highdma(skb->dev, skb)) {
2124 		features &= ~NETIF_F_SG;
2125 	}
2126 
2127 	return features;
2128 }
2129 
2130 u32 netif_skb_features(struct sk_buff *skb)
2131 {
2132 	__be16 protocol = skb->protocol;
2133 	u32 features = skb->dev->features;
2134 
2135 	if (protocol == htons(ETH_P_8021Q)) {
2136 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2137 		protocol = veh->h_vlan_encapsulated_proto;
2138 	} else if (!vlan_tx_tag_present(skb)) {
2139 		return harmonize_features(skb, protocol, features);
2140 	}
2141 
2142 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2143 
2144 	if (protocol != htons(ETH_P_8021Q)) {
2145 		return harmonize_features(skb, protocol, features);
2146 	} else {
2147 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2148 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2149 		return harmonize_features(skb, protocol, features);
2150 	}
2151 }
2152 EXPORT_SYMBOL(netif_skb_features);
2153 
2154 /*
2155  * Returns true if either:
2156  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2157  *	2. skb is fragmented and the device does not support SG, or if
2158  *	   at least one of fragments is in highmem and device does not
2159  *	   support DMA from it.
2160  */
2161 static inline int skb_needs_linearize(struct sk_buff *skb,
2162 				      int features)
2163 {
2164 	return skb_is_nonlinear(skb) &&
2165 			((skb_has_frag_list(skb) &&
2166 				!(features & NETIF_F_FRAGLIST)) ||
2167 			(skb_shinfo(skb)->nr_frags &&
2168 				!(features & NETIF_F_SG)));
2169 }
2170 
2171 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2172 			struct netdev_queue *txq)
2173 {
2174 	const struct net_device_ops *ops = dev->netdev_ops;
2175 	int rc = NETDEV_TX_OK;
2176 	unsigned int skb_len;
2177 
2178 	if (likely(!skb->next)) {
2179 		u32 features;
2180 
2181 		/*
2182 		 * If device doesn't need skb->dst, release it right now while
2183 		 * its hot in this cpu cache
2184 		 */
2185 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2186 			skb_dst_drop(skb);
2187 
2188 		if (!list_empty(&ptype_all))
2189 			dev_queue_xmit_nit(skb, dev);
2190 
2191 		skb_orphan_try(skb);
2192 
2193 		features = netif_skb_features(skb);
2194 
2195 		if (vlan_tx_tag_present(skb) &&
2196 		    !(features & NETIF_F_HW_VLAN_TX)) {
2197 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2198 			if (unlikely(!skb))
2199 				goto out;
2200 
2201 			skb->vlan_tci = 0;
2202 		}
2203 
2204 		if (netif_needs_gso(skb, features)) {
2205 			if (unlikely(dev_gso_segment(skb, features)))
2206 				goto out_kfree_skb;
2207 			if (skb->next)
2208 				goto gso;
2209 		} else {
2210 			if (skb_needs_linearize(skb, features) &&
2211 			    __skb_linearize(skb))
2212 				goto out_kfree_skb;
2213 
2214 			/* If packet is not checksummed and device does not
2215 			 * support checksumming for this protocol, complete
2216 			 * checksumming here.
2217 			 */
2218 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2219 				skb_set_transport_header(skb,
2220 					skb_checksum_start_offset(skb));
2221 				if (!(features & NETIF_F_ALL_CSUM) &&
2222 				     skb_checksum_help(skb))
2223 					goto out_kfree_skb;
2224 			}
2225 		}
2226 
2227 		skb_len = skb->len;
2228 		rc = ops->ndo_start_xmit(skb, dev);
2229 		trace_net_dev_xmit(skb, rc, dev, skb_len);
2230 		if (rc == NETDEV_TX_OK)
2231 			txq_trans_update(txq);
2232 		return rc;
2233 	}
2234 
2235 gso:
2236 	do {
2237 		struct sk_buff *nskb = skb->next;
2238 
2239 		skb->next = nskb->next;
2240 		nskb->next = NULL;
2241 
2242 		/*
2243 		 * If device doesn't need nskb->dst, release it right now while
2244 		 * its hot in this cpu cache
2245 		 */
2246 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2247 			skb_dst_drop(nskb);
2248 
2249 		skb_len = nskb->len;
2250 		rc = ops->ndo_start_xmit(nskb, dev);
2251 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2252 		if (unlikely(rc != NETDEV_TX_OK)) {
2253 			if (rc & ~NETDEV_TX_MASK)
2254 				goto out_kfree_gso_skb;
2255 			nskb->next = skb->next;
2256 			skb->next = nskb;
2257 			return rc;
2258 		}
2259 		txq_trans_update(txq);
2260 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2261 			return NETDEV_TX_BUSY;
2262 	} while (skb->next);
2263 
2264 out_kfree_gso_skb:
2265 	if (likely(skb->next == NULL))
2266 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2267 out_kfree_skb:
2268 	kfree_skb(skb);
2269 out:
2270 	return rc;
2271 }
2272 
2273 static u32 hashrnd __read_mostly;
2274 
2275 /*
2276  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2277  * to be used as a distribution range.
2278  */
2279 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2280 		  unsigned int num_tx_queues)
2281 {
2282 	u32 hash;
2283 	u16 qoffset = 0;
2284 	u16 qcount = num_tx_queues;
2285 
2286 	if (skb_rx_queue_recorded(skb)) {
2287 		hash = skb_get_rx_queue(skb);
2288 		while (unlikely(hash >= num_tx_queues))
2289 			hash -= num_tx_queues;
2290 		return hash;
2291 	}
2292 
2293 	if (dev->num_tc) {
2294 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2295 		qoffset = dev->tc_to_txq[tc].offset;
2296 		qcount = dev->tc_to_txq[tc].count;
2297 	}
2298 
2299 	if (skb->sk && skb->sk->sk_hash)
2300 		hash = skb->sk->sk_hash;
2301 	else
2302 		hash = (__force u16) skb->protocol ^ skb->rxhash;
2303 	hash = jhash_1word(hash, hashrnd);
2304 
2305 	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2306 }
2307 EXPORT_SYMBOL(__skb_tx_hash);
2308 
2309 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2310 {
2311 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2312 		if (net_ratelimit()) {
2313 			pr_warning("%s selects TX queue %d, but "
2314 				"real number of TX queues is %d\n",
2315 				dev->name, queue_index, dev->real_num_tx_queues);
2316 		}
2317 		return 0;
2318 	}
2319 	return queue_index;
2320 }
2321 
2322 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2323 {
2324 #ifdef CONFIG_XPS
2325 	struct xps_dev_maps *dev_maps;
2326 	struct xps_map *map;
2327 	int queue_index = -1;
2328 
2329 	rcu_read_lock();
2330 	dev_maps = rcu_dereference(dev->xps_maps);
2331 	if (dev_maps) {
2332 		map = rcu_dereference(
2333 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2334 		if (map) {
2335 			if (map->len == 1)
2336 				queue_index = map->queues[0];
2337 			else {
2338 				u32 hash;
2339 				if (skb->sk && skb->sk->sk_hash)
2340 					hash = skb->sk->sk_hash;
2341 				else
2342 					hash = (__force u16) skb->protocol ^
2343 					    skb->rxhash;
2344 				hash = jhash_1word(hash, hashrnd);
2345 				queue_index = map->queues[
2346 				    ((u64)hash * map->len) >> 32];
2347 			}
2348 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2349 				queue_index = -1;
2350 		}
2351 	}
2352 	rcu_read_unlock();
2353 
2354 	return queue_index;
2355 #else
2356 	return -1;
2357 #endif
2358 }
2359 
2360 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2361 					struct sk_buff *skb)
2362 {
2363 	int queue_index;
2364 	const struct net_device_ops *ops = dev->netdev_ops;
2365 
2366 	if (dev->real_num_tx_queues == 1)
2367 		queue_index = 0;
2368 	else if (ops->ndo_select_queue) {
2369 		queue_index = ops->ndo_select_queue(dev, skb);
2370 		queue_index = dev_cap_txqueue(dev, queue_index);
2371 	} else {
2372 		struct sock *sk = skb->sk;
2373 		queue_index = sk_tx_queue_get(sk);
2374 
2375 		if (queue_index < 0 || skb->ooo_okay ||
2376 		    queue_index >= dev->real_num_tx_queues) {
2377 			int old_index = queue_index;
2378 
2379 			queue_index = get_xps_queue(dev, skb);
2380 			if (queue_index < 0)
2381 				queue_index = skb_tx_hash(dev, skb);
2382 
2383 			if (queue_index != old_index && sk) {
2384 				struct dst_entry *dst =
2385 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2386 
2387 				if (dst && skb_dst(skb) == dst)
2388 					sk_tx_queue_set(sk, queue_index);
2389 			}
2390 		}
2391 	}
2392 
2393 	skb_set_queue_mapping(skb, queue_index);
2394 	return netdev_get_tx_queue(dev, queue_index);
2395 }
2396 
2397 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2398 				 struct net_device *dev,
2399 				 struct netdev_queue *txq)
2400 {
2401 	spinlock_t *root_lock = qdisc_lock(q);
2402 	bool contended;
2403 	int rc;
2404 
2405 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2406 	qdisc_calculate_pkt_len(skb, q);
2407 	/*
2408 	 * Heuristic to force contended enqueues to serialize on a
2409 	 * separate lock before trying to get qdisc main lock.
2410 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2411 	 * and dequeue packets faster.
2412 	 */
2413 	contended = qdisc_is_running(q);
2414 	if (unlikely(contended))
2415 		spin_lock(&q->busylock);
2416 
2417 	spin_lock(root_lock);
2418 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2419 		kfree_skb(skb);
2420 		rc = NET_XMIT_DROP;
2421 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2422 		   qdisc_run_begin(q)) {
2423 		/*
2424 		 * This is a work-conserving queue; there are no old skbs
2425 		 * waiting to be sent out; and the qdisc is not running -
2426 		 * xmit the skb directly.
2427 		 */
2428 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2429 			skb_dst_force(skb);
2430 
2431 		qdisc_bstats_update(q, skb);
2432 
2433 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2434 			if (unlikely(contended)) {
2435 				spin_unlock(&q->busylock);
2436 				contended = false;
2437 			}
2438 			__qdisc_run(q);
2439 		} else
2440 			qdisc_run_end(q);
2441 
2442 		rc = NET_XMIT_SUCCESS;
2443 	} else {
2444 		skb_dst_force(skb);
2445 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2446 		if (qdisc_run_begin(q)) {
2447 			if (unlikely(contended)) {
2448 				spin_unlock(&q->busylock);
2449 				contended = false;
2450 			}
2451 			__qdisc_run(q);
2452 		}
2453 	}
2454 	spin_unlock(root_lock);
2455 	if (unlikely(contended))
2456 		spin_unlock(&q->busylock);
2457 	return rc;
2458 }
2459 
2460 static DEFINE_PER_CPU(int, xmit_recursion);
2461 #define RECURSION_LIMIT 10
2462 
2463 /**
2464  *	dev_queue_xmit - transmit a buffer
2465  *	@skb: buffer to transmit
2466  *
2467  *	Queue a buffer for transmission to a network device. The caller must
2468  *	have set the device and priority and built the buffer before calling
2469  *	this function. The function can be called from an interrupt.
2470  *
2471  *	A negative errno code is returned on a failure. A success does not
2472  *	guarantee the frame will be transmitted as it may be dropped due
2473  *	to congestion or traffic shaping.
2474  *
2475  * -----------------------------------------------------------------------------------
2476  *      I notice this method can also return errors from the queue disciplines,
2477  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2478  *      be positive.
2479  *
2480  *      Regardless of the return value, the skb is consumed, so it is currently
2481  *      difficult to retry a send to this method.  (You can bump the ref count
2482  *      before sending to hold a reference for retry if you are careful.)
2483  *
2484  *      When calling this method, interrupts MUST be enabled.  This is because
2485  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2486  *          --BLG
2487  */
2488 int dev_queue_xmit(struct sk_buff *skb)
2489 {
2490 	struct net_device *dev = skb->dev;
2491 	struct netdev_queue *txq;
2492 	struct Qdisc *q;
2493 	int rc = -ENOMEM;
2494 
2495 	/* Disable soft irqs for various locks below. Also
2496 	 * stops preemption for RCU.
2497 	 */
2498 	rcu_read_lock_bh();
2499 
2500 	txq = dev_pick_tx(dev, skb);
2501 	q = rcu_dereference_bh(txq->qdisc);
2502 
2503 #ifdef CONFIG_NET_CLS_ACT
2504 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2505 #endif
2506 	trace_net_dev_queue(skb);
2507 	if (q->enqueue) {
2508 		rc = __dev_xmit_skb(skb, q, dev, txq);
2509 		goto out;
2510 	}
2511 
2512 	/* The device has no queue. Common case for software devices:
2513 	   loopback, all the sorts of tunnels...
2514 
2515 	   Really, it is unlikely that netif_tx_lock protection is necessary
2516 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2517 	   counters.)
2518 	   However, it is possible, that they rely on protection
2519 	   made by us here.
2520 
2521 	   Check this and shot the lock. It is not prone from deadlocks.
2522 	   Either shot noqueue qdisc, it is even simpler 8)
2523 	 */
2524 	if (dev->flags & IFF_UP) {
2525 		int cpu = smp_processor_id(); /* ok because BHs are off */
2526 
2527 		if (txq->xmit_lock_owner != cpu) {
2528 
2529 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2530 				goto recursion_alert;
2531 
2532 			HARD_TX_LOCK(dev, txq, cpu);
2533 
2534 			if (!netif_tx_queue_stopped(txq)) {
2535 				__this_cpu_inc(xmit_recursion);
2536 				rc = dev_hard_start_xmit(skb, dev, txq);
2537 				__this_cpu_dec(xmit_recursion);
2538 				if (dev_xmit_complete(rc)) {
2539 					HARD_TX_UNLOCK(dev, txq);
2540 					goto out;
2541 				}
2542 			}
2543 			HARD_TX_UNLOCK(dev, txq);
2544 			if (net_ratelimit())
2545 				printk(KERN_CRIT "Virtual device %s asks to "
2546 				       "queue packet!\n", dev->name);
2547 		} else {
2548 			/* Recursion is detected! It is possible,
2549 			 * unfortunately
2550 			 */
2551 recursion_alert:
2552 			if (net_ratelimit())
2553 				printk(KERN_CRIT "Dead loop on virtual device "
2554 				       "%s, fix it urgently!\n", dev->name);
2555 		}
2556 	}
2557 
2558 	rc = -ENETDOWN;
2559 	rcu_read_unlock_bh();
2560 
2561 	kfree_skb(skb);
2562 	return rc;
2563 out:
2564 	rcu_read_unlock_bh();
2565 	return rc;
2566 }
2567 EXPORT_SYMBOL(dev_queue_xmit);
2568 
2569 
2570 /*=======================================================================
2571 			Receiver routines
2572   =======================================================================*/
2573 
2574 int netdev_max_backlog __read_mostly = 1000;
2575 int netdev_tstamp_prequeue __read_mostly = 1;
2576 int netdev_budget __read_mostly = 300;
2577 int weight_p __read_mostly = 64;            /* old backlog weight */
2578 
2579 /* Called with irq disabled */
2580 static inline void ____napi_schedule(struct softnet_data *sd,
2581 				     struct napi_struct *napi)
2582 {
2583 	list_add_tail(&napi->poll_list, &sd->poll_list);
2584 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2585 }
2586 
2587 /*
2588  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2589  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2590  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2591  * if hash is a canonical 4-tuple hash over transport ports.
2592  */
2593 void __skb_get_rxhash(struct sk_buff *skb)
2594 {
2595 	int nhoff, hash = 0, poff;
2596 	const struct ipv6hdr *ip6;
2597 	const struct iphdr *ip;
2598 	const struct vlan_hdr *vlan;
2599 	u8 ip_proto;
2600 	u32 addr1, addr2;
2601 	u16 proto;
2602 	union {
2603 		u32 v32;
2604 		u16 v16[2];
2605 	} ports;
2606 
2607 	nhoff = skb_network_offset(skb);
2608 	proto = skb->protocol;
2609 
2610 again:
2611 	switch (proto) {
2612 	case __constant_htons(ETH_P_IP):
2613 ip:
2614 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2615 			goto done;
2616 
2617 		ip = (const struct iphdr *) (skb->data + nhoff);
2618 		if (ip_is_fragment(ip))
2619 			ip_proto = 0;
2620 		else
2621 			ip_proto = ip->protocol;
2622 		addr1 = (__force u32) ip->saddr;
2623 		addr2 = (__force u32) ip->daddr;
2624 		nhoff += ip->ihl * 4;
2625 		break;
2626 	case __constant_htons(ETH_P_IPV6):
2627 ipv6:
2628 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2629 			goto done;
2630 
2631 		ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2632 		ip_proto = ip6->nexthdr;
2633 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2634 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2635 		nhoff += 40;
2636 		break;
2637 	case __constant_htons(ETH_P_8021Q):
2638 		if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff))
2639 			goto done;
2640 		vlan = (const struct vlan_hdr *) (skb->data + nhoff);
2641 		proto = vlan->h_vlan_encapsulated_proto;
2642 		nhoff += sizeof(*vlan);
2643 		goto again;
2644 	case __constant_htons(ETH_P_PPP_SES):
2645 		if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff))
2646 			goto done;
2647 		proto = *((__be16 *) (skb->data + nhoff +
2648 				      sizeof(struct pppoe_hdr)));
2649 		nhoff += PPPOE_SES_HLEN;
2650 		switch (proto) {
2651 		case __constant_htons(PPP_IP):
2652 			goto ip;
2653 		case __constant_htons(PPP_IPV6):
2654 			goto ipv6;
2655 		default:
2656 			goto done;
2657 		}
2658 	default:
2659 		goto done;
2660 	}
2661 
2662 	switch (ip_proto) {
2663 	case IPPROTO_GRE:
2664 		if (pskb_may_pull(skb, nhoff + 16)) {
2665 			u8 *h = skb->data + nhoff;
2666 			__be16 flags = *(__be16 *)h;
2667 
2668 			/*
2669 			 * Only look inside GRE if version zero and no
2670 			 * routing
2671 			 */
2672 			if (!(flags & (GRE_VERSION|GRE_ROUTING))) {
2673 				proto = *(__be16 *)(h + 2);
2674 				nhoff += 4;
2675 				if (flags & GRE_CSUM)
2676 					nhoff += 4;
2677 				if (flags & GRE_KEY)
2678 					nhoff += 4;
2679 				if (flags & GRE_SEQ)
2680 					nhoff += 4;
2681 				goto again;
2682 			}
2683 		}
2684 		break;
2685 	case IPPROTO_IPIP:
2686 		goto again;
2687 	default:
2688 		break;
2689 	}
2690 
2691 	ports.v32 = 0;
2692 	poff = proto_ports_offset(ip_proto);
2693 	if (poff >= 0) {
2694 		nhoff += poff;
2695 		if (pskb_may_pull(skb, nhoff + 4)) {
2696 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2697 			if (ports.v16[1] < ports.v16[0])
2698 				swap(ports.v16[0], ports.v16[1]);
2699 			skb->l4_rxhash = 1;
2700 		}
2701 	}
2702 
2703 	/* get a consistent hash (same value on both flow directions) */
2704 	if (addr2 < addr1)
2705 		swap(addr1, addr2);
2706 
2707 	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2708 	if (!hash)
2709 		hash = 1;
2710 
2711 done:
2712 	skb->rxhash = hash;
2713 }
2714 EXPORT_SYMBOL(__skb_get_rxhash);
2715 
2716 #ifdef CONFIG_RPS
2717 
2718 /* One global table that all flow-based protocols share. */
2719 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2720 EXPORT_SYMBOL(rps_sock_flow_table);
2721 
2722 static struct rps_dev_flow *
2723 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2724 	    struct rps_dev_flow *rflow, u16 next_cpu)
2725 {
2726 	if (next_cpu != RPS_NO_CPU) {
2727 #ifdef CONFIG_RFS_ACCEL
2728 		struct netdev_rx_queue *rxqueue;
2729 		struct rps_dev_flow_table *flow_table;
2730 		struct rps_dev_flow *old_rflow;
2731 		u32 flow_id;
2732 		u16 rxq_index;
2733 		int rc;
2734 
2735 		/* Should we steer this flow to a different hardware queue? */
2736 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2737 		    !(dev->features & NETIF_F_NTUPLE))
2738 			goto out;
2739 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2740 		if (rxq_index == skb_get_rx_queue(skb))
2741 			goto out;
2742 
2743 		rxqueue = dev->_rx + rxq_index;
2744 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2745 		if (!flow_table)
2746 			goto out;
2747 		flow_id = skb->rxhash & flow_table->mask;
2748 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2749 							rxq_index, flow_id);
2750 		if (rc < 0)
2751 			goto out;
2752 		old_rflow = rflow;
2753 		rflow = &flow_table->flows[flow_id];
2754 		rflow->filter = rc;
2755 		if (old_rflow->filter == rflow->filter)
2756 			old_rflow->filter = RPS_NO_FILTER;
2757 	out:
2758 #endif
2759 		rflow->last_qtail =
2760 			per_cpu(softnet_data, next_cpu).input_queue_head;
2761 	}
2762 
2763 	rflow->cpu = next_cpu;
2764 	return rflow;
2765 }
2766 
2767 /*
2768  * get_rps_cpu is called from netif_receive_skb and returns the target
2769  * CPU from the RPS map of the receiving queue for a given skb.
2770  * rcu_read_lock must be held on entry.
2771  */
2772 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2773 		       struct rps_dev_flow **rflowp)
2774 {
2775 	struct netdev_rx_queue *rxqueue;
2776 	struct rps_map *map;
2777 	struct rps_dev_flow_table *flow_table;
2778 	struct rps_sock_flow_table *sock_flow_table;
2779 	int cpu = -1;
2780 	u16 tcpu;
2781 
2782 	if (skb_rx_queue_recorded(skb)) {
2783 		u16 index = skb_get_rx_queue(skb);
2784 		if (unlikely(index >= dev->real_num_rx_queues)) {
2785 			WARN_ONCE(dev->real_num_rx_queues > 1,
2786 				  "%s received packet on queue %u, but number "
2787 				  "of RX queues is %u\n",
2788 				  dev->name, index, dev->real_num_rx_queues);
2789 			goto done;
2790 		}
2791 		rxqueue = dev->_rx + index;
2792 	} else
2793 		rxqueue = dev->_rx;
2794 
2795 	map = rcu_dereference(rxqueue->rps_map);
2796 	if (map) {
2797 		if (map->len == 1 &&
2798 		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2799 			tcpu = map->cpus[0];
2800 			if (cpu_online(tcpu))
2801 				cpu = tcpu;
2802 			goto done;
2803 		}
2804 	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2805 		goto done;
2806 	}
2807 
2808 	skb_reset_network_header(skb);
2809 	if (!skb_get_rxhash(skb))
2810 		goto done;
2811 
2812 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2813 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2814 	if (flow_table && sock_flow_table) {
2815 		u16 next_cpu;
2816 		struct rps_dev_flow *rflow;
2817 
2818 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2819 		tcpu = rflow->cpu;
2820 
2821 		next_cpu = sock_flow_table->ents[skb->rxhash &
2822 		    sock_flow_table->mask];
2823 
2824 		/*
2825 		 * If the desired CPU (where last recvmsg was done) is
2826 		 * different from current CPU (one in the rx-queue flow
2827 		 * table entry), switch if one of the following holds:
2828 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2829 		 *   - Current CPU is offline.
2830 		 *   - The current CPU's queue tail has advanced beyond the
2831 		 *     last packet that was enqueued using this table entry.
2832 		 *     This guarantees that all previous packets for the flow
2833 		 *     have been dequeued, thus preserving in order delivery.
2834 		 */
2835 		if (unlikely(tcpu != next_cpu) &&
2836 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2837 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2838 		      rflow->last_qtail)) >= 0))
2839 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2840 
2841 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2842 			*rflowp = rflow;
2843 			cpu = tcpu;
2844 			goto done;
2845 		}
2846 	}
2847 
2848 	if (map) {
2849 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2850 
2851 		if (cpu_online(tcpu)) {
2852 			cpu = tcpu;
2853 			goto done;
2854 		}
2855 	}
2856 
2857 done:
2858 	return cpu;
2859 }
2860 
2861 #ifdef CONFIG_RFS_ACCEL
2862 
2863 /**
2864  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2865  * @dev: Device on which the filter was set
2866  * @rxq_index: RX queue index
2867  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2868  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2869  *
2870  * Drivers that implement ndo_rx_flow_steer() should periodically call
2871  * this function for each installed filter and remove the filters for
2872  * which it returns %true.
2873  */
2874 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2875 			 u32 flow_id, u16 filter_id)
2876 {
2877 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2878 	struct rps_dev_flow_table *flow_table;
2879 	struct rps_dev_flow *rflow;
2880 	bool expire = true;
2881 	int cpu;
2882 
2883 	rcu_read_lock();
2884 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2885 	if (flow_table && flow_id <= flow_table->mask) {
2886 		rflow = &flow_table->flows[flow_id];
2887 		cpu = ACCESS_ONCE(rflow->cpu);
2888 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2889 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2890 			   rflow->last_qtail) <
2891 		     (int)(10 * flow_table->mask)))
2892 			expire = false;
2893 	}
2894 	rcu_read_unlock();
2895 	return expire;
2896 }
2897 EXPORT_SYMBOL(rps_may_expire_flow);
2898 
2899 #endif /* CONFIG_RFS_ACCEL */
2900 
2901 /* Called from hardirq (IPI) context */
2902 static void rps_trigger_softirq(void *data)
2903 {
2904 	struct softnet_data *sd = data;
2905 
2906 	____napi_schedule(sd, &sd->backlog);
2907 	sd->received_rps++;
2908 }
2909 
2910 #endif /* CONFIG_RPS */
2911 
2912 /*
2913  * Check if this softnet_data structure is another cpu one
2914  * If yes, queue it to our IPI list and return 1
2915  * If no, return 0
2916  */
2917 static int rps_ipi_queued(struct softnet_data *sd)
2918 {
2919 #ifdef CONFIG_RPS
2920 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2921 
2922 	if (sd != mysd) {
2923 		sd->rps_ipi_next = mysd->rps_ipi_list;
2924 		mysd->rps_ipi_list = sd;
2925 
2926 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2927 		return 1;
2928 	}
2929 #endif /* CONFIG_RPS */
2930 	return 0;
2931 }
2932 
2933 /*
2934  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2935  * queue (may be a remote CPU queue).
2936  */
2937 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2938 			      unsigned int *qtail)
2939 {
2940 	struct softnet_data *sd;
2941 	unsigned long flags;
2942 
2943 	sd = &per_cpu(softnet_data, cpu);
2944 
2945 	local_irq_save(flags);
2946 
2947 	rps_lock(sd);
2948 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2949 		if (skb_queue_len(&sd->input_pkt_queue)) {
2950 enqueue:
2951 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2952 			input_queue_tail_incr_save(sd, qtail);
2953 			rps_unlock(sd);
2954 			local_irq_restore(flags);
2955 			return NET_RX_SUCCESS;
2956 		}
2957 
2958 		/* Schedule NAPI for backlog device
2959 		 * We can use non atomic operation since we own the queue lock
2960 		 */
2961 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2962 			if (!rps_ipi_queued(sd))
2963 				____napi_schedule(sd, &sd->backlog);
2964 		}
2965 		goto enqueue;
2966 	}
2967 
2968 	sd->dropped++;
2969 	rps_unlock(sd);
2970 
2971 	local_irq_restore(flags);
2972 
2973 	atomic_long_inc(&skb->dev->rx_dropped);
2974 	kfree_skb(skb);
2975 	return NET_RX_DROP;
2976 }
2977 
2978 /**
2979  *	netif_rx	-	post buffer to the network code
2980  *	@skb: buffer to post
2981  *
2982  *	This function receives a packet from a device driver and queues it for
2983  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2984  *	may be dropped during processing for congestion control or by the
2985  *	protocol layers.
2986  *
2987  *	return values:
2988  *	NET_RX_SUCCESS	(no congestion)
2989  *	NET_RX_DROP     (packet was dropped)
2990  *
2991  */
2992 
2993 int netif_rx(struct sk_buff *skb)
2994 {
2995 	int ret;
2996 
2997 	/* if netpoll wants it, pretend we never saw it */
2998 	if (netpoll_rx(skb))
2999 		return NET_RX_DROP;
3000 
3001 	if (netdev_tstamp_prequeue)
3002 		net_timestamp_check(skb);
3003 
3004 	trace_netif_rx(skb);
3005 #ifdef CONFIG_RPS
3006 	{
3007 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3008 		int cpu;
3009 
3010 		preempt_disable();
3011 		rcu_read_lock();
3012 
3013 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3014 		if (cpu < 0)
3015 			cpu = smp_processor_id();
3016 
3017 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3018 
3019 		rcu_read_unlock();
3020 		preempt_enable();
3021 	}
3022 #else
3023 	{
3024 		unsigned int qtail;
3025 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3026 		put_cpu();
3027 	}
3028 #endif
3029 	return ret;
3030 }
3031 EXPORT_SYMBOL(netif_rx);
3032 
3033 int netif_rx_ni(struct sk_buff *skb)
3034 {
3035 	int err;
3036 
3037 	preempt_disable();
3038 	err = netif_rx(skb);
3039 	if (local_softirq_pending())
3040 		do_softirq();
3041 	preempt_enable();
3042 
3043 	return err;
3044 }
3045 EXPORT_SYMBOL(netif_rx_ni);
3046 
3047 static void net_tx_action(struct softirq_action *h)
3048 {
3049 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3050 
3051 	if (sd->completion_queue) {
3052 		struct sk_buff *clist;
3053 
3054 		local_irq_disable();
3055 		clist = sd->completion_queue;
3056 		sd->completion_queue = NULL;
3057 		local_irq_enable();
3058 
3059 		while (clist) {
3060 			struct sk_buff *skb = clist;
3061 			clist = clist->next;
3062 
3063 			WARN_ON(atomic_read(&skb->users));
3064 			trace_kfree_skb(skb, net_tx_action);
3065 			__kfree_skb(skb);
3066 		}
3067 	}
3068 
3069 	if (sd->output_queue) {
3070 		struct Qdisc *head;
3071 
3072 		local_irq_disable();
3073 		head = sd->output_queue;
3074 		sd->output_queue = NULL;
3075 		sd->output_queue_tailp = &sd->output_queue;
3076 		local_irq_enable();
3077 
3078 		while (head) {
3079 			struct Qdisc *q = head;
3080 			spinlock_t *root_lock;
3081 
3082 			head = head->next_sched;
3083 
3084 			root_lock = qdisc_lock(q);
3085 			if (spin_trylock(root_lock)) {
3086 				smp_mb__before_clear_bit();
3087 				clear_bit(__QDISC_STATE_SCHED,
3088 					  &q->state);
3089 				qdisc_run(q);
3090 				spin_unlock(root_lock);
3091 			} else {
3092 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3093 					      &q->state)) {
3094 					__netif_reschedule(q);
3095 				} else {
3096 					smp_mb__before_clear_bit();
3097 					clear_bit(__QDISC_STATE_SCHED,
3098 						  &q->state);
3099 				}
3100 			}
3101 		}
3102 	}
3103 }
3104 
3105 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3106     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3107 /* This hook is defined here for ATM LANE */
3108 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3109 			     unsigned char *addr) __read_mostly;
3110 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3111 #endif
3112 
3113 #ifdef CONFIG_NET_CLS_ACT
3114 /* TODO: Maybe we should just force sch_ingress to be compiled in
3115  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3116  * a compare and 2 stores extra right now if we dont have it on
3117  * but have CONFIG_NET_CLS_ACT
3118  * NOTE: This doesn't stop any functionality; if you dont have
3119  * the ingress scheduler, you just can't add policies on ingress.
3120  *
3121  */
3122 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3123 {
3124 	struct net_device *dev = skb->dev;
3125 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3126 	int result = TC_ACT_OK;
3127 	struct Qdisc *q;
3128 
3129 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3130 		if (net_ratelimit())
3131 			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3132 			       skb->skb_iif, dev->ifindex);
3133 		return TC_ACT_SHOT;
3134 	}
3135 
3136 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3137 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3138 
3139 	q = rxq->qdisc;
3140 	if (q != &noop_qdisc) {
3141 		spin_lock(qdisc_lock(q));
3142 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3143 			result = qdisc_enqueue_root(skb, q);
3144 		spin_unlock(qdisc_lock(q));
3145 	}
3146 
3147 	return result;
3148 }
3149 
3150 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3151 					 struct packet_type **pt_prev,
3152 					 int *ret, struct net_device *orig_dev)
3153 {
3154 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3155 
3156 	if (!rxq || rxq->qdisc == &noop_qdisc)
3157 		goto out;
3158 
3159 	if (*pt_prev) {
3160 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3161 		*pt_prev = NULL;
3162 	}
3163 
3164 	switch (ing_filter(skb, rxq)) {
3165 	case TC_ACT_SHOT:
3166 	case TC_ACT_STOLEN:
3167 		kfree_skb(skb);
3168 		return NULL;
3169 	}
3170 
3171 out:
3172 	skb->tc_verd = 0;
3173 	return skb;
3174 }
3175 #endif
3176 
3177 /**
3178  *	netdev_rx_handler_register - register receive handler
3179  *	@dev: device to register a handler for
3180  *	@rx_handler: receive handler to register
3181  *	@rx_handler_data: data pointer that is used by rx handler
3182  *
3183  *	Register a receive hander for a device. This handler will then be
3184  *	called from __netif_receive_skb. A negative errno code is returned
3185  *	on a failure.
3186  *
3187  *	The caller must hold the rtnl_mutex.
3188  *
3189  *	For a general description of rx_handler, see enum rx_handler_result.
3190  */
3191 int netdev_rx_handler_register(struct net_device *dev,
3192 			       rx_handler_func_t *rx_handler,
3193 			       void *rx_handler_data)
3194 {
3195 	ASSERT_RTNL();
3196 
3197 	if (dev->rx_handler)
3198 		return -EBUSY;
3199 
3200 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3201 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3202 
3203 	return 0;
3204 }
3205 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3206 
3207 /**
3208  *	netdev_rx_handler_unregister - unregister receive handler
3209  *	@dev: device to unregister a handler from
3210  *
3211  *	Unregister a receive hander from a device.
3212  *
3213  *	The caller must hold the rtnl_mutex.
3214  */
3215 void netdev_rx_handler_unregister(struct net_device *dev)
3216 {
3217 
3218 	ASSERT_RTNL();
3219 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3220 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3221 }
3222 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3223 
3224 static int __netif_receive_skb(struct sk_buff *skb)
3225 {
3226 	struct packet_type *ptype, *pt_prev;
3227 	rx_handler_func_t *rx_handler;
3228 	struct net_device *orig_dev;
3229 	struct net_device *null_or_dev;
3230 	bool deliver_exact = false;
3231 	int ret = NET_RX_DROP;
3232 	__be16 type;
3233 
3234 	if (!netdev_tstamp_prequeue)
3235 		net_timestamp_check(skb);
3236 
3237 	trace_netif_receive_skb(skb);
3238 
3239 	/* if we've gotten here through NAPI, check netpoll */
3240 	if (netpoll_receive_skb(skb))
3241 		return NET_RX_DROP;
3242 
3243 	if (!skb->skb_iif)
3244 		skb->skb_iif = skb->dev->ifindex;
3245 	orig_dev = skb->dev;
3246 
3247 	skb_reset_network_header(skb);
3248 	skb_reset_transport_header(skb);
3249 	skb_reset_mac_len(skb);
3250 
3251 	pt_prev = NULL;
3252 
3253 	rcu_read_lock();
3254 
3255 another_round:
3256 
3257 	__this_cpu_inc(softnet_data.processed);
3258 
3259 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3260 		skb = vlan_untag(skb);
3261 		if (unlikely(!skb))
3262 			goto out;
3263 	}
3264 
3265 #ifdef CONFIG_NET_CLS_ACT
3266 	if (skb->tc_verd & TC_NCLS) {
3267 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3268 		goto ncls;
3269 	}
3270 #endif
3271 
3272 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3273 		if (!ptype->dev || ptype->dev == skb->dev) {
3274 			if (pt_prev)
3275 				ret = deliver_skb(skb, pt_prev, orig_dev);
3276 			pt_prev = ptype;
3277 		}
3278 	}
3279 
3280 #ifdef CONFIG_NET_CLS_ACT
3281 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3282 	if (!skb)
3283 		goto out;
3284 ncls:
3285 #endif
3286 
3287 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3288 	if (vlan_tx_tag_present(skb)) {
3289 		if (pt_prev) {
3290 			ret = deliver_skb(skb, pt_prev, orig_dev);
3291 			pt_prev = NULL;
3292 		}
3293 		if (vlan_do_receive(&skb, !rx_handler))
3294 			goto another_round;
3295 		else if (unlikely(!skb))
3296 			goto out;
3297 	}
3298 
3299 	if (rx_handler) {
3300 		if (pt_prev) {
3301 			ret = deliver_skb(skb, pt_prev, orig_dev);
3302 			pt_prev = NULL;
3303 		}
3304 		switch (rx_handler(&skb)) {
3305 		case RX_HANDLER_CONSUMED:
3306 			goto out;
3307 		case RX_HANDLER_ANOTHER:
3308 			goto another_round;
3309 		case RX_HANDLER_EXACT:
3310 			deliver_exact = true;
3311 		case RX_HANDLER_PASS:
3312 			break;
3313 		default:
3314 			BUG();
3315 		}
3316 	}
3317 
3318 	/* deliver only exact match when indicated */
3319 	null_or_dev = deliver_exact ? skb->dev : NULL;
3320 
3321 	type = skb->protocol;
3322 	list_for_each_entry_rcu(ptype,
3323 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3324 		if (ptype->type == type &&
3325 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3326 		     ptype->dev == orig_dev)) {
3327 			if (pt_prev)
3328 				ret = deliver_skb(skb, pt_prev, orig_dev);
3329 			pt_prev = ptype;
3330 		}
3331 	}
3332 
3333 	if (pt_prev) {
3334 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3335 	} else {
3336 		atomic_long_inc(&skb->dev->rx_dropped);
3337 		kfree_skb(skb);
3338 		/* Jamal, now you will not able to escape explaining
3339 		 * me how you were going to use this. :-)
3340 		 */
3341 		ret = NET_RX_DROP;
3342 	}
3343 
3344 out:
3345 	rcu_read_unlock();
3346 	return ret;
3347 }
3348 
3349 /**
3350  *	netif_receive_skb - process receive buffer from network
3351  *	@skb: buffer to process
3352  *
3353  *	netif_receive_skb() is the main receive data processing function.
3354  *	It always succeeds. The buffer may be dropped during processing
3355  *	for congestion control or by the protocol layers.
3356  *
3357  *	This function may only be called from softirq context and interrupts
3358  *	should be enabled.
3359  *
3360  *	Return values (usually ignored):
3361  *	NET_RX_SUCCESS: no congestion
3362  *	NET_RX_DROP: packet was dropped
3363  */
3364 int netif_receive_skb(struct sk_buff *skb)
3365 {
3366 	if (netdev_tstamp_prequeue)
3367 		net_timestamp_check(skb);
3368 
3369 	if (skb_defer_rx_timestamp(skb))
3370 		return NET_RX_SUCCESS;
3371 
3372 #ifdef CONFIG_RPS
3373 	{
3374 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3375 		int cpu, ret;
3376 
3377 		rcu_read_lock();
3378 
3379 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3380 
3381 		if (cpu >= 0) {
3382 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3383 			rcu_read_unlock();
3384 		} else {
3385 			rcu_read_unlock();
3386 			ret = __netif_receive_skb(skb);
3387 		}
3388 
3389 		return ret;
3390 	}
3391 #else
3392 	return __netif_receive_skb(skb);
3393 #endif
3394 }
3395 EXPORT_SYMBOL(netif_receive_skb);
3396 
3397 /* Network device is going away, flush any packets still pending
3398  * Called with irqs disabled.
3399  */
3400 static void flush_backlog(void *arg)
3401 {
3402 	struct net_device *dev = arg;
3403 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3404 	struct sk_buff *skb, *tmp;
3405 
3406 	rps_lock(sd);
3407 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3408 		if (skb->dev == dev) {
3409 			__skb_unlink(skb, &sd->input_pkt_queue);
3410 			kfree_skb(skb);
3411 			input_queue_head_incr(sd);
3412 		}
3413 	}
3414 	rps_unlock(sd);
3415 
3416 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3417 		if (skb->dev == dev) {
3418 			__skb_unlink(skb, &sd->process_queue);
3419 			kfree_skb(skb);
3420 			input_queue_head_incr(sd);
3421 		}
3422 	}
3423 }
3424 
3425 static int napi_gro_complete(struct sk_buff *skb)
3426 {
3427 	struct packet_type *ptype;
3428 	__be16 type = skb->protocol;
3429 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3430 	int err = -ENOENT;
3431 
3432 	if (NAPI_GRO_CB(skb)->count == 1) {
3433 		skb_shinfo(skb)->gso_size = 0;
3434 		goto out;
3435 	}
3436 
3437 	rcu_read_lock();
3438 	list_for_each_entry_rcu(ptype, head, list) {
3439 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3440 			continue;
3441 
3442 		err = ptype->gro_complete(skb);
3443 		break;
3444 	}
3445 	rcu_read_unlock();
3446 
3447 	if (err) {
3448 		WARN_ON(&ptype->list == head);
3449 		kfree_skb(skb);
3450 		return NET_RX_SUCCESS;
3451 	}
3452 
3453 out:
3454 	return netif_receive_skb(skb);
3455 }
3456 
3457 inline void napi_gro_flush(struct napi_struct *napi)
3458 {
3459 	struct sk_buff *skb, *next;
3460 
3461 	for (skb = napi->gro_list; skb; skb = next) {
3462 		next = skb->next;
3463 		skb->next = NULL;
3464 		napi_gro_complete(skb);
3465 	}
3466 
3467 	napi->gro_count = 0;
3468 	napi->gro_list = NULL;
3469 }
3470 EXPORT_SYMBOL(napi_gro_flush);
3471 
3472 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3473 {
3474 	struct sk_buff **pp = NULL;
3475 	struct packet_type *ptype;
3476 	__be16 type = skb->protocol;
3477 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3478 	int same_flow;
3479 	int mac_len;
3480 	enum gro_result ret;
3481 
3482 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3483 		goto normal;
3484 
3485 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3486 		goto normal;
3487 
3488 	rcu_read_lock();
3489 	list_for_each_entry_rcu(ptype, head, list) {
3490 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3491 			continue;
3492 
3493 		skb_set_network_header(skb, skb_gro_offset(skb));
3494 		mac_len = skb->network_header - skb->mac_header;
3495 		skb->mac_len = mac_len;
3496 		NAPI_GRO_CB(skb)->same_flow = 0;
3497 		NAPI_GRO_CB(skb)->flush = 0;
3498 		NAPI_GRO_CB(skb)->free = 0;
3499 
3500 		pp = ptype->gro_receive(&napi->gro_list, skb);
3501 		break;
3502 	}
3503 	rcu_read_unlock();
3504 
3505 	if (&ptype->list == head)
3506 		goto normal;
3507 
3508 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3509 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3510 
3511 	if (pp) {
3512 		struct sk_buff *nskb = *pp;
3513 
3514 		*pp = nskb->next;
3515 		nskb->next = NULL;
3516 		napi_gro_complete(nskb);
3517 		napi->gro_count--;
3518 	}
3519 
3520 	if (same_flow)
3521 		goto ok;
3522 
3523 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3524 		goto normal;
3525 
3526 	napi->gro_count++;
3527 	NAPI_GRO_CB(skb)->count = 1;
3528 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3529 	skb->next = napi->gro_list;
3530 	napi->gro_list = skb;
3531 	ret = GRO_HELD;
3532 
3533 pull:
3534 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3535 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3536 
3537 		BUG_ON(skb->end - skb->tail < grow);
3538 
3539 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3540 
3541 		skb->tail += grow;
3542 		skb->data_len -= grow;
3543 
3544 		skb_shinfo(skb)->frags[0].page_offset += grow;
3545 		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3546 
3547 		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3548 			skb_frag_unref(skb, 0);
3549 			memmove(skb_shinfo(skb)->frags,
3550 				skb_shinfo(skb)->frags + 1,
3551 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3552 		}
3553 	}
3554 
3555 ok:
3556 	return ret;
3557 
3558 normal:
3559 	ret = GRO_NORMAL;
3560 	goto pull;
3561 }
3562 EXPORT_SYMBOL(dev_gro_receive);
3563 
3564 static inline gro_result_t
3565 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3566 {
3567 	struct sk_buff *p;
3568 
3569 	for (p = napi->gro_list; p; p = p->next) {
3570 		unsigned long diffs;
3571 
3572 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3573 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3574 		diffs |= compare_ether_header(skb_mac_header(p),
3575 					      skb_gro_mac_header(skb));
3576 		NAPI_GRO_CB(p)->same_flow = !diffs;
3577 		NAPI_GRO_CB(p)->flush = 0;
3578 	}
3579 
3580 	return dev_gro_receive(napi, skb);
3581 }
3582 
3583 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3584 {
3585 	switch (ret) {
3586 	case GRO_NORMAL:
3587 		if (netif_receive_skb(skb))
3588 			ret = GRO_DROP;
3589 		break;
3590 
3591 	case GRO_DROP:
3592 	case GRO_MERGED_FREE:
3593 		kfree_skb(skb);
3594 		break;
3595 
3596 	case GRO_HELD:
3597 	case GRO_MERGED:
3598 		break;
3599 	}
3600 
3601 	return ret;
3602 }
3603 EXPORT_SYMBOL(napi_skb_finish);
3604 
3605 void skb_gro_reset_offset(struct sk_buff *skb)
3606 {
3607 	NAPI_GRO_CB(skb)->data_offset = 0;
3608 	NAPI_GRO_CB(skb)->frag0 = NULL;
3609 	NAPI_GRO_CB(skb)->frag0_len = 0;
3610 
3611 	if (skb->mac_header == skb->tail &&
3612 	    !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3613 		NAPI_GRO_CB(skb)->frag0 =
3614 			skb_frag_address(&skb_shinfo(skb)->frags[0]);
3615 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3616 	}
3617 }
3618 EXPORT_SYMBOL(skb_gro_reset_offset);
3619 
3620 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3621 {
3622 	skb_gro_reset_offset(skb);
3623 
3624 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3625 }
3626 EXPORT_SYMBOL(napi_gro_receive);
3627 
3628 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3629 {
3630 	__skb_pull(skb, skb_headlen(skb));
3631 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3632 	skb->vlan_tci = 0;
3633 	skb->dev = napi->dev;
3634 	skb->skb_iif = 0;
3635 
3636 	napi->skb = skb;
3637 }
3638 
3639 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3640 {
3641 	struct sk_buff *skb = napi->skb;
3642 
3643 	if (!skb) {
3644 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3645 		if (skb)
3646 			napi->skb = skb;
3647 	}
3648 	return skb;
3649 }
3650 EXPORT_SYMBOL(napi_get_frags);
3651 
3652 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3653 			       gro_result_t ret)
3654 {
3655 	switch (ret) {
3656 	case GRO_NORMAL:
3657 	case GRO_HELD:
3658 		skb->protocol = eth_type_trans(skb, skb->dev);
3659 
3660 		if (ret == GRO_HELD)
3661 			skb_gro_pull(skb, -ETH_HLEN);
3662 		else if (netif_receive_skb(skb))
3663 			ret = GRO_DROP;
3664 		break;
3665 
3666 	case GRO_DROP:
3667 	case GRO_MERGED_FREE:
3668 		napi_reuse_skb(napi, skb);
3669 		break;
3670 
3671 	case GRO_MERGED:
3672 		break;
3673 	}
3674 
3675 	return ret;
3676 }
3677 EXPORT_SYMBOL(napi_frags_finish);
3678 
3679 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3680 {
3681 	struct sk_buff *skb = napi->skb;
3682 	struct ethhdr *eth;
3683 	unsigned int hlen;
3684 	unsigned int off;
3685 
3686 	napi->skb = NULL;
3687 
3688 	skb_reset_mac_header(skb);
3689 	skb_gro_reset_offset(skb);
3690 
3691 	off = skb_gro_offset(skb);
3692 	hlen = off + sizeof(*eth);
3693 	eth = skb_gro_header_fast(skb, off);
3694 	if (skb_gro_header_hard(skb, hlen)) {
3695 		eth = skb_gro_header_slow(skb, hlen, off);
3696 		if (unlikely(!eth)) {
3697 			napi_reuse_skb(napi, skb);
3698 			skb = NULL;
3699 			goto out;
3700 		}
3701 	}
3702 
3703 	skb_gro_pull(skb, sizeof(*eth));
3704 
3705 	/*
3706 	 * This works because the only protocols we care about don't require
3707 	 * special handling.  We'll fix it up properly at the end.
3708 	 */
3709 	skb->protocol = eth->h_proto;
3710 
3711 out:
3712 	return skb;
3713 }
3714 EXPORT_SYMBOL(napi_frags_skb);
3715 
3716 gro_result_t napi_gro_frags(struct napi_struct *napi)
3717 {
3718 	struct sk_buff *skb = napi_frags_skb(napi);
3719 
3720 	if (!skb)
3721 		return GRO_DROP;
3722 
3723 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3724 }
3725 EXPORT_SYMBOL(napi_gro_frags);
3726 
3727 /*
3728  * net_rps_action sends any pending IPI's for rps.
3729  * Note: called with local irq disabled, but exits with local irq enabled.
3730  */
3731 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3732 {
3733 #ifdef CONFIG_RPS
3734 	struct softnet_data *remsd = sd->rps_ipi_list;
3735 
3736 	if (remsd) {
3737 		sd->rps_ipi_list = NULL;
3738 
3739 		local_irq_enable();
3740 
3741 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3742 		while (remsd) {
3743 			struct softnet_data *next = remsd->rps_ipi_next;
3744 
3745 			if (cpu_online(remsd->cpu))
3746 				__smp_call_function_single(remsd->cpu,
3747 							   &remsd->csd, 0);
3748 			remsd = next;
3749 		}
3750 	} else
3751 #endif
3752 		local_irq_enable();
3753 }
3754 
3755 static int process_backlog(struct napi_struct *napi, int quota)
3756 {
3757 	int work = 0;
3758 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3759 
3760 #ifdef CONFIG_RPS
3761 	/* Check if we have pending ipi, its better to send them now,
3762 	 * not waiting net_rx_action() end.
3763 	 */
3764 	if (sd->rps_ipi_list) {
3765 		local_irq_disable();
3766 		net_rps_action_and_irq_enable(sd);
3767 	}
3768 #endif
3769 	napi->weight = weight_p;
3770 	local_irq_disable();
3771 	while (work < quota) {
3772 		struct sk_buff *skb;
3773 		unsigned int qlen;
3774 
3775 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3776 			local_irq_enable();
3777 			__netif_receive_skb(skb);
3778 			local_irq_disable();
3779 			input_queue_head_incr(sd);
3780 			if (++work >= quota) {
3781 				local_irq_enable();
3782 				return work;
3783 			}
3784 		}
3785 
3786 		rps_lock(sd);
3787 		qlen = skb_queue_len(&sd->input_pkt_queue);
3788 		if (qlen)
3789 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3790 						   &sd->process_queue);
3791 
3792 		if (qlen < quota - work) {
3793 			/*
3794 			 * Inline a custom version of __napi_complete().
3795 			 * only current cpu owns and manipulates this napi,
3796 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3797 			 * we can use a plain write instead of clear_bit(),
3798 			 * and we dont need an smp_mb() memory barrier.
3799 			 */
3800 			list_del(&napi->poll_list);
3801 			napi->state = 0;
3802 
3803 			quota = work + qlen;
3804 		}
3805 		rps_unlock(sd);
3806 	}
3807 	local_irq_enable();
3808 
3809 	return work;
3810 }
3811 
3812 /**
3813  * __napi_schedule - schedule for receive
3814  * @n: entry to schedule
3815  *
3816  * The entry's receive function will be scheduled to run
3817  */
3818 void __napi_schedule(struct napi_struct *n)
3819 {
3820 	unsigned long flags;
3821 
3822 	local_irq_save(flags);
3823 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3824 	local_irq_restore(flags);
3825 }
3826 EXPORT_SYMBOL(__napi_schedule);
3827 
3828 void __napi_complete(struct napi_struct *n)
3829 {
3830 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3831 	BUG_ON(n->gro_list);
3832 
3833 	list_del(&n->poll_list);
3834 	smp_mb__before_clear_bit();
3835 	clear_bit(NAPI_STATE_SCHED, &n->state);
3836 }
3837 EXPORT_SYMBOL(__napi_complete);
3838 
3839 void napi_complete(struct napi_struct *n)
3840 {
3841 	unsigned long flags;
3842 
3843 	/*
3844 	 * don't let napi dequeue from the cpu poll list
3845 	 * just in case its running on a different cpu
3846 	 */
3847 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3848 		return;
3849 
3850 	napi_gro_flush(n);
3851 	local_irq_save(flags);
3852 	__napi_complete(n);
3853 	local_irq_restore(flags);
3854 }
3855 EXPORT_SYMBOL(napi_complete);
3856 
3857 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3858 		    int (*poll)(struct napi_struct *, int), int weight)
3859 {
3860 	INIT_LIST_HEAD(&napi->poll_list);
3861 	napi->gro_count = 0;
3862 	napi->gro_list = NULL;
3863 	napi->skb = NULL;
3864 	napi->poll = poll;
3865 	napi->weight = weight;
3866 	list_add(&napi->dev_list, &dev->napi_list);
3867 	napi->dev = dev;
3868 #ifdef CONFIG_NETPOLL
3869 	spin_lock_init(&napi->poll_lock);
3870 	napi->poll_owner = -1;
3871 #endif
3872 	set_bit(NAPI_STATE_SCHED, &napi->state);
3873 }
3874 EXPORT_SYMBOL(netif_napi_add);
3875 
3876 void netif_napi_del(struct napi_struct *napi)
3877 {
3878 	struct sk_buff *skb, *next;
3879 
3880 	list_del_init(&napi->dev_list);
3881 	napi_free_frags(napi);
3882 
3883 	for (skb = napi->gro_list; skb; skb = next) {
3884 		next = skb->next;
3885 		skb->next = NULL;
3886 		kfree_skb(skb);
3887 	}
3888 
3889 	napi->gro_list = NULL;
3890 	napi->gro_count = 0;
3891 }
3892 EXPORT_SYMBOL(netif_napi_del);
3893 
3894 static void net_rx_action(struct softirq_action *h)
3895 {
3896 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3897 	unsigned long time_limit = jiffies + 2;
3898 	int budget = netdev_budget;
3899 	void *have;
3900 
3901 	local_irq_disable();
3902 
3903 	while (!list_empty(&sd->poll_list)) {
3904 		struct napi_struct *n;
3905 		int work, weight;
3906 
3907 		/* If softirq window is exhuasted then punt.
3908 		 * Allow this to run for 2 jiffies since which will allow
3909 		 * an average latency of 1.5/HZ.
3910 		 */
3911 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3912 			goto softnet_break;
3913 
3914 		local_irq_enable();
3915 
3916 		/* Even though interrupts have been re-enabled, this
3917 		 * access is safe because interrupts can only add new
3918 		 * entries to the tail of this list, and only ->poll()
3919 		 * calls can remove this head entry from the list.
3920 		 */
3921 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3922 
3923 		have = netpoll_poll_lock(n);
3924 
3925 		weight = n->weight;
3926 
3927 		/* This NAPI_STATE_SCHED test is for avoiding a race
3928 		 * with netpoll's poll_napi().  Only the entity which
3929 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3930 		 * actually make the ->poll() call.  Therefore we avoid
3931 		 * accidentally calling ->poll() when NAPI is not scheduled.
3932 		 */
3933 		work = 0;
3934 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3935 			work = n->poll(n, weight);
3936 			trace_napi_poll(n);
3937 		}
3938 
3939 		WARN_ON_ONCE(work > weight);
3940 
3941 		budget -= work;
3942 
3943 		local_irq_disable();
3944 
3945 		/* Drivers must not modify the NAPI state if they
3946 		 * consume the entire weight.  In such cases this code
3947 		 * still "owns" the NAPI instance and therefore can
3948 		 * move the instance around on the list at-will.
3949 		 */
3950 		if (unlikely(work == weight)) {
3951 			if (unlikely(napi_disable_pending(n))) {
3952 				local_irq_enable();
3953 				napi_complete(n);
3954 				local_irq_disable();
3955 			} else
3956 				list_move_tail(&n->poll_list, &sd->poll_list);
3957 		}
3958 
3959 		netpoll_poll_unlock(have);
3960 	}
3961 out:
3962 	net_rps_action_and_irq_enable(sd);
3963 
3964 #ifdef CONFIG_NET_DMA
3965 	/*
3966 	 * There may not be any more sk_buffs coming right now, so push
3967 	 * any pending DMA copies to hardware
3968 	 */
3969 	dma_issue_pending_all();
3970 #endif
3971 
3972 	return;
3973 
3974 softnet_break:
3975 	sd->time_squeeze++;
3976 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3977 	goto out;
3978 }
3979 
3980 static gifconf_func_t *gifconf_list[NPROTO];
3981 
3982 /**
3983  *	register_gifconf	-	register a SIOCGIF handler
3984  *	@family: Address family
3985  *	@gifconf: Function handler
3986  *
3987  *	Register protocol dependent address dumping routines. The handler
3988  *	that is passed must not be freed or reused until it has been replaced
3989  *	by another handler.
3990  */
3991 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3992 {
3993 	if (family >= NPROTO)
3994 		return -EINVAL;
3995 	gifconf_list[family] = gifconf;
3996 	return 0;
3997 }
3998 EXPORT_SYMBOL(register_gifconf);
3999 
4000 
4001 /*
4002  *	Map an interface index to its name (SIOCGIFNAME)
4003  */
4004 
4005 /*
4006  *	We need this ioctl for efficient implementation of the
4007  *	if_indextoname() function required by the IPv6 API.  Without
4008  *	it, we would have to search all the interfaces to find a
4009  *	match.  --pb
4010  */
4011 
4012 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4013 {
4014 	struct net_device *dev;
4015 	struct ifreq ifr;
4016 
4017 	/*
4018 	 *	Fetch the caller's info block.
4019 	 */
4020 
4021 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4022 		return -EFAULT;
4023 
4024 	rcu_read_lock();
4025 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4026 	if (!dev) {
4027 		rcu_read_unlock();
4028 		return -ENODEV;
4029 	}
4030 
4031 	strcpy(ifr.ifr_name, dev->name);
4032 	rcu_read_unlock();
4033 
4034 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4035 		return -EFAULT;
4036 	return 0;
4037 }
4038 
4039 /*
4040  *	Perform a SIOCGIFCONF call. This structure will change
4041  *	size eventually, and there is nothing I can do about it.
4042  *	Thus we will need a 'compatibility mode'.
4043  */
4044 
4045 static int dev_ifconf(struct net *net, char __user *arg)
4046 {
4047 	struct ifconf ifc;
4048 	struct net_device *dev;
4049 	char __user *pos;
4050 	int len;
4051 	int total;
4052 	int i;
4053 
4054 	/*
4055 	 *	Fetch the caller's info block.
4056 	 */
4057 
4058 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4059 		return -EFAULT;
4060 
4061 	pos = ifc.ifc_buf;
4062 	len = ifc.ifc_len;
4063 
4064 	/*
4065 	 *	Loop over the interfaces, and write an info block for each.
4066 	 */
4067 
4068 	total = 0;
4069 	for_each_netdev(net, dev) {
4070 		for (i = 0; i < NPROTO; i++) {
4071 			if (gifconf_list[i]) {
4072 				int done;
4073 				if (!pos)
4074 					done = gifconf_list[i](dev, NULL, 0);
4075 				else
4076 					done = gifconf_list[i](dev, pos + total,
4077 							       len - total);
4078 				if (done < 0)
4079 					return -EFAULT;
4080 				total += done;
4081 			}
4082 		}
4083 	}
4084 
4085 	/*
4086 	 *	All done.  Write the updated control block back to the caller.
4087 	 */
4088 	ifc.ifc_len = total;
4089 
4090 	/*
4091 	 * 	Both BSD and Solaris return 0 here, so we do too.
4092 	 */
4093 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4094 }
4095 
4096 #ifdef CONFIG_PROC_FS
4097 
4098 #define BUCKET_SPACE (32 - NETDEV_HASHBITS)
4099 
4100 struct dev_iter_state {
4101 	struct seq_net_private p;
4102 	unsigned int pos; /* bucket << BUCKET_SPACE + offset */
4103 };
4104 
4105 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4106 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4107 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4108 
4109 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq)
4110 {
4111 	struct dev_iter_state *state = seq->private;
4112 	struct net *net = seq_file_net(seq);
4113 	struct net_device *dev;
4114 	struct hlist_node *p;
4115 	struct hlist_head *h;
4116 	unsigned int count, bucket, offset;
4117 
4118 	bucket = get_bucket(state->pos);
4119 	offset = get_offset(state->pos);
4120 	h = &net->dev_name_head[bucket];
4121 	count = 0;
4122 	hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4123 		if (count++ == offset) {
4124 			state->pos = set_bucket_offset(bucket, count);
4125 			return dev;
4126 		}
4127 	}
4128 
4129 	return NULL;
4130 }
4131 
4132 static inline struct net_device *dev_from_new_bucket(struct seq_file *seq)
4133 {
4134 	struct dev_iter_state *state = seq->private;
4135 	struct net_device *dev;
4136 	unsigned int bucket;
4137 
4138 	bucket = get_bucket(state->pos);
4139 	do {
4140 		dev = dev_from_same_bucket(seq);
4141 		if (dev)
4142 			return dev;
4143 
4144 		bucket++;
4145 		state->pos = set_bucket_offset(bucket, 0);
4146 	} while (bucket < NETDEV_HASHENTRIES);
4147 
4148 	return NULL;
4149 }
4150 
4151 /*
4152  *	This is invoked by the /proc filesystem handler to display a device
4153  *	in detail.
4154  */
4155 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4156 	__acquires(RCU)
4157 {
4158 	struct dev_iter_state *state = seq->private;
4159 
4160 	rcu_read_lock();
4161 	if (!*pos)
4162 		return SEQ_START_TOKEN;
4163 
4164 	/* check for end of the hash */
4165 	if (state->pos == 0 && *pos > 1)
4166 		return NULL;
4167 
4168 	return dev_from_new_bucket(seq);
4169 }
4170 
4171 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4172 {
4173 	struct net_device *dev;
4174 
4175 	++*pos;
4176 
4177 	if (v == SEQ_START_TOKEN)
4178 		return dev_from_new_bucket(seq);
4179 
4180 	dev = dev_from_same_bucket(seq);
4181 	if (dev)
4182 		return dev;
4183 
4184 	return dev_from_new_bucket(seq);
4185 }
4186 
4187 void dev_seq_stop(struct seq_file *seq, void *v)
4188 	__releases(RCU)
4189 {
4190 	rcu_read_unlock();
4191 }
4192 
4193 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4194 {
4195 	struct rtnl_link_stats64 temp;
4196 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4197 
4198 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4199 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4200 		   dev->name, stats->rx_bytes, stats->rx_packets,
4201 		   stats->rx_errors,
4202 		   stats->rx_dropped + stats->rx_missed_errors,
4203 		   stats->rx_fifo_errors,
4204 		   stats->rx_length_errors + stats->rx_over_errors +
4205 		    stats->rx_crc_errors + stats->rx_frame_errors,
4206 		   stats->rx_compressed, stats->multicast,
4207 		   stats->tx_bytes, stats->tx_packets,
4208 		   stats->tx_errors, stats->tx_dropped,
4209 		   stats->tx_fifo_errors, stats->collisions,
4210 		   stats->tx_carrier_errors +
4211 		    stats->tx_aborted_errors +
4212 		    stats->tx_window_errors +
4213 		    stats->tx_heartbeat_errors,
4214 		   stats->tx_compressed);
4215 }
4216 
4217 /*
4218  *	Called from the PROCfs module. This now uses the new arbitrary sized
4219  *	/proc/net interface to create /proc/net/dev
4220  */
4221 static int dev_seq_show(struct seq_file *seq, void *v)
4222 {
4223 	if (v == SEQ_START_TOKEN)
4224 		seq_puts(seq, "Inter-|   Receive                            "
4225 			      "                    |  Transmit\n"
4226 			      " face |bytes    packets errs drop fifo frame "
4227 			      "compressed multicast|bytes    packets errs "
4228 			      "drop fifo colls carrier compressed\n");
4229 	else
4230 		dev_seq_printf_stats(seq, v);
4231 	return 0;
4232 }
4233 
4234 static struct softnet_data *softnet_get_online(loff_t *pos)
4235 {
4236 	struct softnet_data *sd = NULL;
4237 
4238 	while (*pos < nr_cpu_ids)
4239 		if (cpu_online(*pos)) {
4240 			sd = &per_cpu(softnet_data, *pos);
4241 			break;
4242 		} else
4243 			++*pos;
4244 	return sd;
4245 }
4246 
4247 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4248 {
4249 	return softnet_get_online(pos);
4250 }
4251 
4252 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4253 {
4254 	++*pos;
4255 	return softnet_get_online(pos);
4256 }
4257 
4258 static void softnet_seq_stop(struct seq_file *seq, void *v)
4259 {
4260 }
4261 
4262 static int softnet_seq_show(struct seq_file *seq, void *v)
4263 {
4264 	struct softnet_data *sd = v;
4265 
4266 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4267 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4268 		   0, 0, 0, 0, /* was fastroute */
4269 		   sd->cpu_collision, sd->received_rps);
4270 	return 0;
4271 }
4272 
4273 static const struct seq_operations dev_seq_ops = {
4274 	.start = dev_seq_start,
4275 	.next  = dev_seq_next,
4276 	.stop  = dev_seq_stop,
4277 	.show  = dev_seq_show,
4278 };
4279 
4280 static int dev_seq_open(struct inode *inode, struct file *file)
4281 {
4282 	return seq_open_net(inode, file, &dev_seq_ops,
4283 			    sizeof(struct dev_iter_state));
4284 }
4285 
4286 int dev_seq_open_ops(struct inode *inode, struct file *file,
4287 		     const struct seq_operations *ops)
4288 {
4289 	return seq_open_net(inode, file, ops, sizeof(struct dev_iter_state));
4290 }
4291 
4292 static const struct file_operations dev_seq_fops = {
4293 	.owner	 = THIS_MODULE,
4294 	.open    = dev_seq_open,
4295 	.read    = seq_read,
4296 	.llseek  = seq_lseek,
4297 	.release = seq_release_net,
4298 };
4299 
4300 static const struct seq_operations softnet_seq_ops = {
4301 	.start = softnet_seq_start,
4302 	.next  = softnet_seq_next,
4303 	.stop  = softnet_seq_stop,
4304 	.show  = softnet_seq_show,
4305 };
4306 
4307 static int softnet_seq_open(struct inode *inode, struct file *file)
4308 {
4309 	return seq_open(file, &softnet_seq_ops);
4310 }
4311 
4312 static const struct file_operations softnet_seq_fops = {
4313 	.owner	 = THIS_MODULE,
4314 	.open    = softnet_seq_open,
4315 	.read    = seq_read,
4316 	.llseek  = seq_lseek,
4317 	.release = seq_release,
4318 };
4319 
4320 static void *ptype_get_idx(loff_t pos)
4321 {
4322 	struct packet_type *pt = NULL;
4323 	loff_t i = 0;
4324 	int t;
4325 
4326 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4327 		if (i == pos)
4328 			return pt;
4329 		++i;
4330 	}
4331 
4332 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4333 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4334 			if (i == pos)
4335 				return pt;
4336 			++i;
4337 		}
4338 	}
4339 	return NULL;
4340 }
4341 
4342 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4343 	__acquires(RCU)
4344 {
4345 	rcu_read_lock();
4346 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4347 }
4348 
4349 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4350 {
4351 	struct packet_type *pt;
4352 	struct list_head *nxt;
4353 	int hash;
4354 
4355 	++*pos;
4356 	if (v == SEQ_START_TOKEN)
4357 		return ptype_get_idx(0);
4358 
4359 	pt = v;
4360 	nxt = pt->list.next;
4361 	if (pt->type == htons(ETH_P_ALL)) {
4362 		if (nxt != &ptype_all)
4363 			goto found;
4364 		hash = 0;
4365 		nxt = ptype_base[0].next;
4366 	} else
4367 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4368 
4369 	while (nxt == &ptype_base[hash]) {
4370 		if (++hash >= PTYPE_HASH_SIZE)
4371 			return NULL;
4372 		nxt = ptype_base[hash].next;
4373 	}
4374 found:
4375 	return list_entry(nxt, struct packet_type, list);
4376 }
4377 
4378 static void ptype_seq_stop(struct seq_file *seq, void *v)
4379 	__releases(RCU)
4380 {
4381 	rcu_read_unlock();
4382 }
4383 
4384 static int ptype_seq_show(struct seq_file *seq, void *v)
4385 {
4386 	struct packet_type *pt = v;
4387 
4388 	if (v == SEQ_START_TOKEN)
4389 		seq_puts(seq, "Type Device      Function\n");
4390 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4391 		if (pt->type == htons(ETH_P_ALL))
4392 			seq_puts(seq, "ALL ");
4393 		else
4394 			seq_printf(seq, "%04x", ntohs(pt->type));
4395 
4396 		seq_printf(seq, " %-8s %pF\n",
4397 			   pt->dev ? pt->dev->name : "", pt->func);
4398 	}
4399 
4400 	return 0;
4401 }
4402 
4403 static const struct seq_operations ptype_seq_ops = {
4404 	.start = ptype_seq_start,
4405 	.next  = ptype_seq_next,
4406 	.stop  = ptype_seq_stop,
4407 	.show  = ptype_seq_show,
4408 };
4409 
4410 static int ptype_seq_open(struct inode *inode, struct file *file)
4411 {
4412 	return seq_open_net(inode, file, &ptype_seq_ops,
4413 			sizeof(struct seq_net_private));
4414 }
4415 
4416 static const struct file_operations ptype_seq_fops = {
4417 	.owner	 = THIS_MODULE,
4418 	.open    = ptype_seq_open,
4419 	.read    = seq_read,
4420 	.llseek  = seq_lseek,
4421 	.release = seq_release_net,
4422 };
4423 
4424 
4425 static int __net_init dev_proc_net_init(struct net *net)
4426 {
4427 	int rc = -ENOMEM;
4428 
4429 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4430 		goto out;
4431 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4432 		goto out_dev;
4433 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4434 		goto out_softnet;
4435 
4436 	if (wext_proc_init(net))
4437 		goto out_ptype;
4438 	rc = 0;
4439 out:
4440 	return rc;
4441 out_ptype:
4442 	proc_net_remove(net, "ptype");
4443 out_softnet:
4444 	proc_net_remove(net, "softnet_stat");
4445 out_dev:
4446 	proc_net_remove(net, "dev");
4447 	goto out;
4448 }
4449 
4450 static void __net_exit dev_proc_net_exit(struct net *net)
4451 {
4452 	wext_proc_exit(net);
4453 
4454 	proc_net_remove(net, "ptype");
4455 	proc_net_remove(net, "softnet_stat");
4456 	proc_net_remove(net, "dev");
4457 }
4458 
4459 static struct pernet_operations __net_initdata dev_proc_ops = {
4460 	.init = dev_proc_net_init,
4461 	.exit = dev_proc_net_exit,
4462 };
4463 
4464 static int __init dev_proc_init(void)
4465 {
4466 	return register_pernet_subsys(&dev_proc_ops);
4467 }
4468 #else
4469 #define dev_proc_init() 0
4470 #endif	/* CONFIG_PROC_FS */
4471 
4472 
4473 /**
4474  *	netdev_set_master	-	set up master pointer
4475  *	@slave: slave device
4476  *	@master: new master device
4477  *
4478  *	Changes the master device of the slave. Pass %NULL to break the
4479  *	bonding. The caller must hold the RTNL semaphore. On a failure
4480  *	a negative errno code is returned. On success the reference counts
4481  *	are adjusted and the function returns zero.
4482  */
4483 int netdev_set_master(struct net_device *slave, struct net_device *master)
4484 {
4485 	struct net_device *old = slave->master;
4486 
4487 	ASSERT_RTNL();
4488 
4489 	if (master) {
4490 		if (old)
4491 			return -EBUSY;
4492 		dev_hold(master);
4493 	}
4494 
4495 	slave->master = master;
4496 
4497 	if (old)
4498 		dev_put(old);
4499 	return 0;
4500 }
4501 EXPORT_SYMBOL(netdev_set_master);
4502 
4503 /**
4504  *	netdev_set_bond_master	-	set up bonding master/slave pair
4505  *	@slave: slave device
4506  *	@master: new master device
4507  *
4508  *	Changes the master device of the slave. Pass %NULL to break the
4509  *	bonding. The caller must hold the RTNL semaphore. On a failure
4510  *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4511  *	to the routing socket and the function returns zero.
4512  */
4513 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4514 {
4515 	int err;
4516 
4517 	ASSERT_RTNL();
4518 
4519 	err = netdev_set_master(slave, master);
4520 	if (err)
4521 		return err;
4522 	if (master)
4523 		slave->flags |= IFF_SLAVE;
4524 	else
4525 		slave->flags &= ~IFF_SLAVE;
4526 
4527 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4528 	return 0;
4529 }
4530 EXPORT_SYMBOL(netdev_set_bond_master);
4531 
4532 static void dev_change_rx_flags(struct net_device *dev, int flags)
4533 {
4534 	const struct net_device_ops *ops = dev->netdev_ops;
4535 
4536 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4537 		ops->ndo_change_rx_flags(dev, flags);
4538 }
4539 
4540 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4541 {
4542 	unsigned short old_flags = dev->flags;
4543 	uid_t uid;
4544 	gid_t gid;
4545 
4546 	ASSERT_RTNL();
4547 
4548 	dev->flags |= IFF_PROMISC;
4549 	dev->promiscuity += inc;
4550 	if (dev->promiscuity == 0) {
4551 		/*
4552 		 * Avoid overflow.
4553 		 * If inc causes overflow, untouch promisc and return error.
4554 		 */
4555 		if (inc < 0)
4556 			dev->flags &= ~IFF_PROMISC;
4557 		else {
4558 			dev->promiscuity -= inc;
4559 			printk(KERN_WARNING "%s: promiscuity touches roof, "
4560 				"set promiscuity failed, promiscuity feature "
4561 				"of device might be broken.\n", dev->name);
4562 			return -EOVERFLOW;
4563 		}
4564 	}
4565 	if (dev->flags != old_flags) {
4566 		printk(KERN_INFO "device %s %s promiscuous mode\n",
4567 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4568 							       "left");
4569 		if (audit_enabled) {
4570 			current_uid_gid(&uid, &gid);
4571 			audit_log(current->audit_context, GFP_ATOMIC,
4572 				AUDIT_ANOM_PROMISCUOUS,
4573 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4574 				dev->name, (dev->flags & IFF_PROMISC),
4575 				(old_flags & IFF_PROMISC),
4576 				audit_get_loginuid(current),
4577 				uid, gid,
4578 				audit_get_sessionid(current));
4579 		}
4580 
4581 		dev_change_rx_flags(dev, IFF_PROMISC);
4582 	}
4583 	return 0;
4584 }
4585 
4586 /**
4587  *	dev_set_promiscuity	- update promiscuity count on a device
4588  *	@dev: device
4589  *	@inc: modifier
4590  *
4591  *	Add or remove promiscuity from a device. While the count in the device
4592  *	remains above zero the interface remains promiscuous. Once it hits zero
4593  *	the device reverts back to normal filtering operation. A negative inc
4594  *	value is used to drop promiscuity on the device.
4595  *	Return 0 if successful or a negative errno code on error.
4596  */
4597 int dev_set_promiscuity(struct net_device *dev, int inc)
4598 {
4599 	unsigned short old_flags = dev->flags;
4600 	int err;
4601 
4602 	err = __dev_set_promiscuity(dev, inc);
4603 	if (err < 0)
4604 		return err;
4605 	if (dev->flags != old_flags)
4606 		dev_set_rx_mode(dev);
4607 	return err;
4608 }
4609 EXPORT_SYMBOL(dev_set_promiscuity);
4610 
4611 /**
4612  *	dev_set_allmulti	- update allmulti count on a device
4613  *	@dev: device
4614  *	@inc: modifier
4615  *
4616  *	Add or remove reception of all multicast frames to a device. While the
4617  *	count in the device remains above zero the interface remains listening
4618  *	to all interfaces. Once it hits zero the device reverts back to normal
4619  *	filtering operation. A negative @inc value is used to drop the counter
4620  *	when releasing a resource needing all multicasts.
4621  *	Return 0 if successful or a negative errno code on error.
4622  */
4623 
4624 int dev_set_allmulti(struct net_device *dev, int inc)
4625 {
4626 	unsigned short old_flags = dev->flags;
4627 
4628 	ASSERT_RTNL();
4629 
4630 	dev->flags |= IFF_ALLMULTI;
4631 	dev->allmulti += inc;
4632 	if (dev->allmulti == 0) {
4633 		/*
4634 		 * Avoid overflow.
4635 		 * If inc causes overflow, untouch allmulti and return error.
4636 		 */
4637 		if (inc < 0)
4638 			dev->flags &= ~IFF_ALLMULTI;
4639 		else {
4640 			dev->allmulti -= inc;
4641 			printk(KERN_WARNING "%s: allmulti touches roof, "
4642 				"set allmulti failed, allmulti feature of "
4643 				"device might be broken.\n", dev->name);
4644 			return -EOVERFLOW;
4645 		}
4646 	}
4647 	if (dev->flags ^ old_flags) {
4648 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4649 		dev_set_rx_mode(dev);
4650 	}
4651 	return 0;
4652 }
4653 EXPORT_SYMBOL(dev_set_allmulti);
4654 
4655 /*
4656  *	Upload unicast and multicast address lists to device and
4657  *	configure RX filtering. When the device doesn't support unicast
4658  *	filtering it is put in promiscuous mode while unicast addresses
4659  *	are present.
4660  */
4661 void __dev_set_rx_mode(struct net_device *dev)
4662 {
4663 	const struct net_device_ops *ops = dev->netdev_ops;
4664 
4665 	/* dev_open will call this function so the list will stay sane. */
4666 	if (!(dev->flags&IFF_UP))
4667 		return;
4668 
4669 	if (!netif_device_present(dev))
4670 		return;
4671 
4672 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4673 		/* Unicast addresses changes may only happen under the rtnl,
4674 		 * therefore calling __dev_set_promiscuity here is safe.
4675 		 */
4676 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4677 			__dev_set_promiscuity(dev, 1);
4678 			dev->uc_promisc = true;
4679 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4680 			__dev_set_promiscuity(dev, -1);
4681 			dev->uc_promisc = false;
4682 		}
4683 	}
4684 
4685 	if (ops->ndo_set_rx_mode)
4686 		ops->ndo_set_rx_mode(dev);
4687 }
4688 
4689 void dev_set_rx_mode(struct net_device *dev)
4690 {
4691 	netif_addr_lock_bh(dev);
4692 	__dev_set_rx_mode(dev);
4693 	netif_addr_unlock_bh(dev);
4694 }
4695 
4696 /**
4697  *	dev_get_flags - get flags reported to userspace
4698  *	@dev: device
4699  *
4700  *	Get the combination of flag bits exported through APIs to userspace.
4701  */
4702 unsigned dev_get_flags(const struct net_device *dev)
4703 {
4704 	unsigned flags;
4705 
4706 	flags = (dev->flags & ~(IFF_PROMISC |
4707 				IFF_ALLMULTI |
4708 				IFF_RUNNING |
4709 				IFF_LOWER_UP |
4710 				IFF_DORMANT)) |
4711 		(dev->gflags & (IFF_PROMISC |
4712 				IFF_ALLMULTI));
4713 
4714 	if (netif_running(dev)) {
4715 		if (netif_oper_up(dev))
4716 			flags |= IFF_RUNNING;
4717 		if (netif_carrier_ok(dev))
4718 			flags |= IFF_LOWER_UP;
4719 		if (netif_dormant(dev))
4720 			flags |= IFF_DORMANT;
4721 	}
4722 
4723 	return flags;
4724 }
4725 EXPORT_SYMBOL(dev_get_flags);
4726 
4727 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4728 {
4729 	int old_flags = dev->flags;
4730 	int ret;
4731 
4732 	ASSERT_RTNL();
4733 
4734 	/*
4735 	 *	Set the flags on our device.
4736 	 */
4737 
4738 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4739 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4740 			       IFF_AUTOMEDIA)) |
4741 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4742 				    IFF_ALLMULTI));
4743 
4744 	/*
4745 	 *	Load in the correct multicast list now the flags have changed.
4746 	 */
4747 
4748 	if ((old_flags ^ flags) & IFF_MULTICAST)
4749 		dev_change_rx_flags(dev, IFF_MULTICAST);
4750 
4751 	dev_set_rx_mode(dev);
4752 
4753 	/*
4754 	 *	Have we downed the interface. We handle IFF_UP ourselves
4755 	 *	according to user attempts to set it, rather than blindly
4756 	 *	setting it.
4757 	 */
4758 
4759 	ret = 0;
4760 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4761 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4762 
4763 		if (!ret)
4764 			dev_set_rx_mode(dev);
4765 	}
4766 
4767 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4768 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4769 
4770 		dev->gflags ^= IFF_PROMISC;
4771 		dev_set_promiscuity(dev, inc);
4772 	}
4773 
4774 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4775 	   is important. Some (broken) drivers set IFF_PROMISC, when
4776 	   IFF_ALLMULTI is requested not asking us and not reporting.
4777 	 */
4778 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4779 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4780 
4781 		dev->gflags ^= IFF_ALLMULTI;
4782 		dev_set_allmulti(dev, inc);
4783 	}
4784 
4785 	return ret;
4786 }
4787 
4788 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4789 {
4790 	unsigned int changes = dev->flags ^ old_flags;
4791 
4792 	if (changes & IFF_UP) {
4793 		if (dev->flags & IFF_UP)
4794 			call_netdevice_notifiers(NETDEV_UP, dev);
4795 		else
4796 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4797 	}
4798 
4799 	if (dev->flags & IFF_UP &&
4800 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4801 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4802 }
4803 
4804 /**
4805  *	dev_change_flags - change device settings
4806  *	@dev: device
4807  *	@flags: device state flags
4808  *
4809  *	Change settings on device based state flags. The flags are
4810  *	in the userspace exported format.
4811  */
4812 int dev_change_flags(struct net_device *dev, unsigned flags)
4813 {
4814 	int ret, changes;
4815 	int old_flags = dev->flags;
4816 
4817 	ret = __dev_change_flags(dev, flags);
4818 	if (ret < 0)
4819 		return ret;
4820 
4821 	changes = old_flags ^ dev->flags;
4822 	if (changes)
4823 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4824 
4825 	__dev_notify_flags(dev, old_flags);
4826 	return ret;
4827 }
4828 EXPORT_SYMBOL(dev_change_flags);
4829 
4830 /**
4831  *	dev_set_mtu - Change maximum transfer unit
4832  *	@dev: device
4833  *	@new_mtu: new transfer unit
4834  *
4835  *	Change the maximum transfer size of the network device.
4836  */
4837 int dev_set_mtu(struct net_device *dev, int new_mtu)
4838 {
4839 	const struct net_device_ops *ops = dev->netdev_ops;
4840 	int err;
4841 
4842 	if (new_mtu == dev->mtu)
4843 		return 0;
4844 
4845 	/*	MTU must be positive.	 */
4846 	if (new_mtu < 0)
4847 		return -EINVAL;
4848 
4849 	if (!netif_device_present(dev))
4850 		return -ENODEV;
4851 
4852 	err = 0;
4853 	if (ops->ndo_change_mtu)
4854 		err = ops->ndo_change_mtu(dev, new_mtu);
4855 	else
4856 		dev->mtu = new_mtu;
4857 
4858 	if (!err && dev->flags & IFF_UP)
4859 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4860 	return err;
4861 }
4862 EXPORT_SYMBOL(dev_set_mtu);
4863 
4864 /**
4865  *	dev_set_group - Change group this device belongs to
4866  *	@dev: device
4867  *	@new_group: group this device should belong to
4868  */
4869 void dev_set_group(struct net_device *dev, int new_group)
4870 {
4871 	dev->group = new_group;
4872 }
4873 EXPORT_SYMBOL(dev_set_group);
4874 
4875 /**
4876  *	dev_set_mac_address - Change Media Access Control Address
4877  *	@dev: device
4878  *	@sa: new address
4879  *
4880  *	Change the hardware (MAC) address of the device
4881  */
4882 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4883 {
4884 	const struct net_device_ops *ops = dev->netdev_ops;
4885 	int err;
4886 
4887 	if (!ops->ndo_set_mac_address)
4888 		return -EOPNOTSUPP;
4889 	if (sa->sa_family != dev->type)
4890 		return -EINVAL;
4891 	if (!netif_device_present(dev))
4892 		return -ENODEV;
4893 	err = ops->ndo_set_mac_address(dev, sa);
4894 	if (!err)
4895 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4896 	return err;
4897 }
4898 EXPORT_SYMBOL(dev_set_mac_address);
4899 
4900 /*
4901  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4902  */
4903 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4904 {
4905 	int err;
4906 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4907 
4908 	if (!dev)
4909 		return -ENODEV;
4910 
4911 	switch (cmd) {
4912 	case SIOCGIFFLAGS:	/* Get interface flags */
4913 		ifr->ifr_flags = (short) dev_get_flags(dev);
4914 		return 0;
4915 
4916 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4917 				   (currently unused) */
4918 		ifr->ifr_metric = 0;
4919 		return 0;
4920 
4921 	case SIOCGIFMTU:	/* Get the MTU of a device */
4922 		ifr->ifr_mtu = dev->mtu;
4923 		return 0;
4924 
4925 	case SIOCGIFHWADDR:
4926 		if (!dev->addr_len)
4927 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4928 		else
4929 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4930 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4931 		ifr->ifr_hwaddr.sa_family = dev->type;
4932 		return 0;
4933 
4934 	case SIOCGIFSLAVE:
4935 		err = -EINVAL;
4936 		break;
4937 
4938 	case SIOCGIFMAP:
4939 		ifr->ifr_map.mem_start = dev->mem_start;
4940 		ifr->ifr_map.mem_end   = dev->mem_end;
4941 		ifr->ifr_map.base_addr = dev->base_addr;
4942 		ifr->ifr_map.irq       = dev->irq;
4943 		ifr->ifr_map.dma       = dev->dma;
4944 		ifr->ifr_map.port      = dev->if_port;
4945 		return 0;
4946 
4947 	case SIOCGIFINDEX:
4948 		ifr->ifr_ifindex = dev->ifindex;
4949 		return 0;
4950 
4951 	case SIOCGIFTXQLEN:
4952 		ifr->ifr_qlen = dev->tx_queue_len;
4953 		return 0;
4954 
4955 	default:
4956 		/* dev_ioctl() should ensure this case
4957 		 * is never reached
4958 		 */
4959 		WARN_ON(1);
4960 		err = -ENOTTY;
4961 		break;
4962 
4963 	}
4964 	return err;
4965 }
4966 
4967 /*
4968  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4969  */
4970 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4971 {
4972 	int err;
4973 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4974 	const struct net_device_ops *ops;
4975 
4976 	if (!dev)
4977 		return -ENODEV;
4978 
4979 	ops = dev->netdev_ops;
4980 
4981 	switch (cmd) {
4982 	case SIOCSIFFLAGS:	/* Set interface flags */
4983 		return dev_change_flags(dev, ifr->ifr_flags);
4984 
4985 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4986 				   (currently unused) */
4987 		return -EOPNOTSUPP;
4988 
4989 	case SIOCSIFMTU:	/* Set the MTU of a device */
4990 		return dev_set_mtu(dev, ifr->ifr_mtu);
4991 
4992 	case SIOCSIFHWADDR:
4993 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4994 
4995 	case SIOCSIFHWBROADCAST:
4996 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4997 			return -EINVAL;
4998 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4999 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5000 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5001 		return 0;
5002 
5003 	case SIOCSIFMAP:
5004 		if (ops->ndo_set_config) {
5005 			if (!netif_device_present(dev))
5006 				return -ENODEV;
5007 			return ops->ndo_set_config(dev, &ifr->ifr_map);
5008 		}
5009 		return -EOPNOTSUPP;
5010 
5011 	case SIOCADDMULTI:
5012 		if (!ops->ndo_set_rx_mode ||
5013 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5014 			return -EINVAL;
5015 		if (!netif_device_present(dev))
5016 			return -ENODEV;
5017 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5018 
5019 	case SIOCDELMULTI:
5020 		if (!ops->ndo_set_rx_mode ||
5021 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5022 			return -EINVAL;
5023 		if (!netif_device_present(dev))
5024 			return -ENODEV;
5025 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5026 
5027 	case SIOCSIFTXQLEN:
5028 		if (ifr->ifr_qlen < 0)
5029 			return -EINVAL;
5030 		dev->tx_queue_len = ifr->ifr_qlen;
5031 		return 0;
5032 
5033 	case SIOCSIFNAME:
5034 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5035 		return dev_change_name(dev, ifr->ifr_newname);
5036 
5037 	case SIOCSHWTSTAMP:
5038 		err = net_hwtstamp_validate(ifr);
5039 		if (err)
5040 			return err;
5041 		/* fall through */
5042 
5043 	/*
5044 	 *	Unknown or private ioctl
5045 	 */
5046 	default:
5047 		if ((cmd >= SIOCDEVPRIVATE &&
5048 		    cmd <= SIOCDEVPRIVATE + 15) ||
5049 		    cmd == SIOCBONDENSLAVE ||
5050 		    cmd == SIOCBONDRELEASE ||
5051 		    cmd == SIOCBONDSETHWADDR ||
5052 		    cmd == SIOCBONDSLAVEINFOQUERY ||
5053 		    cmd == SIOCBONDINFOQUERY ||
5054 		    cmd == SIOCBONDCHANGEACTIVE ||
5055 		    cmd == SIOCGMIIPHY ||
5056 		    cmd == SIOCGMIIREG ||
5057 		    cmd == SIOCSMIIREG ||
5058 		    cmd == SIOCBRADDIF ||
5059 		    cmd == SIOCBRDELIF ||
5060 		    cmd == SIOCSHWTSTAMP ||
5061 		    cmd == SIOCWANDEV) {
5062 			err = -EOPNOTSUPP;
5063 			if (ops->ndo_do_ioctl) {
5064 				if (netif_device_present(dev))
5065 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
5066 				else
5067 					err = -ENODEV;
5068 			}
5069 		} else
5070 			err = -EINVAL;
5071 
5072 	}
5073 	return err;
5074 }
5075 
5076 /*
5077  *	This function handles all "interface"-type I/O control requests. The actual
5078  *	'doing' part of this is dev_ifsioc above.
5079  */
5080 
5081 /**
5082  *	dev_ioctl	-	network device ioctl
5083  *	@net: the applicable net namespace
5084  *	@cmd: command to issue
5085  *	@arg: pointer to a struct ifreq in user space
5086  *
5087  *	Issue ioctl functions to devices. This is normally called by the
5088  *	user space syscall interfaces but can sometimes be useful for
5089  *	other purposes. The return value is the return from the syscall if
5090  *	positive or a negative errno code on error.
5091  */
5092 
5093 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5094 {
5095 	struct ifreq ifr;
5096 	int ret;
5097 	char *colon;
5098 
5099 	/* One special case: SIOCGIFCONF takes ifconf argument
5100 	   and requires shared lock, because it sleeps writing
5101 	   to user space.
5102 	 */
5103 
5104 	if (cmd == SIOCGIFCONF) {
5105 		rtnl_lock();
5106 		ret = dev_ifconf(net, (char __user *) arg);
5107 		rtnl_unlock();
5108 		return ret;
5109 	}
5110 	if (cmd == SIOCGIFNAME)
5111 		return dev_ifname(net, (struct ifreq __user *)arg);
5112 
5113 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5114 		return -EFAULT;
5115 
5116 	ifr.ifr_name[IFNAMSIZ-1] = 0;
5117 
5118 	colon = strchr(ifr.ifr_name, ':');
5119 	if (colon)
5120 		*colon = 0;
5121 
5122 	/*
5123 	 *	See which interface the caller is talking about.
5124 	 */
5125 
5126 	switch (cmd) {
5127 	/*
5128 	 *	These ioctl calls:
5129 	 *	- can be done by all.
5130 	 *	- atomic and do not require locking.
5131 	 *	- return a value
5132 	 */
5133 	case SIOCGIFFLAGS:
5134 	case SIOCGIFMETRIC:
5135 	case SIOCGIFMTU:
5136 	case SIOCGIFHWADDR:
5137 	case SIOCGIFSLAVE:
5138 	case SIOCGIFMAP:
5139 	case SIOCGIFINDEX:
5140 	case SIOCGIFTXQLEN:
5141 		dev_load(net, ifr.ifr_name);
5142 		rcu_read_lock();
5143 		ret = dev_ifsioc_locked(net, &ifr, cmd);
5144 		rcu_read_unlock();
5145 		if (!ret) {
5146 			if (colon)
5147 				*colon = ':';
5148 			if (copy_to_user(arg, &ifr,
5149 					 sizeof(struct ifreq)))
5150 				ret = -EFAULT;
5151 		}
5152 		return ret;
5153 
5154 	case SIOCETHTOOL:
5155 		dev_load(net, ifr.ifr_name);
5156 		rtnl_lock();
5157 		ret = dev_ethtool(net, &ifr);
5158 		rtnl_unlock();
5159 		if (!ret) {
5160 			if (colon)
5161 				*colon = ':';
5162 			if (copy_to_user(arg, &ifr,
5163 					 sizeof(struct ifreq)))
5164 				ret = -EFAULT;
5165 		}
5166 		return ret;
5167 
5168 	/*
5169 	 *	These ioctl calls:
5170 	 *	- require superuser power.
5171 	 *	- require strict serialization.
5172 	 *	- return a value
5173 	 */
5174 	case SIOCGMIIPHY:
5175 	case SIOCGMIIREG:
5176 	case SIOCSIFNAME:
5177 		if (!capable(CAP_NET_ADMIN))
5178 			return -EPERM;
5179 		dev_load(net, ifr.ifr_name);
5180 		rtnl_lock();
5181 		ret = dev_ifsioc(net, &ifr, cmd);
5182 		rtnl_unlock();
5183 		if (!ret) {
5184 			if (colon)
5185 				*colon = ':';
5186 			if (copy_to_user(arg, &ifr,
5187 					 sizeof(struct ifreq)))
5188 				ret = -EFAULT;
5189 		}
5190 		return ret;
5191 
5192 	/*
5193 	 *	These ioctl calls:
5194 	 *	- require superuser power.
5195 	 *	- require strict serialization.
5196 	 *	- do not return a value
5197 	 */
5198 	case SIOCSIFFLAGS:
5199 	case SIOCSIFMETRIC:
5200 	case SIOCSIFMTU:
5201 	case SIOCSIFMAP:
5202 	case SIOCSIFHWADDR:
5203 	case SIOCSIFSLAVE:
5204 	case SIOCADDMULTI:
5205 	case SIOCDELMULTI:
5206 	case SIOCSIFHWBROADCAST:
5207 	case SIOCSIFTXQLEN:
5208 	case SIOCSMIIREG:
5209 	case SIOCBONDENSLAVE:
5210 	case SIOCBONDRELEASE:
5211 	case SIOCBONDSETHWADDR:
5212 	case SIOCBONDCHANGEACTIVE:
5213 	case SIOCBRADDIF:
5214 	case SIOCBRDELIF:
5215 	case SIOCSHWTSTAMP:
5216 		if (!capable(CAP_NET_ADMIN))
5217 			return -EPERM;
5218 		/* fall through */
5219 	case SIOCBONDSLAVEINFOQUERY:
5220 	case SIOCBONDINFOQUERY:
5221 		dev_load(net, ifr.ifr_name);
5222 		rtnl_lock();
5223 		ret = dev_ifsioc(net, &ifr, cmd);
5224 		rtnl_unlock();
5225 		return ret;
5226 
5227 	case SIOCGIFMEM:
5228 		/* Get the per device memory space. We can add this but
5229 		 * currently do not support it */
5230 	case SIOCSIFMEM:
5231 		/* Set the per device memory buffer space.
5232 		 * Not applicable in our case */
5233 	case SIOCSIFLINK:
5234 		return -ENOTTY;
5235 
5236 	/*
5237 	 *	Unknown or private ioctl.
5238 	 */
5239 	default:
5240 		if (cmd == SIOCWANDEV ||
5241 		    (cmd >= SIOCDEVPRIVATE &&
5242 		     cmd <= SIOCDEVPRIVATE + 15)) {
5243 			dev_load(net, ifr.ifr_name);
5244 			rtnl_lock();
5245 			ret = dev_ifsioc(net, &ifr, cmd);
5246 			rtnl_unlock();
5247 			if (!ret && copy_to_user(arg, &ifr,
5248 						 sizeof(struct ifreq)))
5249 				ret = -EFAULT;
5250 			return ret;
5251 		}
5252 		/* Take care of Wireless Extensions */
5253 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5254 			return wext_handle_ioctl(net, &ifr, cmd, arg);
5255 		return -ENOTTY;
5256 	}
5257 }
5258 
5259 
5260 /**
5261  *	dev_new_index	-	allocate an ifindex
5262  *	@net: the applicable net namespace
5263  *
5264  *	Returns a suitable unique value for a new device interface
5265  *	number.  The caller must hold the rtnl semaphore or the
5266  *	dev_base_lock to be sure it remains unique.
5267  */
5268 static int dev_new_index(struct net *net)
5269 {
5270 	static int ifindex;
5271 	for (;;) {
5272 		if (++ifindex <= 0)
5273 			ifindex = 1;
5274 		if (!__dev_get_by_index(net, ifindex))
5275 			return ifindex;
5276 	}
5277 }
5278 
5279 /* Delayed registration/unregisteration */
5280 static LIST_HEAD(net_todo_list);
5281 
5282 static void net_set_todo(struct net_device *dev)
5283 {
5284 	list_add_tail(&dev->todo_list, &net_todo_list);
5285 }
5286 
5287 static void rollback_registered_many(struct list_head *head)
5288 {
5289 	struct net_device *dev, *tmp;
5290 
5291 	BUG_ON(dev_boot_phase);
5292 	ASSERT_RTNL();
5293 
5294 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5295 		/* Some devices call without registering
5296 		 * for initialization unwind. Remove those
5297 		 * devices and proceed with the remaining.
5298 		 */
5299 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5300 			pr_debug("unregister_netdevice: device %s/%p never "
5301 				 "was registered\n", dev->name, dev);
5302 
5303 			WARN_ON(1);
5304 			list_del(&dev->unreg_list);
5305 			continue;
5306 		}
5307 		dev->dismantle = true;
5308 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5309 	}
5310 
5311 	/* If device is running, close it first. */
5312 	dev_close_many(head);
5313 
5314 	list_for_each_entry(dev, head, unreg_list) {
5315 		/* And unlink it from device chain. */
5316 		unlist_netdevice(dev);
5317 
5318 		dev->reg_state = NETREG_UNREGISTERING;
5319 	}
5320 
5321 	synchronize_net();
5322 
5323 	list_for_each_entry(dev, head, unreg_list) {
5324 		/* Shutdown queueing discipline. */
5325 		dev_shutdown(dev);
5326 
5327 
5328 		/* Notify protocols, that we are about to destroy
5329 		   this device. They should clean all the things.
5330 		*/
5331 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5332 
5333 		if (!dev->rtnl_link_ops ||
5334 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5335 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5336 
5337 		/*
5338 		 *	Flush the unicast and multicast chains
5339 		 */
5340 		dev_uc_flush(dev);
5341 		dev_mc_flush(dev);
5342 
5343 		if (dev->netdev_ops->ndo_uninit)
5344 			dev->netdev_ops->ndo_uninit(dev);
5345 
5346 		/* Notifier chain MUST detach us from master device. */
5347 		WARN_ON(dev->master);
5348 
5349 		/* Remove entries from kobject tree */
5350 		netdev_unregister_kobject(dev);
5351 	}
5352 
5353 	/* Process any work delayed until the end of the batch */
5354 	dev = list_first_entry(head, struct net_device, unreg_list);
5355 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5356 
5357 	synchronize_net();
5358 
5359 	list_for_each_entry(dev, head, unreg_list)
5360 		dev_put(dev);
5361 }
5362 
5363 static void rollback_registered(struct net_device *dev)
5364 {
5365 	LIST_HEAD(single);
5366 
5367 	list_add(&dev->unreg_list, &single);
5368 	rollback_registered_many(&single);
5369 	list_del(&single);
5370 }
5371 
5372 static u32 netdev_fix_features(struct net_device *dev, u32 features)
5373 {
5374 	/* Fix illegal checksum combinations */
5375 	if ((features & NETIF_F_HW_CSUM) &&
5376 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5377 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5378 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5379 	}
5380 
5381 	if ((features & NETIF_F_NO_CSUM) &&
5382 	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5383 		netdev_warn(dev, "mixed no checksumming and other settings.\n");
5384 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5385 	}
5386 
5387 	/* Fix illegal SG+CSUM combinations. */
5388 	if ((features & NETIF_F_SG) &&
5389 	    !(features & NETIF_F_ALL_CSUM)) {
5390 		netdev_dbg(dev,
5391 			"Dropping NETIF_F_SG since no checksum feature.\n");
5392 		features &= ~NETIF_F_SG;
5393 	}
5394 
5395 	/* TSO requires that SG is present as well. */
5396 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5397 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5398 		features &= ~NETIF_F_ALL_TSO;
5399 	}
5400 
5401 	/* TSO ECN requires that TSO is present as well. */
5402 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5403 		features &= ~NETIF_F_TSO_ECN;
5404 
5405 	/* Software GSO depends on SG. */
5406 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5407 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5408 		features &= ~NETIF_F_GSO;
5409 	}
5410 
5411 	/* UFO needs SG and checksumming */
5412 	if (features & NETIF_F_UFO) {
5413 		/* maybe split UFO into V4 and V6? */
5414 		if (!((features & NETIF_F_GEN_CSUM) ||
5415 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5416 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5417 			netdev_dbg(dev,
5418 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5419 			features &= ~NETIF_F_UFO;
5420 		}
5421 
5422 		if (!(features & NETIF_F_SG)) {
5423 			netdev_dbg(dev,
5424 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5425 			features &= ~NETIF_F_UFO;
5426 		}
5427 	}
5428 
5429 	return features;
5430 }
5431 
5432 int __netdev_update_features(struct net_device *dev)
5433 {
5434 	u32 features;
5435 	int err = 0;
5436 
5437 	ASSERT_RTNL();
5438 
5439 	features = netdev_get_wanted_features(dev);
5440 
5441 	if (dev->netdev_ops->ndo_fix_features)
5442 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5443 
5444 	/* driver might be less strict about feature dependencies */
5445 	features = netdev_fix_features(dev, features);
5446 
5447 	if (dev->features == features)
5448 		return 0;
5449 
5450 	netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5451 		dev->features, features);
5452 
5453 	if (dev->netdev_ops->ndo_set_features)
5454 		err = dev->netdev_ops->ndo_set_features(dev, features);
5455 
5456 	if (unlikely(err < 0)) {
5457 		netdev_err(dev,
5458 			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5459 			err, features, dev->features);
5460 		return -1;
5461 	}
5462 
5463 	if (!err)
5464 		dev->features = features;
5465 
5466 	return 1;
5467 }
5468 
5469 /**
5470  *	netdev_update_features - recalculate device features
5471  *	@dev: the device to check
5472  *
5473  *	Recalculate dev->features set and send notifications if it
5474  *	has changed. Should be called after driver or hardware dependent
5475  *	conditions might have changed that influence the features.
5476  */
5477 void netdev_update_features(struct net_device *dev)
5478 {
5479 	if (__netdev_update_features(dev))
5480 		netdev_features_change(dev);
5481 }
5482 EXPORT_SYMBOL(netdev_update_features);
5483 
5484 /**
5485  *	netdev_change_features - recalculate device features
5486  *	@dev: the device to check
5487  *
5488  *	Recalculate dev->features set and send notifications even
5489  *	if they have not changed. Should be called instead of
5490  *	netdev_update_features() if also dev->vlan_features might
5491  *	have changed to allow the changes to be propagated to stacked
5492  *	VLAN devices.
5493  */
5494 void netdev_change_features(struct net_device *dev)
5495 {
5496 	__netdev_update_features(dev);
5497 	netdev_features_change(dev);
5498 }
5499 EXPORT_SYMBOL(netdev_change_features);
5500 
5501 /**
5502  *	netif_stacked_transfer_operstate -	transfer operstate
5503  *	@rootdev: the root or lower level device to transfer state from
5504  *	@dev: the device to transfer operstate to
5505  *
5506  *	Transfer operational state from root to device. This is normally
5507  *	called when a stacking relationship exists between the root
5508  *	device and the device(a leaf device).
5509  */
5510 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5511 					struct net_device *dev)
5512 {
5513 	if (rootdev->operstate == IF_OPER_DORMANT)
5514 		netif_dormant_on(dev);
5515 	else
5516 		netif_dormant_off(dev);
5517 
5518 	if (netif_carrier_ok(rootdev)) {
5519 		if (!netif_carrier_ok(dev))
5520 			netif_carrier_on(dev);
5521 	} else {
5522 		if (netif_carrier_ok(dev))
5523 			netif_carrier_off(dev);
5524 	}
5525 }
5526 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5527 
5528 #ifdef CONFIG_RPS
5529 static int netif_alloc_rx_queues(struct net_device *dev)
5530 {
5531 	unsigned int i, count = dev->num_rx_queues;
5532 	struct netdev_rx_queue *rx;
5533 
5534 	BUG_ON(count < 1);
5535 
5536 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5537 	if (!rx) {
5538 		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5539 		return -ENOMEM;
5540 	}
5541 	dev->_rx = rx;
5542 
5543 	for (i = 0; i < count; i++)
5544 		rx[i].dev = dev;
5545 	return 0;
5546 }
5547 #endif
5548 
5549 static void netdev_init_one_queue(struct net_device *dev,
5550 				  struct netdev_queue *queue, void *_unused)
5551 {
5552 	/* Initialize queue lock */
5553 	spin_lock_init(&queue->_xmit_lock);
5554 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5555 	queue->xmit_lock_owner = -1;
5556 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5557 	queue->dev = dev;
5558 }
5559 
5560 static int netif_alloc_netdev_queues(struct net_device *dev)
5561 {
5562 	unsigned int count = dev->num_tx_queues;
5563 	struct netdev_queue *tx;
5564 
5565 	BUG_ON(count < 1);
5566 
5567 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5568 	if (!tx) {
5569 		pr_err("netdev: Unable to allocate %u tx queues.\n",
5570 		       count);
5571 		return -ENOMEM;
5572 	}
5573 	dev->_tx = tx;
5574 
5575 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5576 	spin_lock_init(&dev->tx_global_lock);
5577 
5578 	return 0;
5579 }
5580 
5581 /**
5582  *	register_netdevice	- register a network device
5583  *	@dev: device to register
5584  *
5585  *	Take a completed network device structure and add it to the kernel
5586  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5587  *	chain. 0 is returned on success. A negative errno code is returned
5588  *	on a failure to set up the device, or if the name is a duplicate.
5589  *
5590  *	Callers must hold the rtnl semaphore. You may want
5591  *	register_netdev() instead of this.
5592  *
5593  *	BUGS:
5594  *	The locking appears insufficient to guarantee two parallel registers
5595  *	will not get the same name.
5596  */
5597 
5598 int register_netdevice(struct net_device *dev)
5599 {
5600 	int ret;
5601 	struct net *net = dev_net(dev);
5602 
5603 	BUG_ON(dev_boot_phase);
5604 	ASSERT_RTNL();
5605 
5606 	might_sleep();
5607 
5608 	/* When net_device's are persistent, this will be fatal. */
5609 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5610 	BUG_ON(!net);
5611 
5612 	spin_lock_init(&dev->addr_list_lock);
5613 	netdev_set_addr_lockdep_class(dev);
5614 
5615 	dev->iflink = -1;
5616 
5617 	ret = dev_get_valid_name(dev, dev->name);
5618 	if (ret < 0)
5619 		goto out;
5620 
5621 	/* Init, if this function is available */
5622 	if (dev->netdev_ops->ndo_init) {
5623 		ret = dev->netdev_ops->ndo_init(dev);
5624 		if (ret) {
5625 			if (ret > 0)
5626 				ret = -EIO;
5627 			goto out;
5628 		}
5629 	}
5630 
5631 	dev->ifindex = dev_new_index(net);
5632 	if (dev->iflink == -1)
5633 		dev->iflink = dev->ifindex;
5634 
5635 	/* Transfer changeable features to wanted_features and enable
5636 	 * software offloads (GSO and GRO).
5637 	 */
5638 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5639 	dev->features |= NETIF_F_SOFT_FEATURES;
5640 	dev->wanted_features = dev->features & dev->hw_features;
5641 
5642 	/* Turn on no cache copy if HW is doing checksum */
5643 	dev->hw_features |= NETIF_F_NOCACHE_COPY;
5644 	if ((dev->features & NETIF_F_ALL_CSUM) &&
5645 	    !(dev->features & NETIF_F_NO_CSUM)) {
5646 		dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5647 		dev->features |= NETIF_F_NOCACHE_COPY;
5648 	}
5649 
5650 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5651 	 */
5652 	dev->vlan_features |= NETIF_F_HIGHDMA;
5653 
5654 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5655 	ret = notifier_to_errno(ret);
5656 	if (ret)
5657 		goto err_uninit;
5658 
5659 	ret = netdev_register_kobject(dev);
5660 	if (ret)
5661 		goto err_uninit;
5662 	dev->reg_state = NETREG_REGISTERED;
5663 
5664 	__netdev_update_features(dev);
5665 
5666 	/*
5667 	 *	Default initial state at registry is that the
5668 	 *	device is present.
5669 	 */
5670 
5671 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5672 
5673 	dev_init_scheduler(dev);
5674 	dev_hold(dev);
5675 	list_netdevice(dev);
5676 
5677 	/* Notify protocols, that a new device appeared. */
5678 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5679 	ret = notifier_to_errno(ret);
5680 	if (ret) {
5681 		rollback_registered(dev);
5682 		dev->reg_state = NETREG_UNREGISTERED;
5683 	}
5684 	/*
5685 	 *	Prevent userspace races by waiting until the network
5686 	 *	device is fully setup before sending notifications.
5687 	 */
5688 	if (!dev->rtnl_link_ops ||
5689 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5690 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5691 
5692 out:
5693 	return ret;
5694 
5695 err_uninit:
5696 	if (dev->netdev_ops->ndo_uninit)
5697 		dev->netdev_ops->ndo_uninit(dev);
5698 	goto out;
5699 }
5700 EXPORT_SYMBOL(register_netdevice);
5701 
5702 /**
5703  *	init_dummy_netdev	- init a dummy network device for NAPI
5704  *	@dev: device to init
5705  *
5706  *	This takes a network device structure and initialize the minimum
5707  *	amount of fields so it can be used to schedule NAPI polls without
5708  *	registering a full blown interface. This is to be used by drivers
5709  *	that need to tie several hardware interfaces to a single NAPI
5710  *	poll scheduler due to HW limitations.
5711  */
5712 int init_dummy_netdev(struct net_device *dev)
5713 {
5714 	/* Clear everything. Note we don't initialize spinlocks
5715 	 * are they aren't supposed to be taken by any of the
5716 	 * NAPI code and this dummy netdev is supposed to be
5717 	 * only ever used for NAPI polls
5718 	 */
5719 	memset(dev, 0, sizeof(struct net_device));
5720 
5721 	/* make sure we BUG if trying to hit standard
5722 	 * register/unregister code path
5723 	 */
5724 	dev->reg_state = NETREG_DUMMY;
5725 
5726 	/* NAPI wants this */
5727 	INIT_LIST_HEAD(&dev->napi_list);
5728 
5729 	/* a dummy interface is started by default */
5730 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5731 	set_bit(__LINK_STATE_START, &dev->state);
5732 
5733 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5734 	 * because users of this 'device' dont need to change
5735 	 * its refcount.
5736 	 */
5737 
5738 	return 0;
5739 }
5740 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5741 
5742 
5743 /**
5744  *	register_netdev	- register a network device
5745  *	@dev: device to register
5746  *
5747  *	Take a completed network device structure and add it to the kernel
5748  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5749  *	chain. 0 is returned on success. A negative errno code is returned
5750  *	on a failure to set up the device, or if the name is a duplicate.
5751  *
5752  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5753  *	and expands the device name if you passed a format string to
5754  *	alloc_netdev.
5755  */
5756 int register_netdev(struct net_device *dev)
5757 {
5758 	int err;
5759 
5760 	rtnl_lock();
5761 	err = register_netdevice(dev);
5762 	rtnl_unlock();
5763 	return err;
5764 }
5765 EXPORT_SYMBOL(register_netdev);
5766 
5767 int netdev_refcnt_read(const struct net_device *dev)
5768 {
5769 	int i, refcnt = 0;
5770 
5771 	for_each_possible_cpu(i)
5772 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5773 	return refcnt;
5774 }
5775 EXPORT_SYMBOL(netdev_refcnt_read);
5776 
5777 /*
5778  * netdev_wait_allrefs - wait until all references are gone.
5779  *
5780  * This is called when unregistering network devices.
5781  *
5782  * Any protocol or device that holds a reference should register
5783  * for netdevice notification, and cleanup and put back the
5784  * reference if they receive an UNREGISTER event.
5785  * We can get stuck here if buggy protocols don't correctly
5786  * call dev_put.
5787  */
5788 static void netdev_wait_allrefs(struct net_device *dev)
5789 {
5790 	unsigned long rebroadcast_time, warning_time;
5791 	int refcnt;
5792 
5793 	linkwatch_forget_dev(dev);
5794 
5795 	rebroadcast_time = warning_time = jiffies;
5796 	refcnt = netdev_refcnt_read(dev);
5797 
5798 	while (refcnt != 0) {
5799 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5800 			rtnl_lock();
5801 
5802 			/* Rebroadcast unregister notification */
5803 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5804 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5805 			 * should have already handle it the first time */
5806 
5807 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5808 				     &dev->state)) {
5809 				/* We must not have linkwatch events
5810 				 * pending on unregister. If this
5811 				 * happens, we simply run the queue
5812 				 * unscheduled, resulting in a noop
5813 				 * for this device.
5814 				 */
5815 				linkwatch_run_queue();
5816 			}
5817 
5818 			__rtnl_unlock();
5819 
5820 			rebroadcast_time = jiffies;
5821 		}
5822 
5823 		msleep(250);
5824 
5825 		refcnt = netdev_refcnt_read(dev);
5826 
5827 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5828 			printk(KERN_EMERG "unregister_netdevice: "
5829 			       "waiting for %s to become free. Usage "
5830 			       "count = %d\n",
5831 			       dev->name, refcnt);
5832 			warning_time = jiffies;
5833 		}
5834 	}
5835 }
5836 
5837 /* The sequence is:
5838  *
5839  *	rtnl_lock();
5840  *	...
5841  *	register_netdevice(x1);
5842  *	register_netdevice(x2);
5843  *	...
5844  *	unregister_netdevice(y1);
5845  *	unregister_netdevice(y2);
5846  *      ...
5847  *	rtnl_unlock();
5848  *	free_netdev(y1);
5849  *	free_netdev(y2);
5850  *
5851  * We are invoked by rtnl_unlock().
5852  * This allows us to deal with problems:
5853  * 1) We can delete sysfs objects which invoke hotplug
5854  *    without deadlocking with linkwatch via keventd.
5855  * 2) Since we run with the RTNL semaphore not held, we can sleep
5856  *    safely in order to wait for the netdev refcnt to drop to zero.
5857  *
5858  * We must not return until all unregister events added during
5859  * the interval the lock was held have been completed.
5860  */
5861 void netdev_run_todo(void)
5862 {
5863 	struct list_head list;
5864 
5865 	/* Snapshot list, allow later requests */
5866 	list_replace_init(&net_todo_list, &list);
5867 
5868 	__rtnl_unlock();
5869 
5870 	/* Wait for rcu callbacks to finish before attempting to drain
5871 	 * the device list.  This usually avoids a 250ms wait.
5872 	 */
5873 	if (!list_empty(&list))
5874 		rcu_barrier();
5875 
5876 	while (!list_empty(&list)) {
5877 		struct net_device *dev
5878 			= list_first_entry(&list, struct net_device, todo_list);
5879 		list_del(&dev->todo_list);
5880 
5881 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5882 			printk(KERN_ERR "network todo '%s' but state %d\n",
5883 			       dev->name, dev->reg_state);
5884 			dump_stack();
5885 			continue;
5886 		}
5887 
5888 		dev->reg_state = NETREG_UNREGISTERED;
5889 
5890 		on_each_cpu(flush_backlog, dev, 1);
5891 
5892 		netdev_wait_allrefs(dev);
5893 
5894 		/* paranoia */
5895 		BUG_ON(netdev_refcnt_read(dev));
5896 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
5897 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5898 		WARN_ON(dev->dn_ptr);
5899 
5900 		if (dev->destructor)
5901 			dev->destructor(dev);
5902 
5903 		/* Free network device */
5904 		kobject_put(&dev->dev.kobj);
5905 	}
5906 }
5907 
5908 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5909  * fields in the same order, with only the type differing.
5910  */
5911 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5912 				    const struct net_device_stats *netdev_stats)
5913 {
5914 #if BITS_PER_LONG == 64
5915         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5916         memcpy(stats64, netdev_stats, sizeof(*stats64));
5917 #else
5918 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5919 	const unsigned long *src = (const unsigned long *)netdev_stats;
5920 	u64 *dst = (u64 *)stats64;
5921 
5922 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5923 		     sizeof(*stats64) / sizeof(u64));
5924 	for (i = 0; i < n; i++)
5925 		dst[i] = src[i];
5926 #endif
5927 }
5928 
5929 /**
5930  *	dev_get_stats	- get network device statistics
5931  *	@dev: device to get statistics from
5932  *	@storage: place to store stats
5933  *
5934  *	Get network statistics from device. Return @storage.
5935  *	The device driver may provide its own method by setting
5936  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5937  *	otherwise the internal statistics structure is used.
5938  */
5939 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5940 					struct rtnl_link_stats64 *storage)
5941 {
5942 	const struct net_device_ops *ops = dev->netdev_ops;
5943 
5944 	if (ops->ndo_get_stats64) {
5945 		memset(storage, 0, sizeof(*storage));
5946 		ops->ndo_get_stats64(dev, storage);
5947 	} else if (ops->ndo_get_stats) {
5948 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5949 	} else {
5950 		netdev_stats_to_stats64(storage, &dev->stats);
5951 	}
5952 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5953 	return storage;
5954 }
5955 EXPORT_SYMBOL(dev_get_stats);
5956 
5957 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5958 {
5959 	struct netdev_queue *queue = dev_ingress_queue(dev);
5960 
5961 #ifdef CONFIG_NET_CLS_ACT
5962 	if (queue)
5963 		return queue;
5964 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5965 	if (!queue)
5966 		return NULL;
5967 	netdev_init_one_queue(dev, queue, NULL);
5968 	queue->qdisc = &noop_qdisc;
5969 	queue->qdisc_sleeping = &noop_qdisc;
5970 	rcu_assign_pointer(dev->ingress_queue, queue);
5971 #endif
5972 	return queue;
5973 }
5974 
5975 /**
5976  *	alloc_netdev_mqs - allocate network device
5977  *	@sizeof_priv:	size of private data to allocate space for
5978  *	@name:		device name format string
5979  *	@setup:		callback to initialize device
5980  *	@txqs:		the number of TX subqueues to allocate
5981  *	@rxqs:		the number of RX subqueues to allocate
5982  *
5983  *	Allocates a struct net_device with private data area for driver use
5984  *	and performs basic initialization.  Also allocates subquue structs
5985  *	for each queue on the device.
5986  */
5987 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5988 		void (*setup)(struct net_device *),
5989 		unsigned int txqs, unsigned int rxqs)
5990 {
5991 	struct net_device *dev;
5992 	size_t alloc_size;
5993 	struct net_device *p;
5994 
5995 	BUG_ON(strlen(name) >= sizeof(dev->name));
5996 
5997 	if (txqs < 1) {
5998 		pr_err("alloc_netdev: Unable to allocate device "
5999 		       "with zero queues.\n");
6000 		return NULL;
6001 	}
6002 
6003 #ifdef CONFIG_RPS
6004 	if (rxqs < 1) {
6005 		pr_err("alloc_netdev: Unable to allocate device "
6006 		       "with zero RX queues.\n");
6007 		return NULL;
6008 	}
6009 #endif
6010 
6011 	alloc_size = sizeof(struct net_device);
6012 	if (sizeof_priv) {
6013 		/* ensure 32-byte alignment of private area */
6014 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6015 		alloc_size += sizeof_priv;
6016 	}
6017 	/* ensure 32-byte alignment of whole construct */
6018 	alloc_size += NETDEV_ALIGN - 1;
6019 
6020 	p = kzalloc(alloc_size, GFP_KERNEL);
6021 	if (!p) {
6022 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
6023 		return NULL;
6024 	}
6025 
6026 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6027 	dev->padded = (char *)dev - (char *)p;
6028 
6029 	dev->pcpu_refcnt = alloc_percpu(int);
6030 	if (!dev->pcpu_refcnt)
6031 		goto free_p;
6032 
6033 	if (dev_addr_init(dev))
6034 		goto free_pcpu;
6035 
6036 	dev_mc_init(dev);
6037 	dev_uc_init(dev);
6038 
6039 	dev_net_set(dev, &init_net);
6040 
6041 	dev->gso_max_size = GSO_MAX_SIZE;
6042 
6043 	INIT_LIST_HEAD(&dev->napi_list);
6044 	INIT_LIST_HEAD(&dev->unreg_list);
6045 	INIT_LIST_HEAD(&dev->link_watch_list);
6046 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
6047 	setup(dev);
6048 
6049 	dev->num_tx_queues = txqs;
6050 	dev->real_num_tx_queues = txqs;
6051 	if (netif_alloc_netdev_queues(dev))
6052 		goto free_all;
6053 
6054 #ifdef CONFIG_RPS
6055 	dev->num_rx_queues = rxqs;
6056 	dev->real_num_rx_queues = rxqs;
6057 	if (netif_alloc_rx_queues(dev))
6058 		goto free_all;
6059 #endif
6060 
6061 	strcpy(dev->name, name);
6062 	dev->group = INIT_NETDEV_GROUP;
6063 	return dev;
6064 
6065 free_all:
6066 	free_netdev(dev);
6067 	return NULL;
6068 
6069 free_pcpu:
6070 	free_percpu(dev->pcpu_refcnt);
6071 	kfree(dev->_tx);
6072 #ifdef CONFIG_RPS
6073 	kfree(dev->_rx);
6074 #endif
6075 
6076 free_p:
6077 	kfree(p);
6078 	return NULL;
6079 }
6080 EXPORT_SYMBOL(alloc_netdev_mqs);
6081 
6082 /**
6083  *	free_netdev - free network device
6084  *	@dev: device
6085  *
6086  *	This function does the last stage of destroying an allocated device
6087  * 	interface. The reference to the device object is released.
6088  *	If this is the last reference then it will be freed.
6089  */
6090 void free_netdev(struct net_device *dev)
6091 {
6092 	struct napi_struct *p, *n;
6093 
6094 	release_net(dev_net(dev));
6095 
6096 	kfree(dev->_tx);
6097 #ifdef CONFIG_RPS
6098 	kfree(dev->_rx);
6099 #endif
6100 
6101 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6102 
6103 	/* Flush device addresses */
6104 	dev_addr_flush(dev);
6105 
6106 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6107 		netif_napi_del(p);
6108 
6109 	free_percpu(dev->pcpu_refcnt);
6110 	dev->pcpu_refcnt = NULL;
6111 
6112 	/*  Compatibility with error handling in drivers */
6113 	if (dev->reg_state == NETREG_UNINITIALIZED) {
6114 		kfree((char *)dev - dev->padded);
6115 		return;
6116 	}
6117 
6118 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6119 	dev->reg_state = NETREG_RELEASED;
6120 
6121 	/* will free via device release */
6122 	put_device(&dev->dev);
6123 }
6124 EXPORT_SYMBOL(free_netdev);
6125 
6126 /**
6127  *	synchronize_net -  Synchronize with packet receive processing
6128  *
6129  *	Wait for packets currently being received to be done.
6130  *	Does not block later packets from starting.
6131  */
6132 void synchronize_net(void)
6133 {
6134 	might_sleep();
6135 	if (rtnl_is_locked())
6136 		synchronize_rcu_expedited();
6137 	else
6138 		synchronize_rcu();
6139 }
6140 EXPORT_SYMBOL(synchronize_net);
6141 
6142 /**
6143  *	unregister_netdevice_queue - remove device from the kernel
6144  *	@dev: device
6145  *	@head: list
6146  *
6147  *	This function shuts down a device interface and removes it
6148  *	from the kernel tables.
6149  *	If head not NULL, device is queued to be unregistered later.
6150  *
6151  *	Callers must hold the rtnl semaphore.  You may want
6152  *	unregister_netdev() instead of this.
6153  */
6154 
6155 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6156 {
6157 	ASSERT_RTNL();
6158 
6159 	if (head) {
6160 		list_move_tail(&dev->unreg_list, head);
6161 	} else {
6162 		rollback_registered(dev);
6163 		/* Finish processing unregister after unlock */
6164 		net_set_todo(dev);
6165 	}
6166 }
6167 EXPORT_SYMBOL(unregister_netdevice_queue);
6168 
6169 /**
6170  *	unregister_netdevice_many - unregister many devices
6171  *	@head: list of devices
6172  */
6173 void unregister_netdevice_many(struct list_head *head)
6174 {
6175 	struct net_device *dev;
6176 
6177 	if (!list_empty(head)) {
6178 		rollback_registered_many(head);
6179 		list_for_each_entry(dev, head, unreg_list)
6180 			net_set_todo(dev);
6181 	}
6182 }
6183 EXPORT_SYMBOL(unregister_netdevice_many);
6184 
6185 /**
6186  *	unregister_netdev - remove device from the kernel
6187  *	@dev: device
6188  *
6189  *	This function shuts down a device interface and removes it
6190  *	from the kernel tables.
6191  *
6192  *	This is just a wrapper for unregister_netdevice that takes
6193  *	the rtnl semaphore.  In general you want to use this and not
6194  *	unregister_netdevice.
6195  */
6196 void unregister_netdev(struct net_device *dev)
6197 {
6198 	rtnl_lock();
6199 	unregister_netdevice(dev);
6200 	rtnl_unlock();
6201 }
6202 EXPORT_SYMBOL(unregister_netdev);
6203 
6204 /**
6205  *	dev_change_net_namespace - move device to different nethost namespace
6206  *	@dev: device
6207  *	@net: network namespace
6208  *	@pat: If not NULL name pattern to try if the current device name
6209  *	      is already taken in the destination network namespace.
6210  *
6211  *	This function shuts down a device interface and moves it
6212  *	to a new network namespace. On success 0 is returned, on
6213  *	a failure a netagive errno code is returned.
6214  *
6215  *	Callers must hold the rtnl semaphore.
6216  */
6217 
6218 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6219 {
6220 	int err;
6221 
6222 	ASSERT_RTNL();
6223 
6224 	/* Don't allow namespace local devices to be moved. */
6225 	err = -EINVAL;
6226 	if (dev->features & NETIF_F_NETNS_LOCAL)
6227 		goto out;
6228 
6229 	/* Ensure the device has been registrered */
6230 	err = -EINVAL;
6231 	if (dev->reg_state != NETREG_REGISTERED)
6232 		goto out;
6233 
6234 	/* Get out if there is nothing todo */
6235 	err = 0;
6236 	if (net_eq(dev_net(dev), net))
6237 		goto out;
6238 
6239 	/* Pick the destination device name, and ensure
6240 	 * we can use it in the destination network namespace.
6241 	 */
6242 	err = -EEXIST;
6243 	if (__dev_get_by_name(net, dev->name)) {
6244 		/* We get here if we can't use the current device name */
6245 		if (!pat)
6246 			goto out;
6247 		if (dev_get_valid_name(dev, pat) < 0)
6248 			goto out;
6249 	}
6250 
6251 	/*
6252 	 * And now a mini version of register_netdevice unregister_netdevice.
6253 	 */
6254 
6255 	/* If device is running close it first. */
6256 	dev_close(dev);
6257 
6258 	/* And unlink it from device chain */
6259 	err = -ENODEV;
6260 	unlist_netdevice(dev);
6261 
6262 	synchronize_net();
6263 
6264 	/* Shutdown queueing discipline. */
6265 	dev_shutdown(dev);
6266 
6267 	/* Notify protocols, that we are about to destroy
6268 	   this device. They should clean all the things.
6269 
6270 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6271 	   This is wanted because this way 8021q and macvlan know
6272 	   the device is just moving and can keep their slaves up.
6273 	*/
6274 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6275 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6276 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6277 
6278 	/*
6279 	 *	Flush the unicast and multicast chains
6280 	 */
6281 	dev_uc_flush(dev);
6282 	dev_mc_flush(dev);
6283 
6284 	/* Actually switch the network namespace */
6285 	dev_net_set(dev, net);
6286 
6287 	/* If there is an ifindex conflict assign a new one */
6288 	if (__dev_get_by_index(net, dev->ifindex)) {
6289 		int iflink = (dev->iflink == dev->ifindex);
6290 		dev->ifindex = dev_new_index(net);
6291 		if (iflink)
6292 			dev->iflink = dev->ifindex;
6293 	}
6294 
6295 	/* Fixup kobjects */
6296 	err = device_rename(&dev->dev, dev->name);
6297 	WARN_ON(err);
6298 
6299 	/* Add the device back in the hashes */
6300 	list_netdevice(dev);
6301 
6302 	/* Notify protocols, that a new device appeared. */
6303 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6304 
6305 	/*
6306 	 *	Prevent userspace races by waiting until the network
6307 	 *	device is fully setup before sending notifications.
6308 	 */
6309 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6310 
6311 	synchronize_net();
6312 	err = 0;
6313 out:
6314 	return err;
6315 }
6316 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6317 
6318 static int dev_cpu_callback(struct notifier_block *nfb,
6319 			    unsigned long action,
6320 			    void *ocpu)
6321 {
6322 	struct sk_buff **list_skb;
6323 	struct sk_buff *skb;
6324 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6325 	struct softnet_data *sd, *oldsd;
6326 
6327 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6328 		return NOTIFY_OK;
6329 
6330 	local_irq_disable();
6331 	cpu = smp_processor_id();
6332 	sd = &per_cpu(softnet_data, cpu);
6333 	oldsd = &per_cpu(softnet_data, oldcpu);
6334 
6335 	/* Find end of our completion_queue. */
6336 	list_skb = &sd->completion_queue;
6337 	while (*list_skb)
6338 		list_skb = &(*list_skb)->next;
6339 	/* Append completion queue from offline CPU. */
6340 	*list_skb = oldsd->completion_queue;
6341 	oldsd->completion_queue = NULL;
6342 
6343 	/* Append output queue from offline CPU. */
6344 	if (oldsd->output_queue) {
6345 		*sd->output_queue_tailp = oldsd->output_queue;
6346 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6347 		oldsd->output_queue = NULL;
6348 		oldsd->output_queue_tailp = &oldsd->output_queue;
6349 	}
6350 	/* Append NAPI poll list from offline CPU. */
6351 	if (!list_empty(&oldsd->poll_list)) {
6352 		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6353 		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6354 	}
6355 
6356 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6357 	local_irq_enable();
6358 
6359 	/* Process offline CPU's input_pkt_queue */
6360 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6361 		netif_rx(skb);
6362 		input_queue_head_incr(oldsd);
6363 	}
6364 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6365 		netif_rx(skb);
6366 		input_queue_head_incr(oldsd);
6367 	}
6368 
6369 	return NOTIFY_OK;
6370 }
6371 
6372 
6373 /**
6374  *	netdev_increment_features - increment feature set by one
6375  *	@all: current feature set
6376  *	@one: new feature set
6377  *	@mask: mask feature set
6378  *
6379  *	Computes a new feature set after adding a device with feature set
6380  *	@one to the master device with current feature set @all.  Will not
6381  *	enable anything that is off in @mask. Returns the new feature set.
6382  */
6383 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6384 {
6385 	if (mask & NETIF_F_GEN_CSUM)
6386 		mask |= NETIF_F_ALL_CSUM;
6387 	mask |= NETIF_F_VLAN_CHALLENGED;
6388 
6389 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6390 	all &= one | ~NETIF_F_ALL_FOR_ALL;
6391 
6392 	/* If device needs checksumming, downgrade to it. */
6393 	if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6394 		all &= ~NETIF_F_NO_CSUM;
6395 
6396 	/* If one device supports hw checksumming, set for all. */
6397 	if (all & NETIF_F_GEN_CSUM)
6398 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6399 
6400 	return all;
6401 }
6402 EXPORT_SYMBOL(netdev_increment_features);
6403 
6404 static struct hlist_head *netdev_create_hash(void)
6405 {
6406 	int i;
6407 	struct hlist_head *hash;
6408 
6409 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6410 	if (hash != NULL)
6411 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6412 			INIT_HLIST_HEAD(&hash[i]);
6413 
6414 	return hash;
6415 }
6416 
6417 /* Initialize per network namespace state */
6418 static int __net_init netdev_init(struct net *net)
6419 {
6420 	INIT_LIST_HEAD(&net->dev_base_head);
6421 
6422 	net->dev_name_head = netdev_create_hash();
6423 	if (net->dev_name_head == NULL)
6424 		goto err_name;
6425 
6426 	net->dev_index_head = netdev_create_hash();
6427 	if (net->dev_index_head == NULL)
6428 		goto err_idx;
6429 
6430 	return 0;
6431 
6432 err_idx:
6433 	kfree(net->dev_name_head);
6434 err_name:
6435 	return -ENOMEM;
6436 }
6437 
6438 /**
6439  *	netdev_drivername - network driver for the device
6440  *	@dev: network device
6441  *
6442  *	Determine network driver for device.
6443  */
6444 const char *netdev_drivername(const struct net_device *dev)
6445 {
6446 	const struct device_driver *driver;
6447 	const struct device *parent;
6448 	const char *empty = "";
6449 
6450 	parent = dev->dev.parent;
6451 	if (!parent)
6452 		return empty;
6453 
6454 	driver = parent->driver;
6455 	if (driver && driver->name)
6456 		return driver->name;
6457 	return empty;
6458 }
6459 
6460 int __netdev_printk(const char *level, const struct net_device *dev,
6461 			   struct va_format *vaf)
6462 {
6463 	int r;
6464 
6465 	if (dev && dev->dev.parent)
6466 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6467 			       netdev_name(dev), vaf);
6468 	else if (dev)
6469 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6470 	else
6471 		r = printk("%s(NULL net_device): %pV", level, vaf);
6472 
6473 	return r;
6474 }
6475 EXPORT_SYMBOL(__netdev_printk);
6476 
6477 int netdev_printk(const char *level, const struct net_device *dev,
6478 		  const char *format, ...)
6479 {
6480 	struct va_format vaf;
6481 	va_list args;
6482 	int r;
6483 
6484 	va_start(args, format);
6485 
6486 	vaf.fmt = format;
6487 	vaf.va = &args;
6488 
6489 	r = __netdev_printk(level, dev, &vaf);
6490 	va_end(args);
6491 
6492 	return r;
6493 }
6494 EXPORT_SYMBOL(netdev_printk);
6495 
6496 #define define_netdev_printk_level(func, level)			\
6497 int func(const struct net_device *dev, const char *fmt, ...)	\
6498 {								\
6499 	int r;							\
6500 	struct va_format vaf;					\
6501 	va_list args;						\
6502 								\
6503 	va_start(args, fmt);					\
6504 								\
6505 	vaf.fmt = fmt;						\
6506 	vaf.va = &args;						\
6507 								\
6508 	r = __netdev_printk(level, dev, &vaf);			\
6509 	va_end(args);						\
6510 								\
6511 	return r;						\
6512 }								\
6513 EXPORT_SYMBOL(func);
6514 
6515 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6516 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6517 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6518 define_netdev_printk_level(netdev_err, KERN_ERR);
6519 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6520 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6521 define_netdev_printk_level(netdev_info, KERN_INFO);
6522 
6523 static void __net_exit netdev_exit(struct net *net)
6524 {
6525 	kfree(net->dev_name_head);
6526 	kfree(net->dev_index_head);
6527 }
6528 
6529 static struct pernet_operations __net_initdata netdev_net_ops = {
6530 	.init = netdev_init,
6531 	.exit = netdev_exit,
6532 };
6533 
6534 static void __net_exit default_device_exit(struct net *net)
6535 {
6536 	struct net_device *dev, *aux;
6537 	/*
6538 	 * Push all migratable network devices back to the
6539 	 * initial network namespace
6540 	 */
6541 	rtnl_lock();
6542 	for_each_netdev_safe(net, dev, aux) {
6543 		int err;
6544 		char fb_name[IFNAMSIZ];
6545 
6546 		/* Ignore unmoveable devices (i.e. loopback) */
6547 		if (dev->features & NETIF_F_NETNS_LOCAL)
6548 			continue;
6549 
6550 		/* Leave virtual devices for the generic cleanup */
6551 		if (dev->rtnl_link_ops)
6552 			continue;
6553 
6554 		/* Push remaining network devices to init_net */
6555 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6556 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6557 		if (err) {
6558 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6559 				__func__, dev->name, err);
6560 			BUG();
6561 		}
6562 	}
6563 	rtnl_unlock();
6564 }
6565 
6566 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6567 {
6568 	/* At exit all network devices most be removed from a network
6569 	 * namespace.  Do this in the reverse order of registration.
6570 	 * Do this across as many network namespaces as possible to
6571 	 * improve batching efficiency.
6572 	 */
6573 	struct net_device *dev;
6574 	struct net *net;
6575 	LIST_HEAD(dev_kill_list);
6576 
6577 	rtnl_lock();
6578 	list_for_each_entry(net, net_list, exit_list) {
6579 		for_each_netdev_reverse(net, dev) {
6580 			if (dev->rtnl_link_ops)
6581 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6582 			else
6583 				unregister_netdevice_queue(dev, &dev_kill_list);
6584 		}
6585 	}
6586 	unregister_netdevice_many(&dev_kill_list);
6587 	list_del(&dev_kill_list);
6588 	rtnl_unlock();
6589 }
6590 
6591 static struct pernet_operations __net_initdata default_device_ops = {
6592 	.exit = default_device_exit,
6593 	.exit_batch = default_device_exit_batch,
6594 };
6595 
6596 /*
6597  *	Initialize the DEV module. At boot time this walks the device list and
6598  *	unhooks any devices that fail to initialise (normally hardware not
6599  *	present) and leaves us with a valid list of present and active devices.
6600  *
6601  */
6602 
6603 /*
6604  *       This is called single threaded during boot, so no need
6605  *       to take the rtnl semaphore.
6606  */
6607 static int __init net_dev_init(void)
6608 {
6609 	int i, rc = -ENOMEM;
6610 
6611 	BUG_ON(!dev_boot_phase);
6612 
6613 	if (dev_proc_init())
6614 		goto out;
6615 
6616 	if (netdev_kobject_init())
6617 		goto out;
6618 
6619 	INIT_LIST_HEAD(&ptype_all);
6620 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6621 		INIT_LIST_HEAD(&ptype_base[i]);
6622 
6623 	if (register_pernet_subsys(&netdev_net_ops))
6624 		goto out;
6625 
6626 	/*
6627 	 *	Initialise the packet receive queues.
6628 	 */
6629 
6630 	for_each_possible_cpu(i) {
6631 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6632 
6633 		memset(sd, 0, sizeof(*sd));
6634 		skb_queue_head_init(&sd->input_pkt_queue);
6635 		skb_queue_head_init(&sd->process_queue);
6636 		sd->completion_queue = NULL;
6637 		INIT_LIST_HEAD(&sd->poll_list);
6638 		sd->output_queue = NULL;
6639 		sd->output_queue_tailp = &sd->output_queue;
6640 #ifdef CONFIG_RPS
6641 		sd->csd.func = rps_trigger_softirq;
6642 		sd->csd.info = sd;
6643 		sd->csd.flags = 0;
6644 		sd->cpu = i;
6645 #endif
6646 
6647 		sd->backlog.poll = process_backlog;
6648 		sd->backlog.weight = weight_p;
6649 		sd->backlog.gro_list = NULL;
6650 		sd->backlog.gro_count = 0;
6651 	}
6652 
6653 	dev_boot_phase = 0;
6654 
6655 	/* The loopback device is special if any other network devices
6656 	 * is present in a network namespace the loopback device must
6657 	 * be present. Since we now dynamically allocate and free the
6658 	 * loopback device ensure this invariant is maintained by
6659 	 * keeping the loopback device as the first device on the
6660 	 * list of network devices.  Ensuring the loopback devices
6661 	 * is the first device that appears and the last network device
6662 	 * that disappears.
6663 	 */
6664 	if (register_pernet_device(&loopback_net_ops))
6665 		goto out;
6666 
6667 	if (register_pernet_device(&default_device_ops))
6668 		goto out;
6669 
6670 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6671 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6672 
6673 	hotcpu_notifier(dev_cpu_callback, 0);
6674 	dst_init();
6675 	dev_mcast_init();
6676 	rc = 0;
6677 out:
6678 	return rc;
6679 }
6680 
6681 subsys_initcall(net_dev_init);
6682 
6683 static int __init initialize_hashrnd(void)
6684 {
6685 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6686 	return 0;
6687 }
6688 
6689 late_initcall_sync(initialize_hashrnd);
6690 
6691