xref: /linux/net/core/dev.c (revision 8a5617bdc111aa7ba49c81fa694fde63d3474f94)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 #include <linux/cpu_rmap.h>
136 
137 #include "net-sysfs.h"
138 
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
141 
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144 
145 /*
146  *	The list of packet types we will receive (as opposed to discard)
147  *	and the routines to invoke.
148  *
149  *	Why 16. Because with 16 the only overlap we get on a hash of the
150  *	low nibble of the protocol value is RARP/SNAP/X.25.
151  *
152  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
153  *             sure which should go first, but I bet it won't make much
154  *             difference if we are running VLANs.  The good news is that
155  *             this protocol won't be in the list unless compiled in, so
156  *             the average user (w/out VLANs) will not be adversely affected.
157  *             --BLG
158  *
159  *		0800	IP
160  *		8100    802.1Q VLAN
161  *		0001	802.3
162  *		0002	AX.25
163  *		0004	802.2
164  *		8035	RARP
165  *		0005	SNAP
166  *		0805	X.25
167  *		0806	ARP
168  *		8137	IPX
169  *		0009	Localtalk
170  *		86DD	IPv6
171  */
172 
173 #define PTYPE_HASH_SIZE	(16)
174 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
175 
176 static DEFINE_SPINLOCK(ptype_lock);
177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178 static struct list_head ptype_all __read_mostly;	/* Taps */
179 
180 /*
181  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182  * semaphore.
183  *
184  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
185  *
186  * Writers must hold the rtnl semaphore while they loop through the
187  * dev_base_head list, and hold dev_base_lock for writing when they do the
188  * actual updates.  This allows pure readers to access the list even
189  * while a writer is preparing to update it.
190  *
191  * To put it another way, dev_base_lock is held for writing only to
192  * protect against pure readers; the rtnl semaphore provides the
193  * protection against other writers.
194  *
195  * See, for example usages, register_netdevice() and
196  * unregister_netdevice(), which must be called with the rtnl
197  * semaphore held.
198  */
199 DEFINE_RWLOCK(dev_base_lock);
200 EXPORT_SYMBOL(dev_base_lock);
201 
202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
203 {
204 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
205 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206 }
207 
208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
209 {
210 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211 }
212 
213 static inline void rps_lock(struct softnet_data *sd)
214 {
215 #ifdef CONFIG_RPS
216 	spin_lock(&sd->input_pkt_queue.lock);
217 #endif
218 }
219 
220 static inline void rps_unlock(struct softnet_data *sd)
221 {
222 #ifdef CONFIG_RPS
223 	spin_unlock(&sd->input_pkt_queue.lock);
224 #endif
225 }
226 
227 /* Device list insertion */
228 static int list_netdevice(struct net_device *dev)
229 {
230 	struct net *net = dev_net(dev);
231 
232 	ASSERT_RTNL();
233 
234 	write_lock_bh(&dev_base_lock);
235 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
236 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
237 	hlist_add_head_rcu(&dev->index_hlist,
238 			   dev_index_hash(net, dev->ifindex));
239 	write_unlock_bh(&dev_base_lock);
240 	return 0;
241 }
242 
243 /* Device list removal
244  * caller must respect a RCU grace period before freeing/reusing dev
245  */
246 static void unlist_netdevice(struct net_device *dev)
247 {
248 	ASSERT_RTNL();
249 
250 	/* Unlink dev from the device chain */
251 	write_lock_bh(&dev_base_lock);
252 	list_del_rcu(&dev->dev_list);
253 	hlist_del_rcu(&dev->name_hlist);
254 	hlist_del_rcu(&dev->index_hlist);
255 	write_unlock_bh(&dev_base_lock);
256 }
257 
258 /*
259  *	Our notifier list
260  */
261 
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263 
264 /*
265  *	Device drivers call our routines to queue packets here. We empty the
266  *	queue in the local softnet handler.
267  */
268 
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271 
272 #ifdef CONFIG_LOCKDEP
273 /*
274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275  * according to dev->type
276  */
277 static const unsigned short netdev_lock_type[] =
278 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
291 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
292 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
293 	 ARPHRD_VOID, ARPHRD_NONE};
294 
295 static const char *const netdev_lock_name[] =
296 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
309 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
310 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
311 	 "_xmit_VOID", "_xmit_NONE"};
312 
313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
315 
316 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
317 {
318 	int i;
319 
320 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
321 		if (netdev_lock_type[i] == dev_type)
322 			return i;
323 	/* the last key is used by default */
324 	return ARRAY_SIZE(netdev_lock_type) - 1;
325 }
326 
327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
328 						 unsigned short dev_type)
329 {
330 	int i;
331 
332 	i = netdev_lock_pos(dev_type);
333 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
334 				   netdev_lock_name[i]);
335 }
336 
337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
338 {
339 	int i;
340 
341 	i = netdev_lock_pos(dev->type);
342 	lockdep_set_class_and_name(&dev->addr_list_lock,
343 				   &netdev_addr_lock_key[i],
344 				   netdev_lock_name[i]);
345 }
346 #else
347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
348 						 unsigned short dev_type)
349 {
350 }
351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
352 {
353 }
354 #endif
355 
356 /*******************************************************************************
357 
358 		Protocol management and registration routines
359 
360 *******************************************************************************/
361 
362 /*
363  *	Add a protocol ID to the list. Now that the input handler is
364  *	smarter we can dispense with all the messy stuff that used to be
365  *	here.
366  *
367  *	BEWARE!!! Protocol handlers, mangling input packets,
368  *	MUST BE last in hash buckets and checking protocol handlers
369  *	MUST start from promiscuous ptype_all chain in net_bh.
370  *	It is true now, do not change it.
371  *	Explanation follows: if protocol handler, mangling packet, will
372  *	be the first on list, it is not able to sense, that packet
373  *	is cloned and should be copied-on-write, so that it will
374  *	change it and subsequent readers will get broken packet.
375  *							--ANK (980803)
376  */
377 
378 static inline struct list_head *ptype_head(const struct packet_type *pt)
379 {
380 	if (pt->type == htons(ETH_P_ALL))
381 		return &ptype_all;
382 	else
383 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384 }
385 
386 /**
387  *	dev_add_pack - add packet handler
388  *	@pt: packet type declaration
389  *
390  *	Add a protocol handler to the networking stack. The passed &packet_type
391  *	is linked into kernel lists and may not be freed until it has been
392  *	removed from the kernel lists.
393  *
394  *	This call does not sleep therefore it can not
395  *	guarantee all CPU's that are in middle of receiving packets
396  *	will see the new packet type (until the next received packet).
397  */
398 
399 void dev_add_pack(struct packet_type *pt)
400 {
401 	struct list_head *head = ptype_head(pt);
402 
403 	spin_lock(&ptype_lock);
404 	list_add_rcu(&pt->list, head);
405 	spin_unlock(&ptype_lock);
406 }
407 EXPORT_SYMBOL(dev_add_pack);
408 
409 /**
410  *	__dev_remove_pack	 - remove packet handler
411  *	@pt: packet type declaration
412  *
413  *	Remove a protocol handler that was previously added to the kernel
414  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
415  *	from the kernel lists and can be freed or reused once this function
416  *	returns.
417  *
418  *      The packet type might still be in use by receivers
419  *	and must not be freed until after all the CPU's have gone
420  *	through a quiescent state.
421  */
422 void __dev_remove_pack(struct packet_type *pt)
423 {
424 	struct list_head *head = ptype_head(pt);
425 	struct packet_type *pt1;
426 
427 	spin_lock(&ptype_lock);
428 
429 	list_for_each_entry(pt1, head, list) {
430 		if (pt == pt1) {
431 			list_del_rcu(&pt->list);
432 			goto out;
433 		}
434 	}
435 
436 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437 out:
438 	spin_unlock(&ptype_lock);
439 }
440 EXPORT_SYMBOL(__dev_remove_pack);
441 
442 /**
443  *	dev_remove_pack	 - remove packet handler
444  *	@pt: packet type declaration
445  *
446  *	Remove a protocol handler that was previously added to the kernel
447  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
448  *	from the kernel lists and can be freed or reused once this function
449  *	returns.
450  *
451  *	This call sleeps to guarantee that no CPU is looking at the packet
452  *	type after return.
453  */
454 void dev_remove_pack(struct packet_type *pt)
455 {
456 	__dev_remove_pack(pt);
457 
458 	synchronize_net();
459 }
460 EXPORT_SYMBOL(dev_remove_pack);
461 
462 /******************************************************************************
463 
464 		      Device Boot-time Settings Routines
465 
466 *******************************************************************************/
467 
468 /* Boot time configuration table */
469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470 
471 /**
472  *	netdev_boot_setup_add	- add new setup entry
473  *	@name: name of the device
474  *	@map: configured settings for the device
475  *
476  *	Adds new setup entry to the dev_boot_setup list.  The function
477  *	returns 0 on error and 1 on success.  This is a generic routine to
478  *	all netdevices.
479  */
480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
481 {
482 	struct netdev_boot_setup *s;
483 	int i;
484 
485 	s = dev_boot_setup;
486 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 			memset(s[i].name, 0, sizeof(s[i].name));
489 			strlcpy(s[i].name, name, IFNAMSIZ);
490 			memcpy(&s[i].map, map, sizeof(s[i].map));
491 			break;
492 		}
493 	}
494 
495 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496 }
497 
498 /**
499  *	netdev_boot_setup_check	- check boot time settings
500  *	@dev: the netdevice
501  *
502  * 	Check boot time settings for the device.
503  *	The found settings are set for the device to be used
504  *	later in the device probing.
505  *	Returns 0 if no settings found, 1 if they are.
506  */
507 int netdev_boot_setup_check(struct net_device *dev)
508 {
509 	struct netdev_boot_setup *s = dev_boot_setup;
510 	int i;
511 
512 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514 		    !strcmp(dev->name, s[i].name)) {
515 			dev->irq 	= s[i].map.irq;
516 			dev->base_addr 	= s[i].map.base_addr;
517 			dev->mem_start 	= s[i].map.mem_start;
518 			dev->mem_end 	= s[i].map.mem_end;
519 			return 1;
520 		}
521 	}
522 	return 0;
523 }
524 EXPORT_SYMBOL(netdev_boot_setup_check);
525 
526 
527 /**
528  *	netdev_boot_base	- get address from boot time settings
529  *	@prefix: prefix for network device
530  *	@unit: id for network device
531  *
532  * 	Check boot time settings for the base address of device.
533  *	The found settings are set for the device to be used
534  *	later in the device probing.
535  *	Returns 0 if no settings found.
536  */
537 unsigned long netdev_boot_base(const char *prefix, int unit)
538 {
539 	const struct netdev_boot_setup *s = dev_boot_setup;
540 	char name[IFNAMSIZ];
541 	int i;
542 
543 	sprintf(name, "%s%d", prefix, unit);
544 
545 	/*
546 	 * If device already registered then return base of 1
547 	 * to indicate not to probe for this interface
548 	 */
549 	if (__dev_get_by_name(&init_net, name))
550 		return 1;
551 
552 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 		if (!strcmp(name, s[i].name))
554 			return s[i].map.base_addr;
555 	return 0;
556 }
557 
558 /*
559  * Saves at boot time configured settings for any netdevice.
560  */
561 int __init netdev_boot_setup(char *str)
562 {
563 	int ints[5];
564 	struct ifmap map;
565 
566 	str = get_options(str, ARRAY_SIZE(ints), ints);
567 	if (!str || !*str)
568 		return 0;
569 
570 	/* Save settings */
571 	memset(&map, 0, sizeof(map));
572 	if (ints[0] > 0)
573 		map.irq = ints[1];
574 	if (ints[0] > 1)
575 		map.base_addr = ints[2];
576 	if (ints[0] > 2)
577 		map.mem_start = ints[3];
578 	if (ints[0] > 3)
579 		map.mem_end = ints[4];
580 
581 	/* Add new entry to the list */
582 	return netdev_boot_setup_add(str, &map);
583 }
584 
585 __setup("netdev=", netdev_boot_setup);
586 
587 /*******************************************************************************
588 
589 			    Device Interface Subroutines
590 
591 *******************************************************************************/
592 
593 /**
594  *	__dev_get_by_name	- find a device by its name
595  *	@net: the applicable net namespace
596  *	@name: name to find
597  *
598  *	Find an interface by name. Must be called under RTNL semaphore
599  *	or @dev_base_lock. If the name is found a pointer to the device
600  *	is returned. If the name is not found then %NULL is returned. The
601  *	reference counters are not incremented so the caller must be
602  *	careful with locks.
603  */
604 
605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
606 {
607 	struct hlist_node *p;
608 	struct net_device *dev;
609 	struct hlist_head *head = dev_name_hash(net, name);
610 
611 	hlist_for_each_entry(dev, p, head, name_hlist)
612 		if (!strncmp(dev->name, name, IFNAMSIZ))
613 			return dev;
614 
615 	return NULL;
616 }
617 EXPORT_SYMBOL(__dev_get_by_name);
618 
619 /**
620  *	dev_get_by_name_rcu	- find a device by its name
621  *	@net: the applicable net namespace
622  *	@name: name to find
623  *
624  *	Find an interface by name.
625  *	If the name is found a pointer to the device is returned.
626  * 	If the name is not found then %NULL is returned.
627  *	The reference counters are not incremented so the caller must be
628  *	careful with locks. The caller must hold RCU lock.
629  */
630 
631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632 {
633 	struct hlist_node *p;
634 	struct net_device *dev;
635 	struct hlist_head *head = dev_name_hash(net, name);
636 
637 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 		if (!strncmp(dev->name, name, IFNAMSIZ))
639 			return dev;
640 
641 	return NULL;
642 }
643 EXPORT_SYMBOL(dev_get_by_name_rcu);
644 
645 /**
646  *	dev_get_by_name		- find a device by its name
647  *	@net: the applicable net namespace
648  *	@name: name to find
649  *
650  *	Find an interface by name. This can be called from any
651  *	context and does its own locking. The returned handle has
652  *	the usage count incremented and the caller must use dev_put() to
653  *	release it when it is no longer needed. %NULL is returned if no
654  *	matching device is found.
655  */
656 
657 struct net_device *dev_get_by_name(struct net *net, const char *name)
658 {
659 	struct net_device *dev;
660 
661 	rcu_read_lock();
662 	dev = dev_get_by_name_rcu(net, name);
663 	if (dev)
664 		dev_hold(dev);
665 	rcu_read_unlock();
666 	return dev;
667 }
668 EXPORT_SYMBOL(dev_get_by_name);
669 
670 /**
671  *	__dev_get_by_index - find a device by its ifindex
672  *	@net: the applicable net namespace
673  *	@ifindex: index of device
674  *
675  *	Search for an interface by index. Returns %NULL if the device
676  *	is not found or a pointer to the device. The device has not
677  *	had its reference counter increased so the caller must be careful
678  *	about locking. The caller must hold either the RTNL semaphore
679  *	or @dev_base_lock.
680  */
681 
682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
683 {
684 	struct hlist_node *p;
685 	struct net_device *dev;
686 	struct hlist_head *head = dev_index_hash(net, ifindex);
687 
688 	hlist_for_each_entry(dev, p, head, index_hlist)
689 		if (dev->ifindex == ifindex)
690 			return dev;
691 
692 	return NULL;
693 }
694 EXPORT_SYMBOL(__dev_get_by_index);
695 
696 /**
697  *	dev_get_by_index_rcu - find a device by its ifindex
698  *	@net: the applicable net namespace
699  *	@ifindex: index of device
700  *
701  *	Search for an interface by index. Returns %NULL if the device
702  *	is not found or a pointer to the device. The device has not
703  *	had its reference counter increased so the caller must be careful
704  *	about locking. The caller must hold RCU lock.
705  */
706 
707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708 {
709 	struct hlist_node *p;
710 	struct net_device *dev;
711 	struct hlist_head *head = dev_index_hash(net, ifindex);
712 
713 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 		if (dev->ifindex == ifindex)
715 			return dev;
716 
717 	return NULL;
718 }
719 EXPORT_SYMBOL(dev_get_by_index_rcu);
720 
721 
722 /**
723  *	dev_get_by_index - find a device by its ifindex
724  *	@net: the applicable net namespace
725  *	@ifindex: index of device
726  *
727  *	Search for an interface by index. Returns NULL if the device
728  *	is not found or a pointer to the device. The device returned has
729  *	had a reference added and the pointer is safe until the user calls
730  *	dev_put to indicate they have finished with it.
731  */
732 
733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
734 {
735 	struct net_device *dev;
736 
737 	rcu_read_lock();
738 	dev = dev_get_by_index_rcu(net, ifindex);
739 	if (dev)
740 		dev_hold(dev);
741 	rcu_read_unlock();
742 	return dev;
743 }
744 EXPORT_SYMBOL(dev_get_by_index);
745 
746 /**
747  *	dev_getbyhwaddr_rcu - find a device by its hardware address
748  *	@net: the applicable net namespace
749  *	@type: media type of device
750  *	@ha: hardware address
751  *
752  *	Search for an interface by MAC address. Returns NULL if the device
753  *	is not found or a pointer to the device.
754  *	The caller must hold RCU or RTNL.
755  *	The returned device has not had its ref count increased
756  *	and the caller must therefore be careful about locking
757  *
758  */
759 
760 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
761 				       const char *ha)
762 {
763 	struct net_device *dev;
764 
765 	for_each_netdev_rcu(net, dev)
766 		if (dev->type == type &&
767 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
768 			return dev;
769 
770 	return NULL;
771 }
772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
773 
774 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
775 {
776 	struct net_device *dev;
777 
778 	ASSERT_RTNL();
779 	for_each_netdev(net, dev)
780 		if (dev->type == type)
781 			return dev;
782 
783 	return NULL;
784 }
785 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
786 
787 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
788 {
789 	struct net_device *dev, *ret = NULL;
790 
791 	rcu_read_lock();
792 	for_each_netdev_rcu(net, dev)
793 		if (dev->type == type) {
794 			dev_hold(dev);
795 			ret = dev;
796 			break;
797 		}
798 	rcu_read_unlock();
799 	return ret;
800 }
801 EXPORT_SYMBOL(dev_getfirstbyhwtype);
802 
803 /**
804  *	dev_get_by_flags_rcu - find any device with given flags
805  *	@net: the applicable net namespace
806  *	@if_flags: IFF_* values
807  *	@mask: bitmask of bits in if_flags to check
808  *
809  *	Search for any interface with the given flags. Returns NULL if a device
810  *	is not found or a pointer to the device. Must be called inside
811  *	rcu_read_lock(), and result refcount is unchanged.
812  */
813 
814 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
815 				    unsigned short mask)
816 {
817 	struct net_device *dev, *ret;
818 
819 	ret = NULL;
820 	for_each_netdev_rcu(net, dev) {
821 		if (((dev->flags ^ if_flags) & mask) == 0) {
822 			ret = dev;
823 			break;
824 		}
825 	}
826 	return ret;
827 }
828 EXPORT_SYMBOL(dev_get_by_flags_rcu);
829 
830 /**
831  *	dev_valid_name - check if name is okay for network device
832  *	@name: name string
833  *
834  *	Network device names need to be valid file names to
835  *	to allow sysfs to work.  We also disallow any kind of
836  *	whitespace.
837  */
838 int dev_valid_name(const char *name)
839 {
840 	if (*name == '\0')
841 		return 0;
842 	if (strlen(name) >= IFNAMSIZ)
843 		return 0;
844 	if (!strcmp(name, ".") || !strcmp(name, ".."))
845 		return 0;
846 
847 	while (*name) {
848 		if (*name == '/' || isspace(*name))
849 			return 0;
850 		name++;
851 	}
852 	return 1;
853 }
854 EXPORT_SYMBOL(dev_valid_name);
855 
856 /**
857  *	__dev_alloc_name - allocate a name for a device
858  *	@net: network namespace to allocate the device name in
859  *	@name: name format string
860  *	@buf:  scratch buffer and result name string
861  *
862  *	Passed a format string - eg "lt%d" it will try and find a suitable
863  *	id. It scans list of devices to build up a free map, then chooses
864  *	the first empty slot. The caller must hold the dev_base or rtnl lock
865  *	while allocating the name and adding the device in order to avoid
866  *	duplicates.
867  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
868  *	Returns the number of the unit assigned or a negative errno code.
869  */
870 
871 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
872 {
873 	int i = 0;
874 	const char *p;
875 	const int max_netdevices = 8*PAGE_SIZE;
876 	unsigned long *inuse;
877 	struct net_device *d;
878 
879 	p = strnchr(name, IFNAMSIZ-1, '%');
880 	if (p) {
881 		/*
882 		 * Verify the string as this thing may have come from
883 		 * the user.  There must be either one "%d" and no other "%"
884 		 * characters.
885 		 */
886 		if (p[1] != 'd' || strchr(p + 2, '%'))
887 			return -EINVAL;
888 
889 		/* Use one page as a bit array of possible slots */
890 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
891 		if (!inuse)
892 			return -ENOMEM;
893 
894 		for_each_netdev(net, d) {
895 			if (!sscanf(d->name, name, &i))
896 				continue;
897 			if (i < 0 || i >= max_netdevices)
898 				continue;
899 
900 			/*  avoid cases where sscanf is not exact inverse of printf */
901 			snprintf(buf, IFNAMSIZ, name, i);
902 			if (!strncmp(buf, d->name, IFNAMSIZ))
903 				set_bit(i, inuse);
904 		}
905 
906 		i = find_first_zero_bit(inuse, max_netdevices);
907 		free_page((unsigned long) inuse);
908 	}
909 
910 	if (buf != name)
911 		snprintf(buf, IFNAMSIZ, name, i);
912 	if (!__dev_get_by_name(net, buf))
913 		return i;
914 
915 	/* It is possible to run out of possible slots
916 	 * when the name is long and there isn't enough space left
917 	 * for the digits, or if all bits are used.
918 	 */
919 	return -ENFILE;
920 }
921 
922 /**
923  *	dev_alloc_name - allocate a name for a device
924  *	@dev: device
925  *	@name: name format string
926  *
927  *	Passed a format string - eg "lt%d" it will try and find a suitable
928  *	id. It scans list of devices to build up a free map, then chooses
929  *	the first empty slot. The caller must hold the dev_base or rtnl lock
930  *	while allocating the name and adding the device in order to avoid
931  *	duplicates.
932  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
933  *	Returns the number of the unit assigned or a negative errno code.
934  */
935 
936 int dev_alloc_name(struct net_device *dev, const char *name)
937 {
938 	char buf[IFNAMSIZ];
939 	struct net *net;
940 	int ret;
941 
942 	BUG_ON(!dev_net(dev));
943 	net = dev_net(dev);
944 	ret = __dev_alloc_name(net, name, buf);
945 	if (ret >= 0)
946 		strlcpy(dev->name, buf, IFNAMSIZ);
947 	return ret;
948 }
949 EXPORT_SYMBOL(dev_alloc_name);
950 
951 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
952 {
953 	struct net *net;
954 
955 	BUG_ON(!dev_net(dev));
956 	net = dev_net(dev);
957 
958 	if (!dev_valid_name(name))
959 		return -EINVAL;
960 
961 	if (fmt && strchr(name, '%'))
962 		return dev_alloc_name(dev, name);
963 	else if (__dev_get_by_name(net, name))
964 		return -EEXIST;
965 	else if (dev->name != name)
966 		strlcpy(dev->name, name, IFNAMSIZ);
967 
968 	return 0;
969 }
970 
971 /**
972  *	dev_change_name - change name of a device
973  *	@dev: device
974  *	@newname: name (or format string) must be at least IFNAMSIZ
975  *
976  *	Change name of a device, can pass format strings "eth%d".
977  *	for wildcarding.
978  */
979 int dev_change_name(struct net_device *dev, const char *newname)
980 {
981 	char oldname[IFNAMSIZ];
982 	int err = 0;
983 	int ret;
984 	struct net *net;
985 
986 	ASSERT_RTNL();
987 	BUG_ON(!dev_net(dev));
988 
989 	net = dev_net(dev);
990 	if (dev->flags & IFF_UP)
991 		return -EBUSY;
992 
993 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
994 		return 0;
995 
996 	memcpy(oldname, dev->name, IFNAMSIZ);
997 
998 	err = dev_get_valid_name(dev, newname, 1);
999 	if (err < 0)
1000 		return err;
1001 
1002 rollback:
1003 	ret = device_rename(&dev->dev, dev->name);
1004 	if (ret) {
1005 		memcpy(dev->name, oldname, IFNAMSIZ);
1006 		return ret;
1007 	}
1008 
1009 	write_lock_bh(&dev_base_lock);
1010 	hlist_del(&dev->name_hlist);
1011 	write_unlock_bh(&dev_base_lock);
1012 
1013 	synchronize_rcu();
1014 
1015 	write_lock_bh(&dev_base_lock);
1016 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017 	write_unlock_bh(&dev_base_lock);
1018 
1019 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020 	ret = notifier_to_errno(ret);
1021 
1022 	if (ret) {
1023 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1024 		if (err >= 0) {
1025 			err = ret;
1026 			memcpy(dev->name, oldname, IFNAMSIZ);
1027 			goto rollback;
1028 		} else {
1029 			printk(KERN_ERR
1030 			       "%s: name change rollback failed: %d.\n",
1031 			       dev->name, ret);
1032 		}
1033 	}
1034 
1035 	return err;
1036 }
1037 
1038 /**
1039  *	dev_set_alias - change ifalias of a device
1040  *	@dev: device
1041  *	@alias: name up to IFALIASZ
1042  *	@len: limit of bytes to copy from info
1043  *
1044  *	Set ifalias for a device,
1045  */
1046 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047 {
1048 	ASSERT_RTNL();
1049 
1050 	if (len >= IFALIASZ)
1051 		return -EINVAL;
1052 
1053 	if (!len) {
1054 		if (dev->ifalias) {
1055 			kfree(dev->ifalias);
1056 			dev->ifalias = NULL;
1057 		}
1058 		return 0;
1059 	}
1060 
1061 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062 	if (!dev->ifalias)
1063 		return -ENOMEM;
1064 
1065 	strlcpy(dev->ifalias, alias, len+1);
1066 	return len;
1067 }
1068 
1069 
1070 /**
1071  *	netdev_features_change - device changes features
1072  *	@dev: device to cause notification
1073  *
1074  *	Called to indicate a device has changed features.
1075  */
1076 void netdev_features_change(struct net_device *dev)
1077 {
1078 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079 }
1080 EXPORT_SYMBOL(netdev_features_change);
1081 
1082 /**
1083  *	netdev_state_change - device changes state
1084  *	@dev: device to cause notification
1085  *
1086  *	Called to indicate a device has changed state. This function calls
1087  *	the notifier chains for netdev_chain and sends a NEWLINK message
1088  *	to the routing socket.
1089  */
1090 void netdev_state_change(struct net_device *dev)
1091 {
1092 	if (dev->flags & IFF_UP) {
1093 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095 	}
1096 }
1097 EXPORT_SYMBOL(netdev_state_change);
1098 
1099 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100 {
1101 	return call_netdevice_notifiers(event, dev);
1102 }
1103 EXPORT_SYMBOL(netdev_bonding_change);
1104 
1105 /**
1106  *	dev_load 	- load a network module
1107  *	@net: the applicable net namespace
1108  *	@name: name of interface
1109  *
1110  *	If a network interface is not present and the process has suitable
1111  *	privileges this function loads the module. If module loading is not
1112  *	available in this kernel then it becomes a nop.
1113  */
1114 
1115 void dev_load(struct net *net, const char *name)
1116 {
1117 	struct net_device *dev;
1118 	int no_module;
1119 
1120 	rcu_read_lock();
1121 	dev = dev_get_by_name_rcu(net, name);
1122 	rcu_read_unlock();
1123 
1124 	no_module = !dev;
1125 	if (no_module && capable(CAP_NET_ADMIN))
1126 		no_module = request_module("netdev-%s", name);
1127 	if (no_module && capable(CAP_SYS_MODULE)) {
1128 		if (!request_module("%s", name))
1129 			pr_err("Loading kernel module for a network device "
1130 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1131 "instead\n", name);
1132 	}
1133 }
1134 EXPORT_SYMBOL(dev_load);
1135 
1136 static int __dev_open(struct net_device *dev)
1137 {
1138 	const struct net_device_ops *ops = dev->netdev_ops;
1139 	int ret;
1140 
1141 	ASSERT_RTNL();
1142 
1143 	if (!netif_device_present(dev))
1144 		return -ENODEV;
1145 
1146 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1147 	ret = notifier_to_errno(ret);
1148 	if (ret)
1149 		return ret;
1150 
1151 	set_bit(__LINK_STATE_START, &dev->state);
1152 
1153 	if (ops->ndo_validate_addr)
1154 		ret = ops->ndo_validate_addr(dev);
1155 
1156 	if (!ret && ops->ndo_open)
1157 		ret = ops->ndo_open(dev);
1158 
1159 	if (ret)
1160 		clear_bit(__LINK_STATE_START, &dev->state);
1161 	else {
1162 		dev->flags |= IFF_UP;
1163 		net_dmaengine_get();
1164 		dev_set_rx_mode(dev);
1165 		dev_activate(dev);
1166 	}
1167 
1168 	return ret;
1169 }
1170 
1171 /**
1172  *	dev_open	- prepare an interface for use.
1173  *	@dev:	device to open
1174  *
1175  *	Takes a device from down to up state. The device's private open
1176  *	function is invoked and then the multicast lists are loaded. Finally
1177  *	the device is moved into the up state and a %NETDEV_UP message is
1178  *	sent to the netdev notifier chain.
1179  *
1180  *	Calling this function on an active interface is a nop. On a failure
1181  *	a negative errno code is returned.
1182  */
1183 int dev_open(struct net_device *dev)
1184 {
1185 	int ret;
1186 
1187 	if (dev->flags & IFF_UP)
1188 		return 0;
1189 
1190 	ret = __dev_open(dev);
1191 	if (ret < 0)
1192 		return ret;
1193 
1194 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1195 	call_netdevice_notifiers(NETDEV_UP, dev);
1196 
1197 	return ret;
1198 }
1199 EXPORT_SYMBOL(dev_open);
1200 
1201 static int __dev_close_many(struct list_head *head)
1202 {
1203 	struct net_device *dev;
1204 
1205 	ASSERT_RTNL();
1206 	might_sleep();
1207 
1208 	list_for_each_entry(dev, head, unreg_list) {
1209 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1210 
1211 		clear_bit(__LINK_STATE_START, &dev->state);
1212 
1213 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1214 		 * can be even on different cpu. So just clear netif_running().
1215 		 *
1216 		 * dev->stop() will invoke napi_disable() on all of it's
1217 		 * napi_struct instances on this device.
1218 		 */
1219 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1220 	}
1221 
1222 	dev_deactivate_many(head);
1223 
1224 	list_for_each_entry(dev, head, unreg_list) {
1225 		const struct net_device_ops *ops = dev->netdev_ops;
1226 
1227 		/*
1228 		 *	Call the device specific close. This cannot fail.
1229 		 *	Only if device is UP
1230 		 *
1231 		 *	We allow it to be called even after a DETACH hot-plug
1232 		 *	event.
1233 		 */
1234 		if (ops->ndo_stop)
1235 			ops->ndo_stop(dev);
1236 
1237 		dev->flags &= ~IFF_UP;
1238 		net_dmaengine_put();
1239 	}
1240 
1241 	return 0;
1242 }
1243 
1244 static int __dev_close(struct net_device *dev)
1245 {
1246 	int retval;
1247 	LIST_HEAD(single);
1248 
1249 	list_add(&dev->unreg_list, &single);
1250 	retval = __dev_close_many(&single);
1251 	list_del(&single);
1252 	return retval;
1253 }
1254 
1255 static int dev_close_many(struct list_head *head)
1256 {
1257 	struct net_device *dev, *tmp;
1258 	LIST_HEAD(tmp_list);
1259 
1260 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1261 		if (!(dev->flags & IFF_UP))
1262 			list_move(&dev->unreg_list, &tmp_list);
1263 
1264 	__dev_close_many(head);
1265 
1266 	list_for_each_entry(dev, head, unreg_list) {
1267 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1268 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1269 	}
1270 
1271 	/* rollback_registered_many needs the complete original list */
1272 	list_splice(&tmp_list, head);
1273 	return 0;
1274 }
1275 
1276 /**
1277  *	dev_close - shutdown an interface.
1278  *	@dev: device to shutdown
1279  *
1280  *	This function moves an active device into down state. A
1281  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1282  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1283  *	chain.
1284  */
1285 int dev_close(struct net_device *dev)
1286 {
1287 	LIST_HEAD(single);
1288 
1289 	list_add(&dev->unreg_list, &single);
1290 	dev_close_many(&single);
1291 	list_del(&single);
1292 	return 0;
1293 }
1294 EXPORT_SYMBOL(dev_close);
1295 
1296 
1297 /**
1298  *	dev_disable_lro - disable Large Receive Offload on a device
1299  *	@dev: device
1300  *
1301  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1302  *	called under RTNL.  This is needed if received packets may be
1303  *	forwarded to another interface.
1304  */
1305 void dev_disable_lro(struct net_device *dev)
1306 {
1307 	u32 flags;
1308 
1309 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1310 		flags = dev->ethtool_ops->get_flags(dev);
1311 	else
1312 		flags = ethtool_op_get_flags(dev);
1313 
1314 	if (!(flags & ETH_FLAG_LRO))
1315 		return;
1316 
1317 	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1318 	WARN_ON(dev->features & NETIF_F_LRO);
1319 }
1320 EXPORT_SYMBOL(dev_disable_lro);
1321 
1322 
1323 static int dev_boot_phase = 1;
1324 
1325 /**
1326  *	register_netdevice_notifier - register a network notifier block
1327  *	@nb: notifier
1328  *
1329  *	Register a notifier to be called when network device events occur.
1330  *	The notifier passed is linked into the kernel structures and must
1331  *	not be reused until it has been unregistered. A negative errno code
1332  *	is returned on a failure.
1333  *
1334  * 	When registered all registration and up events are replayed
1335  *	to the new notifier to allow device to have a race free
1336  *	view of the network device list.
1337  */
1338 
1339 int register_netdevice_notifier(struct notifier_block *nb)
1340 {
1341 	struct net_device *dev;
1342 	struct net_device *last;
1343 	struct net *net;
1344 	int err;
1345 
1346 	rtnl_lock();
1347 	err = raw_notifier_chain_register(&netdev_chain, nb);
1348 	if (err)
1349 		goto unlock;
1350 	if (dev_boot_phase)
1351 		goto unlock;
1352 	for_each_net(net) {
1353 		for_each_netdev(net, dev) {
1354 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1355 			err = notifier_to_errno(err);
1356 			if (err)
1357 				goto rollback;
1358 
1359 			if (!(dev->flags & IFF_UP))
1360 				continue;
1361 
1362 			nb->notifier_call(nb, NETDEV_UP, dev);
1363 		}
1364 	}
1365 
1366 unlock:
1367 	rtnl_unlock();
1368 	return err;
1369 
1370 rollback:
1371 	last = dev;
1372 	for_each_net(net) {
1373 		for_each_netdev(net, dev) {
1374 			if (dev == last)
1375 				break;
1376 
1377 			if (dev->flags & IFF_UP) {
1378 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1379 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1380 			}
1381 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1382 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1383 		}
1384 	}
1385 
1386 	raw_notifier_chain_unregister(&netdev_chain, nb);
1387 	goto unlock;
1388 }
1389 EXPORT_SYMBOL(register_netdevice_notifier);
1390 
1391 /**
1392  *	unregister_netdevice_notifier - unregister a network notifier block
1393  *	@nb: notifier
1394  *
1395  *	Unregister a notifier previously registered by
1396  *	register_netdevice_notifier(). The notifier is unlinked into the
1397  *	kernel structures and may then be reused. A negative errno code
1398  *	is returned on a failure.
1399  */
1400 
1401 int unregister_netdevice_notifier(struct notifier_block *nb)
1402 {
1403 	int err;
1404 
1405 	rtnl_lock();
1406 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1407 	rtnl_unlock();
1408 	return err;
1409 }
1410 EXPORT_SYMBOL(unregister_netdevice_notifier);
1411 
1412 /**
1413  *	call_netdevice_notifiers - call all network notifier blocks
1414  *      @val: value passed unmodified to notifier function
1415  *      @dev: net_device pointer passed unmodified to notifier function
1416  *
1417  *	Call all network notifier blocks.  Parameters and return value
1418  *	are as for raw_notifier_call_chain().
1419  */
1420 
1421 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1422 {
1423 	ASSERT_RTNL();
1424 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1425 }
1426 EXPORT_SYMBOL(call_netdevice_notifiers);
1427 
1428 /* When > 0 there are consumers of rx skb time stamps */
1429 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1430 
1431 void net_enable_timestamp(void)
1432 {
1433 	atomic_inc(&netstamp_needed);
1434 }
1435 EXPORT_SYMBOL(net_enable_timestamp);
1436 
1437 void net_disable_timestamp(void)
1438 {
1439 	atomic_dec(&netstamp_needed);
1440 }
1441 EXPORT_SYMBOL(net_disable_timestamp);
1442 
1443 static inline void net_timestamp_set(struct sk_buff *skb)
1444 {
1445 	if (atomic_read(&netstamp_needed))
1446 		__net_timestamp(skb);
1447 	else
1448 		skb->tstamp.tv64 = 0;
1449 }
1450 
1451 static inline void net_timestamp_check(struct sk_buff *skb)
1452 {
1453 	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1454 		__net_timestamp(skb);
1455 }
1456 
1457 static inline bool is_skb_forwardable(struct net_device *dev,
1458 				      struct sk_buff *skb)
1459 {
1460 	unsigned int len;
1461 
1462 	if (!(dev->flags & IFF_UP))
1463 		return false;
1464 
1465 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1466 	if (skb->len <= len)
1467 		return true;
1468 
1469 	/* if TSO is enabled, we don't care about the length as the packet
1470 	 * could be forwarded without being segmented before
1471 	 */
1472 	if (skb_is_gso(skb))
1473 		return true;
1474 
1475 	return false;
1476 }
1477 
1478 /**
1479  * dev_forward_skb - loopback an skb to another netif
1480  *
1481  * @dev: destination network device
1482  * @skb: buffer to forward
1483  *
1484  * return values:
1485  *	NET_RX_SUCCESS	(no congestion)
1486  *	NET_RX_DROP     (packet was dropped, but freed)
1487  *
1488  * dev_forward_skb can be used for injecting an skb from the
1489  * start_xmit function of one device into the receive queue
1490  * of another device.
1491  *
1492  * The receiving device may be in another namespace, so
1493  * we have to clear all information in the skb that could
1494  * impact namespace isolation.
1495  */
1496 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1497 {
1498 	skb_orphan(skb);
1499 	nf_reset(skb);
1500 
1501 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1502 		atomic_long_inc(&dev->rx_dropped);
1503 		kfree_skb(skb);
1504 		return NET_RX_DROP;
1505 	}
1506 	skb_set_dev(skb, dev);
1507 	skb->tstamp.tv64 = 0;
1508 	skb->pkt_type = PACKET_HOST;
1509 	skb->protocol = eth_type_trans(skb, dev);
1510 	return netif_rx(skb);
1511 }
1512 EXPORT_SYMBOL_GPL(dev_forward_skb);
1513 
1514 static inline int deliver_skb(struct sk_buff *skb,
1515 			      struct packet_type *pt_prev,
1516 			      struct net_device *orig_dev)
1517 {
1518 	atomic_inc(&skb->users);
1519 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1520 }
1521 
1522 /*
1523  *	Support routine. Sends outgoing frames to any network
1524  *	taps currently in use.
1525  */
1526 
1527 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1528 {
1529 	struct packet_type *ptype;
1530 	struct sk_buff *skb2 = NULL;
1531 	struct packet_type *pt_prev = NULL;
1532 
1533 	rcu_read_lock();
1534 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1535 		/* Never send packets back to the socket
1536 		 * they originated from - MvS (miquels@drinkel.ow.org)
1537 		 */
1538 		if ((ptype->dev == dev || !ptype->dev) &&
1539 		    (ptype->af_packet_priv == NULL ||
1540 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1541 			if (pt_prev) {
1542 				deliver_skb(skb2, pt_prev, skb->dev);
1543 				pt_prev = ptype;
1544 				continue;
1545 			}
1546 
1547 			skb2 = skb_clone(skb, GFP_ATOMIC);
1548 			if (!skb2)
1549 				break;
1550 
1551 			net_timestamp_set(skb2);
1552 
1553 			/* skb->nh should be correctly
1554 			   set by sender, so that the second statement is
1555 			   just protection against buggy protocols.
1556 			 */
1557 			skb_reset_mac_header(skb2);
1558 
1559 			if (skb_network_header(skb2) < skb2->data ||
1560 			    skb2->network_header > skb2->tail) {
1561 				if (net_ratelimit())
1562 					printk(KERN_CRIT "protocol %04x is "
1563 					       "buggy, dev %s\n",
1564 					       ntohs(skb2->protocol),
1565 					       dev->name);
1566 				skb_reset_network_header(skb2);
1567 			}
1568 
1569 			skb2->transport_header = skb2->network_header;
1570 			skb2->pkt_type = PACKET_OUTGOING;
1571 			pt_prev = ptype;
1572 		}
1573 	}
1574 	if (pt_prev)
1575 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1576 	rcu_read_unlock();
1577 }
1578 
1579 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1580  * @dev: Network device
1581  * @txq: number of queues available
1582  *
1583  * If real_num_tx_queues is changed the tc mappings may no longer be
1584  * valid. To resolve this verify the tc mapping remains valid and if
1585  * not NULL the mapping. With no priorities mapping to this
1586  * offset/count pair it will no longer be used. In the worst case TC0
1587  * is invalid nothing can be done so disable priority mappings. If is
1588  * expected that drivers will fix this mapping if they can before
1589  * calling netif_set_real_num_tx_queues.
1590  */
1591 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1592 {
1593 	int i;
1594 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1595 
1596 	/* If TC0 is invalidated disable TC mapping */
1597 	if (tc->offset + tc->count > txq) {
1598 		pr_warning("Number of in use tx queues changed "
1599 			   "invalidating tc mappings. Priority "
1600 			   "traffic classification disabled!\n");
1601 		dev->num_tc = 0;
1602 		return;
1603 	}
1604 
1605 	/* Invalidated prio to tc mappings set to TC0 */
1606 	for (i = 1; i < TC_BITMASK + 1; i++) {
1607 		int q = netdev_get_prio_tc_map(dev, i);
1608 
1609 		tc = &dev->tc_to_txq[q];
1610 		if (tc->offset + tc->count > txq) {
1611 			pr_warning("Number of in use tx queues "
1612 				   "changed. Priority %i to tc "
1613 				   "mapping %i is no longer valid "
1614 				   "setting map to 0\n",
1615 				   i, q);
1616 			netdev_set_prio_tc_map(dev, i, 0);
1617 		}
1618 	}
1619 }
1620 
1621 /*
1622  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1623  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1624  */
1625 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1626 {
1627 	int rc;
1628 
1629 	if (txq < 1 || txq > dev->num_tx_queues)
1630 		return -EINVAL;
1631 
1632 	if (dev->reg_state == NETREG_REGISTERED ||
1633 	    dev->reg_state == NETREG_UNREGISTERING) {
1634 		ASSERT_RTNL();
1635 
1636 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1637 						  txq);
1638 		if (rc)
1639 			return rc;
1640 
1641 		if (dev->num_tc)
1642 			netif_setup_tc(dev, txq);
1643 
1644 		if (txq < dev->real_num_tx_queues)
1645 			qdisc_reset_all_tx_gt(dev, txq);
1646 	}
1647 
1648 	dev->real_num_tx_queues = txq;
1649 	return 0;
1650 }
1651 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1652 
1653 #ifdef CONFIG_RPS
1654 /**
1655  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1656  *	@dev: Network device
1657  *	@rxq: Actual number of RX queues
1658  *
1659  *	This must be called either with the rtnl_lock held or before
1660  *	registration of the net device.  Returns 0 on success, or a
1661  *	negative error code.  If called before registration, it always
1662  *	succeeds.
1663  */
1664 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1665 {
1666 	int rc;
1667 
1668 	if (rxq < 1 || rxq > dev->num_rx_queues)
1669 		return -EINVAL;
1670 
1671 	if (dev->reg_state == NETREG_REGISTERED) {
1672 		ASSERT_RTNL();
1673 
1674 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1675 						  rxq);
1676 		if (rc)
1677 			return rc;
1678 	}
1679 
1680 	dev->real_num_rx_queues = rxq;
1681 	return 0;
1682 }
1683 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1684 #endif
1685 
1686 static inline void __netif_reschedule(struct Qdisc *q)
1687 {
1688 	struct softnet_data *sd;
1689 	unsigned long flags;
1690 
1691 	local_irq_save(flags);
1692 	sd = &__get_cpu_var(softnet_data);
1693 	q->next_sched = NULL;
1694 	*sd->output_queue_tailp = q;
1695 	sd->output_queue_tailp = &q->next_sched;
1696 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1697 	local_irq_restore(flags);
1698 }
1699 
1700 void __netif_schedule(struct Qdisc *q)
1701 {
1702 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1703 		__netif_reschedule(q);
1704 }
1705 EXPORT_SYMBOL(__netif_schedule);
1706 
1707 void dev_kfree_skb_irq(struct sk_buff *skb)
1708 {
1709 	if (atomic_dec_and_test(&skb->users)) {
1710 		struct softnet_data *sd;
1711 		unsigned long flags;
1712 
1713 		local_irq_save(flags);
1714 		sd = &__get_cpu_var(softnet_data);
1715 		skb->next = sd->completion_queue;
1716 		sd->completion_queue = skb;
1717 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1718 		local_irq_restore(flags);
1719 	}
1720 }
1721 EXPORT_SYMBOL(dev_kfree_skb_irq);
1722 
1723 void dev_kfree_skb_any(struct sk_buff *skb)
1724 {
1725 	if (in_irq() || irqs_disabled())
1726 		dev_kfree_skb_irq(skb);
1727 	else
1728 		dev_kfree_skb(skb);
1729 }
1730 EXPORT_SYMBOL(dev_kfree_skb_any);
1731 
1732 
1733 /**
1734  * netif_device_detach - mark device as removed
1735  * @dev: network device
1736  *
1737  * Mark device as removed from system and therefore no longer available.
1738  */
1739 void netif_device_detach(struct net_device *dev)
1740 {
1741 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1742 	    netif_running(dev)) {
1743 		netif_tx_stop_all_queues(dev);
1744 	}
1745 }
1746 EXPORT_SYMBOL(netif_device_detach);
1747 
1748 /**
1749  * netif_device_attach - mark device as attached
1750  * @dev: network device
1751  *
1752  * Mark device as attached from system and restart if needed.
1753  */
1754 void netif_device_attach(struct net_device *dev)
1755 {
1756 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1757 	    netif_running(dev)) {
1758 		netif_tx_wake_all_queues(dev);
1759 		__netdev_watchdog_up(dev);
1760 	}
1761 }
1762 EXPORT_SYMBOL(netif_device_attach);
1763 
1764 /**
1765  * skb_dev_set -- assign a new device to a buffer
1766  * @skb: buffer for the new device
1767  * @dev: network device
1768  *
1769  * If an skb is owned by a device already, we have to reset
1770  * all data private to the namespace a device belongs to
1771  * before assigning it a new device.
1772  */
1773 #ifdef CONFIG_NET_NS
1774 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1775 {
1776 	skb_dst_drop(skb);
1777 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1778 		secpath_reset(skb);
1779 		nf_reset(skb);
1780 		skb_init_secmark(skb);
1781 		skb->mark = 0;
1782 		skb->priority = 0;
1783 		skb->nf_trace = 0;
1784 		skb->ipvs_property = 0;
1785 #ifdef CONFIG_NET_SCHED
1786 		skb->tc_index = 0;
1787 #endif
1788 	}
1789 	skb->dev = dev;
1790 }
1791 EXPORT_SYMBOL(skb_set_dev);
1792 #endif /* CONFIG_NET_NS */
1793 
1794 /*
1795  * Invalidate hardware checksum when packet is to be mangled, and
1796  * complete checksum manually on outgoing path.
1797  */
1798 int skb_checksum_help(struct sk_buff *skb)
1799 {
1800 	__wsum csum;
1801 	int ret = 0, offset;
1802 
1803 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1804 		goto out_set_summed;
1805 
1806 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1807 		/* Let GSO fix up the checksum. */
1808 		goto out_set_summed;
1809 	}
1810 
1811 	offset = skb_checksum_start_offset(skb);
1812 	BUG_ON(offset >= skb_headlen(skb));
1813 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1814 
1815 	offset += skb->csum_offset;
1816 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1817 
1818 	if (skb_cloned(skb) &&
1819 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1820 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1821 		if (ret)
1822 			goto out;
1823 	}
1824 
1825 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1826 out_set_summed:
1827 	skb->ip_summed = CHECKSUM_NONE;
1828 out:
1829 	return ret;
1830 }
1831 EXPORT_SYMBOL(skb_checksum_help);
1832 
1833 /**
1834  *	skb_gso_segment - Perform segmentation on skb.
1835  *	@skb: buffer to segment
1836  *	@features: features for the output path (see dev->features)
1837  *
1838  *	This function segments the given skb and returns a list of segments.
1839  *
1840  *	It may return NULL if the skb requires no segmentation.  This is
1841  *	only possible when GSO is used for verifying header integrity.
1842  */
1843 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1844 {
1845 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1846 	struct packet_type *ptype;
1847 	__be16 type = skb->protocol;
1848 	int vlan_depth = ETH_HLEN;
1849 	int err;
1850 
1851 	while (type == htons(ETH_P_8021Q)) {
1852 		struct vlan_hdr *vh;
1853 
1854 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1855 			return ERR_PTR(-EINVAL);
1856 
1857 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1858 		type = vh->h_vlan_encapsulated_proto;
1859 		vlan_depth += VLAN_HLEN;
1860 	}
1861 
1862 	skb_reset_mac_header(skb);
1863 	skb->mac_len = skb->network_header - skb->mac_header;
1864 	__skb_pull(skb, skb->mac_len);
1865 
1866 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1867 		struct net_device *dev = skb->dev;
1868 		struct ethtool_drvinfo info = {};
1869 
1870 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1871 			dev->ethtool_ops->get_drvinfo(dev, &info);
1872 
1873 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1874 		     info.driver, dev ? dev->features : 0L,
1875 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1876 		     skb->len, skb->data_len, skb->ip_summed);
1877 
1878 		if (skb_header_cloned(skb) &&
1879 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1880 			return ERR_PTR(err);
1881 	}
1882 
1883 	rcu_read_lock();
1884 	list_for_each_entry_rcu(ptype,
1885 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1886 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1887 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1888 				err = ptype->gso_send_check(skb);
1889 				segs = ERR_PTR(err);
1890 				if (err || skb_gso_ok(skb, features))
1891 					break;
1892 				__skb_push(skb, (skb->data -
1893 						 skb_network_header(skb)));
1894 			}
1895 			segs = ptype->gso_segment(skb, features);
1896 			break;
1897 		}
1898 	}
1899 	rcu_read_unlock();
1900 
1901 	__skb_push(skb, skb->data - skb_mac_header(skb));
1902 
1903 	return segs;
1904 }
1905 EXPORT_SYMBOL(skb_gso_segment);
1906 
1907 /* Take action when hardware reception checksum errors are detected. */
1908 #ifdef CONFIG_BUG
1909 void netdev_rx_csum_fault(struct net_device *dev)
1910 {
1911 	if (net_ratelimit()) {
1912 		printk(KERN_ERR "%s: hw csum failure.\n",
1913 			dev ? dev->name : "<unknown>");
1914 		dump_stack();
1915 	}
1916 }
1917 EXPORT_SYMBOL(netdev_rx_csum_fault);
1918 #endif
1919 
1920 /* Actually, we should eliminate this check as soon as we know, that:
1921  * 1. IOMMU is present and allows to map all the memory.
1922  * 2. No high memory really exists on this machine.
1923  */
1924 
1925 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1926 {
1927 #ifdef CONFIG_HIGHMEM
1928 	int i;
1929 	if (!(dev->features & NETIF_F_HIGHDMA)) {
1930 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1931 			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1932 				return 1;
1933 	}
1934 
1935 	if (PCI_DMA_BUS_IS_PHYS) {
1936 		struct device *pdev = dev->dev.parent;
1937 
1938 		if (!pdev)
1939 			return 0;
1940 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1941 			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1942 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1943 				return 1;
1944 		}
1945 	}
1946 #endif
1947 	return 0;
1948 }
1949 
1950 struct dev_gso_cb {
1951 	void (*destructor)(struct sk_buff *skb);
1952 };
1953 
1954 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1955 
1956 static void dev_gso_skb_destructor(struct sk_buff *skb)
1957 {
1958 	struct dev_gso_cb *cb;
1959 
1960 	do {
1961 		struct sk_buff *nskb = skb->next;
1962 
1963 		skb->next = nskb->next;
1964 		nskb->next = NULL;
1965 		kfree_skb(nskb);
1966 	} while (skb->next);
1967 
1968 	cb = DEV_GSO_CB(skb);
1969 	if (cb->destructor)
1970 		cb->destructor(skb);
1971 }
1972 
1973 /**
1974  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1975  *	@skb: buffer to segment
1976  *	@features: device features as applicable to this skb
1977  *
1978  *	This function segments the given skb and stores the list of segments
1979  *	in skb->next.
1980  */
1981 static int dev_gso_segment(struct sk_buff *skb, int features)
1982 {
1983 	struct sk_buff *segs;
1984 
1985 	segs = skb_gso_segment(skb, features);
1986 
1987 	/* Verifying header integrity only. */
1988 	if (!segs)
1989 		return 0;
1990 
1991 	if (IS_ERR(segs))
1992 		return PTR_ERR(segs);
1993 
1994 	skb->next = segs;
1995 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1996 	skb->destructor = dev_gso_skb_destructor;
1997 
1998 	return 0;
1999 }
2000 
2001 /*
2002  * Try to orphan skb early, right before transmission by the device.
2003  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2004  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2005  */
2006 static inline void skb_orphan_try(struct sk_buff *skb)
2007 {
2008 	struct sock *sk = skb->sk;
2009 
2010 	if (sk && !skb_shinfo(skb)->tx_flags) {
2011 		/* skb_tx_hash() wont be able to get sk.
2012 		 * We copy sk_hash into skb->rxhash
2013 		 */
2014 		if (!skb->rxhash)
2015 			skb->rxhash = sk->sk_hash;
2016 		skb_orphan(skb);
2017 	}
2018 }
2019 
2020 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2021 {
2022 	return ((features & NETIF_F_GEN_CSUM) ||
2023 		((features & NETIF_F_V4_CSUM) &&
2024 		 protocol == htons(ETH_P_IP)) ||
2025 		((features & NETIF_F_V6_CSUM) &&
2026 		 protocol == htons(ETH_P_IPV6)) ||
2027 		((features & NETIF_F_FCOE_CRC) &&
2028 		 protocol == htons(ETH_P_FCOE)));
2029 }
2030 
2031 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2032 {
2033 	if (!can_checksum_protocol(features, protocol)) {
2034 		features &= ~NETIF_F_ALL_CSUM;
2035 		features &= ~NETIF_F_SG;
2036 	} else if (illegal_highdma(skb->dev, skb)) {
2037 		features &= ~NETIF_F_SG;
2038 	}
2039 
2040 	return features;
2041 }
2042 
2043 u32 netif_skb_features(struct sk_buff *skb)
2044 {
2045 	__be16 protocol = skb->protocol;
2046 	u32 features = skb->dev->features;
2047 
2048 	if (protocol == htons(ETH_P_8021Q)) {
2049 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2050 		protocol = veh->h_vlan_encapsulated_proto;
2051 	} else if (!vlan_tx_tag_present(skb)) {
2052 		return harmonize_features(skb, protocol, features);
2053 	}
2054 
2055 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2056 
2057 	if (protocol != htons(ETH_P_8021Q)) {
2058 		return harmonize_features(skb, protocol, features);
2059 	} else {
2060 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2061 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2062 		return harmonize_features(skb, protocol, features);
2063 	}
2064 }
2065 EXPORT_SYMBOL(netif_skb_features);
2066 
2067 /*
2068  * Returns true if either:
2069  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2070  *	2. skb is fragmented and the device does not support SG, or if
2071  *	   at least one of fragments is in highmem and device does not
2072  *	   support DMA from it.
2073  */
2074 static inline int skb_needs_linearize(struct sk_buff *skb,
2075 				      int features)
2076 {
2077 	return skb_is_nonlinear(skb) &&
2078 			((skb_has_frag_list(skb) &&
2079 				!(features & NETIF_F_FRAGLIST)) ||
2080 			(skb_shinfo(skb)->nr_frags &&
2081 				!(features & NETIF_F_SG)));
2082 }
2083 
2084 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2085 			struct netdev_queue *txq)
2086 {
2087 	const struct net_device_ops *ops = dev->netdev_ops;
2088 	int rc = NETDEV_TX_OK;
2089 
2090 	if (likely(!skb->next)) {
2091 		u32 features;
2092 
2093 		/*
2094 		 * If device doesn't need skb->dst, release it right now while
2095 		 * its hot in this cpu cache
2096 		 */
2097 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2098 			skb_dst_drop(skb);
2099 
2100 		if (!list_empty(&ptype_all))
2101 			dev_queue_xmit_nit(skb, dev);
2102 
2103 		skb_orphan_try(skb);
2104 
2105 		features = netif_skb_features(skb);
2106 
2107 		if (vlan_tx_tag_present(skb) &&
2108 		    !(features & NETIF_F_HW_VLAN_TX)) {
2109 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2110 			if (unlikely(!skb))
2111 				goto out;
2112 
2113 			skb->vlan_tci = 0;
2114 		}
2115 
2116 		if (netif_needs_gso(skb, features)) {
2117 			if (unlikely(dev_gso_segment(skb, features)))
2118 				goto out_kfree_skb;
2119 			if (skb->next)
2120 				goto gso;
2121 		} else {
2122 			if (skb_needs_linearize(skb, features) &&
2123 			    __skb_linearize(skb))
2124 				goto out_kfree_skb;
2125 
2126 			/* If packet is not checksummed and device does not
2127 			 * support checksumming for this protocol, complete
2128 			 * checksumming here.
2129 			 */
2130 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2131 				skb_set_transport_header(skb,
2132 					skb_checksum_start_offset(skb));
2133 				if (!(features & NETIF_F_ALL_CSUM) &&
2134 				     skb_checksum_help(skb))
2135 					goto out_kfree_skb;
2136 			}
2137 		}
2138 
2139 		rc = ops->ndo_start_xmit(skb, dev);
2140 		trace_net_dev_xmit(skb, rc);
2141 		if (rc == NETDEV_TX_OK)
2142 			txq_trans_update(txq);
2143 		return rc;
2144 	}
2145 
2146 gso:
2147 	do {
2148 		struct sk_buff *nskb = skb->next;
2149 
2150 		skb->next = nskb->next;
2151 		nskb->next = NULL;
2152 
2153 		/*
2154 		 * If device doesn't need nskb->dst, release it right now while
2155 		 * its hot in this cpu cache
2156 		 */
2157 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2158 			skb_dst_drop(nskb);
2159 
2160 		rc = ops->ndo_start_xmit(nskb, dev);
2161 		trace_net_dev_xmit(nskb, rc);
2162 		if (unlikely(rc != NETDEV_TX_OK)) {
2163 			if (rc & ~NETDEV_TX_MASK)
2164 				goto out_kfree_gso_skb;
2165 			nskb->next = skb->next;
2166 			skb->next = nskb;
2167 			return rc;
2168 		}
2169 		txq_trans_update(txq);
2170 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2171 			return NETDEV_TX_BUSY;
2172 	} while (skb->next);
2173 
2174 out_kfree_gso_skb:
2175 	if (likely(skb->next == NULL))
2176 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2177 out_kfree_skb:
2178 	kfree_skb(skb);
2179 out:
2180 	return rc;
2181 }
2182 
2183 static u32 hashrnd __read_mostly;
2184 
2185 /*
2186  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2187  * to be used as a distribution range.
2188  */
2189 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2190 		  unsigned int num_tx_queues)
2191 {
2192 	u32 hash;
2193 	u16 qoffset = 0;
2194 	u16 qcount = num_tx_queues;
2195 
2196 	if (skb_rx_queue_recorded(skb)) {
2197 		hash = skb_get_rx_queue(skb);
2198 		while (unlikely(hash >= num_tx_queues))
2199 			hash -= num_tx_queues;
2200 		return hash;
2201 	}
2202 
2203 	if (dev->num_tc) {
2204 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2205 		qoffset = dev->tc_to_txq[tc].offset;
2206 		qcount = dev->tc_to_txq[tc].count;
2207 	}
2208 
2209 	if (skb->sk && skb->sk->sk_hash)
2210 		hash = skb->sk->sk_hash;
2211 	else
2212 		hash = (__force u16) skb->protocol ^ skb->rxhash;
2213 	hash = jhash_1word(hash, hashrnd);
2214 
2215 	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2216 }
2217 EXPORT_SYMBOL(__skb_tx_hash);
2218 
2219 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2220 {
2221 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2222 		if (net_ratelimit()) {
2223 			pr_warning("%s selects TX queue %d, but "
2224 				"real number of TX queues is %d\n",
2225 				dev->name, queue_index, dev->real_num_tx_queues);
2226 		}
2227 		return 0;
2228 	}
2229 	return queue_index;
2230 }
2231 
2232 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2233 {
2234 #ifdef CONFIG_XPS
2235 	struct xps_dev_maps *dev_maps;
2236 	struct xps_map *map;
2237 	int queue_index = -1;
2238 
2239 	rcu_read_lock();
2240 	dev_maps = rcu_dereference(dev->xps_maps);
2241 	if (dev_maps) {
2242 		map = rcu_dereference(
2243 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2244 		if (map) {
2245 			if (map->len == 1)
2246 				queue_index = map->queues[0];
2247 			else {
2248 				u32 hash;
2249 				if (skb->sk && skb->sk->sk_hash)
2250 					hash = skb->sk->sk_hash;
2251 				else
2252 					hash = (__force u16) skb->protocol ^
2253 					    skb->rxhash;
2254 				hash = jhash_1word(hash, hashrnd);
2255 				queue_index = map->queues[
2256 				    ((u64)hash * map->len) >> 32];
2257 			}
2258 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2259 				queue_index = -1;
2260 		}
2261 	}
2262 	rcu_read_unlock();
2263 
2264 	return queue_index;
2265 #else
2266 	return -1;
2267 #endif
2268 }
2269 
2270 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2271 					struct sk_buff *skb)
2272 {
2273 	int queue_index;
2274 	const struct net_device_ops *ops = dev->netdev_ops;
2275 
2276 	if (dev->real_num_tx_queues == 1)
2277 		queue_index = 0;
2278 	else if (ops->ndo_select_queue) {
2279 		queue_index = ops->ndo_select_queue(dev, skb);
2280 		queue_index = dev_cap_txqueue(dev, queue_index);
2281 	} else {
2282 		struct sock *sk = skb->sk;
2283 		queue_index = sk_tx_queue_get(sk);
2284 
2285 		if (queue_index < 0 || skb->ooo_okay ||
2286 		    queue_index >= dev->real_num_tx_queues) {
2287 			int old_index = queue_index;
2288 
2289 			queue_index = get_xps_queue(dev, skb);
2290 			if (queue_index < 0)
2291 				queue_index = skb_tx_hash(dev, skb);
2292 
2293 			if (queue_index != old_index && sk) {
2294 				struct dst_entry *dst =
2295 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2296 
2297 				if (dst && skb_dst(skb) == dst)
2298 					sk_tx_queue_set(sk, queue_index);
2299 			}
2300 		}
2301 	}
2302 
2303 	skb_set_queue_mapping(skb, queue_index);
2304 	return netdev_get_tx_queue(dev, queue_index);
2305 }
2306 
2307 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2308 				 struct net_device *dev,
2309 				 struct netdev_queue *txq)
2310 {
2311 	spinlock_t *root_lock = qdisc_lock(q);
2312 	bool contended;
2313 	int rc;
2314 
2315 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2316 	qdisc_calculate_pkt_len(skb, q);
2317 	/*
2318 	 * Heuristic to force contended enqueues to serialize on a
2319 	 * separate lock before trying to get qdisc main lock.
2320 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2321 	 * and dequeue packets faster.
2322 	 */
2323 	contended = qdisc_is_running(q);
2324 	if (unlikely(contended))
2325 		spin_lock(&q->busylock);
2326 
2327 	spin_lock(root_lock);
2328 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2329 		kfree_skb(skb);
2330 		rc = NET_XMIT_DROP;
2331 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2332 		   qdisc_run_begin(q)) {
2333 		/*
2334 		 * This is a work-conserving queue; there are no old skbs
2335 		 * waiting to be sent out; and the qdisc is not running -
2336 		 * xmit the skb directly.
2337 		 */
2338 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2339 			skb_dst_force(skb);
2340 
2341 		qdisc_bstats_update(q, skb);
2342 
2343 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2344 			if (unlikely(contended)) {
2345 				spin_unlock(&q->busylock);
2346 				contended = false;
2347 			}
2348 			__qdisc_run(q);
2349 		} else
2350 			qdisc_run_end(q);
2351 
2352 		rc = NET_XMIT_SUCCESS;
2353 	} else {
2354 		skb_dst_force(skb);
2355 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2356 		if (qdisc_run_begin(q)) {
2357 			if (unlikely(contended)) {
2358 				spin_unlock(&q->busylock);
2359 				contended = false;
2360 			}
2361 			__qdisc_run(q);
2362 		}
2363 	}
2364 	spin_unlock(root_lock);
2365 	if (unlikely(contended))
2366 		spin_unlock(&q->busylock);
2367 	return rc;
2368 }
2369 
2370 static DEFINE_PER_CPU(int, xmit_recursion);
2371 #define RECURSION_LIMIT 10
2372 
2373 /**
2374  *	dev_queue_xmit - transmit a buffer
2375  *	@skb: buffer to transmit
2376  *
2377  *	Queue a buffer for transmission to a network device. The caller must
2378  *	have set the device and priority and built the buffer before calling
2379  *	this function. The function can be called from an interrupt.
2380  *
2381  *	A negative errno code is returned on a failure. A success does not
2382  *	guarantee the frame will be transmitted as it may be dropped due
2383  *	to congestion or traffic shaping.
2384  *
2385  * -----------------------------------------------------------------------------------
2386  *      I notice this method can also return errors from the queue disciplines,
2387  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2388  *      be positive.
2389  *
2390  *      Regardless of the return value, the skb is consumed, so it is currently
2391  *      difficult to retry a send to this method.  (You can bump the ref count
2392  *      before sending to hold a reference for retry if you are careful.)
2393  *
2394  *      When calling this method, interrupts MUST be enabled.  This is because
2395  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2396  *          --BLG
2397  */
2398 int dev_queue_xmit(struct sk_buff *skb)
2399 {
2400 	struct net_device *dev = skb->dev;
2401 	struct netdev_queue *txq;
2402 	struct Qdisc *q;
2403 	int rc = -ENOMEM;
2404 
2405 	/* Disable soft irqs for various locks below. Also
2406 	 * stops preemption for RCU.
2407 	 */
2408 	rcu_read_lock_bh();
2409 
2410 	txq = dev_pick_tx(dev, skb);
2411 	q = rcu_dereference_bh(txq->qdisc);
2412 
2413 #ifdef CONFIG_NET_CLS_ACT
2414 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2415 #endif
2416 	trace_net_dev_queue(skb);
2417 	if (q->enqueue) {
2418 		rc = __dev_xmit_skb(skb, q, dev, txq);
2419 		goto out;
2420 	}
2421 
2422 	/* The device has no queue. Common case for software devices:
2423 	   loopback, all the sorts of tunnels...
2424 
2425 	   Really, it is unlikely that netif_tx_lock protection is necessary
2426 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2427 	   counters.)
2428 	   However, it is possible, that they rely on protection
2429 	   made by us here.
2430 
2431 	   Check this and shot the lock. It is not prone from deadlocks.
2432 	   Either shot noqueue qdisc, it is even simpler 8)
2433 	 */
2434 	if (dev->flags & IFF_UP) {
2435 		int cpu = smp_processor_id(); /* ok because BHs are off */
2436 
2437 		if (txq->xmit_lock_owner != cpu) {
2438 
2439 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2440 				goto recursion_alert;
2441 
2442 			HARD_TX_LOCK(dev, txq, cpu);
2443 
2444 			if (!netif_tx_queue_stopped(txq)) {
2445 				__this_cpu_inc(xmit_recursion);
2446 				rc = dev_hard_start_xmit(skb, dev, txq);
2447 				__this_cpu_dec(xmit_recursion);
2448 				if (dev_xmit_complete(rc)) {
2449 					HARD_TX_UNLOCK(dev, txq);
2450 					goto out;
2451 				}
2452 			}
2453 			HARD_TX_UNLOCK(dev, txq);
2454 			if (net_ratelimit())
2455 				printk(KERN_CRIT "Virtual device %s asks to "
2456 				       "queue packet!\n", dev->name);
2457 		} else {
2458 			/* Recursion is detected! It is possible,
2459 			 * unfortunately
2460 			 */
2461 recursion_alert:
2462 			if (net_ratelimit())
2463 				printk(KERN_CRIT "Dead loop on virtual device "
2464 				       "%s, fix it urgently!\n", dev->name);
2465 		}
2466 	}
2467 
2468 	rc = -ENETDOWN;
2469 	rcu_read_unlock_bh();
2470 
2471 	kfree_skb(skb);
2472 	return rc;
2473 out:
2474 	rcu_read_unlock_bh();
2475 	return rc;
2476 }
2477 EXPORT_SYMBOL(dev_queue_xmit);
2478 
2479 
2480 /*=======================================================================
2481 			Receiver routines
2482   =======================================================================*/
2483 
2484 int netdev_max_backlog __read_mostly = 1000;
2485 int netdev_tstamp_prequeue __read_mostly = 1;
2486 int netdev_budget __read_mostly = 300;
2487 int weight_p __read_mostly = 64;            /* old backlog weight */
2488 
2489 /* Called with irq disabled */
2490 static inline void ____napi_schedule(struct softnet_data *sd,
2491 				     struct napi_struct *napi)
2492 {
2493 	list_add_tail(&napi->poll_list, &sd->poll_list);
2494 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2495 }
2496 
2497 /*
2498  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2499  * and src/dst port numbers. Returns a non-zero hash number on success
2500  * and 0 on failure.
2501  */
2502 __u32 __skb_get_rxhash(struct sk_buff *skb)
2503 {
2504 	int nhoff, hash = 0, poff;
2505 	struct ipv6hdr *ip6;
2506 	struct iphdr *ip;
2507 	u8 ip_proto;
2508 	u32 addr1, addr2, ihl;
2509 	union {
2510 		u32 v32;
2511 		u16 v16[2];
2512 	} ports;
2513 
2514 	nhoff = skb_network_offset(skb);
2515 
2516 	switch (skb->protocol) {
2517 	case __constant_htons(ETH_P_IP):
2518 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2519 			goto done;
2520 
2521 		ip = (struct iphdr *) (skb->data + nhoff);
2522 		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2523 			ip_proto = 0;
2524 		else
2525 			ip_proto = ip->protocol;
2526 		addr1 = (__force u32) ip->saddr;
2527 		addr2 = (__force u32) ip->daddr;
2528 		ihl = ip->ihl;
2529 		break;
2530 	case __constant_htons(ETH_P_IPV6):
2531 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2532 			goto done;
2533 
2534 		ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2535 		ip_proto = ip6->nexthdr;
2536 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2537 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2538 		ihl = (40 >> 2);
2539 		break;
2540 	default:
2541 		goto done;
2542 	}
2543 
2544 	ports.v32 = 0;
2545 	poff = proto_ports_offset(ip_proto);
2546 	if (poff >= 0) {
2547 		nhoff += ihl * 4 + poff;
2548 		if (pskb_may_pull(skb, nhoff + 4)) {
2549 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2550 			if (ports.v16[1] < ports.v16[0])
2551 				swap(ports.v16[0], ports.v16[1]);
2552 		}
2553 	}
2554 
2555 	/* get a consistent hash (same value on both flow directions) */
2556 	if (addr2 < addr1)
2557 		swap(addr1, addr2);
2558 
2559 	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2560 	if (!hash)
2561 		hash = 1;
2562 
2563 done:
2564 	return hash;
2565 }
2566 EXPORT_SYMBOL(__skb_get_rxhash);
2567 
2568 #ifdef CONFIG_RPS
2569 
2570 /* One global table that all flow-based protocols share. */
2571 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2572 EXPORT_SYMBOL(rps_sock_flow_table);
2573 
2574 static struct rps_dev_flow *
2575 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2576 	    struct rps_dev_flow *rflow, u16 next_cpu)
2577 {
2578 	u16 tcpu;
2579 
2580 	tcpu = rflow->cpu = next_cpu;
2581 	if (tcpu != RPS_NO_CPU) {
2582 #ifdef CONFIG_RFS_ACCEL
2583 		struct netdev_rx_queue *rxqueue;
2584 		struct rps_dev_flow_table *flow_table;
2585 		struct rps_dev_flow *old_rflow;
2586 		u32 flow_id;
2587 		u16 rxq_index;
2588 		int rc;
2589 
2590 		/* Should we steer this flow to a different hardware queue? */
2591 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2592 		    !(dev->features & NETIF_F_NTUPLE))
2593 			goto out;
2594 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2595 		if (rxq_index == skb_get_rx_queue(skb))
2596 			goto out;
2597 
2598 		rxqueue = dev->_rx + rxq_index;
2599 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2600 		if (!flow_table)
2601 			goto out;
2602 		flow_id = skb->rxhash & flow_table->mask;
2603 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2604 							rxq_index, flow_id);
2605 		if (rc < 0)
2606 			goto out;
2607 		old_rflow = rflow;
2608 		rflow = &flow_table->flows[flow_id];
2609 		rflow->cpu = next_cpu;
2610 		rflow->filter = rc;
2611 		if (old_rflow->filter == rflow->filter)
2612 			old_rflow->filter = RPS_NO_FILTER;
2613 	out:
2614 #endif
2615 		rflow->last_qtail =
2616 			per_cpu(softnet_data, tcpu).input_queue_head;
2617 	}
2618 
2619 	return rflow;
2620 }
2621 
2622 /*
2623  * get_rps_cpu is called from netif_receive_skb and returns the target
2624  * CPU from the RPS map of the receiving queue for a given skb.
2625  * rcu_read_lock must be held on entry.
2626  */
2627 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2628 		       struct rps_dev_flow **rflowp)
2629 {
2630 	struct netdev_rx_queue *rxqueue;
2631 	struct rps_map *map;
2632 	struct rps_dev_flow_table *flow_table;
2633 	struct rps_sock_flow_table *sock_flow_table;
2634 	int cpu = -1;
2635 	u16 tcpu;
2636 
2637 	if (skb_rx_queue_recorded(skb)) {
2638 		u16 index = skb_get_rx_queue(skb);
2639 		if (unlikely(index >= dev->real_num_rx_queues)) {
2640 			WARN_ONCE(dev->real_num_rx_queues > 1,
2641 				  "%s received packet on queue %u, but number "
2642 				  "of RX queues is %u\n",
2643 				  dev->name, index, dev->real_num_rx_queues);
2644 			goto done;
2645 		}
2646 		rxqueue = dev->_rx + index;
2647 	} else
2648 		rxqueue = dev->_rx;
2649 
2650 	map = rcu_dereference(rxqueue->rps_map);
2651 	if (map) {
2652 		if (map->len == 1 &&
2653 		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2654 			tcpu = map->cpus[0];
2655 			if (cpu_online(tcpu))
2656 				cpu = tcpu;
2657 			goto done;
2658 		}
2659 	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2660 		goto done;
2661 	}
2662 
2663 	skb_reset_network_header(skb);
2664 	if (!skb_get_rxhash(skb))
2665 		goto done;
2666 
2667 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2668 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2669 	if (flow_table && sock_flow_table) {
2670 		u16 next_cpu;
2671 		struct rps_dev_flow *rflow;
2672 
2673 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2674 		tcpu = rflow->cpu;
2675 
2676 		next_cpu = sock_flow_table->ents[skb->rxhash &
2677 		    sock_flow_table->mask];
2678 
2679 		/*
2680 		 * If the desired CPU (where last recvmsg was done) is
2681 		 * different from current CPU (one in the rx-queue flow
2682 		 * table entry), switch if one of the following holds:
2683 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2684 		 *   - Current CPU is offline.
2685 		 *   - The current CPU's queue tail has advanced beyond the
2686 		 *     last packet that was enqueued using this table entry.
2687 		 *     This guarantees that all previous packets for the flow
2688 		 *     have been dequeued, thus preserving in order delivery.
2689 		 */
2690 		if (unlikely(tcpu != next_cpu) &&
2691 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2692 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2693 		      rflow->last_qtail)) >= 0))
2694 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2695 
2696 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2697 			*rflowp = rflow;
2698 			cpu = tcpu;
2699 			goto done;
2700 		}
2701 	}
2702 
2703 	if (map) {
2704 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2705 
2706 		if (cpu_online(tcpu)) {
2707 			cpu = tcpu;
2708 			goto done;
2709 		}
2710 	}
2711 
2712 done:
2713 	return cpu;
2714 }
2715 
2716 #ifdef CONFIG_RFS_ACCEL
2717 
2718 /**
2719  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2720  * @dev: Device on which the filter was set
2721  * @rxq_index: RX queue index
2722  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2723  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2724  *
2725  * Drivers that implement ndo_rx_flow_steer() should periodically call
2726  * this function for each installed filter and remove the filters for
2727  * which it returns %true.
2728  */
2729 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2730 			 u32 flow_id, u16 filter_id)
2731 {
2732 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2733 	struct rps_dev_flow_table *flow_table;
2734 	struct rps_dev_flow *rflow;
2735 	bool expire = true;
2736 	int cpu;
2737 
2738 	rcu_read_lock();
2739 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2740 	if (flow_table && flow_id <= flow_table->mask) {
2741 		rflow = &flow_table->flows[flow_id];
2742 		cpu = ACCESS_ONCE(rflow->cpu);
2743 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2744 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2745 			   rflow->last_qtail) <
2746 		     (int)(10 * flow_table->mask)))
2747 			expire = false;
2748 	}
2749 	rcu_read_unlock();
2750 	return expire;
2751 }
2752 EXPORT_SYMBOL(rps_may_expire_flow);
2753 
2754 #endif /* CONFIG_RFS_ACCEL */
2755 
2756 /* Called from hardirq (IPI) context */
2757 static void rps_trigger_softirq(void *data)
2758 {
2759 	struct softnet_data *sd = data;
2760 
2761 	____napi_schedule(sd, &sd->backlog);
2762 	sd->received_rps++;
2763 }
2764 
2765 #endif /* CONFIG_RPS */
2766 
2767 /*
2768  * Check if this softnet_data structure is another cpu one
2769  * If yes, queue it to our IPI list and return 1
2770  * If no, return 0
2771  */
2772 static int rps_ipi_queued(struct softnet_data *sd)
2773 {
2774 #ifdef CONFIG_RPS
2775 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2776 
2777 	if (sd != mysd) {
2778 		sd->rps_ipi_next = mysd->rps_ipi_list;
2779 		mysd->rps_ipi_list = sd;
2780 
2781 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2782 		return 1;
2783 	}
2784 #endif /* CONFIG_RPS */
2785 	return 0;
2786 }
2787 
2788 /*
2789  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2790  * queue (may be a remote CPU queue).
2791  */
2792 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2793 			      unsigned int *qtail)
2794 {
2795 	struct softnet_data *sd;
2796 	unsigned long flags;
2797 
2798 	sd = &per_cpu(softnet_data, cpu);
2799 
2800 	local_irq_save(flags);
2801 
2802 	rps_lock(sd);
2803 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2804 		if (skb_queue_len(&sd->input_pkt_queue)) {
2805 enqueue:
2806 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2807 			input_queue_tail_incr_save(sd, qtail);
2808 			rps_unlock(sd);
2809 			local_irq_restore(flags);
2810 			return NET_RX_SUCCESS;
2811 		}
2812 
2813 		/* Schedule NAPI for backlog device
2814 		 * We can use non atomic operation since we own the queue lock
2815 		 */
2816 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2817 			if (!rps_ipi_queued(sd))
2818 				____napi_schedule(sd, &sd->backlog);
2819 		}
2820 		goto enqueue;
2821 	}
2822 
2823 	sd->dropped++;
2824 	rps_unlock(sd);
2825 
2826 	local_irq_restore(flags);
2827 
2828 	atomic_long_inc(&skb->dev->rx_dropped);
2829 	kfree_skb(skb);
2830 	return NET_RX_DROP;
2831 }
2832 
2833 /**
2834  *	netif_rx	-	post buffer to the network code
2835  *	@skb: buffer to post
2836  *
2837  *	This function receives a packet from a device driver and queues it for
2838  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2839  *	may be dropped during processing for congestion control or by the
2840  *	protocol layers.
2841  *
2842  *	return values:
2843  *	NET_RX_SUCCESS	(no congestion)
2844  *	NET_RX_DROP     (packet was dropped)
2845  *
2846  */
2847 
2848 int netif_rx(struct sk_buff *skb)
2849 {
2850 	int ret;
2851 
2852 	/* if netpoll wants it, pretend we never saw it */
2853 	if (netpoll_rx(skb))
2854 		return NET_RX_DROP;
2855 
2856 	if (netdev_tstamp_prequeue)
2857 		net_timestamp_check(skb);
2858 
2859 	trace_netif_rx(skb);
2860 #ifdef CONFIG_RPS
2861 	{
2862 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2863 		int cpu;
2864 
2865 		preempt_disable();
2866 		rcu_read_lock();
2867 
2868 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2869 		if (cpu < 0)
2870 			cpu = smp_processor_id();
2871 
2872 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2873 
2874 		rcu_read_unlock();
2875 		preempt_enable();
2876 	}
2877 #else
2878 	{
2879 		unsigned int qtail;
2880 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2881 		put_cpu();
2882 	}
2883 #endif
2884 	return ret;
2885 }
2886 EXPORT_SYMBOL(netif_rx);
2887 
2888 int netif_rx_ni(struct sk_buff *skb)
2889 {
2890 	int err;
2891 
2892 	preempt_disable();
2893 	err = netif_rx(skb);
2894 	if (local_softirq_pending())
2895 		do_softirq();
2896 	preempt_enable();
2897 
2898 	return err;
2899 }
2900 EXPORT_SYMBOL(netif_rx_ni);
2901 
2902 static void net_tx_action(struct softirq_action *h)
2903 {
2904 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2905 
2906 	if (sd->completion_queue) {
2907 		struct sk_buff *clist;
2908 
2909 		local_irq_disable();
2910 		clist = sd->completion_queue;
2911 		sd->completion_queue = NULL;
2912 		local_irq_enable();
2913 
2914 		while (clist) {
2915 			struct sk_buff *skb = clist;
2916 			clist = clist->next;
2917 
2918 			WARN_ON(atomic_read(&skb->users));
2919 			trace_kfree_skb(skb, net_tx_action);
2920 			__kfree_skb(skb);
2921 		}
2922 	}
2923 
2924 	if (sd->output_queue) {
2925 		struct Qdisc *head;
2926 
2927 		local_irq_disable();
2928 		head = sd->output_queue;
2929 		sd->output_queue = NULL;
2930 		sd->output_queue_tailp = &sd->output_queue;
2931 		local_irq_enable();
2932 
2933 		while (head) {
2934 			struct Qdisc *q = head;
2935 			spinlock_t *root_lock;
2936 
2937 			head = head->next_sched;
2938 
2939 			root_lock = qdisc_lock(q);
2940 			if (spin_trylock(root_lock)) {
2941 				smp_mb__before_clear_bit();
2942 				clear_bit(__QDISC_STATE_SCHED,
2943 					  &q->state);
2944 				qdisc_run(q);
2945 				spin_unlock(root_lock);
2946 			} else {
2947 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2948 					      &q->state)) {
2949 					__netif_reschedule(q);
2950 				} else {
2951 					smp_mb__before_clear_bit();
2952 					clear_bit(__QDISC_STATE_SCHED,
2953 						  &q->state);
2954 				}
2955 			}
2956 		}
2957 	}
2958 }
2959 
2960 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2961     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2962 /* This hook is defined here for ATM LANE */
2963 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2964 			     unsigned char *addr) __read_mostly;
2965 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2966 #endif
2967 
2968 #ifdef CONFIG_NET_CLS_ACT
2969 /* TODO: Maybe we should just force sch_ingress to be compiled in
2970  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2971  * a compare and 2 stores extra right now if we dont have it on
2972  * but have CONFIG_NET_CLS_ACT
2973  * NOTE: This doesn't stop any functionality; if you dont have
2974  * the ingress scheduler, you just can't add policies on ingress.
2975  *
2976  */
2977 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2978 {
2979 	struct net_device *dev = skb->dev;
2980 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2981 	int result = TC_ACT_OK;
2982 	struct Qdisc *q;
2983 
2984 	if (unlikely(MAX_RED_LOOP < ttl++)) {
2985 		if (net_ratelimit())
2986 			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2987 			       skb->skb_iif, dev->ifindex);
2988 		return TC_ACT_SHOT;
2989 	}
2990 
2991 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2992 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2993 
2994 	q = rxq->qdisc;
2995 	if (q != &noop_qdisc) {
2996 		spin_lock(qdisc_lock(q));
2997 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2998 			result = qdisc_enqueue_root(skb, q);
2999 		spin_unlock(qdisc_lock(q));
3000 	}
3001 
3002 	return result;
3003 }
3004 
3005 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3006 					 struct packet_type **pt_prev,
3007 					 int *ret, struct net_device *orig_dev)
3008 {
3009 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3010 
3011 	if (!rxq || rxq->qdisc == &noop_qdisc)
3012 		goto out;
3013 
3014 	if (*pt_prev) {
3015 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3016 		*pt_prev = NULL;
3017 	}
3018 
3019 	switch (ing_filter(skb, rxq)) {
3020 	case TC_ACT_SHOT:
3021 	case TC_ACT_STOLEN:
3022 		kfree_skb(skb);
3023 		return NULL;
3024 	}
3025 
3026 out:
3027 	skb->tc_verd = 0;
3028 	return skb;
3029 }
3030 #endif
3031 
3032 /**
3033  *	netdev_rx_handler_register - register receive handler
3034  *	@dev: device to register a handler for
3035  *	@rx_handler: receive handler to register
3036  *	@rx_handler_data: data pointer that is used by rx handler
3037  *
3038  *	Register a receive hander for a device. This handler will then be
3039  *	called from __netif_receive_skb. A negative errno code is returned
3040  *	on a failure.
3041  *
3042  *	The caller must hold the rtnl_mutex.
3043  *
3044  *	For a general description of rx_handler, see enum rx_handler_result.
3045  */
3046 int netdev_rx_handler_register(struct net_device *dev,
3047 			       rx_handler_func_t *rx_handler,
3048 			       void *rx_handler_data)
3049 {
3050 	ASSERT_RTNL();
3051 
3052 	if (dev->rx_handler)
3053 		return -EBUSY;
3054 
3055 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3056 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3057 
3058 	return 0;
3059 }
3060 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3061 
3062 /**
3063  *	netdev_rx_handler_unregister - unregister receive handler
3064  *	@dev: device to unregister a handler from
3065  *
3066  *	Unregister a receive hander from a device.
3067  *
3068  *	The caller must hold the rtnl_mutex.
3069  */
3070 void netdev_rx_handler_unregister(struct net_device *dev)
3071 {
3072 
3073 	ASSERT_RTNL();
3074 	rcu_assign_pointer(dev->rx_handler, NULL);
3075 	rcu_assign_pointer(dev->rx_handler_data, NULL);
3076 }
3077 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3078 
3079 static void vlan_on_bond_hook(struct sk_buff *skb)
3080 {
3081 	/*
3082 	 * Make sure ARP frames received on VLAN interfaces stacked on
3083 	 * bonding interfaces still make their way to any base bonding
3084 	 * device that may have registered for a specific ptype.
3085 	 */
3086 	if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
3087 	    vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
3088 	    skb->protocol == htons(ETH_P_ARP)) {
3089 		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
3090 
3091 		if (!skb2)
3092 			return;
3093 		skb2->dev = vlan_dev_real_dev(skb->dev);
3094 		netif_rx(skb2);
3095 	}
3096 }
3097 
3098 static int __netif_receive_skb(struct sk_buff *skb)
3099 {
3100 	struct packet_type *ptype, *pt_prev;
3101 	rx_handler_func_t *rx_handler;
3102 	struct net_device *orig_dev;
3103 	struct net_device *null_or_dev;
3104 	bool deliver_exact = false;
3105 	int ret = NET_RX_DROP;
3106 	__be16 type;
3107 
3108 	if (!netdev_tstamp_prequeue)
3109 		net_timestamp_check(skb);
3110 
3111 	trace_netif_receive_skb(skb);
3112 
3113 	/* if we've gotten here through NAPI, check netpoll */
3114 	if (netpoll_receive_skb(skb))
3115 		return NET_RX_DROP;
3116 
3117 	if (!skb->skb_iif)
3118 		skb->skb_iif = skb->dev->ifindex;
3119 	orig_dev = skb->dev;
3120 
3121 	skb_reset_network_header(skb);
3122 	skb_reset_transport_header(skb);
3123 	skb->mac_len = skb->network_header - skb->mac_header;
3124 
3125 	pt_prev = NULL;
3126 
3127 	rcu_read_lock();
3128 
3129 another_round:
3130 
3131 	__this_cpu_inc(softnet_data.processed);
3132 
3133 #ifdef CONFIG_NET_CLS_ACT
3134 	if (skb->tc_verd & TC_NCLS) {
3135 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3136 		goto ncls;
3137 	}
3138 #endif
3139 
3140 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3141 		if (!ptype->dev || ptype->dev == skb->dev) {
3142 			if (pt_prev)
3143 				ret = deliver_skb(skb, pt_prev, orig_dev);
3144 			pt_prev = ptype;
3145 		}
3146 	}
3147 
3148 #ifdef CONFIG_NET_CLS_ACT
3149 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3150 	if (!skb)
3151 		goto out;
3152 ncls:
3153 #endif
3154 
3155 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3156 	if (rx_handler) {
3157 		if (pt_prev) {
3158 			ret = deliver_skb(skb, pt_prev, orig_dev);
3159 			pt_prev = NULL;
3160 		}
3161 		switch (rx_handler(&skb)) {
3162 		case RX_HANDLER_CONSUMED:
3163 			goto out;
3164 		case RX_HANDLER_ANOTHER:
3165 			goto another_round;
3166 		case RX_HANDLER_EXACT:
3167 			deliver_exact = true;
3168 		case RX_HANDLER_PASS:
3169 			break;
3170 		default:
3171 			BUG();
3172 		}
3173 	}
3174 
3175 	if (vlan_tx_tag_present(skb)) {
3176 		if (pt_prev) {
3177 			ret = deliver_skb(skb, pt_prev, orig_dev);
3178 			pt_prev = NULL;
3179 		}
3180 		if (vlan_hwaccel_do_receive(&skb)) {
3181 			ret = __netif_receive_skb(skb);
3182 			goto out;
3183 		} else if (unlikely(!skb))
3184 			goto out;
3185 	}
3186 
3187 	vlan_on_bond_hook(skb);
3188 
3189 	/* deliver only exact match when indicated */
3190 	null_or_dev = deliver_exact ? skb->dev : NULL;
3191 
3192 	type = skb->protocol;
3193 	list_for_each_entry_rcu(ptype,
3194 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3195 		if (ptype->type == type &&
3196 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3197 		     ptype->dev == orig_dev)) {
3198 			if (pt_prev)
3199 				ret = deliver_skb(skb, pt_prev, orig_dev);
3200 			pt_prev = ptype;
3201 		}
3202 	}
3203 
3204 	if (pt_prev) {
3205 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3206 	} else {
3207 		atomic_long_inc(&skb->dev->rx_dropped);
3208 		kfree_skb(skb);
3209 		/* Jamal, now you will not able to escape explaining
3210 		 * me how you were going to use this. :-)
3211 		 */
3212 		ret = NET_RX_DROP;
3213 	}
3214 
3215 out:
3216 	rcu_read_unlock();
3217 	return ret;
3218 }
3219 
3220 /**
3221  *	netif_receive_skb - process receive buffer from network
3222  *	@skb: buffer to process
3223  *
3224  *	netif_receive_skb() is the main receive data processing function.
3225  *	It always succeeds. The buffer may be dropped during processing
3226  *	for congestion control or by the protocol layers.
3227  *
3228  *	This function may only be called from softirq context and interrupts
3229  *	should be enabled.
3230  *
3231  *	Return values (usually ignored):
3232  *	NET_RX_SUCCESS: no congestion
3233  *	NET_RX_DROP: packet was dropped
3234  */
3235 int netif_receive_skb(struct sk_buff *skb)
3236 {
3237 	if (netdev_tstamp_prequeue)
3238 		net_timestamp_check(skb);
3239 
3240 	if (skb_defer_rx_timestamp(skb))
3241 		return NET_RX_SUCCESS;
3242 
3243 #ifdef CONFIG_RPS
3244 	{
3245 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3246 		int cpu, ret;
3247 
3248 		rcu_read_lock();
3249 
3250 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3251 
3252 		if (cpu >= 0) {
3253 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3254 			rcu_read_unlock();
3255 		} else {
3256 			rcu_read_unlock();
3257 			ret = __netif_receive_skb(skb);
3258 		}
3259 
3260 		return ret;
3261 	}
3262 #else
3263 	return __netif_receive_skb(skb);
3264 #endif
3265 }
3266 EXPORT_SYMBOL(netif_receive_skb);
3267 
3268 /* Network device is going away, flush any packets still pending
3269  * Called with irqs disabled.
3270  */
3271 static void flush_backlog(void *arg)
3272 {
3273 	struct net_device *dev = arg;
3274 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3275 	struct sk_buff *skb, *tmp;
3276 
3277 	rps_lock(sd);
3278 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3279 		if (skb->dev == dev) {
3280 			__skb_unlink(skb, &sd->input_pkt_queue);
3281 			kfree_skb(skb);
3282 			input_queue_head_incr(sd);
3283 		}
3284 	}
3285 	rps_unlock(sd);
3286 
3287 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3288 		if (skb->dev == dev) {
3289 			__skb_unlink(skb, &sd->process_queue);
3290 			kfree_skb(skb);
3291 			input_queue_head_incr(sd);
3292 		}
3293 	}
3294 }
3295 
3296 static int napi_gro_complete(struct sk_buff *skb)
3297 {
3298 	struct packet_type *ptype;
3299 	__be16 type = skb->protocol;
3300 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3301 	int err = -ENOENT;
3302 
3303 	if (NAPI_GRO_CB(skb)->count == 1) {
3304 		skb_shinfo(skb)->gso_size = 0;
3305 		goto out;
3306 	}
3307 
3308 	rcu_read_lock();
3309 	list_for_each_entry_rcu(ptype, head, list) {
3310 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3311 			continue;
3312 
3313 		err = ptype->gro_complete(skb);
3314 		break;
3315 	}
3316 	rcu_read_unlock();
3317 
3318 	if (err) {
3319 		WARN_ON(&ptype->list == head);
3320 		kfree_skb(skb);
3321 		return NET_RX_SUCCESS;
3322 	}
3323 
3324 out:
3325 	return netif_receive_skb(skb);
3326 }
3327 
3328 inline void napi_gro_flush(struct napi_struct *napi)
3329 {
3330 	struct sk_buff *skb, *next;
3331 
3332 	for (skb = napi->gro_list; skb; skb = next) {
3333 		next = skb->next;
3334 		skb->next = NULL;
3335 		napi_gro_complete(skb);
3336 	}
3337 
3338 	napi->gro_count = 0;
3339 	napi->gro_list = NULL;
3340 }
3341 EXPORT_SYMBOL(napi_gro_flush);
3342 
3343 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3344 {
3345 	struct sk_buff **pp = NULL;
3346 	struct packet_type *ptype;
3347 	__be16 type = skb->protocol;
3348 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3349 	int same_flow;
3350 	int mac_len;
3351 	enum gro_result ret;
3352 
3353 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3354 		goto normal;
3355 
3356 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3357 		goto normal;
3358 
3359 	rcu_read_lock();
3360 	list_for_each_entry_rcu(ptype, head, list) {
3361 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3362 			continue;
3363 
3364 		skb_set_network_header(skb, skb_gro_offset(skb));
3365 		mac_len = skb->network_header - skb->mac_header;
3366 		skb->mac_len = mac_len;
3367 		NAPI_GRO_CB(skb)->same_flow = 0;
3368 		NAPI_GRO_CB(skb)->flush = 0;
3369 		NAPI_GRO_CB(skb)->free = 0;
3370 
3371 		pp = ptype->gro_receive(&napi->gro_list, skb);
3372 		break;
3373 	}
3374 	rcu_read_unlock();
3375 
3376 	if (&ptype->list == head)
3377 		goto normal;
3378 
3379 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3380 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3381 
3382 	if (pp) {
3383 		struct sk_buff *nskb = *pp;
3384 
3385 		*pp = nskb->next;
3386 		nskb->next = NULL;
3387 		napi_gro_complete(nskb);
3388 		napi->gro_count--;
3389 	}
3390 
3391 	if (same_flow)
3392 		goto ok;
3393 
3394 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3395 		goto normal;
3396 
3397 	napi->gro_count++;
3398 	NAPI_GRO_CB(skb)->count = 1;
3399 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3400 	skb->next = napi->gro_list;
3401 	napi->gro_list = skb;
3402 	ret = GRO_HELD;
3403 
3404 pull:
3405 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3406 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3407 
3408 		BUG_ON(skb->end - skb->tail < grow);
3409 
3410 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3411 
3412 		skb->tail += grow;
3413 		skb->data_len -= grow;
3414 
3415 		skb_shinfo(skb)->frags[0].page_offset += grow;
3416 		skb_shinfo(skb)->frags[0].size -= grow;
3417 
3418 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3419 			put_page(skb_shinfo(skb)->frags[0].page);
3420 			memmove(skb_shinfo(skb)->frags,
3421 				skb_shinfo(skb)->frags + 1,
3422 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3423 		}
3424 	}
3425 
3426 ok:
3427 	return ret;
3428 
3429 normal:
3430 	ret = GRO_NORMAL;
3431 	goto pull;
3432 }
3433 EXPORT_SYMBOL(dev_gro_receive);
3434 
3435 static inline gro_result_t
3436 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3437 {
3438 	struct sk_buff *p;
3439 
3440 	for (p = napi->gro_list; p; p = p->next) {
3441 		unsigned long diffs;
3442 
3443 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3444 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3445 		diffs |= compare_ether_header(skb_mac_header(p),
3446 					      skb_gro_mac_header(skb));
3447 		NAPI_GRO_CB(p)->same_flow = !diffs;
3448 		NAPI_GRO_CB(p)->flush = 0;
3449 	}
3450 
3451 	return dev_gro_receive(napi, skb);
3452 }
3453 
3454 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3455 {
3456 	switch (ret) {
3457 	case GRO_NORMAL:
3458 		if (netif_receive_skb(skb))
3459 			ret = GRO_DROP;
3460 		break;
3461 
3462 	case GRO_DROP:
3463 	case GRO_MERGED_FREE:
3464 		kfree_skb(skb);
3465 		break;
3466 
3467 	case GRO_HELD:
3468 	case GRO_MERGED:
3469 		break;
3470 	}
3471 
3472 	return ret;
3473 }
3474 EXPORT_SYMBOL(napi_skb_finish);
3475 
3476 void skb_gro_reset_offset(struct sk_buff *skb)
3477 {
3478 	NAPI_GRO_CB(skb)->data_offset = 0;
3479 	NAPI_GRO_CB(skb)->frag0 = NULL;
3480 	NAPI_GRO_CB(skb)->frag0_len = 0;
3481 
3482 	if (skb->mac_header == skb->tail &&
3483 	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3484 		NAPI_GRO_CB(skb)->frag0 =
3485 			page_address(skb_shinfo(skb)->frags[0].page) +
3486 			skb_shinfo(skb)->frags[0].page_offset;
3487 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3488 	}
3489 }
3490 EXPORT_SYMBOL(skb_gro_reset_offset);
3491 
3492 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3493 {
3494 	skb_gro_reset_offset(skb);
3495 
3496 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3497 }
3498 EXPORT_SYMBOL(napi_gro_receive);
3499 
3500 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3501 {
3502 	__skb_pull(skb, skb_headlen(skb));
3503 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3504 	skb->vlan_tci = 0;
3505 	skb->dev = napi->dev;
3506 	skb->skb_iif = 0;
3507 
3508 	napi->skb = skb;
3509 }
3510 
3511 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3512 {
3513 	struct sk_buff *skb = napi->skb;
3514 
3515 	if (!skb) {
3516 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3517 		if (skb)
3518 			napi->skb = skb;
3519 	}
3520 	return skb;
3521 }
3522 EXPORT_SYMBOL(napi_get_frags);
3523 
3524 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3525 			       gro_result_t ret)
3526 {
3527 	switch (ret) {
3528 	case GRO_NORMAL:
3529 	case GRO_HELD:
3530 		skb->protocol = eth_type_trans(skb, skb->dev);
3531 
3532 		if (ret == GRO_HELD)
3533 			skb_gro_pull(skb, -ETH_HLEN);
3534 		else if (netif_receive_skb(skb))
3535 			ret = GRO_DROP;
3536 		break;
3537 
3538 	case GRO_DROP:
3539 	case GRO_MERGED_FREE:
3540 		napi_reuse_skb(napi, skb);
3541 		break;
3542 
3543 	case GRO_MERGED:
3544 		break;
3545 	}
3546 
3547 	return ret;
3548 }
3549 EXPORT_SYMBOL(napi_frags_finish);
3550 
3551 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3552 {
3553 	struct sk_buff *skb = napi->skb;
3554 	struct ethhdr *eth;
3555 	unsigned int hlen;
3556 	unsigned int off;
3557 
3558 	napi->skb = NULL;
3559 
3560 	skb_reset_mac_header(skb);
3561 	skb_gro_reset_offset(skb);
3562 
3563 	off = skb_gro_offset(skb);
3564 	hlen = off + sizeof(*eth);
3565 	eth = skb_gro_header_fast(skb, off);
3566 	if (skb_gro_header_hard(skb, hlen)) {
3567 		eth = skb_gro_header_slow(skb, hlen, off);
3568 		if (unlikely(!eth)) {
3569 			napi_reuse_skb(napi, skb);
3570 			skb = NULL;
3571 			goto out;
3572 		}
3573 	}
3574 
3575 	skb_gro_pull(skb, sizeof(*eth));
3576 
3577 	/*
3578 	 * This works because the only protocols we care about don't require
3579 	 * special handling.  We'll fix it up properly at the end.
3580 	 */
3581 	skb->protocol = eth->h_proto;
3582 
3583 out:
3584 	return skb;
3585 }
3586 EXPORT_SYMBOL(napi_frags_skb);
3587 
3588 gro_result_t napi_gro_frags(struct napi_struct *napi)
3589 {
3590 	struct sk_buff *skb = napi_frags_skb(napi);
3591 
3592 	if (!skb)
3593 		return GRO_DROP;
3594 
3595 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3596 }
3597 EXPORT_SYMBOL(napi_gro_frags);
3598 
3599 /*
3600  * net_rps_action sends any pending IPI's for rps.
3601  * Note: called with local irq disabled, but exits with local irq enabled.
3602  */
3603 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3604 {
3605 #ifdef CONFIG_RPS
3606 	struct softnet_data *remsd = sd->rps_ipi_list;
3607 
3608 	if (remsd) {
3609 		sd->rps_ipi_list = NULL;
3610 
3611 		local_irq_enable();
3612 
3613 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3614 		while (remsd) {
3615 			struct softnet_data *next = remsd->rps_ipi_next;
3616 
3617 			if (cpu_online(remsd->cpu))
3618 				__smp_call_function_single(remsd->cpu,
3619 							   &remsd->csd, 0);
3620 			remsd = next;
3621 		}
3622 	} else
3623 #endif
3624 		local_irq_enable();
3625 }
3626 
3627 static int process_backlog(struct napi_struct *napi, int quota)
3628 {
3629 	int work = 0;
3630 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3631 
3632 #ifdef CONFIG_RPS
3633 	/* Check if we have pending ipi, its better to send them now,
3634 	 * not waiting net_rx_action() end.
3635 	 */
3636 	if (sd->rps_ipi_list) {
3637 		local_irq_disable();
3638 		net_rps_action_and_irq_enable(sd);
3639 	}
3640 #endif
3641 	napi->weight = weight_p;
3642 	local_irq_disable();
3643 	while (work < quota) {
3644 		struct sk_buff *skb;
3645 		unsigned int qlen;
3646 
3647 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3648 			local_irq_enable();
3649 			__netif_receive_skb(skb);
3650 			local_irq_disable();
3651 			input_queue_head_incr(sd);
3652 			if (++work >= quota) {
3653 				local_irq_enable();
3654 				return work;
3655 			}
3656 		}
3657 
3658 		rps_lock(sd);
3659 		qlen = skb_queue_len(&sd->input_pkt_queue);
3660 		if (qlen)
3661 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3662 						   &sd->process_queue);
3663 
3664 		if (qlen < quota - work) {
3665 			/*
3666 			 * Inline a custom version of __napi_complete().
3667 			 * only current cpu owns and manipulates this napi,
3668 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3669 			 * we can use a plain write instead of clear_bit(),
3670 			 * and we dont need an smp_mb() memory barrier.
3671 			 */
3672 			list_del(&napi->poll_list);
3673 			napi->state = 0;
3674 
3675 			quota = work + qlen;
3676 		}
3677 		rps_unlock(sd);
3678 	}
3679 	local_irq_enable();
3680 
3681 	return work;
3682 }
3683 
3684 /**
3685  * __napi_schedule - schedule for receive
3686  * @n: entry to schedule
3687  *
3688  * The entry's receive function will be scheduled to run
3689  */
3690 void __napi_schedule(struct napi_struct *n)
3691 {
3692 	unsigned long flags;
3693 
3694 	local_irq_save(flags);
3695 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3696 	local_irq_restore(flags);
3697 }
3698 EXPORT_SYMBOL(__napi_schedule);
3699 
3700 void __napi_complete(struct napi_struct *n)
3701 {
3702 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3703 	BUG_ON(n->gro_list);
3704 
3705 	list_del(&n->poll_list);
3706 	smp_mb__before_clear_bit();
3707 	clear_bit(NAPI_STATE_SCHED, &n->state);
3708 }
3709 EXPORT_SYMBOL(__napi_complete);
3710 
3711 void napi_complete(struct napi_struct *n)
3712 {
3713 	unsigned long flags;
3714 
3715 	/*
3716 	 * don't let napi dequeue from the cpu poll list
3717 	 * just in case its running on a different cpu
3718 	 */
3719 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3720 		return;
3721 
3722 	napi_gro_flush(n);
3723 	local_irq_save(flags);
3724 	__napi_complete(n);
3725 	local_irq_restore(flags);
3726 }
3727 EXPORT_SYMBOL(napi_complete);
3728 
3729 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3730 		    int (*poll)(struct napi_struct *, int), int weight)
3731 {
3732 	INIT_LIST_HEAD(&napi->poll_list);
3733 	napi->gro_count = 0;
3734 	napi->gro_list = NULL;
3735 	napi->skb = NULL;
3736 	napi->poll = poll;
3737 	napi->weight = weight;
3738 	list_add(&napi->dev_list, &dev->napi_list);
3739 	napi->dev = dev;
3740 #ifdef CONFIG_NETPOLL
3741 	spin_lock_init(&napi->poll_lock);
3742 	napi->poll_owner = -1;
3743 #endif
3744 	set_bit(NAPI_STATE_SCHED, &napi->state);
3745 }
3746 EXPORT_SYMBOL(netif_napi_add);
3747 
3748 void netif_napi_del(struct napi_struct *napi)
3749 {
3750 	struct sk_buff *skb, *next;
3751 
3752 	list_del_init(&napi->dev_list);
3753 	napi_free_frags(napi);
3754 
3755 	for (skb = napi->gro_list; skb; skb = next) {
3756 		next = skb->next;
3757 		skb->next = NULL;
3758 		kfree_skb(skb);
3759 	}
3760 
3761 	napi->gro_list = NULL;
3762 	napi->gro_count = 0;
3763 }
3764 EXPORT_SYMBOL(netif_napi_del);
3765 
3766 static void net_rx_action(struct softirq_action *h)
3767 {
3768 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3769 	unsigned long time_limit = jiffies + 2;
3770 	int budget = netdev_budget;
3771 	void *have;
3772 
3773 	local_irq_disable();
3774 
3775 	while (!list_empty(&sd->poll_list)) {
3776 		struct napi_struct *n;
3777 		int work, weight;
3778 
3779 		/* If softirq window is exhuasted then punt.
3780 		 * Allow this to run for 2 jiffies since which will allow
3781 		 * an average latency of 1.5/HZ.
3782 		 */
3783 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3784 			goto softnet_break;
3785 
3786 		local_irq_enable();
3787 
3788 		/* Even though interrupts have been re-enabled, this
3789 		 * access is safe because interrupts can only add new
3790 		 * entries to the tail of this list, and only ->poll()
3791 		 * calls can remove this head entry from the list.
3792 		 */
3793 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3794 
3795 		have = netpoll_poll_lock(n);
3796 
3797 		weight = n->weight;
3798 
3799 		/* This NAPI_STATE_SCHED test is for avoiding a race
3800 		 * with netpoll's poll_napi().  Only the entity which
3801 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3802 		 * actually make the ->poll() call.  Therefore we avoid
3803 		 * accidentally calling ->poll() when NAPI is not scheduled.
3804 		 */
3805 		work = 0;
3806 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3807 			work = n->poll(n, weight);
3808 			trace_napi_poll(n);
3809 		}
3810 
3811 		WARN_ON_ONCE(work > weight);
3812 
3813 		budget -= work;
3814 
3815 		local_irq_disable();
3816 
3817 		/* Drivers must not modify the NAPI state if they
3818 		 * consume the entire weight.  In such cases this code
3819 		 * still "owns" the NAPI instance and therefore can
3820 		 * move the instance around on the list at-will.
3821 		 */
3822 		if (unlikely(work == weight)) {
3823 			if (unlikely(napi_disable_pending(n))) {
3824 				local_irq_enable();
3825 				napi_complete(n);
3826 				local_irq_disable();
3827 			} else
3828 				list_move_tail(&n->poll_list, &sd->poll_list);
3829 		}
3830 
3831 		netpoll_poll_unlock(have);
3832 	}
3833 out:
3834 	net_rps_action_and_irq_enable(sd);
3835 
3836 #ifdef CONFIG_NET_DMA
3837 	/*
3838 	 * There may not be any more sk_buffs coming right now, so push
3839 	 * any pending DMA copies to hardware
3840 	 */
3841 	dma_issue_pending_all();
3842 #endif
3843 
3844 	return;
3845 
3846 softnet_break:
3847 	sd->time_squeeze++;
3848 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3849 	goto out;
3850 }
3851 
3852 static gifconf_func_t *gifconf_list[NPROTO];
3853 
3854 /**
3855  *	register_gifconf	-	register a SIOCGIF handler
3856  *	@family: Address family
3857  *	@gifconf: Function handler
3858  *
3859  *	Register protocol dependent address dumping routines. The handler
3860  *	that is passed must not be freed or reused until it has been replaced
3861  *	by another handler.
3862  */
3863 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3864 {
3865 	if (family >= NPROTO)
3866 		return -EINVAL;
3867 	gifconf_list[family] = gifconf;
3868 	return 0;
3869 }
3870 EXPORT_SYMBOL(register_gifconf);
3871 
3872 
3873 /*
3874  *	Map an interface index to its name (SIOCGIFNAME)
3875  */
3876 
3877 /*
3878  *	We need this ioctl for efficient implementation of the
3879  *	if_indextoname() function required by the IPv6 API.  Without
3880  *	it, we would have to search all the interfaces to find a
3881  *	match.  --pb
3882  */
3883 
3884 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3885 {
3886 	struct net_device *dev;
3887 	struct ifreq ifr;
3888 
3889 	/*
3890 	 *	Fetch the caller's info block.
3891 	 */
3892 
3893 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3894 		return -EFAULT;
3895 
3896 	rcu_read_lock();
3897 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3898 	if (!dev) {
3899 		rcu_read_unlock();
3900 		return -ENODEV;
3901 	}
3902 
3903 	strcpy(ifr.ifr_name, dev->name);
3904 	rcu_read_unlock();
3905 
3906 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3907 		return -EFAULT;
3908 	return 0;
3909 }
3910 
3911 /*
3912  *	Perform a SIOCGIFCONF call. This structure will change
3913  *	size eventually, and there is nothing I can do about it.
3914  *	Thus we will need a 'compatibility mode'.
3915  */
3916 
3917 static int dev_ifconf(struct net *net, char __user *arg)
3918 {
3919 	struct ifconf ifc;
3920 	struct net_device *dev;
3921 	char __user *pos;
3922 	int len;
3923 	int total;
3924 	int i;
3925 
3926 	/*
3927 	 *	Fetch the caller's info block.
3928 	 */
3929 
3930 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3931 		return -EFAULT;
3932 
3933 	pos = ifc.ifc_buf;
3934 	len = ifc.ifc_len;
3935 
3936 	/*
3937 	 *	Loop over the interfaces, and write an info block for each.
3938 	 */
3939 
3940 	total = 0;
3941 	for_each_netdev(net, dev) {
3942 		for (i = 0; i < NPROTO; i++) {
3943 			if (gifconf_list[i]) {
3944 				int done;
3945 				if (!pos)
3946 					done = gifconf_list[i](dev, NULL, 0);
3947 				else
3948 					done = gifconf_list[i](dev, pos + total,
3949 							       len - total);
3950 				if (done < 0)
3951 					return -EFAULT;
3952 				total += done;
3953 			}
3954 		}
3955 	}
3956 
3957 	/*
3958 	 *	All done.  Write the updated control block back to the caller.
3959 	 */
3960 	ifc.ifc_len = total;
3961 
3962 	/*
3963 	 * 	Both BSD and Solaris return 0 here, so we do too.
3964 	 */
3965 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3966 }
3967 
3968 #ifdef CONFIG_PROC_FS
3969 /*
3970  *	This is invoked by the /proc filesystem handler to display a device
3971  *	in detail.
3972  */
3973 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3974 	__acquires(RCU)
3975 {
3976 	struct net *net = seq_file_net(seq);
3977 	loff_t off;
3978 	struct net_device *dev;
3979 
3980 	rcu_read_lock();
3981 	if (!*pos)
3982 		return SEQ_START_TOKEN;
3983 
3984 	off = 1;
3985 	for_each_netdev_rcu(net, dev)
3986 		if (off++ == *pos)
3987 			return dev;
3988 
3989 	return NULL;
3990 }
3991 
3992 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3993 {
3994 	struct net_device *dev = v;
3995 
3996 	if (v == SEQ_START_TOKEN)
3997 		dev = first_net_device_rcu(seq_file_net(seq));
3998 	else
3999 		dev = next_net_device_rcu(dev);
4000 
4001 	++*pos;
4002 	return dev;
4003 }
4004 
4005 void dev_seq_stop(struct seq_file *seq, void *v)
4006 	__releases(RCU)
4007 {
4008 	rcu_read_unlock();
4009 }
4010 
4011 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4012 {
4013 	struct rtnl_link_stats64 temp;
4014 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4015 
4016 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4017 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4018 		   dev->name, stats->rx_bytes, stats->rx_packets,
4019 		   stats->rx_errors,
4020 		   stats->rx_dropped + stats->rx_missed_errors,
4021 		   stats->rx_fifo_errors,
4022 		   stats->rx_length_errors + stats->rx_over_errors +
4023 		    stats->rx_crc_errors + stats->rx_frame_errors,
4024 		   stats->rx_compressed, stats->multicast,
4025 		   stats->tx_bytes, stats->tx_packets,
4026 		   stats->tx_errors, stats->tx_dropped,
4027 		   stats->tx_fifo_errors, stats->collisions,
4028 		   stats->tx_carrier_errors +
4029 		    stats->tx_aborted_errors +
4030 		    stats->tx_window_errors +
4031 		    stats->tx_heartbeat_errors,
4032 		   stats->tx_compressed);
4033 }
4034 
4035 /*
4036  *	Called from the PROCfs module. This now uses the new arbitrary sized
4037  *	/proc/net interface to create /proc/net/dev
4038  */
4039 static int dev_seq_show(struct seq_file *seq, void *v)
4040 {
4041 	if (v == SEQ_START_TOKEN)
4042 		seq_puts(seq, "Inter-|   Receive                            "
4043 			      "                    |  Transmit\n"
4044 			      " face |bytes    packets errs drop fifo frame "
4045 			      "compressed multicast|bytes    packets errs "
4046 			      "drop fifo colls carrier compressed\n");
4047 	else
4048 		dev_seq_printf_stats(seq, v);
4049 	return 0;
4050 }
4051 
4052 static struct softnet_data *softnet_get_online(loff_t *pos)
4053 {
4054 	struct softnet_data *sd = NULL;
4055 
4056 	while (*pos < nr_cpu_ids)
4057 		if (cpu_online(*pos)) {
4058 			sd = &per_cpu(softnet_data, *pos);
4059 			break;
4060 		} else
4061 			++*pos;
4062 	return sd;
4063 }
4064 
4065 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4066 {
4067 	return softnet_get_online(pos);
4068 }
4069 
4070 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4071 {
4072 	++*pos;
4073 	return softnet_get_online(pos);
4074 }
4075 
4076 static void softnet_seq_stop(struct seq_file *seq, void *v)
4077 {
4078 }
4079 
4080 static int softnet_seq_show(struct seq_file *seq, void *v)
4081 {
4082 	struct softnet_data *sd = v;
4083 
4084 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4085 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4086 		   0, 0, 0, 0, /* was fastroute */
4087 		   sd->cpu_collision, sd->received_rps);
4088 	return 0;
4089 }
4090 
4091 static const struct seq_operations dev_seq_ops = {
4092 	.start = dev_seq_start,
4093 	.next  = dev_seq_next,
4094 	.stop  = dev_seq_stop,
4095 	.show  = dev_seq_show,
4096 };
4097 
4098 static int dev_seq_open(struct inode *inode, struct file *file)
4099 {
4100 	return seq_open_net(inode, file, &dev_seq_ops,
4101 			    sizeof(struct seq_net_private));
4102 }
4103 
4104 static const struct file_operations dev_seq_fops = {
4105 	.owner	 = THIS_MODULE,
4106 	.open    = dev_seq_open,
4107 	.read    = seq_read,
4108 	.llseek  = seq_lseek,
4109 	.release = seq_release_net,
4110 };
4111 
4112 static const struct seq_operations softnet_seq_ops = {
4113 	.start = softnet_seq_start,
4114 	.next  = softnet_seq_next,
4115 	.stop  = softnet_seq_stop,
4116 	.show  = softnet_seq_show,
4117 };
4118 
4119 static int softnet_seq_open(struct inode *inode, struct file *file)
4120 {
4121 	return seq_open(file, &softnet_seq_ops);
4122 }
4123 
4124 static const struct file_operations softnet_seq_fops = {
4125 	.owner	 = THIS_MODULE,
4126 	.open    = softnet_seq_open,
4127 	.read    = seq_read,
4128 	.llseek  = seq_lseek,
4129 	.release = seq_release,
4130 };
4131 
4132 static void *ptype_get_idx(loff_t pos)
4133 {
4134 	struct packet_type *pt = NULL;
4135 	loff_t i = 0;
4136 	int t;
4137 
4138 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4139 		if (i == pos)
4140 			return pt;
4141 		++i;
4142 	}
4143 
4144 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4145 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4146 			if (i == pos)
4147 				return pt;
4148 			++i;
4149 		}
4150 	}
4151 	return NULL;
4152 }
4153 
4154 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4155 	__acquires(RCU)
4156 {
4157 	rcu_read_lock();
4158 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4159 }
4160 
4161 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4162 {
4163 	struct packet_type *pt;
4164 	struct list_head *nxt;
4165 	int hash;
4166 
4167 	++*pos;
4168 	if (v == SEQ_START_TOKEN)
4169 		return ptype_get_idx(0);
4170 
4171 	pt = v;
4172 	nxt = pt->list.next;
4173 	if (pt->type == htons(ETH_P_ALL)) {
4174 		if (nxt != &ptype_all)
4175 			goto found;
4176 		hash = 0;
4177 		nxt = ptype_base[0].next;
4178 	} else
4179 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4180 
4181 	while (nxt == &ptype_base[hash]) {
4182 		if (++hash >= PTYPE_HASH_SIZE)
4183 			return NULL;
4184 		nxt = ptype_base[hash].next;
4185 	}
4186 found:
4187 	return list_entry(nxt, struct packet_type, list);
4188 }
4189 
4190 static void ptype_seq_stop(struct seq_file *seq, void *v)
4191 	__releases(RCU)
4192 {
4193 	rcu_read_unlock();
4194 }
4195 
4196 static int ptype_seq_show(struct seq_file *seq, void *v)
4197 {
4198 	struct packet_type *pt = v;
4199 
4200 	if (v == SEQ_START_TOKEN)
4201 		seq_puts(seq, "Type Device      Function\n");
4202 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4203 		if (pt->type == htons(ETH_P_ALL))
4204 			seq_puts(seq, "ALL ");
4205 		else
4206 			seq_printf(seq, "%04x", ntohs(pt->type));
4207 
4208 		seq_printf(seq, " %-8s %pF\n",
4209 			   pt->dev ? pt->dev->name : "", pt->func);
4210 	}
4211 
4212 	return 0;
4213 }
4214 
4215 static const struct seq_operations ptype_seq_ops = {
4216 	.start = ptype_seq_start,
4217 	.next  = ptype_seq_next,
4218 	.stop  = ptype_seq_stop,
4219 	.show  = ptype_seq_show,
4220 };
4221 
4222 static int ptype_seq_open(struct inode *inode, struct file *file)
4223 {
4224 	return seq_open_net(inode, file, &ptype_seq_ops,
4225 			sizeof(struct seq_net_private));
4226 }
4227 
4228 static const struct file_operations ptype_seq_fops = {
4229 	.owner	 = THIS_MODULE,
4230 	.open    = ptype_seq_open,
4231 	.read    = seq_read,
4232 	.llseek  = seq_lseek,
4233 	.release = seq_release_net,
4234 };
4235 
4236 
4237 static int __net_init dev_proc_net_init(struct net *net)
4238 {
4239 	int rc = -ENOMEM;
4240 
4241 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4242 		goto out;
4243 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4244 		goto out_dev;
4245 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4246 		goto out_softnet;
4247 
4248 	if (wext_proc_init(net))
4249 		goto out_ptype;
4250 	rc = 0;
4251 out:
4252 	return rc;
4253 out_ptype:
4254 	proc_net_remove(net, "ptype");
4255 out_softnet:
4256 	proc_net_remove(net, "softnet_stat");
4257 out_dev:
4258 	proc_net_remove(net, "dev");
4259 	goto out;
4260 }
4261 
4262 static void __net_exit dev_proc_net_exit(struct net *net)
4263 {
4264 	wext_proc_exit(net);
4265 
4266 	proc_net_remove(net, "ptype");
4267 	proc_net_remove(net, "softnet_stat");
4268 	proc_net_remove(net, "dev");
4269 }
4270 
4271 static struct pernet_operations __net_initdata dev_proc_ops = {
4272 	.init = dev_proc_net_init,
4273 	.exit = dev_proc_net_exit,
4274 };
4275 
4276 static int __init dev_proc_init(void)
4277 {
4278 	return register_pernet_subsys(&dev_proc_ops);
4279 }
4280 #else
4281 #define dev_proc_init() 0
4282 #endif	/* CONFIG_PROC_FS */
4283 
4284 
4285 /**
4286  *	netdev_set_master	-	set up master pointer
4287  *	@slave: slave device
4288  *	@master: new master device
4289  *
4290  *	Changes the master device of the slave. Pass %NULL to break the
4291  *	bonding. The caller must hold the RTNL semaphore. On a failure
4292  *	a negative errno code is returned. On success the reference counts
4293  *	are adjusted and the function returns zero.
4294  */
4295 int netdev_set_master(struct net_device *slave, struct net_device *master)
4296 {
4297 	struct net_device *old = slave->master;
4298 
4299 	ASSERT_RTNL();
4300 
4301 	if (master) {
4302 		if (old)
4303 			return -EBUSY;
4304 		dev_hold(master);
4305 	}
4306 
4307 	slave->master = master;
4308 
4309 	if (old) {
4310 		synchronize_net();
4311 		dev_put(old);
4312 	}
4313 	return 0;
4314 }
4315 EXPORT_SYMBOL(netdev_set_master);
4316 
4317 /**
4318  *	netdev_set_bond_master	-	set up bonding master/slave pair
4319  *	@slave: slave device
4320  *	@master: new master device
4321  *
4322  *	Changes the master device of the slave. Pass %NULL to break the
4323  *	bonding. The caller must hold the RTNL semaphore. On a failure
4324  *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4325  *	to the routing socket and the function returns zero.
4326  */
4327 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4328 {
4329 	int err;
4330 
4331 	ASSERT_RTNL();
4332 
4333 	err = netdev_set_master(slave, master);
4334 	if (err)
4335 		return err;
4336 	if (master)
4337 		slave->flags |= IFF_SLAVE;
4338 	else
4339 		slave->flags &= ~IFF_SLAVE;
4340 
4341 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4342 	return 0;
4343 }
4344 EXPORT_SYMBOL(netdev_set_bond_master);
4345 
4346 static void dev_change_rx_flags(struct net_device *dev, int flags)
4347 {
4348 	const struct net_device_ops *ops = dev->netdev_ops;
4349 
4350 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4351 		ops->ndo_change_rx_flags(dev, flags);
4352 }
4353 
4354 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4355 {
4356 	unsigned short old_flags = dev->flags;
4357 	uid_t uid;
4358 	gid_t gid;
4359 
4360 	ASSERT_RTNL();
4361 
4362 	dev->flags |= IFF_PROMISC;
4363 	dev->promiscuity += inc;
4364 	if (dev->promiscuity == 0) {
4365 		/*
4366 		 * Avoid overflow.
4367 		 * If inc causes overflow, untouch promisc and return error.
4368 		 */
4369 		if (inc < 0)
4370 			dev->flags &= ~IFF_PROMISC;
4371 		else {
4372 			dev->promiscuity -= inc;
4373 			printk(KERN_WARNING "%s: promiscuity touches roof, "
4374 				"set promiscuity failed, promiscuity feature "
4375 				"of device might be broken.\n", dev->name);
4376 			return -EOVERFLOW;
4377 		}
4378 	}
4379 	if (dev->flags != old_flags) {
4380 		printk(KERN_INFO "device %s %s promiscuous mode\n",
4381 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4382 							       "left");
4383 		if (audit_enabled) {
4384 			current_uid_gid(&uid, &gid);
4385 			audit_log(current->audit_context, GFP_ATOMIC,
4386 				AUDIT_ANOM_PROMISCUOUS,
4387 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4388 				dev->name, (dev->flags & IFF_PROMISC),
4389 				(old_flags & IFF_PROMISC),
4390 				audit_get_loginuid(current),
4391 				uid, gid,
4392 				audit_get_sessionid(current));
4393 		}
4394 
4395 		dev_change_rx_flags(dev, IFF_PROMISC);
4396 	}
4397 	return 0;
4398 }
4399 
4400 /**
4401  *	dev_set_promiscuity	- update promiscuity count on a device
4402  *	@dev: device
4403  *	@inc: modifier
4404  *
4405  *	Add or remove promiscuity from a device. While the count in the device
4406  *	remains above zero the interface remains promiscuous. Once it hits zero
4407  *	the device reverts back to normal filtering operation. A negative inc
4408  *	value is used to drop promiscuity on the device.
4409  *	Return 0 if successful or a negative errno code on error.
4410  */
4411 int dev_set_promiscuity(struct net_device *dev, int inc)
4412 {
4413 	unsigned short old_flags = dev->flags;
4414 	int err;
4415 
4416 	err = __dev_set_promiscuity(dev, inc);
4417 	if (err < 0)
4418 		return err;
4419 	if (dev->flags != old_flags)
4420 		dev_set_rx_mode(dev);
4421 	return err;
4422 }
4423 EXPORT_SYMBOL(dev_set_promiscuity);
4424 
4425 /**
4426  *	dev_set_allmulti	- update allmulti count on a device
4427  *	@dev: device
4428  *	@inc: modifier
4429  *
4430  *	Add or remove reception of all multicast frames to a device. While the
4431  *	count in the device remains above zero the interface remains listening
4432  *	to all interfaces. Once it hits zero the device reverts back to normal
4433  *	filtering operation. A negative @inc value is used to drop the counter
4434  *	when releasing a resource needing all multicasts.
4435  *	Return 0 if successful or a negative errno code on error.
4436  */
4437 
4438 int dev_set_allmulti(struct net_device *dev, int inc)
4439 {
4440 	unsigned short old_flags = dev->flags;
4441 
4442 	ASSERT_RTNL();
4443 
4444 	dev->flags |= IFF_ALLMULTI;
4445 	dev->allmulti += inc;
4446 	if (dev->allmulti == 0) {
4447 		/*
4448 		 * Avoid overflow.
4449 		 * If inc causes overflow, untouch allmulti and return error.
4450 		 */
4451 		if (inc < 0)
4452 			dev->flags &= ~IFF_ALLMULTI;
4453 		else {
4454 			dev->allmulti -= inc;
4455 			printk(KERN_WARNING "%s: allmulti touches roof, "
4456 				"set allmulti failed, allmulti feature of "
4457 				"device might be broken.\n", dev->name);
4458 			return -EOVERFLOW;
4459 		}
4460 	}
4461 	if (dev->flags ^ old_flags) {
4462 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4463 		dev_set_rx_mode(dev);
4464 	}
4465 	return 0;
4466 }
4467 EXPORT_SYMBOL(dev_set_allmulti);
4468 
4469 /*
4470  *	Upload unicast and multicast address lists to device and
4471  *	configure RX filtering. When the device doesn't support unicast
4472  *	filtering it is put in promiscuous mode while unicast addresses
4473  *	are present.
4474  */
4475 void __dev_set_rx_mode(struct net_device *dev)
4476 {
4477 	const struct net_device_ops *ops = dev->netdev_ops;
4478 
4479 	/* dev_open will call this function so the list will stay sane. */
4480 	if (!(dev->flags&IFF_UP))
4481 		return;
4482 
4483 	if (!netif_device_present(dev))
4484 		return;
4485 
4486 	if (ops->ndo_set_rx_mode)
4487 		ops->ndo_set_rx_mode(dev);
4488 	else {
4489 		/* Unicast addresses changes may only happen under the rtnl,
4490 		 * therefore calling __dev_set_promiscuity here is safe.
4491 		 */
4492 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4493 			__dev_set_promiscuity(dev, 1);
4494 			dev->uc_promisc = 1;
4495 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4496 			__dev_set_promiscuity(dev, -1);
4497 			dev->uc_promisc = 0;
4498 		}
4499 
4500 		if (ops->ndo_set_multicast_list)
4501 			ops->ndo_set_multicast_list(dev);
4502 	}
4503 }
4504 
4505 void dev_set_rx_mode(struct net_device *dev)
4506 {
4507 	netif_addr_lock_bh(dev);
4508 	__dev_set_rx_mode(dev);
4509 	netif_addr_unlock_bh(dev);
4510 }
4511 
4512 /**
4513  *	dev_get_flags - get flags reported to userspace
4514  *	@dev: device
4515  *
4516  *	Get the combination of flag bits exported through APIs to userspace.
4517  */
4518 unsigned dev_get_flags(const struct net_device *dev)
4519 {
4520 	unsigned flags;
4521 
4522 	flags = (dev->flags & ~(IFF_PROMISC |
4523 				IFF_ALLMULTI |
4524 				IFF_RUNNING |
4525 				IFF_LOWER_UP |
4526 				IFF_DORMANT)) |
4527 		(dev->gflags & (IFF_PROMISC |
4528 				IFF_ALLMULTI));
4529 
4530 	if (netif_running(dev)) {
4531 		if (netif_oper_up(dev))
4532 			flags |= IFF_RUNNING;
4533 		if (netif_carrier_ok(dev))
4534 			flags |= IFF_LOWER_UP;
4535 		if (netif_dormant(dev))
4536 			flags |= IFF_DORMANT;
4537 	}
4538 
4539 	return flags;
4540 }
4541 EXPORT_SYMBOL(dev_get_flags);
4542 
4543 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4544 {
4545 	int old_flags = dev->flags;
4546 	int ret;
4547 
4548 	ASSERT_RTNL();
4549 
4550 	/*
4551 	 *	Set the flags on our device.
4552 	 */
4553 
4554 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4555 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4556 			       IFF_AUTOMEDIA)) |
4557 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4558 				    IFF_ALLMULTI));
4559 
4560 	/*
4561 	 *	Load in the correct multicast list now the flags have changed.
4562 	 */
4563 
4564 	if ((old_flags ^ flags) & IFF_MULTICAST)
4565 		dev_change_rx_flags(dev, IFF_MULTICAST);
4566 
4567 	dev_set_rx_mode(dev);
4568 
4569 	/*
4570 	 *	Have we downed the interface. We handle IFF_UP ourselves
4571 	 *	according to user attempts to set it, rather than blindly
4572 	 *	setting it.
4573 	 */
4574 
4575 	ret = 0;
4576 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4577 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4578 
4579 		if (!ret)
4580 			dev_set_rx_mode(dev);
4581 	}
4582 
4583 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4584 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4585 
4586 		dev->gflags ^= IFF_PROMISC;
4587 		dev_set_promiscuity(dev, inc);
4588 	}
4589 
4590 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4591 	   is important. Some (broken) drivers set IFF_PROMISC, when
4592 	   IFF_ALLMULTI is requested not asking us and not reporting.
4593 	 */
4594 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4595 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4596 
4597 		dev->gflags ^= IFF_ALLMULTI;
4598 		dev_set_allmulti(dev, inc);
4599 	}
4600 
4601 	return ret;
4602 }
4603 
4604 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4605 {
4606 	unsigned int changes = dev->flags ^ old_flags;
4607 
4608 	if (changes & IFF_UP) {
4609 		if (dev->flags & IFF_UP)
4610 			call_netdevice_notifiers(NETDEV_UP, dev);
4611 		else
4612 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4613 	}
4614 
4615 	if (dev->flags & IFF_UP &&
4616 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4617 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4618 }
4619 
4620 /**
4621  *	dev_change_flags - change device settings
4622  *	@dev: device
4623  *	@flags: device state flags
4624  *
4625  *	Change settings on device based state flags. The flags are
4626  *	in the userspace exported format.
4627  */
4628 int dev_change_flags(struct net_device *dev, unsigned flags)
4629 {
4630 	int ret, changes;
4631 	int old_flags = dev->flags;
4632 
4633 	ret = __dev_change_flags(dev, flags);
4634 	if (ret < 0)
4635 		return ret;
4636 
4637 	changes = old_flags ^ dev->flags;
4638 	if (changes)
4639 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4640 
4641 	__dev_notify_flags(dev, old_flags);
4642 	return ret;
4643 }
4644 EXPORT_SYMBOL(dev_change_flags);
4645 
4646 /**
4647  *	dev_set_mtu - Change maximum transfer unit
4648  *	@dev: device
4649  *	@new_mtu: new transfer unit
4650  *
4651  *	Change the maximum transfer size of the network device.
4652  */
4653 int dev_set_mtu(struct net_device *dev, int new_mtu)
4654 {
4655 	const struct net_device_ops *ops = dev->netdev_ops;
4656 	int err;
4657 
4658 	if (new_mtu == dev->mtu)
4659 		return 0;
4660 
4661 	/*	MTU must be positive.	 */
4662 	if (new_mtu < 0)
4663 		return -EINVAL;
4664 
4665 	if (!netif_device_present(dev))
4666 		return -ENODEV;
4667 
4668 	err = 0;
4669 	if (ops->ndo_change_mtu)
4670 		err = ops->ndo_change_mtu(dev, new_mtu);
4671 	else
4672 		dev->mtu = new_mtu;
4673 
4674 	if (!err && dev->flags & IFF_UP)
4675 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4676 	return err;
4677 }
4678 EXPORT_SYMBOL(dev_set_mtu);
4679 
4680 /**
4681  *	dev_set_group - Change group this device belongs to
4682  *	@dev: device
4683  *	@new_group: group this device should belong to
4684  */
4685 void dev_set_group(struct net_device *dev, int new_group)
4686 {
4687 	dev->group = new_group;
4688 }
4689 EXPORT_SYMBOL(dev_set_group);
4690 
4691 /**
4692  *	dev_set_mac_address - Change Media Access Control Address
4693  *	@dev: device
4694  *	@sa: new address
4695  *
4696  *	Change the hardware (MAC) address of the device
4697  */
4698 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4699 {
4700 	const struct net_device_ops *ops = dev->netdev_ops;
4701 	int err;
4702 
4703 	if (!ops->ndo_set_mac_address)
4704 		return -EOPNOTSUPP;
4705 	if (sa->sa_family != dev->type)
4706 		return -EINVAL;
4707 	if (!netif_device_present(dev))
4708 		return -ENODEV;
4709 	err = ops->ndo_set_mac_address(dev, sa);
4710 	if (!err)
4711 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4712 	return err;
4713 }
4714 EXPORT_SYMBOL(dev_set_mac_address);
4715 
4716 /*
4717  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4718  */
4719 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4720 {
4721 	int err;
4722 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4723 
4724 	if (!dev)
4725 		return -ENODEV;
4726 
4727 	switch (cmd) {
4728 	case SIOCGIFFLAGS:	/* Get interface flags */
4729 		ifr->ifr_flags = (short) dev_get_flags(dev);
4730 		return 0;
4731 
4732 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4733 				   (currently unused) */
4734 		ifr->ifr_metric = 0;
4735 		return 0;
4736 
4737 	case SIOCGIFMTU:	/* Get the MTU of a device */
4738 		ifr->ifr_mtu = dev->mtu;
4739 		return 0;
4740 
4741 	case SIOCGIFHWADDR:
4742 		if (!dev->addr_len)
4743 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4744 		else
4745 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4746 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4747 		ifr->ifr_hwaddr.sa_family = dev->type;
4748 		return 0;
4749 
4750 	case SIOCGIFSLAVE:
4751 		err = -EINVAL;
4752 		break;
4753 
4754 	case SIOCGIFMAP:
4755 		ifr->ifr_map.mem_start = dev->mem_start;
4756 		ifr->ifr_map.mem_end   = dev->mem_end;
4757 		ifr->ifr_map.base_addr = dev->base_addr;
4758 		ifr->ifr_map.irq       = dev->irq;
4759 		ifr->ifr_map.dma       = dev->dma;
4760 		ifr->ifr_map.port      = dev->if_port;
4761 		return 0;
4762 
4763 	case SIOCGIFINDEX:
4764 		ifr->ifr_ifindex = dev->ifindex;
4765 		return 0;
4766 
4767 	case SIOCGIFTXQLEN:
4768 		ifr->ifr_qlen = dev->tx_queue_len;
4769 		return 0;
4770 
4771 	default:
4772 		/* dev_ioctl() should ensure this case
4773 		 * is never reached
4774 		 */
4775 		WARN_ON(1);
4776 		err = -ENOTTY;
4777 		break;
4778 
4779 	}
4780 	return err;
4781 }
4782 
4783 /*
4784  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4785  */
4786 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4787 {
4788 	int err;
4789 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4790 	const struct net_device_ops *ops;
4791 
4792 	if (!dev)
4793 		return -ENODEV;
4794 
4795 	ops = dev->netdev_ops;
4796 
4797 	switch (cmd) {
4798 	case SIOCSIFFLAGS:	/* Set interface flags */
4799 		return dev_change_flags(dev, ifr->ifr_flags);
4800 
4801 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4802 				   (currently unused) */
4803 		return -EOPNOTSUPP;
4804 
4805 	case SIOCSIFMTU:	/* Set the MTU of a device */
4806 		return dev_set_mtu(dev, ifr->ifr_mtu);
4807 
4808 	case SIOCSIFHWADDR:
4809 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4810 
4811 	case SIOCSIFHWBROADCAST:
4812 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4813 			return -EINVAL;
4814 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4815 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4816 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4817 		return 0;
4818 
4819 	case SIOCSIFMAP:
4820 		if (ops->ndo_set_config) {
4821 			if (!netif_device_present(dev))
4822 				return -ENODEV;
4823 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4824 		}
4825 		return -EOPNOTSUPP;
4826 
4827 	case SIOCADDMULTI:
4828 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4829 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4830 			return -EINVAL;
4831 		if (!netif_device_present(dev))
4832 			return -ENODEV;
4833 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4834 
4835 	case SIOCDELMULTI:
4836 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4837 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4838 			return -EINVAL;
4839 		if (!netif_device_present(dev))
4840 			return -ENODEV;
4841 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4842 
4843 	case SIOCSIFTXQLEN:
4844 		if (ifr->ifr_qlen < 0)
4845 			return -EINVAL;
4846 		dev->tx_queue_len = ifr->ifr_qlen;
4847 		return 0;
4848 
4849 	case SIOCSIFNAME:
4850 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4851 		return dev_change_name(dev, ifr->ifr_newname);
4852 
4853 	/*
4854 	 *	Unknown or private ioctl
4855 	 */
4856 	default:
4857 		if ((cmd >= SIOCDEVPRIVATE &&
4858 		    cmd <= SIOCDEVPRIVATE + 15) ||
4859 		    cmd == SIOCBONDENSLAVE ||
4860 		    cmd == SIOCBONDRELEASE ||
4861 		    cmd == SIOCBONDSETHWADDR ||
4862 		    cmd == SIOCBONDSLAVEINFOQUERY ||
4863 		    cmd == SIOCBONDINFOQUERY ||
4864 		    cmd == SIOCBONDCHANGEACTIVE ||
4865 		    cmd == SIOCGMIIPHY ||
4866 		    cmd == SIOCGMIIREG ||
4867 		    cmd == SIOCSMIIREG ||
4868 		    cmd == SIOCBRADDIF ||
4869 		    cmd == SIOCBRDELIF ||
4870 		    cmd == SIOCSHWTSTAMP ||
4871 		    cmd == SIOCWANDEV) {
4872 			err = -EOPNOTSUPP;
4873 			if (ops->ndo_do_ioctl) {
4874 				if (netif_device_present(dev))
4875 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4876 				else
4877 					err = -ENODEV;
4878 			}
4879 		} else
4880 			err = -EINVAL;
4881 
4882 	}
4883 	return err;
4884 }
4885 
4886 /*
4887  *	This function handles all "interface"-type I/O control requests. The actual
4888  *	'doing' part of this is dev_ifsioc above.
4889  */
4890 
4891 /**
4892  *	dev_ioctl	-	network device ioctl
4893  *	@net: the applicable net namespace
4894  *	@cmd: command to issue
4895  *	@arg: pointer to a struct ifreq in user space
4896  *
4897  *	Issue ioctl functions to devices. This is normally called by the
4898  *	user space syscall interfaces but can sometimes be useful for
4899  *	other purposes. The return value is the return from the syscall if
4900  *	positive or a negative errno code on error.
4901  */
4902 
4903 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4904 {
4905 	struct ifreq ifr;
4906 	int ret;
4907 	char *colon;
4908 
4909 	/* One special case: SIOCGIFCONF takes ifconf argument
4910 	   and requires shared lock, because it sleeps writing
4911 	   to user space.
4912 	 */
4913 
4914 	if (cmd == SIOCGIFCONF) {
4915 		rtnl_lock();
4916 		ret = dev_ifconf(net, (char __user *) arg);
4917 		rtnl_unlock();
4918 		return ret;
4919 	}
4920 	if (cmd == SIOCGIFNAME)
4921 		return dev_ifname(net, (struct ifreq __user *)arg);
4922 
4923 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4924 		return -EFAULT;
4925 
4926 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4927 
4928 	colon = strchr(ifr.ifr_name, ':');
4929 	if (colon)
4930 		*colon = 0;
4931 
4932 	/*
4933 	 *	See which interface the caller is talking about.
4934 	 */
4935 
4936 	switch (cmd) {
4937 	/*
4938 	 *	These ioctl calls:
4939 	 *	- can be done by all.
4940 	 *	- atomic and do not require locking.
4941 	 *	- return a value
4942 	 */
4943 	case SIOCGIFFLAGS:
4944 	case SIOCGIFMETRIC:
4945 	case SIOCGIFMTU:
4946 	case SIOCGIFHWADDR:
4947 	case SIOCGIFSLAVE:
4948 	case SIOCGIFMAP:
4949 	case SIOCGIFINDEX:
4950 	case SIOCGIFTXQLEN:
4951 		dev_load(net, ifr.ifr_name);
4952 		rcu_read_lock();
4953 		ret = dev_ifsioc_locked(net, &ifr, cmd);
4954 		rcu_read_unlock();
4955 		if (!ret) {
4956 			if (colon)
4957 				*colon = ':';
4958 			if (copy_to_user(arg, &ifr,
4959 					 sizeof(struct ifreq)))
4960 				ret = -EFAULT;
4961 		}
4962 		return ret;
4963 
4964 	case SIOCETHTOOL:
4965 		dev_load(net, ifr.ifr_name);
4966 		rtnl_lock();
4967 		ret = dev_ethtool(net, &ifr);
4968 		rtnl_unlock();
4969 		if (!ret) {
4970 			if (colon)
4971 				*colon = ':';
4972 			if (copy_to_user(arg, &ifr,
4973 					 sizeof(struct ifreq)))
4974 				ret = -EFAULT;
4975 		}
4976 		return ret;
4977 
4978 	/*
4979 	 *	These ioctl calls:
4980 	 *	- require superuser power.
4981 	 *	- require strict serialization.
4982 	 *	- return a value
4983 	 */
4984 	case SIOCGMIIPHY:
4985 	case SIOCGMIIREG:
4986 	case SIOCSIFNAME:
4987 		if (!capable(CAP_NET_ADMIN))
4988 			return -EPERM;
4989 		dev_load(net, ifr.ifr_name);
4990 		rtnl_lock();
4991 		ret = dev_ifsioc(net, &ifr, cmd);
4992 		rtnl_unlock();
4993 		if (!ret) {
4994 			if (colon)
4995 				*colon = ':';
4996 			if (copy_to_user(arg, &ifr,
4997 					 sizeof(struct ifreq)))
4998 				ret = -EFAULT;
4999 		}
5000 		return ret;
5001 
5002 	/*
5003 	 *	These ioctl calls:
5004 	 *	- require superuser power.
5005 	 *	- require strict serialization.
5006 	 *	- do not return a value
5007 	 */
5008 	case SIOCSIFFLAGS:
5009 	case SIOCSIFMETRIC:
5010 	case SIOCSIFMTU:
5011 	case SIOCSIFMAP:
5012 	case SIOCSIFHWADDR:
5013 	case SIOCSIFSLAVE:
5014 	case SIOCADDMULTI:
5015 	case SIOCDELMULTI:
5016 	case SIOCSIFHWBROADCAST:
5017 	case SIOCSIFTXQLEN:
5018 	case SIOCSMIIREG:
5019 	case SIOCBONDENSLAVE:
5020 	case SIOCBONDRELEASE:
5021 	case SIOCBONDSETHWADDR:
5022 	case SIOCBONDCHANGEACTIVE:
5023 	case SIOCBRADDIF:
5024 	case SIOCBRDELIF:
5025 	case SIOCSHWTSTAMP:
5026 		if (!capable(CAP_NET_ADMIN))
5027 			return -EPERM;
5028 		/* fall through */
5029 	case SIOCBONDSLAVEINFOQUERY:
5030 	case SIOCBONDINFOQUERY:
5031 		dev_load(net, ifr.ifr_name);
5032 		rtnl_lock();
5033 		ret = dev_ifsioc(net, &ifr, cmd);
5034 		rtnl_unlock();
5035 		return ret;
5036 
5037 	case SIOCGIFMEM:
5038 		/* Get the per device memory space. We can add this but
5039 		 * currently do not support it */
5040 	case SIOCSIFMEM:
5041 		/* Set the per device memory buffer space.
5042 		 * Not applicable in our case */
5043 	case SIOCSIFLINK:
5044 		return -ENOTTY;
5045 
5046 	/*
5047 	 *	Unknown or private ioctl.
5048 	 */
5049 	default:
5050 		if (cmd == SIOCWANDEV ||
5051 		    (cmd >= SIOCDEVPRIVATE &&
5052 		     cmd <= SIOCDEVPRIVATE + 15)) {
5053 			dev_load(net, ifr.ifr_name);
5054 			rtnl_lock();
5055 			ret = dev_ifsioc(net, &ifr, cmd);
5056 			rtnl_unlock();
5057 			if (!ret && copy_to_user(arg, &ifr,
5058 						 sizeof(struct ifreq)))
5059 				ret = -EFAULT;
5060 			return ret;
5061 		}
5062 		/* Take care of Wireless Extensions */
5063 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5064 			return wext_handle_ioctl(net, &ifr, cmd, arg);
5065 		return -ENOTTY;
5066 	}
5067 }
5068 
5069 
5070 /**
5071  *	dev_new_index	-	allocate an ifindex
5072  *	@net: the applicable net namespace
5073  *
5074  *	Returns a suitable unique value for a new device interface
5075  *	number.  The caller must hold the rtnl semaphore or the
5076  *	dev_base_lock to be sure it remains unique.
5077  */
5078 static int dev_new_index(struct net *net)
5079 {
5080 	static int ifindex;
5081 	for (;;) {
5082 		if (++ifindex <= 0)
5083 			ifindex = 1;
5084 		if (!__dev_get_by_index(net, ifindex))
5085 			return ifindex;
5086 	}
5087 }
5088 
5089 /* Delayed registration/unregisteration */
5090 static LIST_HEAD(net_todo_list);
5091 
5092 static void net_set_todo(struct net_device *dev)
5093 {
5094 	list_add_tail(&dev->todo_list, &net_todo_list);
5095 }
5096 
5097 static void rollback_registered_many(struct list_head *head)
5098 {
5099 	struct net_device *dev, *tmp;
5100 
5101 	BUG_ON(dev_boot_phase);
5102 	ASSERT_RTNL();
5103 
5104 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5105 		/* Some devices call without registering
5106 		 * for initialization unwind. Remove those
5107 		 * devices and proceed with the remaining.
5108 		 */
5109 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5110 			pr_debug("unregister_netdevice: device %s/%p never "
5111 				 "was registered\n", dev->name, dev);
5112 
5113 			WARN_ON(1);
5114 			list_del(&dev->unreg_list);
5115 			continue;
5116 		}
5117 
5118 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5119 	}
5120 
5121 	/* If device is running, close it first. */
5122 	dev_close_many(head);
5123 
5124 	list_for_each_entry(dev, head, unreg_list) {
5125 		/* And unlink it from device chain. */
5126 		unlist_netdevice(dev);
5127 
5128 		dev->reg_state = NETREG_UNREGISTERING;
5129 	}
5130 
5131 	synchronize_net();
5132 
5133 	list_for_each_entry(dev, head, unreg_list) {
5134 		/* Shutdown queueing discipline. */
5135 		dev_shutdown(dev);
5136 
5137 
5138 		/* Notify protocols, that we are about to destroy
5139 		   this device. They should clean all the things.
5140 		*/
5141 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5142 
5143 		if (!dev->rtnl_link_ops ||
5144 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5145 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5146 
5147 		/*
5148 		 *	Flush the unicast and multicast chains
5149 		 */
5150 		dev_uc_flush(dev);
5151 		dev_mc_flush(dev);
5152 
5153 		if (dev->netdev_ops->ndo_uninit)
5154 			dev->netdev_ops->ndo_uninit(dev);
5155 
5156 		/* Notifier chain MUST detach us from master device. */
5157 		WARN_ON(dev->master);
5158 
5159 		/* Remove entries from kobject tree */
5160 		netdev_unregister_kobject(dev);
5161 	}
5162 
5163 	/* Process any work delayed until the end of the batch */
5164 	dev = list_first_entry(head, struct net_device, unreg_list);
5165 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5166 
5167 	rcu_barrier();
5168 
5169 	list_for_each_entry(dev, head, unreg_list)
5170 		dev_put(dev);
5171 }
5172 
5173 static void rollback_registered(struct net_device *dev)
5174 {
5175 	LIST_HEAD(single);
5176 
5177 	list_add(&dev->unreg_list, &single);
5178 	rollback_registered_many(&single);
5179 	list_del(&single);
5180 }
5181 
5182 u32 netdev_fix_features(struct net_device *dev, u32 features)
5183 {
5184 	/* Fix illegal checksum combinations */
5185 	if ((features & NETIF_F_HW_CSUM) &&
5186 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5187 		netdev_info(dev, "mixed HW and IP checksum settings.\n");
5188 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5189 	}
5190 
5191 	if ((features & NETIF_F_NO_CSUM) &&
5192 	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5193 		netdev_info(dev, "mixed no checksumming and other settings.\n");
5194 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5195 	}
5196 
5197 	/* Fix illegal SG+CSUM combinations. */
5198 	if ((features & NETIF_F_SG) &&
5199 	    !(features & NETIF_F_ALL_CSUM)) {
5200 		netdev_info(dev,
5201 			    "Dropping NETIF_F_SG since no checksum feature.\n");
5202 		features &= ~NETIF_F_SG;
5203 	}
5204 
5205 	/* TSO requires that SG is present as well. */
5206 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5207 		netdev_info(dev, "Dropping TSO features since no SG feature.\n");
5208 		features &= ~NETIF_F_ALL_TSO;
5209 	}
5210 
5211 	/* TSO ECN requires that TSO is present as well. */
5212 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5213 		features &= ~NETIF_F_TSO_ECN;
5214 
5215 	/* Software GSO depends on SG. */
5216 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5217 		netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5218 		features &= ~NETIF_F_GSO;
5219 	}
5220 
5221 	/* UFO needs SG and checksumming */
5222 	if (features & NETIF_F_UFO) {
5223 		/* maybe split UFO into V4 and V6? */
5224 		if (!((features & NETIF_F_GEN_CSUM) ||
5225 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5226 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5227 			netdev_info(dev,
5228 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5229 			features &= ~NETIF_F_UFO;
5230 		}
5231 
5232 		if (!(features & NETIF_F_SG)) {
5233 			netdev_info(dev,
5234 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5235 			features &= ~NETIF_F_UFO;
5236 		}
5237 	}
5238 
5239 	return features;
5240 }
5241 EXPORT_SYMBOL(netdev_fix_features);
5242 
5243 void netdev_update_features(struct net_device *dev)
5244 {
5245 	u32 features;
5246 	int err = 0;
5247 
5248 	features = netdev_get_wanted_features(dev);
5249 
5250 	if (dev->netdev_ops->ndo_fix_features)
5251 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5252 
5253 	/* driver might be less strict about feature dependencies */
5254 	features = netdev_fix_features(dev, features);
5255 
5256 	if (dev->features == features)
5257 		return;
5258 
5259 	netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n",
5260 		dev->features, features);
5261 
5262 	if (dev->netdev_ops->ndo_set_features)
5263 		err = dev->netdev_ops->ndo_set_features(dev, features);
5264 
5265 	if (!err)
5266 		dev->features = features;
5267 	else if (err < 0)
5268 		netdev_err(dev,
5269 			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5270 			err, features, dev->features);
5271 }
5272 EXPORT_SYMBOL(netdev_update_features);
5273 
5274 /**
5275  *	netif_stacked_transfer_operstate -	transfer operstate
5276  *	@rootdev: the root or lower level device to transfer state from
5277  *	@dev: the device to transfer operstate to
5278  *
5279  *	Transfer operational state from root to device. This is normally
5280  *	called when a stacking relationship exists between the root
5281  *	device and the device(a leaf device).
5282  */
5283 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5284 					struct net_device *dev)
5285 {
5286 	if (rootdev->operstate == IF_OPER_DORMANT)
5287 		netif_dormant_on(dev);
5288 	else
5289 		netif_dormant_off(dev);
5290 
5291 	if (netif_carrier_ok(rootdev)) {
5292 		if (!netif_carrier_ok(dev))
5293 			netif_carrier_on(dev);
5294 	} else {
5295 		if (netif_carrier_ok(dev))
5296 			netif_carrier_off(dev);
5297 	}
5298 }
5299 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5300 
5301 #ifdef CONFIG_RPS
5302 static int netif_alloc_rx_queues(struct net_device *dev)
5303 {
5304 	unsigned int i, count = dev->num_rx_queues;
5305 	struct netdev_rx_queue *rx;
5306 
5307 	BUG_ON(count < 1);
5308 
5309 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5310 	if (!rx) {
5311 		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5312 		return -ENOMEM;
5313 	}
5314 	dev->_rx = rx;
5315 
5316 	for (i = 0; i < count; i++)
5317 		rx[i].dev = dev;
5318 	return 0;
5319 }
5320 #endif
5321 
5322 static void netdev_init_one_queue(struct net_device *dev,
5323 				  struct netdev_queue *queue, void *_unused)
5324 {
5325 	/* Initialize queue lock */
5326 	spin_lock_init(&queue->_xmit_lock);
5327 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5328 	queue->xmit_lock_owner = -1;
5329 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5330 	queue->dev = dev;
5331 }
5332 
5333 static int netif_alloc_netdev_queues(struct net_device *dev)
5334 {
5335 	unsigned int count = dev->num_tx_queues;
5336 	struct netdev_queue *tx;
5337 
5338 	BUG_ON(count < 1);
5339 
5340 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5341 	if (!tx) {
5342 		pr_err("netdev: Unable to allocate %u tx queues.\n",
5343 		       count);
5344 		return -ENOMEM;
5345 	}
5346 	dev->_tx = tx;
5347 
5348 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5349 	spin_lock_init(&dev->tx_global_lock);
5350 
5351 	return 0;
5352 }
5353 
5354 /**
5355  *	register_netdevice	- register a network device
5356  *	@dev: device to register
5357  *
5358  *	Take a completed network device structure and add it to the kernel
5359  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5360  *	chain. 0 is returned on success. A negative errno code is returned
5361  *	on a failure to set up the device, or if the name is a duplicate.
5362  *
5363  *	Callers must hold the rtnl semaphore. You may want
5364  *	register_netdev() instead of this.
5365  *
5366  *	BUGS:
5367  *	The locking appears insufficient to guarantee two parallel registers
5368  *	will not get the same name.
5369  */
5370 
5371 int register_netdevice(struct net_device *dev)
5372 {
5373 	int ret;
5374 	struct net *net = dev_net(dev);
5375 
5376 	BUG_ON(dev_boot_phase);
5377 	ASSERT_RTNL();
5378 
5379 	might_sleep();
5380 
5381 	/* When net_device's are persistent, this will be fatal. */
5382 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5383 	BUG_ON(!net);
5384 
5385 	spin_lock_init(&dev->addr_list_lock);
5386 	netdev_set_addr_lockdep_class(dev);
5387 
5388 	dev->iflink = -1;
5389 
5390 	/* Init, if this function is available */
5391 	if (dev->netdev_ops->ndo_init) {
5392 		ret = dev->netdev_ops->ndo_init(dev);
5393 		if (ret) {
5394 			if (ret > 0)
5395 				ret = -EIO;
5396 			goto out;
5397 		}
5398 	}
5399 
5400 	ret = dev_get_valid_name(dev, dev->name, 0);
5401 	if (ret)
5402 		goto err_uninit;
5403 
5404 	dev->ifindex = dev_new_index(net);
5405 	if (dev->iflink == -1)
5406 		dev->iflink = dev->ifindex;
5407 
5408 	/* Transfer changeable features to wanted_features and enable
5409 	 * software offloads (GSO and GRO).
5410 	 */
5411 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5412 	dev->features |= NETIF_F_SOFT_FEATURES;
5413 	dev->wanted_features = dev->features & dev->hw_features;
5414 
5415 	/* Avoid warning from netdev_fix_features() for GSO without SG */
5416 	if (!(dev->wanted_features & NETIF_F_SG)) {
5417 		dev->wanted_features &= ~NETIF_F_GSO;
5418 		dev->features &= ~NETIF_F_GSO;
5419 	}
5420 
5421 	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5422 	 * vlan_dev_init() will do the dev->features check, so these features
5423 	 * are enabled only if supported by underlying device.
5424 	 */
5425 	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5426 
5427 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5428 	ret = notifier_to_errno(ret);
5429 	if (ret)
5430 		goto err_uninit;
5431 
5432 	ret = netdev_register_kobject(dev);
5433 	if (ret)
5434 		goto err_uninit;
5435 	dev->reg_state = NETREG_REGISTERED;
5436 
5437 	netdev_update_features(dev);
5438 
5439 	/*
5440 	 *	Default initial state at registry is that the
5441 	 *	device is present.
5442 	 */
5443 
5444 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5445 
5446 	dev_init_scheduler(dev);
5447 	dev_hold(dev);
5448 	list_netdevice(dev);
5449 
5450 	/* Notify protocols, that a new device appeared. */
5451 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5452 	ret = notifier_to_errno(ret);
5453 	if (ret) {
5454 		rollback_registered(dev);
5455 		dev->reg_state = NETREG_UNREGISTERED;
5456 	}
5457 	/*
5458 	 *	Prevent userspace races by waiting until the network
5459 	 *	device is fully setup before sending notifications.
5460 	 */
5461 	if (!dev->rtnl_link_ops ||
5462 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5463 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5464 
5465 out:
5466 	return ret;
5467 
5468 err_uninit:
5469 	if (dev->netdev_ops->ndo_uninit)
5470 		dev->netdev_ops->ndo_uninit(dev);
5471 	goto out;
5472 }
5473 EXPORT_SYMBOL(register_netdevice);
5474 
5475 /**
5476  *	init_dummy_netdev	- init a dummy network device for NAPI
5477  *	@dev: device to init
5478  *
5479  *	This takes a network device structure and initialize the minimum
5480  *	amount of fields so it can be used to schedule NAPI polls without
5481  *	registering a full blown interface. This is to be used by drivers
5482  *	that need to tie several hardware interfaces to a single NAPI
5483  *	poll scheduler due to HW limitations.
5484  */
5485 int init_dummy_netdev(struct net_device *dev)
5486 {
5487 	/* Clear everything. Note we don't initialize spinlocks
5488 	 * are they aren't supposed to be taken by any of the
5489 	 * NAPI code and this dummy netdev is supposed to be
5490 	 * only ever used for NAPI polls
5491 	 */
5492 	memset(dev, 0, sizeof(struct net_device));
5493 
5494 	/* make sure we BUG if trying to hit standard
5495 	 * register/unregister code path
5496 	 */
5497 	dev->reg_state = NETREG_DUMMY;
5498 
5499 	/* NAPI wants this */
5500 	INIT_LIST_HEAD(&dev->napi_list);
5501 
5502 	/* a dummy interface is started by default */
5503 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5504 	set_bit(__LINK_STATE_START, &dev->state);
5505 
5506 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5507 	 * because users of this 'device' dont need to change
5508 	 * its refcount.
5509 	 */
5510 
5511 	return 0;
5512 }
5513 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5514 
5515 
5516 /**
5517  *	register_netdev	- register a network device
5518  *	@dev: device to register
5519  *
5520  *	Take a completed network device structure and add it to the kernel
5521  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5522  *	chain. 0 is returned on success. A negative errno code is returned
5523  *	on a failure to set up the device, or if the name is a duplicate.
5524  *
5525  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5526  *	and expands the device name if you passed a format string to
5527  *	alloc_netdev.
5528  */
5529 int register_netdev(struct net_device *dev)
5530 {
5531 	int err;
5532 
5533 	rtnl_lock();
5534 
5535 	/*
5536 	 * If the name is a format string the caller wants us to do a
5537 	 * name allocation.
5538 	 */
5539 	if (strchr(dev->name, '%')) {
5540 		err = dev_alloc_name(dev, dev->name);
5541 		if (err < 0)
5542 			goto out;
5543 	}
5544 
5545 	err = register_netdevice(dev);
5546 out:
5547 	rtnl_unlock();
5548 	return err;
5549 }
5550 EXPORT_SYMBOL(register_netdev);
5551 
5552 int netdev_refcnt_read(const struct net_device *dev)
5553 {
5554 	int i, refcnt = 0;
5555 
5556 	for_each_possible_cpu(i)
5557 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5558 	return refcnt;
5559 }
5560 EXPORT_SYMBOL(netdev_refcnt_read);
5561 
5562 /*
5563  * netdev_wait_allrefs - wait until all references are gone.
5564  *
5565  * This is called when unregistering network devices.
5566  *
5567  * Any protocol or device that holds a reference should register
5568  * for netdevice notification, and cleanup and put back the
5569  * reference if they receive an UNREGISTER event.
5570  * We can get stuck here if buggy protocols don't correctly
5571  * call dev_put.
5572  */
5573 static void netdev_wait_allrefs(struct net_device *dev)
5574 {
5575 	unsigned long rebroadcast_time, warning_time;
5576 	int refcnt;
5577 
5578 	linkwatch_forget_dev(dev);
5579 
5580 	rebroadcast_time = warning_time = jiffies;
5581 	refcnt = netdev_refcnt_read(dev);
5582 
5583 	while (refcnt != 0) {
5584 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5585 			rtnl_lock();
5586 
5587 			/* Rebroadcast unregister notification */
5588 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5589 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5590 			 * should have already handle it the first time */
5591 
5592 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5593 				     &dev->state)) {
5594 				/* We must not have linkwatch events
5595 				 * pending on unregister. If this
5596 				 * happens, we simply run the queue
5597 				 * unscheduled, resulting in a noop
5598 				 * for this device.
5599 				 */
5600 				linkwatch_run_queue();
5601 			}
5602 
5603 			__rtnl_unlock();
5604 
5605 			rebroadcast_time = jiffies;
5606 		}
5607 
5608 		msleep(250);
5609 
5610 		refcnt = netdev_refcnt_read(dev);
5611 
5612 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5613 			printk(KERN_EMERG "unregister_netdevice: "
5614 			       "waiting for %s to become free. Usage "
5615 			       "count = %d\n",
5616 			       dev->name, refcnt);
5617 			warning_time = jiffies;
5618 		}
5619 	}
5620 }
5621 
5622 /* The sequence is:
5623  *
5624  *	rtnl_lock();
5625  *	...
5626  *	register_netdevice(x1);
5627  *	register_netdevice(x2);
5628  *	...
5629  *	unregister_netdevice(y1);
5630  *	unregister_netdevice(y2);
5631  *      ...
5632  *	rtnl_unlock();
5633  *	free_netdev(y1);
5634  *	free_netdev(y2);
5635  *
5636  * We are invoked by rtnl_unlock().
5637  * This allows us to deal with problems:
5638  * 1) We can delete sysfs objects which invoke hotplug
5639  *    without deadlocking with linkwatch via keventd.
5640  * 2) Since we run with the RTNL semaphore not held, we can sleep
5641  *    safely in order to wait for the netdev refcnt to drop to zero.
5642  *
5643  * We must not return until all unregister events added during
5644  * the interval the lock was held have been completed.
5645  */
5646 void netdev_run_todo(void)
5647 {
5648 	struct list_head list;
5649 
5650 	/* Snapshot list, allow later requests */
5651 	list_replace_init(&net_todo_list, &list);
5652 
5653 	__rtnl_unlock();
5654 
5655 	while (!list_empty(&list)) {
5656 		struct net_device *dev
5657 			= list_first_entry(&list, struct net_device, todo_list);
5658 		list_del(&dev->todo_list);
5659 
5660 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5661 			printk(KERN_ERR "network todo '%s' but state %d\n",
5662 			       dev->name, dev->reg_state);
5663 			dump_stack();
5664 			continue;
5665 		}
5666 
5667 		dev->reg_state = NETREG_UNREGISTERED;
5668 
5669 		on_each_cpu(flush_backlog, dev, 1);
5670 
5671 		netdev_wait_allrefs(dev);
5672 
5673 		/* paranoia */
5674 		BUG_ON(netdev_refcnt_read(dev));
5675 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5676 		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5677 		WARN_ON(dev->dn_ptr);
5678 
5679 		if (dev->destructor)
5680 			dev->destructor(dev);
5681 
5682 		/* Free network device */
5683 		kobject_put(&dev->dev.kobj);
5684 	}
5685 }
5686 
5687 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5688  * fields in the same order, with only the type differing.
5689  */
5690 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5691 				    const struct net_device_stats *netdev_stats)
5692 {
5693 #if BITS_PER_LONG == 64
5694         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5695         memcpy(stats64, netdev_stats, sizeof(*stats64));
5696 #else
5697 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5698 	const unsigned long *src = (const unsigned long *)netdev_stats;
5699 	u64 *dst = (u64 *)stats64;
5700 
5701 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5702 		     sizeof(*stats64) / sizeof(u64));
5703 	for (i = 0; i < n; i++)
5704 		dst[i] = src[i];
5705 #endif
5706 }
5707 
5708 /**
5709  *	dev_get_stats	- get network device statistics
5710  *	@dev: device to get statistics from
5711  *	@storage: place to store stats
5712  *
5713  *	Get network statistics from device. Return @storage.
5714  *	The device driver may provide its own method by setting
5715  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5716  *	otherwise the internal statistics structure is used.
5717  */
5718 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5719 					struct rtnl_link_stats64 *storage)
5720 {
5721 	const struct net_device_ops *ops = dev->netdev_ops;
5722 
5723 	if (ops->ndo_get_stats64) {
5724 		memset(storage, 0, sizeof(*storage));
5725 		ops->ndo_get_stats64(dev, storage);
5726 	} else if (ops->ndo_get_stats) {
5727 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5728 	} else {
5729 		netdev_stats_to_stats64(storage, &dev->stats);
5730 	}
5731 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5732 	return storage;
5733 }
5734 EXPORT_SYMBOL(dev_get_stats);
5735 
5736 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5737 {
5738 	struct netdev_queue *queue = dev_ingress_queue(dev);
5739 
5740 #ifdef CONFIG_NET_CLS_ACT
5741 	if (queue)
5742 		return queue;
5743 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5744 	if (!queue)
5745 		return NULL;
5746 	netdev_init_one_queue(dev, queue, NULL);
5747 	queue->qdisc = &noop_qdisc;
5748 	queue->qdisc_sleeping = &noop_qdisc;
5749 	rcu_assign_pointer(dev->ingress_queue, queue);
5750 #endif
5751 	return queue;
5752 }
5753 
5754 /**
5755  *	alloc_netdev_mqs - allocate network device
5756  *	@sizeof_priv:	size of private data to allocate space for
5757  *	@name:		device name format string
5758  *	@setup:		callback to initialize device
5759  *	@txqs:		the number of TX subqueues to allocate
5760  *	@rxqs:		the number of RX subqueues to allocate
5761  *
5762  *	Allocates a struct net_device with private data area for driver use
5763  *	and performs basic initialization.  Also allocates subquue structs
5764  *	for each queue on the device.
5765  */
5766 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5767 		void (*setup)(struct net_device *),
5768 		unsigned int txqs, unsigned int rxqs)
5769 {
5770 	struct net_device *dev;
5771 	size_t alloc_size;
5772 	struct net_device *p;
5773 
5774 	BUG_ON(strlen(name) >= sizeof(dev->name));
5775 
5776 	if (txqs < 1) {
5777 		pr_err("alloc_netdev: Unable to allocate device "
5778 		       "with zero queues.\n");
5779 		return NULL;
5780 	}
5781 
5782 #ifdef CONFIG_RPS
5783 	if (rxqs < 1) {
5784 		pr_err("alloc_netdev: Unable to allocate device "
5785 		       "with zero RX queues.\n");
5786 		return NULL;
5787 	}
5788 #endif
5789 
5790 	alloc_size = sizeof(struct net_device);
5791 	if (sizeof_priv) {
5792 		/* ensure 32-byte alignment of private area */
5793 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5794 		alloc_size += sizeof_priv;
5795 	}
5796 	/* ensure 32-byte alignment of whole construct */
5797 	alloc_size += NETDEV_ALIGN - 1;
5798 
5799 	p = kzalloc(alloc_size, GFP_KERNEL);
5800 	if (!p) {
5801 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5802 		return NULL;
5803 	}
5804 
5805 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5806 	dev->padded = (char *)dev - (char *)p;
5807 
5808 	dev->pcpu_refcnt = alloc_percpu(int);
5809 	if (!dev->pcpu_refcnt)
5810 		goto free_p;
5811 
5812 	if (dev_addr_init(dev))
5813 		goto free_pcpu;
5814 
5815 	dev_mc_init(dev);
5816 	dev_uc_init(dev);
5817 
5818 	dev_net_set(dev, &init_net);
5819 
5820 	dev->gso_max_size = GSO_MAX_SIZE;
5821 
5822 	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5823 	dev->ethtool_ntuple_list.count = 0;
5824 	INIT_LIST_HEAD(&dev->napi_list);
5825 	INIT_LIST_HEAD(&dev->unreg_list);
5826 	INIT_LIST_HEAD(&dev->link_watch_list);
5827 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5828 	setup(dev);
5829 
5830 	dev->num_tx_queues = txqs;
5831 	dev->real_num_tx_queues = txqs;
5832 	if (netif_alloc_netdev_queues(dev))
5833 		goto free_all;
5834 
5835 #ifdef CONFIG_RPS
5836 	dev->num_rx_queues = rxqs;
5837 	dev->real_num_rx_queues = rxqs;
5838 	if (netif_alloc_rx_queues(dev))
5839 		goto free_all;
5840 #endif
5841 
5842 	strcpy(dev->name, name);
5843 	dev->group = INIT_NETDEV_GROUP;
5844 	return dev;
5845 
5846 free_all:
5847 	free_netdev(dev);
5848 	return NULL;
5849 
5850 free_pcpu:
5851 	free_percpu(dev->pcpu_refcnt);
5852 	kfree(dev->_tx);
5853 #ifdef CONFIG_RPS
5854 	kfree(dev->_rx);
5855 #endif
5856 
5857 free_p:
5858 	kfree(p);
5859 	return NULL;
5860 }
5861 EXPORT_SYMBOL(alloc_netdev_mqs);
5862 
5863 /**
5864  *	free_netdev - free network device
5865  *	@dev: device
5866  *
5867  *	This function does the last stage of destroying an allocated device
5868  * 	interface. The reference to the device object is released.
5869  *	If this is the last reference then it will be freed.
5870  */
5871 void free_netdev(struct net_device *dev)
5872 {
5873 	struct napi_struct *p, *n;
5874 
5875 	release_net(dev_net(dev));
5876 
5877 	kfree(dev->_tx);
5878 #ifdef CONFIG_RPS
5879 	kfree(dev->_rx);
5880 #endif
5881 
5882 	kfree(rcu_dereference_raw(dev->ingress_queue));
5883 
5884 	/* Flush device addresses */
5885 	dev_addr_flush(dev);
5886 
5887 	/* Clear ethtool n-tuple list */
5888 	ethtool_ntuple_flush(dev);
5889 
5890 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5891 		netif_napi_del(p);
5892 
5893 	free_percpu(dev->pcpu_refcnt);
5894 	dev->pcpu_refcnt = NULL;
5895 
5896 	/*  Compatibility with error handling in drivers */
5897 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5898 		kfree((char *)dev - dev->padded);
5899 		return;
5900 	}
5901 
5902 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5903 	dev->reg_state = NETREG_RELEASED;
5904 
5905 	/* will free via device release */
5906 	put_device(&dev->dev);
5907 }
5908 EXPORT_SYMBOL(free_netdev);
5909 
5910 /**
5911  *	synchronize_net -  Synchronize with packet receive processing
5912  *
5913  *	Wait for packets currently being received to be done.
5914  *	Does not block later packets from starting.
5915  */
5916 void synchronize_net(void)
5917 {
5918 	might_sleep();
5919 	synchronize_rcu();
5920 }
5921 EXPORT_SYMBOL(synchronize_net);
5922 
5923 /**
5924  *	unregister_netdevice_queue - remove device from the kernel
5925  *	@dev: device
5926  *	@head: list
5927  *
5928  *	This function shuts down a device interface and removes it
5929  *	from the kernel tables.
5930  *	If head not NULL, device is queued to be unregistered later.
5931  *
5932  *	Callers must hold the rtnl semaphore.  You may want
5933  *	unregister_netdev() instead of this.
5934  */
5935 
5936 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5937 {
5938 	ASSERT_RTNL();
5939 
5940 	if (head) {
5941 		list_move_tail(&dev->unreg_list, head);
5942 	} else {
5943 		rollback_registered(dev);
5944 		/* Finish processing unregister after unlock */
5945 		net_set_todo(dev);
5946 	}
5947 }
5948 EXPORT_SYMBOL(unregister_netdevice_queue);
5949 
5950 /**
5951  *	unregister_netdevice_many - unregister many devices
5952  *	@head: list of devices
5953  */
5954 void unregister_netdevice_many(struct list_head *head)
5955 {
5956 	struct net_device *dev;
5957 
5958 	if (!list_empty(head)) {
5959 		rollback_registered_many(head);
5960 		list_for_each_entry(dev, head, unreg_list)
5961 			net_set_todo(dev);
5962 	}
5963 }
5964 EXPORT_SYMBOL(unregister_netdevice_many);
5965 
5966 /**
5967  *	unregister_netdev - remove device from the kernel
5968  *	@dev: device
5969  *
5970  *	This function shuts down a device interface and removes it
5971  *	from the kernel tables.
5972  *
5973  *	This is just a wrapper for unregister_netdevice that takes
5974  *	the rtnl semaphore.  In general you want to use this and not
5975  *	unregister_netdevice.
5976  */
5977 void unregister_netdev(struct net_device *dev)
5978 {
5979 	rtnl_lock();
5980 	unregister_netdevice(dev);
5981 	rtnl_unlock();
5982 }
5983 EXPORT_SYMBOL(unregister_netdev);
5984 
5985 /**
5986  *	dev_change_net_namespace - move device to different nethost namespace
5987  *	@dev: device
5988  *	@net: network namespace
5989  *	@pat: If not NULL name pattern to try if the current device name
5990  *	      is already taken in the destination network namespace.
5991  *
5992  *	This function shuts down a device interface and moves it
5993  *	to a new network namespace. On success 0 is returned, on
5994  *	a failure a netagive errno code is returned.
5995  *
5996  *	Callers must hold the rtnl semaphore.
5997  */
5998 
5999 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6000 {
6001 	int err;
6002 
6003 	ASSERT_RTNL();
6004 
6005 	/* Don't allow namespace local devices to be moved. */
6006 	err = -EINVAL;
6007 	if (dev->features & NETIF_F_NETNS_LOCAL)
6008 		goto out;
6009 
6010 	/* Ensure the device has been registrered */
6011 	err = -EINVAL;
6012 	if (dev->reg_state != NETREG_REGISTERED)
6013 		goto out;
6014 
6015 	/* Get out if there is nothing todo */
6016 	err = 0;
6017 	if (net_eq(dev_net(dev), net))
6018 		goto out;
6019 
6020 	/* Pick the destination device name, and ensure
6021 	 * we can use it in the destination network namespace.
6022 	 */
6023 	err = -EEXIST;
6024 	if (__dev_get_by_name(net, dev->name)) {
6025 		/* We get here if we can't use the current device name */
6026 		if (!pat)
6027 			goto out;
6028 		if (dev_get_valid_name(dev, pat, 1))
6029 			goto out;
6030 	}
6031 
6032 	/*
6033 	 * And now a mini version of register_netdevice unregister_netdevice.
6034 	 */
6035 
6036 	/* If device is running close it first. */
6037 	dev_close(dev);
6038 
6039 	/* And unlink it from device chain */
6040 	err = -ENODEV;
6041 	unlist_netdevice(dev);
6042 
6043 	synchronize_net();
6044 
6045 	/* Shutdown queueing discipline. */
6046 	dev_shutdown(dev);
6047 
6048 	/* Notify protocols, that we are about to destroy
6049 	   this device. They should clean all the things.
6050 
6051 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6052 	   This is wanted because this way 8021q and macvlan know
6053 	   the device is just moving and can keep their slaves up.
6054 	*/
6055 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6056 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6057 
6058 	/*
6059 	 *	Flush the unicast and multicast chains
6060 	 */
6061 	dev_uc_flush(dev);
6062 	dev_mc_flush(dev);
6063 
6064 	/* Actually switch the network namespace */
6065 	dev_net_set(dev, net);
6066 
6067 	/* If there is an ifindex conflict assign a new one */
6068 	if (__dev_get_by_index(net, dev->ifindex)) {
6069 		int iflink = (dev->iflink == dev->ifindex);
6070 		dev->ifindex = dev_new_index(net);
6071 		if (iflink)
6072 			dev->iflink = dev->ifindex;
6073 	}
6074 
6075 	/* Fixup kobjects */
6076 	err = device_rename(&dev->dev, dev->name);
6077 	WARN_ON(err);
6078 
6079 	/* Add the device back in the hashes */
6080 	list_netdevice(dev);
6081 
6082 	/* Notify protocols, that a new device appeared. */
6083 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6084 
6085 	/*
6086 	 *	Prevent userspace races by waiting until the network
6087 	 *	device is fully setup before sending notifications.
6088 	 */
6089 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6090 
6091 	synchronize_net();
6092 	err = 0;
6093 out:
6094 	return err;
6095 }
6096 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6097 
6098 static int dev_cpu_callback(struct notifier_block *nfb,
6099 			    unsigned long action,
6100 			    void *ocpu)
6101 {
6102 	struct sk_buff **list_skb;
6103 	struct sk_buff *skb;
6104 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6105 	struct softnet_data *sd, *oldsd;
6106 
6107 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6108 		return NOTIFY_OK;
6109 
6110 	local_irq_disable();
6111 	cpu = smp_processor_id();
6112 	sd = &per_cpu(softnet_data, cpu);
6113 	oldsd = &per_cpu(softnet_data, oldcpu);
6114 
6115 	/* Find end of our completion_queue. */
6116 	list_skb = &sd->completion_queue;
6117 	while (*list_skb)
6118 		list_skb = &(*list_skb)->next;
6119 	/* Append completion queue from offline CPU. */
6120 	*list_skb = oldsd->completion_queue;
6121 	oldsd->completion_queue = NULL;
6122 
6123 	/* Append output queue from offline CPU. */
6124 	if (oldsd->output_queue) {
6125 		*sd->output_queue_tailp = oldsd->output_queue;
6126 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6127 		oldsd->output_queue = NULL;
6128 		oldsd->output_queue_tailp = &oldsd->output_queue;
6129 	}
6130 
6131 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6132 	local_irq_enable();
6133 
6134 	/* Process offline CPU's input_pkt_queue */
6135 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6136 		netif_rx(skb);
6137 		input_queue_head_incr(oldsd);
6138 	}
6139 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6140 		netif_rx(skb);
6141 		input_queue_head_incr(oldsd);
6142 	}
6143 
6144 	return NOTIFY_OK;
6145 }
6146 
6147 
6148 /**
6149  *	netdev_increment_features - increment feature set by one
6150  *	@all: current feature set
6151  *	@one: new feature set
6152  *	@mask: mask feature set
6153  *
6154  *	Computes a new feature set after adding a device with feature set
6155  *	@one to the master device with current feature set @all.  Will not
6156  *	enable anything that is off in @mask. Returns the new feature set.
6157  */
6158 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6159 {
6160 	/* If device needs checksumming, downgrade to it. */
6161 	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6162 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6163 	else if (mask & NETIF_F_ALL_CSUM) {
6164 		/* If one device supports v4/v6 checksumming, set for all. */
6165 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6166 		    !(all & NETIF_F_GEN_CSUM)) {
6167 			all &= ~NETIF_F_ALL_CSUM;
6168 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6169 		}
6170 
6171 		/* If one device supports hw checksumming, set for all. */
6172 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6173 			all &= ~NETIF_F_ALL_CSUM;
6174 			all |= NETIF_F_HW_CSUM;
6175 		}
6176 	}
6177 
6178 	one |= NETIF_F_ALL_CSUM;
6179 
6180 	one |= all & NETIF_F_ONE_FOR_ALL;
6181 	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6182 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
6183 
6184 	return all;
6185 }
6186 EXPORT_SYMBOL(netdev_increment_features);
6187 
6188 static struct hlist_head *netdev_create_hash(void)
6189 {
6190 	int i;
6191 	struct hlist_head *hash;
6192 
6193 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6194 	if (hash != NULL)
6195 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6196 			INIT_HLIST_HEAD(&hash[i]);
6197 
6198 	return hash;
6199 }
6200 
6201 /* Initialize per network namespace state */
6202 static int __net_init netdev_init(struct net *net)
6203 {
6204 	INIT_LIST_HEAD(&net->dev_base_head);
6205 
6206 	net->dev_name_head = netdev_create_hash();
6207 	if (net->dev_name_head == NULL)
6208 		goto err_name;
6209 
6210 	net->dev_index_head = netdev_create_hash();
6211 	if (net->dev_index_head == NULL)
6212 		goto err_idx;
6213 
6214 	return 0;
6215 
6216 err_idx:
6217 	kfree(net->dev_name_head);
6218 err_name:
6219 	return -ENOMEM;
6220 }
6221 
6222 /**
6223  *	netdev_drivername - network driver for the device
6224  *	@dev: network device
6225  *	@buffer: buffer for resulting name
6226  *	@len: size of buffer
6227  *
6228  *	Determine network driver for device.
6229  */
6230 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6231 {
6232 	const struct device_driver *driver;
6233 	const struct device *parent;
6234 
6235 	if (len <= 0 || !buffer)
6236 		return buffer;
6237 	buffer[0] = 0;
6238 
6239 	parent = dev->dev.parent;
6240 
6241 	if (!parent)
6242 		return buffer;
6243 
6244 	driver = parent->driver;
6245 	if (driver && driver->name)
6246 		strlcpy(buffer, driver->name, len);
6247 	return buffer;
6248 }
6249 
6250 static int __netdev_printk(const char *level, const struct net_device *dev,
6251 			   struct va_format *vaf)
6252 {
6253 	int r;
6254 
6255 	if (dev && dev->dev.parent)
6256 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6257 			       netdev_name(dev), vaf);
6258 	else if (dev)
6259 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6260 	else
6261 		r = printk("%s(NULL net_device): %pV", level, vaf);
6262 
6263 	return r;
6264 }
6265 
6266 int netdev_printk(const char *level, const struct net_device *dev,
6267 		  const char *format, ...)
6268 {
6269 	struct va_format vaf;
6270 	va_list args;
6271 	int r;
6272 
6273 	va_start(args, format);
6274 
6275 	vaf.fmt = format;
6276 	vaf.va = &args;
6277 
6278 	r = __netdev_printk(level, dev, &vaf);
6279 	va_end(args);
6280 
6281 	return r;
6282 }
6283 EXPORT_SYMBOL(netdev_printk);
6284 
6285 #define define_netdev_printk_level(func, level)			\
6286 int func(const struct net_device *dev, const char *fmt, ...)	\
6287 {								\
6288 	int r;							\
6289 	struct va_format vaf;					\
6290 	va_list args;						\
6291 								\
6292 	va_start(args, fmt);					\
6293 								\
6294 	vaf.fmt = fmt;						\
6295 	vaf.va = &args;						\
6296 								\
6297 	r = __netdev_printk(level, dev, &vaf);			\
6298 	va_end(args);						\
6299 								\
6300 	return r;						\
6301 }								\
6302 EXPORT_SYMBOL(func);
6303 
6304 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6305 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6306 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6307 define_netdev_printk_level(netdev_err, KERN_ERR);
6308 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6309 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6310 define_netdev_printk_level(netdev_info, KERN_INFO);
6311 
6312 static void __net_exit netdev_exit(struct net *net)
6313 {
6314 	kfree(net->dev_name_head);
6315 	kfree(net->dev_index_head);
6316 }
6317 
6318 static struct pernet_operations __net_initdata netdev_net_ops = {
6319 	.init = netdev_init,
6320 	.exit = netdev_exit,
6321 };
6322 
6323 static void __net_exit default_device_exit(struct net *net)
6324 {
6325 	struct net_device *dev, *aux;
6326 	/*
6327 	 * Push all migratable network devices back to the
6328 	 * initial network namespace
6329 	 */
6330 	rtnl_lock();
6331 	for_each_netdev_safe(net, dev, aux) {
6332 		int err;
6333 		char fb_name[IFNAMSIZ];
6334 
6335 		/* Ignore unmoveable devices (i.e. loopback) */
6336 		if (dev->features & NETIF_F_NETNS_LOCAL)
6337 			continue;
6338 
6339 		/* Leave virtual devices for the generic cleanup */
6340 		if (dev->rtnl_link_ops)
6341 			continue;
6342 
6343 		/* Push remaining network devices to init_net */
6344 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6345 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6346 		if (err) {
6347 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6348 				__func__, dev->name, err);
6349 			BUG();
6350 		}
6351 	}
6352 	rtnl_unlock();
6353 }
6354 
6355 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6356 {
6357 	/* At exit all network devices most be removed from a network
6358 	 * namespace.  Do this in the reverse order of registration.
6359 	 * Do this across as many network namespaces as possible to
6360 	 * improve batching efficiency.
6361 	 */
6362 	struct net_device *dev;
6363 	struct net *net;
6364 	LIST_HEAD(dev_kill_list);
6365 
6366 	rtnl_lock();
6367 	list_for_each_entry(net, net_list, exit_list) {
6368 		for_each_netdev_reverse(net, dev) {
6369 			if (dev->rtnl_link_ops)
6370 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6371 			else
6372 				unregister_netdevice_queue(dev, &dev_kill_list);
6373 		}
6374 	}
6375 	unregister_netdevice_many(&dev_kill_list);
6376 	list_del(&dev_kill_list);
6377 	rtnl_unlock();
6378 }
6379 
6380 static struct pernet_operations __net_initdata default_device_ops = {
6381 	.exit = default_device_exit,
6382 	.exit_batch = default_device_exit_batch,
6383 };
6384 
6385 /*
6386  *	Initialize the DEV module. At boot time this walks the device list and
6387  *	unhooks any devices that fail to initialise (normally hardware not
6388  *	present) and leaves us with a valid list of present and active devices.
6389  *
6390  */
6391 
6392 /*
6393  *       This is called single threaded during boot, so no need
6394  *       to take the rtnl semaphore.
6395  */
6396 static int __init net_dev_init(void)
6397 {
6398 	int i, rc = -ENOMEM;
6399 
6400 	BUG_ON(!dev_boot_phase);
6401 
6402 	if (dev_proc_init())
6403 		goto out;
6404 
6405 	if (netdev_kobject_init())
6406 		goto out;
6407 
6408 	INIT_LIST_HEAD(&ptype_all);
6409 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6410 		INIT_LIST_HEAD(&ptype_base[i]);
6411 
6412 	if (register_pernet_subsys(&netdev_net_ops))
6413 		goto out;
6414 
6415 	/*
6416 	 *	Initialise the packet receive queues.
6417 	 */
6418 
6419 	for_each_possible_cpu(i) {
6420 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6421 
6422 		memset(sd, 0, sizeof(*sd));
6423 		skb_queue_head_init(&sd->input_pkt_queue);
6424 		skb_queue_head_init(&sd->process_queue);
6425 		sd->completion_queue = NULL;
6426 		INIT_LIST_HEAD(&sd->poll_list);
6427 		sd->output_queue = NULL;
6428 		sd->output_queue_tailp = &sd->output_queue;
6429 #ifdef CONFIG_RPS
6430 		sd->csd.func = rps_trigger_softirq;
6431 		sd->csd.info = sd;
6432 		sd->csd.flags = 0;
6433 		sd->cpu = i;
6434 #endif
6435 
6436 		sd->backlog.poll = process_backlog;
6437 		sd->backlog.weight = weight_p;
6438 		sd->backlog.gro_list = NULL;
6439 		sd->backlog.gro_count = 0;
6440 	}
6441 
6442 	dev_boot_phase = 0;
6443 
6444 	/* The loopback device is special if any other network devices
6445 	 * is present in a network namespace the loopback device must
6446 	 * be present. Since we now dynamically allocate and free the
6447 	 * loopback device ensure this invariant is maintained by
6448 	 * keeping the loopback device as the first device on the
6449 	 * list of network devices.  Ensuring the loopback devices
6450 	 * is the first device that appears and the last network device
6451 	 * that disappears.
6452 	 */
6453 	if (register_pernet_device(&loopback_net_ops))
6454 		goto out;
6455 
6456 	if (register_pernet_device(&default_device_ops))
6457 		goto out;
6458 
6459 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6460 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6461 
6462 	hotcpu_notifier(dev_cpu_callback, 0);
6463 	dst_init();
6464 	dev_mcast_init();
6465 	rc = 0;
6466 out:
6467 	return rc;
6468 }
6469 
6470 subsys_initcall(net_dev_init);
6471 
6472 static int __init initialize_hashrnd(void)
6473 {
6474 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6475 	return 0;
6476 }
6477 
6478 late_initcall_sync(initialize_hashrnd);
6479 
6480