xref: /linux/net/core/dev.c (revision 492c826b9facefa84995f4dea917e301b5ee0884)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 #include <linux/cpu_rmap.h>
136 
137 #include "net-sysfs.h"
138 
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
141 
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144 
145 /*
146  *	The list of packet types we will receive (as opposed to discard)
147  *	and the routines to invoke.
148  *
149  *	Why 16. Because with 16 the only overlap we get on a hash of the
150  *	low nibble of the protocol value is RARP/SNAP/X.25.
151  *
152  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
153  *             sure which should go first, but I bet it won't make much
154  *             difference if we are running VLANs.  The good news is that
155  *             this protocol won't be in the list unless compiled in, so
156  *             the average user (w/out VLANs) will not be adversely affected.
157  *             --BLG
158  *
159  *		0800	IP
160  *		8100    802.1Q VLAN
161  *		0001	802.3
162  *		0002	AX.25
163  *		0004	802.2
164  *		8035	RARP
165  *		0005	SNAP
166  *		0805	X.25
167  *		0806	ARP
168  *		8137	IPX
169  *		0009	Localtalk
170  *		86DD	IPv6
171  */
172 
173 #define PTYPE_HASH_SIZE	(16)
174 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
175 
176 static DEFINE_SPINLOCK(ptype_lock);
177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178 static struct list_head ptype_all __read_mostly;	/* Taps */
179 
180 /*
181  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182  * semaphore.
183  *
184  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
185  *
186  * Writers must hold the rtnl semaphore while they loop through the
187  * dev_base_head list, and hold dev_base_lock for writing when they do the
188  * actual updates.  This allows pure readers to access the list even
189  * while a writer is preparing to update it.
190  *
191  * To put it another way, dev_base_lock is held for writing only to
192  * protect against pure readers; the rtnl semaphore provides the
193  * protection against other writers.
194  *
195  * See, for example usages, register_netdevice() and
196  * unregister_netdevice(), which must be called with the rtnl
197  * semaphore held.
198  */
199 DEFINE_RWLOCK(dev_base_lock);
200 EXPORT_SYMBOL(dev_base_lock);
201 
202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
203 {
204 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
205 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206 }
207 
208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
209 {
210 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211 }
212 
213 static inline void rps_lock(struct softnet_data *sd)
214 {
215 #ifdef CONFIG_RPS
216 	spin_lock(&sd->input_pkt_queue.lock);
217 #endif
218 }
219 
220 static inline void rps_unlock(struct softnet_data *sd)
221 {
222 #ifdef CONFIG_RPS
223 	spin_unlock(&sd->input_pkt_queue.lock);
224 #endif
225 }
226 
227 /* Device list insertion */
228 static int list_netdevice(struct net_device *dev)
229 {
230 	struct net *net = dev_net(dev);
231 
232 	ASSERT_RTNL();
233 
234 	write_lock_bh(&dev_base_lock);
235 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
236 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
237 	hlist_add_head_rcu(&dev->index_hlist,
238 			   dev_index_hash(net, dev->ifindex));
239 	write_unlock_bh(&dev_base_lock);
240 	return 0;
241 }
242 
243 /* Device list removal
244  * caller must respect a RCU grace period before freeing/reusing dev
245  */
246 static void unlist_netdevice(struct net_device *dev)
247 {
248 	ASSERT_RTNL();
249 
250 	/* Unlink dev from the device chain */
251 	write_lock_bh(&dev_base_lock);
252 	list_del_rcu(&dev->dev_list);
253 	hlist_del_rcu(&dev->name_hlist);
254 	hlist_del_rcu(&dev->index_hlist);
255 	write_unlock_bh(&dev_base_lock);
256 }
257 
258 /*
259  *	Our notifier list
260  */
261 
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263 
264 /*
265  *	Device drivers call our routines to queue packets here. We empty the
266  *	queue in the local softnet handler.
267  */
268 
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271 
272 #ifdef CONFIG_LOCKDEP
273 /*
274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275  * according to dev->type
276  */
277 static const unsigned short netdev_lock_type[] =
278 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
291 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
292 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
293 	 ARPHRD_VOID, ARPHRD_NONE};
294 
295 static const char *const netdev_lock_name[] =
296 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
309 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
310 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
311 	 "_xmit_VOID", "_xmit_NONE"};
312 
313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
315 
316 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
317 {
318 	int i;
319 
320 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
321 		if (netdev_lock_type[i] == dev_type)
322 			return i;
323 	/* the last key is used by default */
324 	return ARRAY_SIZE(netdev_lock_type) - 1;
325 }
326 
327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
328 						 unsigned short dev_type)
329 {
330 	int i;
331 
332 	i = netdev_lock_pos(dev_type);
333 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
334 				   netdev_lock_name[i]);
335 }
336 
337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
338 {
339 	int i;
340 
341 	i = netdev_lock_pos(dev->type);
342 	lockdep_set_class_and_name(&dev->addr_list_lock,
343 				   &netdev_addr_lock_key[i],
344 				   netdev_lock_name[i]);
345 }
346 #else
347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
348 						 unsigned short dev_type)
349 {
350 }
351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
352 {
353 }
354 #endif
355 
356 /*******************************************************************************
357 
358 		Protocol management and registration routines
359 
360 *******************************************************************************/
361 
362 /*
363  *	Add a protocol ID to the list. Now that the input handler is
364  *	smarter we can dispense with all the messy stuff that used to be
365  *	here.
366  *
367  *	BEWARE!!! Protocol handlers, mangling input packets,
368  *	MUST BE last in hash buckets and checking protocol handlers
369  *	MUST start from promiscuous ptype_all chain in net_bh.
370  *	It is true now, do not change it.
371  *	Explanation follows: if protocol handler, mangling packet, will
372  *	be the first on list, it is not able to sense, that packet
373  *	is cloned and should be copied-on-write, so that it will
374  *	change it and subsequent readers will get broken packet.
375  *							--ANK (980803)
376  */
377 
378 static inline struct list_head *ptype_head(const struct packet_type *pt)
379 {
380 	if (pt->type == htons(ETH_P_ALL))
381 		return &ptype_all;
382 	else
383 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384 }
385 
386 /**
387  *	dev_add_pack - add packet handler
388  *	@pt: packet type declaration
389  *
390  *	Add a protocol handler to the networking stack. The passed &packet_type
391  *	is linked into kernel lists and may not be freed until it has been
392  *	removed from the kernel lists.
393  *
394  *	This call does not sleep therefore it can not
395  *	guarantee all CPU's that are in middle of receiving packets
396  *	will see the new packet type (until the next received packet).
397  */
398 
399 void dev_add_pack(struct packet_type *pt)
400 {
401 	struct list_head *head = ptype_head(pt);
402 
403 	spin_lock(&ptype_lock);
404 	list_add_rcu(&pt->list, head);
405 	spin_unlock(&ptype_lock);
406 }
407 EXPORT_SYMBOL(dev_add_pack);
408 
409 /**
410  *	__dev_remove_pack	 - remove packet handler
411  *	@pt: packet type declaration
412  *
413  *	Remove a protocol handler that was previously added to the kernel
414  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
415  *	from the kernel lists and can be freed or reused once this function
416  *	returns.
417  *
418  *      The packet type might still be in use by receivers
419  *	and must not be freed until after all the CPU's have gone
420  *	through a quiescent state.
421  */
422 void __dev_remove_pack(struct packet_type *pt)
423 {
424 	struct list_head *head = ptype_head(pt);
425 	struct packet_type *pt1;
426 
427 	spin_lock(&ptype_lock);
428 
429 	list_for_each_entry(pt1, head, list) {
430 		if (pt == pt1) {
431 			list_del_rcu(&pt->list);
432 			goto out;
433 		}
434 	}
435 
436 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437 out:
438 	spin_unlock(&ptype_lock);
439 }
440 EXPORT_SYMBOL(__dev_remove_pack);
441 
442 /**
443  *	dev_remove_pack	 - remove packet handler
444  *	@pt: packet type declaration
445  *
446  *	Remove a protocol handler that was previously added to the kernel
447  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
448  *	from the kernel lists and can be freed or reused once this function
449  *	returns.
450  *
451  *	This call sleeps to guarantee that no CPU is looking at the packet
452  *	type after return.
453  */
454 void dev_remove_pack(struct packet_type *pt)
455 {
456 	__dev_remove_pack(pt);
457 
458 	synchronize_net();
459 }
460 EXPORT_SYMBOL(dev_remove_pack);
461 
462 /******************************************************************************
463 
464 		      Device Boot-time Settings Routines
465 
466 *******************************************************************************/
467 
468 /* Boot time configuration table */
469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470 
471 /**
472  *	netdev_boot_setup_add	- add new setup entry
473  *	@name: name of the device
474  *	@map: configured settings for the device
475  *
476  *	Adds new setup entry to the dev_boot_setup list.  The function
477  *	returns 0 on error and 1 on success.  This is a generic routine to
478  *	all netdevices.
479  */
480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
481 {
482 	struct netdev_boot_setup *s;
483 	int i;
484 
485 	s = dev_boot_setup;
486 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 			memset(s[i].name, 0, sizeof(s[i].name));
489 			strlcpy(s[i].name, name, IFNAMSIZ);
490 			memcpy(&s[i].map, map, sizeof(s[i].map));
491 			break;
492 		}
493 	}
494 
495 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496 }
497 
498 /**
499  *	netdev_boot_setup_check	- check boot time settings
500  *	@dev: the netdevice
501  *
502  * 	Check boot time settings for the device.
503  *	The found settings are set for the device to be used
504  *	later in the device probing.
505  *	Returns 0 if no settings found, 1 if they are.
506  */
507 int netdev_boot_setup_check(struct net_device *dev)
508 {
509 	struct netdev_boot_setup *s = dev_boot_setup;
510 	int i;
511 
512 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514 		    !strcmp(dev->name, s[i].name)) {
515 			dev->irq 	= s[i].map.irq;
516 			dev->base_addr 	= s[i].map.base_addr;
517 			dev->mem_start 	= s[i].map.mem_start;
518 			dev->mem_end 	= s[i].map.mem_end;
519 			return 1;
520 		}
521 	}
522 	return 0;
523 }
524 EXPORT_SYMBOL(netdev_boot_setup_check);
525 
526 
527 /**
528  *	netdev_boot_base	- get address from boot time settings
529  *	@prefix: prefix for network device
530  *	@unit: id for network device
531  *
532  * 	Check boot time settings for the base address of device.
533  *	The found settings are set for the device to be used
534  *	later in the device probing.
535  *	Returns 0 if no settings found.
536  */
537 unsigned long netdev_boot_base(const char *prefix, int unit)
538 {
539 	const struct netdev_boot_setup *s = dev_boot_setup;
540 	char name[IFNAMSIZ];
541 	int i;
542 
543 	sprintf(name, "%s%d", prefix, unit);
544 
545 	/*
546 	 * If device already registered then return base of 1
547 	 * to indicate not to probe for this interface
548 	 */
549 	if (__dev_get_by_name(&init_net, name))
550 		return 1;
551 
552 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 		if (!strcmp(name, s[i].name))
554 			return s[i].map.base_addr;
555 	return 0;
556 }
557 
558 /*
559  * Saves at boot time configured settings for any netdevice.
560  */
561 int __init netdev_boot_setup(char *str)
562 {
563 	int ints[5];
564 	struct ifmap map;
565 
566 	str = get_options(str, ARRAY_SIZE(ints), ints);
567 	if (!str || !*str)
568 		return 0;
569 
570 	/* Save settings */
571 	memset(&map, 0, sizeof(map));
572 	if (ints[0] > 0)
573 		map.irq = ints[1];
574 	if (ints[0] > 1)
575 		map.base_addr = ints[2];
576 	if (ints[0] > 2)
577 		map.mem_start = ints[3];
578 	if (ints[0] > 3)
579 		map.mem_end = ints[4];
580 
581 	/* Add new entry to the list */
582 	return netdev_boot_setup_add(str, &map);
583 }
584 
585 __setup("netdev=", netdev_boot_setup);
586 
587 /*******************************************************************************
588 
589 			    Device Interface Subroutines
590 
591 *******************************************************************************/
592 
593 /**
594  *	__dev_get_by_name	- find a device by its name
595  *	@net: the applicable net namespace
596  *	@name: name to find
597  *
598  *	Find an interface by name. Must be called under RTNL semaphore
599  *	or @dev_base_lock. If the name is found a pointer to the device
600  *	is returned. If the name is not found then %NULL is returned. The
601  *	reference counters are not incremented so the caller must be
602  *	careful with locks.
603  */
604 
605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
606 {
607 	struct hlist_node *p;
608 	struct net_device *dev;
609 	struct hlist_head *head = dev_name_hash(net, name);
610 
611 	hlist_for_each_entry(dev, p, head, name_hlist)
612 		if (!strncmp(dev->name, name, IFNAMSIZ))
613 			return dev;
614 
615 	return NULL;
616 }
617 EXPORT_SYMBOL(__dev_get_by_name);
618 
619 /**
620  *	dev_get_by_name_rcu	- find a device by its name
621  *	@net: the applicable net namespace
622  *	@name: name to find
623  *
624  *	Find an interface by name.
625  *	If the name is found a pointer to the device is returned.
626  * 	If the name is not found then %NULL is returned.
627  *	The reference counters are not incremented so the caller must be
628  *	careful with locks. The caller must hold RCU lock.
629  */
630 
631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632 {
633 	struct hlist_node *p;
634 	struct net_device *dev;
635 	struct hlist_head *head = dev_name_hash(net, name);
636 
637 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 		if (!strncmp(dev->name, name, IFNAMSIZ))
639 			return dev;
640 
641 	return NULL;
642 }
643 EXPORT_SYMBOL(dev_get_by_name_rcu);
644 
645 /**
646  *	dev_get_by_name		- find a device by its name
647  *	@net: the applicable net namespace
648  *	@name: name to find
649  *
650  *	Find an interface by name. This can be called from any
651  *	context and does its own locking. The returned handle has
652  *	the usage count incremented and the caller must use dev_put() to
653  *	release it when it is no longer needed. %NULL is returned if no
654  *	matching device is found.
655  */
656 
657 struct net_device *dev_get_by_name(struct net *net, const char *name)
658 {
659 	struct net_device *dev;
660 
661 	rcu_read_lock();
662 	dev = dev_get_by_name_rcu(net, name);
663 	if (dev)
664 		dev_hold(dev);
665 	rcu_read_unlock();
666 	return dev;
667 }
668 EXPORT_SYMBOL(dev_get_by_name);
669 
670 /**
671  *	__dev_get_by_index - find a device by its ifindex
672  *	@net: the applicable net namespace
673  *	@ifindex: index of device
674  *
675  *	Search for an interface by index. Returns %NULL if the device
676  *	is not found or a pointer to the device. The device has not
677  *	had its reference counter increased so the caller must be careful
678  *	about locking. The caller must hold either the RTNL semaphore
679  *	or @dev_base_lock.
680  */
681 
682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
683 {
684 	struct hlist_node *p;
685 	struct net_device *dev;
686 	struct hlist_head *head = dev_index_hash(net, ifindex);
687 
688 	hlist_for_each_entry(dev, p, head, index_hlist)
689 		if (dev->ifindex == ifindex)
690 			return dev;
691 
692 	return NULL;
693 }
694 EXPORT_SYMBOL(__dev_get_by_index);
695 
696 /**
697  *	dev_get_by_index_rcu - find a device by its ifindex
698  *	@net: the applicable net namespace
699  *	@ifindex: index of device
700  *
701  *	Search for an interface by index. Returns %NULL if the device
702  *	is not found or a pointer to the device. The device has not
703  *	had its reference counter increased so the caller must be careful
704  *	about locking. The caller must hold RCU lock.
705  */
706 
707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708 {
709 	struct hlist_node *p;
710 	struct net_device *dev;
711 	struct hlist_head *head = dev_index_hash(net, ifindex);
712 
713 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 		if (dev->ifindex == ifindex)
715 			return dev;
716 
717 	return NULL;
718 }
719 EXPORT_SYMBOL(dev_get_by_index_rcu);
720 
721 
722 /**
723  *	dev_get_by_index - find a device by its ifindex
724  *	@net: the applicable net namespace
725  *	@ifindex: index of device
726  *
727  *	Search for an interface by index. Returns NULL if the device
728  *	is not found or a pointer to the device. The device returned has
729  *	had a reference added and the pointer is safe until the user calls
730  *	dev_put to indicate they have finished with it.
731  */
732 
733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
734 {
735 	struct net_device *dev;
736 
737 	rcu_read_lock();
738 	dev = dev_get_by_index_rcu(net, ifindex);
739 	if (dev)
740 		dev_hold(dev);
741 	rcu_read_unlock();
742 	return dev;
743 }
744 EXPORT_SYMBOL(dev_get_by_index);
745 
746 /**
747  *	dev_getbyhwaddr_rcu - find a device by its hardware address
748  *	@net: the applicable net namespace
749  *	@type: media type of device
750  *	@ha: hardware address
751  *
752  *	Search for an interface by MAC address. Returns NULL if the device
753  *	is not found or a pointer to the device.
754  *	The caller must hold RCU or RTNL.
755  *	The returned device has not had its ref count increased
756  *	and the caller must therefore be careful about locking
757  *
758  */
759 
760 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
761 				       const char *ha)
762 {
763 	struct net_device *dev;
764 
765 	for_each_netdev_rcu(net, dev)
766 		if (dev->type == type &&
767 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
768 			return dev;
769 
770 	return NULL;
771 }
772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
773 
774 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
775 {
776 	struct net_device *dev;
777 
778 	ASSERT_RTNL();
779 	for_each_netdev(net, dev)
780 		if (dev->type == type)
781 			return dev;
782 
783 	return NULL;
784 }
785 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
786 
787 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
788 {
789 	struct net_device *dev, *ret = NULL;
790 
791 	rcu_read_lock();
792 	for_each_netdev_rcu(net, dev)
793 		if (dev->type == type) {
794 			dev_hold(dev);
795 			ret = dev;
796 			break;
797 		}
798 	rcu_read_unlock();
799 	return ret;
800 }
801 EXPORT_SYMBOL(dev_getfirstbyhwtype);
802 
803 /**
804  *	dev_get_by_flags_rcu - find any device with given flags
805  *	@net: the applicable net namespace
806  *	@if_flags: IFF_* values
807  *	@mask: bitmask of bits in if_flags to check
808  *
809  *	Search for any interface with the given flags. Returns NULL if a device
810  *	is not found or a pointer to the device. Must be called inside
811  *	rcu_read_lock(), and result refcount is unchanged.
812  */
813 
814 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
815 				    unsigned short mask)
816 {
817 	struct net_device *dev, *ret;
818 
819 	ret = NULL;
820 	for_each_netdev_rcu(net, dev) {
821 		if (((dev->flags ^ if_flags) & mask) == 0) {
822 			ret = dev;
823 			break;
824 		}
825 	}
826 	return ret;
827 }
828 EXPORT_SYMBOL(dev_get_by_flags_rcu);
829 
830 /**
831  *	dev_valid_name - check if name is okay for network device
832  *	@name: name string
833  *
834  *	Network device names need to be valid file names to
835  *	to allow sysfs to work.  We also disallow any kind of
836  *	whitespace.
837  */
838 int dev_valid_name(const char *name)
839 {
840 	if (*name == '\0')
841 		return 0;
842 	if (strlen(name) >= IFNAMSIZ)
843 		return 0;
844 	if (!strcmp(name, ".") || !strcmp(name, ".."))
845 		return 0;
846 
847 	while (*name) {
848 		if (*name == '/' || isspace(*name))
849 			return 0;
850 		name++;
851 	}
852 	return 1;
853 }
854 EXPORT_SYMBOL(dev_valid_name);
855 
856 /**
857  *	__dev_alloc_name - allocate a name for a device
858  *	@net: network namespace to allocate the device name in
859  *	@name: name format string
860  *	@buf:  scratch buffer and result name string
861  *
862  *	Passed a format string - eg "lt%d" it will try and find a suitable
863  *	id. It scans list of devices to build up a free map, then chooses
864  *	the first empty slot. The caller must hold the dev_base or rtnl lock
865  *	while allocating the name and adding the device in order to avoid
866  *	duplicates.
867  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
868  *	Returns the number of the unit assigned or a negative errno code.
869  */
870 
871 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
872 {
873 	int i = 0;
874 	const char *p;
875 	const int max_netdevices = 8*PAGE_SIZE;
876 	unsigned long *inuse;
877 	struct net_device *d;
878 
879 	p = strnchr(name, IFNAMSIZ-1, '%');
880 	if (p) {
881 		/*
882 		 * Verify the string as this thing may have come from
883 		 * the user.  There must be either one "%d" and no other "%"
884 		 * characters.
885 		 */
886 		if (p[1] != 'd' || strchr(p + 2, '%'))
887 			return -EINVAL;
888 
889 		/* Use one page as a bit array of possible slots */
890 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
891 		if (!inuse)
892 			return -ENOMEM;
893 
894 		for_each_netdev(net, d) {
895 			if (!sscanf(d->name, name, &i))
896 				continue;
897 			if (i < 0 || i >= max_netdevices)
898 				continue;
899 
900 			/*  avoid cases where sscanf is not exact inverse of printf */
901 			snprintf(buf, IFNAMSIZ, name, i);
902 			if (!strncmp(buf, d->name, IFNAMSIZ))
903 				set_bit(i, inuse);
904 		}
905 
906 		i = find_first_zero_bit(inuse, max_netdevices);
907 		free_page((unsigned long) inuse);
908 	}
909 
910 	if (buf != name)
911 		snprintf(buf, IFNAMSIZ, name, i);
912 	if (!__dev_get_by_name(net, buf))
913 		return i;
914 
915 	/* It is possible to run out of possible slots
916 	 * when the name is long and there isn't enough space left
917 	 * for the digits, or if all bits are used.
918 	 */
919 	return -ENFILE;
920 }
921 
922 /**
923  *	dev_alloc_name - allocate a name for a device
924  *	@dev: device
925  *	@name: name format string
926  *
927  *	Passed a format string - eg "lt%d" it will try and find a suitable
928  *	id. It scans list of devices to build up a free map, then chooses
929  *	the first empty slot. The caller must hold the dev_base or rtnl lock
930  *	while allocating the name and adding the device in order to avoid
931  *	duplicates.
932  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
933  *	Returns the number of the unit assigned or a negative errno code.
934  */
935 
936 int dev_alloc_name(struct net_device *dev, const char *name)
937 {
938 	char buf[IFNAMSIZ];
939 	struct net *net;
940 	int ret;
941 
942 	BUG_ON(!dev_net(dev));
943 	net = dev_net(dev);
944 	ret = __dev_alloc_name(net, name, buf);
945 	if (ret >= 0)
946 		strlcpy(dev->name, buf, IFNAMSIZ);
947 	return ret;
948 }
949 EXPORT_SYMBOL(dev_alloc_name);
950 
951 static int dev_get_valid_name(struct net_device *dev, const char *name)
952 {
953 	struct net *net;
954 
955 	BUG_ON(!dev_net(dev));
956 	net = dev_net(dev);
957 
958 	if (!dev_valid_name(name))
959 		return -EINVAL;
960 
961 	if (strchr(name, '%'))
962 		return dev_alloc_name(dev, name);
963 	else if (__dev_get_by_name(net, name))
964 		return -EEXIST;
965 	else if (dev->name != name)
966 		strlcpy(dev->name, name, IFNAMSIZ);
967 
968 	return 0;
969 }
970 
971 /**
972  *	dev_change_name - change name of a device
973  *	@dev: device
974  *	@newname: name (or format string) must be at least IFNAMSIZ
975  *
976  *	Change name of a device, can pass format strings "eth%d".
977  *	for wildcarding.
978  */
979 int dev_change_name(struct net_device *dev, const char *newname)
980 {
981 	char oldname[IFNAMSIZ];
982 	int err = 0;
983 	int ret;
984 	struct net *net;
985 
986 	ASSERT_RTNL();
987 	BUG_ON(!dev_net(dev));
988 
989 	net = dev_net(dev);
990 	if (dev->flags & IFF_UP)
991 		return -EBUSY;
992 
993 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
994 		return 0;
995 
996 	memcpy(oldname, dev->name, IFNAMSIZ);
997 
998 	err = dev_get_valid_name(dev, newname);
999 	if (err < 0)
1000 		return err;
1001 
1002 rollback:
1003 	ret = device_rename(&dev->dev, dev->name);
1004 	if (ret) {
1005 		memcpy(dev->name, oldname, IFNAMSIZ);
1006 		return ret;
1007 	}
1008 
1009 	write_lock_bh(&dev_base_lock);
1010 	hlist_del_rcu(&dev->name_hlist);
1011 	write_unlock_bh(&dev_base_lock);
1012 
1013 	synchronize_rcu();
1014 
1015 	write_lock_bh(&dev_base_lock);
1016 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017 	write_unlock_bh(&dev_base_lock);
1018 
1019 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020 	ret = notifier_to_errno(ret);
1021 
1022 	if (ret) {
1023 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1024 		if (err >= 0) {
1025 			err = ret;
1026 			memcpy(dev->name, oldname, IFNAMSIZ);
1027 			goto rollback;
1028 		} else {
1029 			printk(KERN_ERR
1030 			       "%s: name change rollback failed: %d.\n",
1031 			       dev->name, ret);
1032 		}
1033 	}
1034 
1035 	return err;
1036 }
1037 
1038 /**
1039  *	dev_set_alias - change ifalias of a device
1040  *	@dev: device
1041  *	@alias: name up to IFALIASZ
1042  *	@len: limit of bytes to copy from info
1043  *
1044  *	Set ifalias for a device,
1045  */
1046 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047 {
1048 	ASSERT_RTNL();
1049 
1050 	if (len >= IFALIASZ)
1051 		return -EINVAL;
1052 
1053 	if (!len) {
1054 		if (dev->ifalias) {
1055 			kfree(dev->ifalias);
1056 			dev->ifalias = NULL;
1057 		}
1058 		return 0;
1059 	}
1060 
1061 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062 	if (!dev->ifalias)
1063 		return -ENOMEM;
1064 
1065 	strlcpy(dev->ifalias, alias, len+1);
1066 	return len;
1067 }
1068 
1069 
1070 /**
1071  *	netdev_features_change - device changes features
1072  *	@dev: device to cause notification
1073  *
1074  *	Called to indicate a device has changed features.
1075  */
1076 void netdev_features_change(struct net_device *dev)
1077 {
1078 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079 }
1080 EXPORT_SYMBOL(netdev_features_change);
1081 
1082 /**
1083  *	netdev_state_change - device changes state
1084  *	@dev: device to cause notification
1085  *
1086  *	Called to indicate a device has changed state. This function calls
1087  *	the notifier chains for netdev_chain and sends a NEWLINK message
1088  *	to the routing socket.
1089  */
1090 void netdev_state_change(struct net_device *dev)
1091 {
1092 	if (dev->flags & IFF_UP) {
1093 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095 	}
1096 }
1097 EXPORT_SYMBOL(netdev_state_change);
1098 
1099 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100 {
1101 	return call_netdevice_notifiers(event, dev);
1102 }
1103 EXPORT_SYMBOL(netdev_bonding_change);
1104 
1105 /**
1106  *	dev_load 	- load a network module
1107  *	@net: the applicable net namespace
1108  *	@name: name of interface
1109  *
1110  *	If a network interface is not present and the process has suitable
1111  *	privileges this function loads the module. If module loading is not
1112  *	available in this kernel then it becomes a nop.
1113  */
1114 
1115 void dev_load(struct net *net, const char *name)
1116 {
1117 	struct net_device *dev;
1118 	int no_module;
1119 
1120 	rcu_read_lock();
1121 	dev = dev_get_by_name_rcu(net, name);
1122 	rcu_read_unlock();
1123 
1124 	no_module = !dev;
1125 	if (no_module && capable(CAP_NET_ADMIN))
1126 		no_module = request_module("netdev-%s", name);
1127 	if (no_module && capable(CAP_SYS_MODULE)) {
1128 		if (!request_module("%s", name))
1129 			pr_err("Loading kernel module for a network device "
1130 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1131 "instead\n", name);
1132 	}
1133 }
1134 EXPORT_SYMBOL(dev_load);
1135 
1136 static int __dev_open(struct net_device *dev)
1137 {
1138 	const struct net_device_ops *ops = dev->netdev_ops;
1139 	int ret;
1140 
1141 	ASSERT_RTNL();
1142 
1143 	if (!netif_device_present(dev))
1144 		return -ENODEV;
1145 
1146 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1147 	ret = notifier_to_errno(ret);
1148 	if (ret)
1149 		return ret;
1150 
1151 	set_bit(__LINK_STATE_START, &dev->state);
1152 
1153 	if (ops->ndo_validate_addr)
1154 		ret = ops->ndo_validate_addr(dev);
1155 
1156 	if (!ret && ops->ndo_open)
1157 		ret = ops->ndo_open(dev);
1158 
1159 	if (ret)
1160 		clear_bit(__LINK_STATE_START, &dev->state);
1161 	else {
1162 		dev->flags |= IFF_UP;
1163 		net_dmaengine_get();
1164 		dev_set_rx_mode(dev);
1165 		dev_activate(dev);
1166 	}
1167 
1168 	return ret;
1169 }
1170 
1171 /**
1172  *	dev_open	- prepare an interface for use.
1173  *	@dev:	device to open
1174  *
1175  *	Takes a device from down to up state. The device's private open
1176  *	function is invoked and then the multicast lists are loaded. Finally
1177  *	the device is moved into the up state and a %NETDEV_UP message is
1178  *	sent to the netdev notifier chain.
1179  *
1180  *	Calling this function on an active interface is a nop. On a failure
1181  *	a negative errno code is returned.
1182  */
1183 int dev_open(struct net_device *dev)
1184 {
1185 	int ret;
1186 
1187 	if (dev->flags & IFF_UP)
1188 		return 0;
1189 
1190 	ret = __dev_open(dev);
1191 	if (ret < 0)
1192 		return ret;
1193 
1194 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1195 	call_netdevice_notifiers(NETDEV_UP, dev);
1196 
1197 	return ret;
1198 }
1199 EXPORT_SYMBOL(dev_open);
1200 
1201 static int __dev_close_many(struct list_head *head)
1202 {
1203 	struct net_device *dev;
1204 
1205 	ASSERT_RTNL();
1206 	might_sleep();
1207 
1208 	list_for_each_entry(dev, head, unreg_list) {
1209 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1210 
1211 		clear_bit(__LINK_STATE_START, &dev->state);
1212 
1213 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1214 		 * can be even on different cpu. So just clear netif_running().
1215 		 *
1216 		 * dev->stop() will invoke napi_disable() on all of it's
1217 		 * napi_struct instances on this device.
1218 		 */
1219 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1220 	}
1221 
1222 	dev_deactivate_many(head);
1223 
1224 	list_for_each_entry(dev, head, unreg_list) {
1225 		const struct net_device_ops *ops = dev->netdev_ops;
1226 
1227 		/*
1228 		 *	Call the device specific close. This cannot fail.
1229 		 *	Only if device is UP
1230 		 *
1231 		 *	We allow it to be called even after a DETACH hot-plug
1232 		 *	event.
1233 		 */
1234 		if (ops->ndo_stop)
1235 			ops->ndo_stop(dev);
1236 
1237 		dev->flags &= ~IFF_UP;
1238 		net_dmaengine_put();
1239 	}
1240 
1241 	return 0;
1242 }
1243 
1244 static int __dev_close(struct net_device *dev)
1245 {
1246 	int retval;
1247 	LIST_HEAD(single);
1248 
1249 	list_add(&dev->unreg_list, &single);
1250 	retval = __dev_close_many(&single);
1251 	list_del(&single);
1252 	return retval;
1253 }
1254 
1255 static int dev_close_many(struct list_head *head)
1256 {
1257 	struct net_device *dev, *tmp;
1258 	LIST_HEAD(tmp_list);
1259 
1260 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1261 		if (!(dev->flags & IFF_UP))
1262 			list_move(&dev->unreg_list, &tmp_list);
1263 
1264 	__dev_close_many(head);
1265 
1266 	list_for_each_entry(dev, head, unreg_list) {
1267 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1268 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1269 	}
1270 
1271 	/* rollback_registered_many needs the complete original list */
1272 	list_splice(&tmp_list, head);
1273 	return 0;
1274 }
1275 
1276 /**
1277  *	dev_close - shutdown an interface.
1278  *	@dev: device to shutdown
1279  *
1280  *	This function moves an active device into down state. A
1281  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1282  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1283  *	chain.
1284  */
1285 int dev_close(struct net_device *dev)
1286 {
1287 	if (dev->flags & IFF_UP) {
1288 		LIST_HEAD(single);
1289 
1290 		list_add(&dev->unreg_list, &single);
1291 		dev_close_many(&single);
1292 		list_del(&single);
1293 	}
1294 	return 0;
1295 }
1296 EXPORT_SYMBOL(dev_close);
1297 
1298 
1299 /**
1300  *	dev_disable_lro - disable Large Receive Offload on a device
1301  *	@dev: device
1302  *
1303  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1304  *	called under RTNL.  This is needed if received packets may be
1305  *	forwarded to another interface.
1306  */
1307 void dev_disable_lro(struct net_device *dev)
1308 {
1309 	u32 flags;
1310 
1311 	/*
1312 	 * If we're trying to disable lro on a vlan device
1313 	 * use the underlying physical device instead
1314 	 */
1315 	if (is_vlan_dev(dev))
1316 		dev = vlan_dev_real_dev(dev);
1317 
1318 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1319 		flags = dev->ethtool_ops->get_flags(dev);
1320 	else
1321 		flags = ethtool_op_get_flags(dev);
1322 
1323 	if (!(flags & ETH_FLAG_LRO))
1324 		return;
1325 
1326 	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1327 	if (unlikely(dev->features & NETIF_F_LRO))
1328 		netdev_WARN(dev, "failed to disable LRO!\n");
1329 }
1330 EXPORT_SYMBOL(dev_disable_lro);
1331 
1332 
1333 static int dev_boot_phase = 1;
1334 
1335 /**
1336  *	register_netdevice_notifier - register a network notifier block
1337  *	@nb: notifier
1338  *
1339  *	Register a notifier to be called when network device events occur.
1340  *	The notifier passed is linked into the kernel structures and must
1341  *	not be reused until it has been unregistered. A negative errno code
1342  *	is returned on a failure.
1343  *
1344  * 	When registered all registration and up events are replayed
1345  *	to the new notifier to allow device to have a race free
1346  *	view of the network device list.
1347  */
1348 
1349 int register_netdevice_notifier(struct notifier_block *nb)
1350 {
1351 	struct net_device *dev;
1352 	struct net_device *last;
1353 	struct net *net;
1354 	int err;
1355 
1356 	rtnl_lock();
1357 	err = raw_notifier_chain_register(&netdev_chain, nb);
1358 	if (err)
1359 		goto unlock;
1360 	if (dev_boot_phase)
1361 		goto unlock;
1362 	for_each_net(net) {
1363 		for_each_netdev(net, dev) {
1364 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1365 			err = notifier_to_errno(err);
1366 			if (err)
1367 				goto rollback;
1368 
1369 			if (!(dev->flags & IFF_UP))
1370 				continue;
1371 
1372 			nb->notifier_call(nb, NETDEV_UP, dev);
1373 		}
1374 	}
1375 
1376 unlock:
1377 	rtnl_unlock();
1378 	return err;
1379 
1380 rollback:
1381 	last = dev;
1382 	for_each_net(net) {
1383 		for_each_netdev(net, dev) {
1384 			if (dev == last)
1385 				break;
1386 
1387 			if (dev->flags & IFF_UP) {
1388 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1389 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1390 			}
1391 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1392 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1393 		}
1394 	}
1395 
1396 	raw_notifier_chain_unregister(&netdev_chain, nb);
1397 	goto unlock;
1398 }
1399 EXPORT_SYMBOL(register_netdevice_notifier);
1400 
1401 /**
1402  *	unregister_netdevice_notifier - unregister a network notifier block
1403  *	@nb: notifier
1404  *
1405  *	Unregister a notifier previously registered by
1406  *	register_netdevice_notifier(). The notifier is unlinked into the
1407  *	kernel structures and may then be reused. A negative errno code
1408  *	is returned on a failure.
1409  */
1410 
1411 int unregister_netdevice_notifier(struct notifier_block *nb)
1412 {
1413 	int err;
1414 
1415 	rtnl_lock();
1416 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1417 	rtnl_unlock();
1418 	return err;
1419 }
1420 EXPORT_SYMBOL(unregister_netdevice_notifier);
1421 
1422 /**
1423  *	call_netdevice_notifiers - call all network notifier blocks
1424  *      @val: value passed unmodified to notifier function
1425  *      @dev: net_device pointer passed unmodified to notifier function
1426  *
1427  *	Call all network notifier blocks.  Parameters and return value
1428  *	are as for raw_notifier_call_chain().
1429  */
1430 
1431 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1432 {
1433 	ASSERT_RTNL();
1434 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1435 }
1436 EXPORT_SYMBOL(call_netdevice_notifiers);
1437 
1438 /* When > 0 there are consumers of rx skb time stamps */
1439 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1440 
1441 void net_enable_timestamp(void)
1442 {
1443 	atomic_inc(&netstamp_needed);
1444 }
1445 EXPORT_SYMBOL(net_enable_timestamp);
1446 
1447 void net_disable_timestamp(void)
1448 {
1449 	atomic_dec(&netstamp_needed);
1450 }
1451 EXPORT_SYMBOL(net_disable_timestamp);
1452 
1453 static inline void net_timestamp_set(struct sk_buff *skb)
1454 {
1455 	if (atomic_read(&netstamp_needed))
1456 		__net_timestamp(skb);
1457 	else
1458 		skb->tstamp.tv64 = 0;
1459 }
1460 
1461 static inline void net_timestamp_check(struct sk_buff *skb)
1462 {
1463 	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1464 		__net_timestamp(skb);
1465 }
1466 
1467 static inline bool is_skb_forwardable(struct net_device *dev,
1468 				      struct sk_buff *skb)
1469 {
1470 	unsigned int len;
1471 
1472 	if (!(dev->flags & IFF_UP))
1473 		return false;
1474 
1475 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1476 	if (skb->len <= len)
1477 		return true;
1478 
1479 	/* if TSO is enabled, we don't care about the length as the packet
1480 	 * could be forwarded without being segmented before
1481 	 */
1482 	if (skb_is_gso(skb))
1483 		return true;
1484 
1485 	return false;
1486 }
1487 
1488 /**
1489  * dev_forward_skb - loopback an skb to another netif
1490  *
1491  * @dev: destination network device
1492  * @skb: buffer to forward
1493  *
1494  * return values:
1495  *	NET_RX_SUCCESS	(no congestion)
1496  *	NET_RX_DROP     (packet was dropped, but freed)
1497  *
1498  * dev_forward_skb can be used for injecting an skb from the
1499  * start_xmit function of one device into the receive queue
1500  * of another device.
1501  *
1502  * The receiving device may be in another namespace, so
1503  * we have to clear all information in the skb that could
1504  * impact namespace isolation.
1505  */
1506 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1507 {
1508 	skb_orphan(skb);
1509 	nf_reset(skb);
1510 
1511 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1512 		atomic_long_inc(&dev->rx_dropped);
1513 		kfree_skb(skb);
1514 		return NET_RX_DROP;
1515 	}
1516 	skb_set_dev(skb, dev);
1517 	skb->tstamp.tv64 = 0;
1518 	skb->pkt_type = PACKET_HOST;
1519 	skb->protocol = eth_type_trans(skb, dev);
1520 	return netif_rx(skb);
1521 }
1522 EXPORT_SYMBOL_GPL(dev_forward_skb);
1523 
1524 static inline int deliver_skb(struct sk_buff *skb,
1525 			      struct packet_type *pt_prev,
1526 			      struct net_device *orig_dev)
1527 {
1528 	atomic_inc(&skb->users);
1529 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1530 }
1531 
1532 /*
1533  *	Support routine. Sends outgoing frames to any network
1534  *	taps currently in use.
1535  */
1536 
1537 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1538 {
1539 	struct packet_type *ptype;
1540 	struct sk_buff *skb2 = NULL;
1541 	struct packet_type *pt_prev = NULL;
1542 
1543 	rcu_read_lock();
1544 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1545 		/* Never send packets back to the socket
1546 		 * they originated from - MvS (miquels@drinkel.ow.org)
1547 		 */
1548 		if ((ptype->dev == dev || !ptype->dev) &&
1549 		    (ptype->af_packet_priv == NULL ||
1550 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1551 			if (pt_prev) {
1552 				deliver_skb(skb2, pt_prev, skb->dev);
1553 				pt_prev = ptype;
1554 				continue;
1555 			}
1556 
1557 			skb2 = skb_clone(skb, GFP_ATOMIC);
1558 			if (!skb2)
1559 				break;
1560 
1561 			net_timestamp_set(skb2);
1562 
1563 			/* skb->nh should be correctly
1564 			   set by sender, so that the second statement is
1565 			   just protection against buggy protocols.
1566 			 */
1567 			skb_reset_mac_header(skb2);
1568 
1569 			if (skb_network_header(skb2) < skb2->data ||
1570 			    skb2->network_header > skb2->tail) {
1571 				if (net_ratelimit())
1572 					printk(KERN_CRIT "protocol %04x is "
1573 					       "buggy, dev %s\n",
1574 					       ntohs(skb2->protocol),
1575 					       dev->name);
1576 				skb_reset_network_header(skb2);
1577 			}
1578 
1579 			skb2->transport_header = skb2->network_header;
1580 			skb2->pkt_type = PACKET_OUTGOING;
1581 			pt_prev = ptype;
1582 		}
1583 	}
1584 	if (pt_prev)
1585 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1586 	rcu_read_unlock();
1587 }
1588 
1589 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1590  * @dev: Network device
1591  * @txq: number of queues available
1592  *
1593  * If real_num_tx_queues is changed the tc mappings may no longer be
1594  * valid. To resolve this verify the tc mapping remains valid and if
1595  * not NULL the mapping. With no priorities mapping to this
1596  * offset/count pair it will no longer be used. In the worst case TC0
1597  * is invalid nothing can be done so disable priority mappings. If is
1598  * expected that drivers will fix this mapping if they can before
1599  * calling netif_set_real_num_tx_queues.
1600  */
1601 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1602 {
1603 	int i;
1604 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1605 
1606 	/* If TC0 is invalidated disable TC mapping */
1607 	if (tc->offset + tc->count > txq) {
1608 		pr_warning("Number of in use tx queues changed "
1609 			   "invalidating tc mappings. Priority "
1610 			   "traffic classification disabled!\n");
1611 		dev->num_tc = 0;
1612 		return;
1613 	}
1614 
1615 	/* Invalidated prio to tc mappings set to TC0 */
1616 	for (i = 1; i < TC_BITMASK + 1; i++) {
1617 		int q = netdev_get_prio_tc_map(dev, i);
1618 
1619 		tc = &dev->tc_to_txq[q];
1620 		if (tc->offset + tc->count > txq) {
1621 			pr_warning("Number of in use tx queues "
1622 				   "changed. Priority %i to tc "
1623 				   "mapping %i is no longer valid "
1624 				   "setting map to 0\n",
1625 				   i, q);
1626 			netdev_set_prio_tc_map(dev, i, 0);
1627 		}
1628 	}
1629 }
1630 
1631 /*
1632  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1633  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1634  */
1635 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1636 {
1637 	int rc;
1638 
1639 	if (txq < 1 || txq > dev->num_tx_queues)
1640 		return -EINVAL;
1641 
1642 	if (dev->reg_state == NETREG_REGISTERED ||
1643 	    dev->reg_state == NETREG_UNREGISTERING) {
1644 		ASSERT_RTNL();
1645 
1646 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1647 						  txq);
1648 		if (rc)
1649 			return rc;
1650 
1651 		if (dev->num_tc)
1652 			netif_setup_tc(dev, txq);
1653 
1654 		if (txq < dev->real_num_tx_queues)
1655 			qdisc_reset_all_tx_gt(dev, txq);
1656 	}
1657 
1658 	dev->real_num_tx_queues = txq;
1659 	return 0;
1660 }
1661 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1662 
1663 #ifdef CONFIG_RPS
1664 /**
1665  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1666  *	@dev: Network device
1667  *	@rxq: Actual number of RX queues
1668  *
1669  *	This must be called either with the rtnl_lock held or before
1670  *	registration of the net device.  Returns 0 on success, or a
1671  *	negative error code.  If called before registration, it always
1672  *	succeeds.
1673  */
1674 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1675 {
1676 	int rc;
1677 
1678 	if (rxq < 1 || rxq > dev->num_rx_queues)
1679 		return -EINVAL;
1680 
1681 	if (dev->reg_state == NETREG_REGISTERED) {
1682 		ASSERT_RTNL();
1683 
1684 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1685 						  rxq);
1686 		if (rc)
1687 			return rc;
1688 	}
1689 
1690 	dev->real_num_rx_queues = rxq;
1691 	return 0;
1692 }
1693 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1694 #endif
1695 
1696 static inline void __netif_reschedule(struct Qdisc *q)
1697 {
1698 	struct softnet_data *sd;
1699 	unsigned long flags;
1700 
1701 	local_irq_save(flags);
1702 	sd = &__get_cpu_var(softnet_data);
1703 	q->next_sched = NULL;
1704 	*sd->output_queue_tailp = q;
1705 	sd->output_queue_tailp = &q->next_sched;
1706 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1707 	local_irq_restore(flags);
1708 }
1709 
1710 void __netif_schedule(struct Qdisc *q)
1711 {
1712 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1713 		__netif_reschedule(q);
1714 }
1715 EXPORT_SYMBOL(__netif_schedule);
1716 
1717 void dev_kfree_skb_irq(struct sk_buff *skb)
1718 {
1719 	if (atomic_dec_and_test(&skb->users)) {
1720 		struct softnet_data *sd;
1721 		unsigned long flags;
1722 
1723 		local_irq_save(flags);
1724 		sd = &__get_cpu_var(softnet_data);
1725 		skb->next = sd->completion_queue;
1726 		sd->completion_queue = skb;
1727 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1728 		local_irq_restore(flags);
1729 	}
1730 }
1731 EXPORT_SYMBOL(dev_kfree_skb_irq);
1732 
1733 void dev_kfree_skb_any(struct sk_buff *skb)
1734 {
1735 	if (in_irq() || irqs_disabled())
1736 		dev_kfree_skb_irq(skb);
1737 	else
1738 		dev_kfree_skb(skb);
1739 }
1740 EXPORT_SYMBOL(dev_kfree_skb_any);
1741 
1742 
1743 /**
1744  * netif_device_detach - mark device as removed
1745  * @dev: network device
1746  *
1747  * Mark device as removed from system and therefore no longer available.
1748  */
1749 void netif_device_detach(struct net_device *dev)
1750 {
1751 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1752 	    netif_running(dev)) {
1753 		netif_tx_stop_all_queues(dev);
1754 	}
1755 }
1756 EXPORT_SYMBOL(netif_device_detach);
1757 
1758 /**
1759  * netif_device_attach - mark device as attached
1760  * @dev: network device
1761  *
1762  * Mark device as attached from system and restart if needed.
1763  */
1764 void netif_device_attach(struct net_device *dev)
1765 {
1766 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1767 	    netif_running(dev)) {
1768 		netif_tx_wake_all_queues(dev);
1769 		__netdev_watchdog_up(dev);
1770 	}
1771 }
1772 EXPORT_SYMBOL(netif_device_attach);
1773 
1774 /**
1775  * skb_dev_set -- assign a new device to a buffer
1776  * @skb: buffer for the new device
1777  * @dev: network device
1778  *
1779  * If an skb is owned by a device already, we have to reset
1780  * all data private to the namespace a device belongs to
1781  * before assigning it a new device.
1782  */
1783 #ifdef CONFIG_NET_NS
1784 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1785 {
1786 	skb_dst_drop(skb);
1787 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1788 		secpath_reset(skb);
1789 		nf_reset(skb);
1790 		skb_init_secmark(skb);
1791 		skb->mark = 0;
1792 		skb->priority = 0;
1793 		skb->nf_trace = 0;
1794 		skb->ipvs_property = 0;
1795 #ifdef CONFIG_NET_SCHED
1796 		skb->tc_index = 0;
1797 #endif
1798 	}
1799 	skb->dev = dev;
1800 }
1801 EXPORT_SYMBOL(skb_set_dev);
1802 #endif /* CONFIG_NET_NS */
1803 
1804 /*
1805  * Invalidate hardware checksum when packet is to be mangled, and
1806  * complete checksum manually on outgoing path.
1807  */
1808 int skb_checksum_help(struct sk_buff *skb)
1809 {
1810 	__wsum csum;
1811 	int ret = 0, offset;
1812 
1813 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1814 		goto out_set_summed;
1815 
1816 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1817 		/* Let GSO fix up the checksum. */
1818 		goto out_set_summed;
1819 	}
1820 
1821 	offset = skb_checksum_start_offset(skb);
1822 	BUG_ON(offset >= skb_headlen(skb));
1823 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1824 
1825 	offset += skb->csum_offset;
1826 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1827 
1828 	if (skb_cloned(skb) &&
1829 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1830 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1831 		if (ret)
1832 			goto out;
1833 	}
1834 
1835 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1836 out_set_summed:
1837 	skb->ip_summed = CHECKSUM_NONE;
1838 out:
1839 	return ret;
1840 }
1841 EXPORT_SYMBOL(skb_checksum_help);
1842 
1843 /**
1844  *	skb_gso_segment - Perform segmentation on skb.
1845  *	@skb: buffer to segment
1846  *	@features: features for the output path (see dev->features)
1847  *
1848  *	This function segments the given skb and returns a list of segments.
1849  *
1850  *	It may return NULL if the skb requires no segmentation.  This is
1851  *	only possible when GSO is used for verifying header integrity.
1852  */
1853 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1854 {
1855 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1856 	struct packet_type *ptype;
1857 	__be16 type = skb->protocol;
1858 	int vlan_depth = ETH_HLEN;
1859 	int err;
1860 
1861 	while (type == htons(ETH_P_8021Q)) {
1862 		struct vlan_hdr *vh;
1863 
1864 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1865 			return ERR_PTR(-EINVAL);
1866 
1867 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1868 		type = vh->h_vlan_encapsulated_proto;
1869 		vlan_depth += VLAN_HLEN;
1870 	}
1871 
1872 	skb_reset_mac_header(skb);
1873 	skb->mac_len = skb->network_header - skb->mac_header;
1874 	__skb_pull(skb, skb->mac_len);
1875 
1876 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1877 		struct net_device *dev = skb->dev;
1878 		struct ethtool_drvinfo info = {};
1879 
1880 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1881 			dev->ethtool_ops->get_drvinfo(dev, &info);
1882 
1883 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1884 		     info.driver, dev ? dev->features : 0L,
1885 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1886 		     skb->len, skb->data_len, skb->ip_summed);
1887 
1888 		if (skb_header_cloned(skb) &&
1889 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1890 			return ERR_PTR(err);
1891 	}
1892 
1893 	rcu_read_lock();
1894 	list_for_each_entry_rcu(ptype,
1895 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1896 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1897 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1898 				err = ptype->gso_send_check(skb);
1899 				segs = ERR_PTR(err);
1900 				if (err || skb_gso_ok(skb, features))
1901 					break;
1902 				__skb_push(skb, (skb->data -
1903 						 skb_network_header(skb)));
1904 			}
1905 			segs = ptype->gso_segment(skb, features);
1906 			break;
1907 		}
1908 	}
1909 	rcu_read_unlock();
1910 
1911 	__skb_push(skb, skb->data - skb_mac_header(skb));
1912 
1913 	return segs;
1914 }
1915 EXPORT_SYMBOL(skb_gso_segment);
1916 
1917 /* Take action when hardware reception checksum errors are detected. */
1918 #ifdef CONFIG_BUG
1919 void netdev_rx_csum_fault(struct net_device *dev)
1920 {
1921 	if (net_ratelimit()) {
1922 		printk(KERN_ERR "%s: hw csum failure.\n",
1923 			dev ? dev->name : "<unknown>");
1924 		dump_stack();
1925 	}
1926 }
1927 EXPORT_SYMBOL(netdev_rx_csum_fault);
1928 #endif
1929 
1930 /* Actually, we should eliminate this check as soon as we know, that:
1931  * 1. IOMMU is present and allows to map all the memory.
1932  * 2. No high memory really exists on this machine.
1933  */
1934 
1935 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1936 {
1937 #ifdef CONFIG_HIGHMEM
1938 	int i;
1939 	if (!(dev->features & NETIF_F_HIGHDMA)) {
1940 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1941 			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1942 				return 1;
1943 	}
1944 
1945 	if (PCI_DMA_BUS_IS_PHYS) {
1946 		struct device *pdev = dev->dev.parent;
1947 
1948 		if (!pdev)
1949 			return 0;
1950 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1951 			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1952 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1953 				return 1;
1954 		}
1955 	}
1956 #endif
1957 	return 0;
1958 }
1959 
1960 struct dev_gso_cb {
1961 	void (*destructor)(struct sk_buff *skb);
1962 };
1963 
1964 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1965 
1966 static void dev_gso_skb_destructor(struct sk_buff *skb)
1967 {
1968 	struct dev_gso_cb *cb;
1969 
1970 	do {
1971 		struct sk_buff *nskb = skb->next;
1972 
1973 		skb->next = nskb->next;
1974 		nskb->next = NULL;
1975 		kfree_skb(nskb);
1976 	} while (skb->next);
1977 
1978 	cb = DEV_GSO_CB(skb);
1979 	if (cb->destructor)
1980 		cb->destructor(skb);
1981 }
1982 
1983 /**
1984  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1985  *	@skb: buffer to segment
1986  *	@features: device features as applicable to this skb
1987  *
1988  *	This function segments the given skb and stores the list of segments
1989  *	in skb->next.
1990  */
1991 static int dev_gso_segment(struct sk_buff *skb, int features)
1992 {
1993 	struct sk_buff *segs;
1994 
1995 	segs = skb_gso_segment(skb, features);
1996 
1997 	/* Verifying header integrity only. */
1998 	if (!segs)
1999 		return 0;
2000 
2001 	if (IS_ERR(segs))
2002 		return PTR_ERR(segs);
2003 
2004 	skb->next = segs;
2005 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2006 	skb->destructor = dev_gso_skb_destructor;
2007 
2008 	return 0;
2009 }
2010 
2011 /*
2012  * Try to orphan skb early, right before transmission by the device.
2013  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2014  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2015  */
2016 static inline void skb_orphan_try(struct sk_buff *skb)
2017 {
2018 	struct sock *sk = skb->sk;
2019 
2020 	if (sk && !skb_shinfo(skb)->tx_flags) {
2021 		/* skb_tx_hash() wont be able to get sk.
2022 		 * We copy sk_hash into skb->rxhash
2023 		 */
2024 		if (!skb->rxhash)
2025 			skb->rxhash = sk->sk_hash;
2026 		skb_orphan(skb);
2027 	}
2028 }
2029 
2030 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2031 {
2032 	return ((features & NETIF_F_GEN_CSUM) ||
2033 		((features & NETIF_F_V4_CSUM) &&
2034 		 protocol == htons(ETH_P_IP)) ||
2035 		((features & NETIF_F_V6_CSUM) &&
2036 		 protocol == htons(ETH_P_IPV6)) ||
2037 		((features & NETIF_F_FCOE_CRC) &&
2038 		 protocol == htons(ETH_P_FCOE)));
2039 }
2040 
2041 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2042 {
2043 	if (!can_checksum_protocol(features, protocol)) {
2044 		features &= ~NETIF_F_ALL_CSUM;
2045 		features &= ~NETIF_F_SG;
2046 	} else if (illegal_highdma(skb->dev, skb)) {
2047 		features &= ~NETIF_F_SG;
2048 	}
2049 
2050 	return features;
2051 }
2052 
2053 u32 netif_skb_features(struct sk_buff *skb)
2054 {
2055 	__be16 protocol = skb->protocol;
2056 	u32 features = skb->dev->features;
2057 
2058 	if (protocol == htons(ETH_P_8021Q)) {
2059 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2060 		protocol = veh->h_vlan_encapsulated_proto;
2061 	} else if (!vlan_tx_tag_present(skb)) {
2062 		return harmonize_features(skb, protocol, features);
2063 	}
2064 
2065 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2066 
2067 	if (protocol != htons(ETH_P_8021Q)) {
2068 		return harmonize_features(skb, protocol, features);
2069 	} else {
2070 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2071 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2072 		return harmonize_features(skb, protocol, features);
2073 	}
2074 }
2075 EXPORT_SYMBOL(netif_skb_features);
2076 
2077 /*
2078  * Returns true if either:
2079  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2080  *	2. skb is fragmented and the device does not support SG, or if
2081  *	   at least one of fragments is in highmem and device does not
2082  *	   support DMA from it.
2083  */
2084 static inline int skb_needs_linearize(struct sk_buff *skb,
2085 				      int features)
2086 {
2087 	return skb_is_nonlinear(skb) &&
2088 			((skb_has_frag_list(skb) &&
2089 				!(features & NETIF_F_FRAGLIST)) ||
2090 			(skb_shinfo(skb)->nr_frags &&
2091 				!(features & NETIF_F_SG)));
2092 }
2093 
2094 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2095 			struct netdev_queue *txq)
2096 {
2097 	const struct net_device_ops *ops = dev->netdev_ops;
2098 	int rc = NETDEV_TX_OK;
2099 
2100 	if (likely(!skb->next)) {
2101 		u32 features;
2102 
2103 		/*
2104 		 * If device doesn't need skb->dst, release it right now while
2105 		 * its hot in this cpu cache
2106 		 */
2107 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2108 			skb_dst_drop(skb);
2109 
2110 		if (!list_empty(&ptype_all))
2111 			dev_queue_xmit_nit(skb, dev);
2112 
2113 		skb_orphan_try(skb);
2114 
2115 		features = netif_skb_features(skb);
2116 
2117 		if (vlan_tx_tag_present(skb) &&
2118 		    !(features & NETIF_F_HW_VLAN_TX)) {
2119 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2120 			if (unlikely(!skb))
2121 				goto out;
2122 
2123 			skb->vlan_tci = 0;
2124 		}
2125 
2126 		if (netif_needs_gso(skb, features)) {
2127 			if (unlikely(dev_gso_segment(skb, features)))
2128 				goto out_kfree_skb;
2129 			if (skb->next)
2130 				goto gso;
2131 		} else {
2132 			if (skb_needs_linearize(skb, features) &&
2133 			    __skb_linearize(skb))
2134 				goto out_kfree_skb;
2135 
2136 			/* If packet is not checksummed and device does not
2137 			 * support checksumming for this protocol, complete
2138 			 * checksumming here.
2139 			 */
2140 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2141 				skb_set_transport_header(skb,
2142 					skb_checksum_start_offset(skb));
2143 				if (!(features & NETIF_F_ALL_CSUM) &&
2144 				     skb_checksum_help(skb))
2145 					goto out_kfree_skb;
2146 			}
2147 		}
2148 
2149 		rc = ops->ndo_start_xmit(skb, dev);
2150 		trace_net_dev_xmit(skb, rc);
2151 		if (rc == NETDEV_TX_OK)
2152 			txq_trans_update(txq);
2153 		return rc;
2154 	}
2155 
2156 gso:
2157 	do {
2158 		struct sk_buff *nskb = skb->next;
2159 
2160 		skb->next = nskb->next;
2161 		nskb->next = NULL;
2162 
2163 		/*
2164 		 * If device doesn't need nskb->dst, release it right now while
2165 		 * its hot in this cpu cache
2166 		 */
2167 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2168 			skb_dst_drop(nskb);
2169 
2170 		rc = ops->ndo_start_xmit(nskb, dev);
2171 		trace_net_dev_xmit(nskb, rc);
2172 		if (unlikely(rc != NETDEV_TX_OK)) {
2173 			if (rc & ~NETDEV_TX_MASK)
2174 				goto out_kfree_gso_skb;
2175 			nskb->next = skb->next;
2176 			skb->next = nskb;
2177 			return rc;
2178 		}
2179 		txq_trans_update(txq);
2180 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2181 			return NETDEV_TX_BUSY;
2182 	} while (skb->next);
2183 
2184 out_kfree_gso_skb:
2185 	if (likely(skb->next == NULL))
2186 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2187 out_kfree_skb:
2188 	kfree_skb(skb);
2189 out:
2190 	return rc;
2191 }
2192 
2193 static u32 hashrnd __read_mostly;
2194 
2195 /*
2196  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2197  * to be used as a distribution range.
2198  */
2199 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2200 		  unsigned int num_tx_queues)
2201 {
2202 	u32 hash;
2203 	u16 qoffset = 0;
2204 	u16 qcount = num_tx_queues;
2205 
2206 	if (skb_rx_queue_recorded(skb)) {
2207 		hash = skb_get_rx_queue(skb);
2208 		while (unlikely(hash >= num_tx_queues))
2209 			hash -= num_tx_queues;
2210 		return hash;
2211 	}
2212 
2213 	if (dev->num_tc) {
2214 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2215 		qoffset = dev->tc_to_txq[tc].offset;
2216 		qcount = dev->tc_to_txq[tc].count;
2217 	}
2218 
2219 	if (skb->sk && skb->sk->sk_hash)
2220 		hash = skb->sk->sk_hash;
2221 	else
2222 		hash = (__force u16) skb->protocol ^ skb->rxhash;
2223 	hash = jhash_1word(hash, hashrnd);
2224 
2225 	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2226 }
2227 EXPORT_SYMBOL(__skb_tx_hash);
2228 
2229 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2230 {
2231 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2232 		if (net_ratelimit()) {
2233 			pr_warning("%s selects TX queue %d, but "
2234 				"real number of TX queues is %d\n",
2235 				dev->name, queue_index, dev->real_num_tx_queues);
2236 		}
2237 		return 0;
2238 	}
2239 	return queue_index;
2240 }
2241 
2242 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2243 {
2244 #ifdef CONFIG_XPS
2245 	struct xps_dev_maps *dev_maps;
2246 	struct xps_map *map;
2247 	int queue_index = -1;
2248 
2249 	rcu_read_lock();
2250 	dev_maps = rcu_dereference(dev->xps_maps);
2251 	if (dev_maps) {
2252 		map = rcu_dereference(
2253 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2254 		if (map) {
2255 			if (map->len == 1)
2256 				queue_index = map->queues[0];
2257 			else {
2258 				u32 hash;
2259 				if (skb->sk && skb->sk->sk_hash)
2260 					hash = skb->sk->sk_hash;
2261 				else
2262 					hash = (__force u16) skb->protocol ^
2263 					    skb->rxhash;
2264 				hash = jhash_1word(hash, hashrnd);
2265 				queue_index = map->queues[
2266 				    ((u64)hash * map->len) >> 32];
2267 			}
2268 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2269 				queue_index = -1;
2270 		}
2271 	}
2272 	rcu_read_unlock();
2273 
2274 	return queue_index;
2275 #else
2276 	return -1;
2277 #endif
2278 }
2279 
2280 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2281 					struct sk_buff *skb)
2282 {
2283 	int queue_index;
2284 	const struct net_device_ops *ops = dev->netdev_ops;
2285 
2286 	if (dev->real_num_tx_queues == 1)
2287 		queue_index = 0;
2288 	else if (ops->ndo_select_queue) {
2289 		queue_index = ops->ndo_select_queue(dev, skb);
2290 		queue_index = dev_cap_txqueue(dev, queue_index);
2291 	} else {
2292 		struct sock *sk = skb->sk;
2293 		queue_index = sk_tx_queue_get(sk);
2294 
2295 		if (queue_index < 0 || skb->ooo_okay ||
2296 		    queue_index >= dev->real_num_tx_queues) {
2297 			int old_index = queue_index;
2298 
2299 			queue_index = get_xps_queue(dev, skb);
2300 			if (queue_index < 0)
2301 				queue_index = skb_tx_hash(dev, skb);
2302 
2303 			if (queue_index != old_index && sk) {
2304 				struct dst_entry *dst =
2305 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2306 
2307 				if (dst && skb_dst(skb) == dst)
2308 					sk_tx_queue_set(sk, queue_index);
2309 			}
2310 		}
2311 	}
2312 
2313 	skb_set_queue_mapping(skb, queue_index);
2314 	return netdev_get_tx_queue(dev, queue_index);
2315 }
2316 
2317 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2318 				 struct net_device *dev,
2319 				 struct netdev_queue *txq)
2320 {
2321 	spinlock_t *root_lock = qdisc_lock(q);
2322 	bool contended;
2323 	int rc;
2324 
2325 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2326 	qdisc_calculate_pkt_len(skb, q);
2327 	/*
2328 	 * Heuristic to force contended enqueues to serialize on a
2329 	 * separate lock before trying to get qdisc main lock.
2330 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2331 	 * and dequeue packets faster.
2332 	 */
2333 	contended = qdisc_is_running(q);
2334 	if (unlikely(contended))
2335 		spin_lock(&q->busylock);
2336 
2337 	spin_lock(root_lock);
2338 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2339 		kfree_skb(skb);
2340 		rc = NET_XMIT_DROP;
2341 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2342 		   qdisc_run_begin(q)) {
2343 		/*
2344 		 * This is a work-conserving queue; there are no old skbs
2345 		 * waiting to be sent out; and the qdisc is not running -
2346 		 * xmit the skb directly.
2347 		 */
2348 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2349 			skb_dst_force(skb);
2350 
2351 		qdisc_bstats_update(q, skb);
2352 
2353 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2354 			if (unlikely(contended)) {
2355 				spin_unlock(&q->busylock);
2356 				contended = false;
2357 			}
2358 			__qdisc_run(q);
2359 		} else
2360 			qdisc_run_end(q);
2361 
2362 		rc = NET_XMIT_SUCCESS;
2363 	} else {
2364 		skb_dst_force(skb);
2365 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2366 		if (qdisc_run_begin(q)) {
2367 			if (unlikely(contended)) {
2368 				spin_unlock(&q->busylock);
2369 				contended = false;
2370 			}
2371 			__qdisc_run(q);
2372 		}
2373 	}
2374 	spin_unlock(root_lock);
2375 	if (unlikely(contended))
2376 		spin_unlock(&q->busylock);
2377 	return rc;
2378 }
2379 
2380 static DEFINE_PER_CPU(int, xmit_recursion);
2381 #define RECURSION_LIMIT 10
2382 
2383 /**
2384  *	dev_queue_xmit - transmit a buffer
2385  *	@skb: buffer to transmit
2386  *
2387  *	Queue a buffer for transmission to a network device. The caller must
2388  *	have set the device and priority and built the buffer before calling
2389  *	this function. The function can be called from an interrupt.
2390  *
2391  *	A negative errno code is returned on a failure. A success does not
2392  *	guarantee the frame will be transmitted as it may be dropped due
2393  *	to congestion or traffic shaping.
2394  *
2395  * -----------------------------------------------------------------------------------
2396  *      I notice this method can also return errors from the queue disciplines,
2397  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2398  *      be positive.
2399  *
2400  *      Regardless of the return value, the skb is consumed, so it is currently
2401  *      difficult to retry a send to this method.  (You can bump the ref count
2402  *      before sending to hold a reference for retry if you are careful.)
2403  *
2404  *      When calling this method, interrupts MUST be enabled.  This is because
2405  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2406  *          --BLG
2407  */
2408 int dev_queue_xmit(struct sk_buff *skb)
2409 {
2410 	struct net_device *dev = skb->dev;
2411 	struct netdev_queue *txq;
2412 	struct Qdisc *q;
2413 	int rc = -ENOMEM;
2414 
2415 	/* Disable soft irqs for various locks below. Also
2416 	 * stops preemption for RCU.
2417 	 */
2418 	rcu_read_lock_bh();
2419 
2420 	txq = dev_pick_tx(dev, skb);
2421 	q = rcu_dereference_bh(txq->qdisc);
2422 
2423 #ifdef CONFIG_NET_CLS_ACT
2424 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2425 #endif
2426 	trace_net_dev_queue(skb);
2427 	if (q->enqueue) {
2428 		rc = __dev_xmit_skb(skb, q, dev, txq);
2429 		goto out;
2430 	}
2431 
2432 	/* The device has no queue. Common case for software devices:
2433 	   loopback, all the sorts of tunnels...
2434 
2435 	   Really, it is unlikely that netif_tx_lock protection is necessary
2436 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2437 	   counters.)
2438 	   However, it is possible, that they rely on protection
2439 	   made by us here.
2440 
2441 	   Check this and shot the lock. It is not prone from deadlocks.
2442 	   Either shot noqueue qdisc, it is even simpler 8)
2443 	 */
2444 	if (dev->flags & IFF_UP) {
2445 		int cpu = smp_processor_id(); /* ok because BHs are off */
2446 
2447 		if (txq->xmit_lock_owner != cpu) {
2448 
2449 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2450 				goto recursion_alert;
2451 
2452 			HARD_TX_LOCK(dev, txq, cpu);
2453 
2454 			if (!netif_tx_queue_stopped(txq)) {
2455 				__this_cpu_inc(xmit_recursion);
2456 				rc = dev_hard_start_xmit(skb, dev, txq);
2457 				__this_cpu_dec(xmit_recursion);
2458 				if (dev_xmit_complete(rc)) {
2459 					HARD_TX_UNLOCK(dev, txq);
2460 					goto out;
2461 				}
2462 			}
2463 			HARD_TX_UNLOCK(dev, txq);
2464 			if (net_ratelimit())
2465 				printk(KERN_CRIT "Virtual device %s asks to "
2466 				       "queue packet!\n", dev->name);
2467 		} else {
2468 			/* Recursion is detected! It is possible,
2469 			 * unfortunately
2470 			 */
2471 recursion_alert:
2472 			if (net_ratelimit())
2473 				printk(KERN_CRIT "Dead loop on virtual device "
2474 				       "%s, fix it urgently!\n", dev->name);
2475 		}
2476 	}
2477 
2478 	rc = -ENETDOWN;
2479 	rcu_read_unlock_bh();
2480 
2481 	kfree_skb(skb);
2482 	return rc;
2483 out:
2484 	rcu_read_unlock_bh();
2485 	return rc;
2486 }
2487 EXPORT_SYMBOL(dev_queue_xmit);
2488 
2489 
2490 /*=======================================================================
2491 			Receiver routines
2492   =======================================================================*/
2493 
2494 int netdev_max_backlog __read_mostly = 1000;
2495 int netdev_tstamp_prequeue __read_mostly = 1;
2496 int netdev_budget __read_mostly = 300;
2497 int weight_p __read_mostly = 64;            /* old backlog weight */
2498 
2499 /* Called with irq disabled */
2500 static inline void ____napi_schedule(struct softnet_data *sd,
2501 				     struct napi_struct *napi)
2502 {
2503 	list_add_tail(&napi->poll_list, &sd->poll_list);
2504 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2505 }
2506 
2507 /*
2508  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2509  * and src/dst port numbers. Returns a non-zero hash number on success
2510  * and 0 on failure.
2511  */
2512 __u32 __skb_get_rxhash(struct sk_buff *skb)
2513 {
2514 	int nhoff, hash = 0, poff;
2515 	const struct ipv6hdr *ip6;
2516 	const struct iphdr *ip;
2517 	u8 ip_proto;
2518 	u32 addr1, addr2, ihl;
2519 	union {
2520 		u32 v32;
2521 		u16 v16[2];
2522 	} ports;
2523 
2524 	nhoff = skb_network_offset(skb);
2525 
2526 	switch (skb->protocol) {
2527 	case __constant_htons(ETH_P_IP):
2528 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2529 			goto done;
2530 
2531 		ip = (const struct iphdr *) (skb->data + nhoff);
2532 		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2533 			ip_proto = 0;
2534 		else
2535 			ip_proto = ip->protocol;
2536 		addr1 = (__force u32) ip->saddr;
2537 		addr2 = (__force u32) ip->daddr;
2538 		ihl = ip->ihl;
2539 		break;
2540 	case __constant_htons(ETH_P_IPV6):
2541 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2542 			goto done;
2543 
2544 		ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2545 		ip_proto = ip6->nexthdr;
2546 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2547 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2548 		ihl = (40 >> 2);
2549 		break;
2550 	default:
2551 		goto done;
2552 	}
2553 
2554 	ports.v32 = 0;
2555 	poff = proto_ports_offset(ip_proto);
2556 	if (poff >= 0) {
2557 		nhoff += ihl * 4 + poff;
2558 		if (pskb_may_pull(skb, nhoff + 4)) {
2559 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2560 			if (ports.v16[1] < ports.v16[0])
2561 				swap(ports.v16[0], ports.v16[1]);
2562 		}
2563 	}
2564 
2565 	/* get a consistent hash (same value on both flow directions) */
2566 	if (addr2 < addr1)
2567 		swap(addr1, addr2);
2568 
2569 	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2570 	if (!hash)
2571 		hash = 1;
2572 
2573 done:
2574 	return hash;
2575 }
2576 EXPORT_SYMBOL(__skb_get_rxhash);
2577 
2578 #ifdef CONFIG_RPS
2579 
2580 /* One global table that all flow-based protocols share. */
2581 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2582 EXPORT_SYMBOL(rps_sock_flow_table);
2583 
2584 static struct rps_dev_flow *
2585 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2586 	    struct rps_dev_flow *rflow, u16 next_cpu)
2587 {
2588 	u16 tcpu;
2589 
2590 	tcpu = rflow->cpu = next_cpu;
2591 	if (tcpu != RPS_NO_CPU) {
2592 #ifdef CONFIG_RFS_ACCEL
2593 		struct netdev_rx_queue *rxqueue;
2594 		struct rps_dev_flow_table *flow_table;
2595 		struct rps_dev_flow *old_rflow;
2596 		u32 flow_id;
2597 		u16 rxq_index;
2598 		int rc;
2599 
2600 		/* Should we steer this flow to a different hardware queue? */
2601 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2602 		    !(dev->features & NETIF_F_NTUPLE))
2603 			goto out;
2604 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2605 		if (rxq_index == skb_get_rx_queue(skb))
2606 			goto out;
2607 
2608 		rxqueue = dev->_rx + rxq_index;
2609 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2610 		if (!flow_table)
2611 			goto out;
2612 		flow_id = skb->rxhash & flow_table->mask;
2613 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2614 							rxq_index, flow_id);
2615 		if (rc < 0)
2616 			goto out;
2617 		old_rflow = rflow;
2618 		rflow = &flow_table->flows[flow_id];
2619 		rflow->cpu = next_cpu;
2620 		rflow->filter = rc;
2621 		if (old_rflow->filter == rflow->filter)
2622 			old_rflow->filter = RPS_NO_FILTER;
2623 	out:
2624 #endif
2625 		rflow->last_qtail =
2626 			per_cpu(softnet_data, tcpu).input_queue_head;
2627 	}
2628 
2629 	return rflow;
2630 }
2631 
2632 /*
2633  * get_rps_cpu is called from netif_receive_skb and returns the target
2634  * CPU from the RPS map of the receiving queue for a given skb.
2635  * rcu_read_lock must be held on entry.
2636  */
2637 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2638 		       struct rps_dev_flow **rflowp)
2639 {
2640 	struct netdev_rx_queue *rxqueue;
2641 	struct rps_map *map;
2642 	struct rps_dev_flow_table *flow_table;
2643 	struct rps_sock_flow_table *sock_flow_table;
2644 	int cpu = -1;
2645 	u16 tcpu;
2646 
2647 	if (skb_rx_queue_recorded(skb)) {
2648 		u16 index = skb_get_rx_queue(skb);
2649 		if (unlikely(index >= dev->real_num_rx_queues)) {
2650 			WARN_ONCE(dev->real_num_rx_queues > 1,
2651 				  "%s received packet on queue %u, but number "
2652 				  "of RX queues is %u\n",
2653 				  dev->name, index, dev->real_num_rx_queues);
2654 			goto done;
2655 		}
2656 		rxqueue = dev->_rx + index;
2657 	} else
2658 		rxqueue = dev->_rx;
2659 
2660 	map = rcu_dereference(rxqueue->rps_map);
2661 	if (map) {
2662 		if (map->len == 1 &&
2663 		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2664 			tcpu = map->cpus[0];
2665 			if (cpu_online(tcpu))
2666 				cpu = tcpu;
2667 			goto done;
2668 		}
2669 	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2670 		goto done;
2671 	}
2672 
2673 	skb_reset_network_header(skb);
2674 	if (!skb_get_rxhash(skb))
2675 		goto done;
2676 
2677 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2678 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2679 	if (flow_table && sock_flow_table) {
2680 		u16 next_cpu;
2681 		struct rps_dev_flow *rflow;
2682 
2683 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2684 		tcpu = rflow->cpu;
2685 
2686 		next_cpu = sock_flow_table->ents[skb->rxhash &
2687 		    sock_flow_table->mask];
2688 
2689 		/*
2690 		 * If the desired CPU (where last recvmsg was done) is
2691 		 * different from current CPU (one in the rx-queue flow
2692 		 * table entry), switch if one of the following holds:
2693 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2694 		 *   - Current CPU is offline.
2695 		 *   - The current CPU's queue tail has advanced beyond the
2696 		 *     last packet that was enqueued using this table entry.
2697 		 *     This guarantees that all previous packets for the flow
2698 		 *     have been dequeued, thus preserving in order delivery.
2699 		 */
2700 		if (unlikely(tcpu != next_cpu) &&
2701 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2702 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2703 		      rflow->last_qtail)) >= 0))
2704 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2705 
2706 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2707 			*rflowp = rflow;
2708 			cpu = tcpu;
2709 			goto done;
2710 		}
2711 	}
2712 
2713 	if (map) {
2714 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2715 
2716 		if (cpu_online(tcpu)) {
2717 			cpu = tcpu;
2718 			goto done;
2719 		}
2720 	}
2721 
2722 done:
2723 	return cpu;
2724 }
2725 
2726 #ifdef CONFIG_RFS_ACCEL
2727 
2728 /**
2729  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2730  * @dev: Device on which the filter was set
2731  * @rxq_index: RX queue index
2732  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2733  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2734  *
2735  * Drivers that implement ndo_rx_flow_steer() should periodically call
2736  * this function for each installed filter and remove the filters for
2737  * which it returns %true.
2738  */
2739 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2740 			 u32 flow_id, u16 filter_id)
2741 {
2742 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2743 	struct rps_dev_flow_table *flow_table;
2744 	struct rps_dev_flow *rflow;
2745 	bool expire = true;
2746 	int cpu;
2747 
2748 	rcu_read_lock();
2749 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2750 	if (flow_table && flow_id <= flow_table->mask) {
2751 		rflow = &flow_table->flows[flow_id];
2752 		cpu = ACCESS_ONCE(rflow->cpu);
2753 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2754 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2755 			   rflow->last_qtail) <
2756 		     (int)(10 * flow_table->mask)))
2757 			expire = false;
2758 	}
2759 	rcu_read_unlock();
2760 	return expire;
2761 }
2762 EXPORT_SYMBOL(rps_may_expire_flow);
2763 
2764 #endif /* CONFIG_RFS_ACCEL */
2765 
2766 /* Called from hardirq (IPI) context */
2767 static void rps_trigger_softirq(void *data)
2768 {
2769 	struct softnet_data *sd = data;
2770 
2771 	____napi_schedule(sd, &sd->backlog);
2772 	sd->received_rps++;
2773 }
2774 
2775 #endif /* CONFIG_RPS */
2776 
2777 /*
2778  * Check if this softnet_data structure is another cpu one
2779  * If yes, queue it to our IPI list and return 1
2780  * If no, return 0
2781  */
2782 static int rps_ipi_queued(struct softnet_data *sd)
2783 {
2784 #ifdef CONFIG_RPS
2785 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2786 
2787 	if (sd != mysd) {
2788 		sd->rps_ipi_next = mysd->rps_ipi_list;
2789 		mysd->rps_ipi_list = sd;
2790 
2791 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2792 		return 1;
2793 	}
2794 #endif /* CONFIG_RPS */
2795 	return 0;
2796 }
2797 
2798 /*
2799  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2800  * queue (may be a remote CPU queue).
2801  */
2802 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2803 			      unsigned int *qtail)
2804 {
2805 	struct softnet_data *sd;
2806 	unsigned long flags;
2807 
2808 	sd = &per_cpu(softnet_data, cpu);
2809 
2810 	local_irq_save(flags);
2811 
2812 	rps_lock(sd);
2813 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2814 		if (skb_queue_len(&sd->input_pkt_queue)) {
2815 enqueue:
2816 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2817 			input_queue_tail_incr_save(sd, qtail);
2818 			rps_unlock(sd);
2819 			local_irq_restore(flags);
2820 			return NET_RX_SUCCESS;
2821 		}
2822 
2823 		/* Schedule NAPI for backlog device
2824 		 * We can use non atomic operation since we own the queue lock
2825 		 */
2826 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2827 			if (!rps_ipi_queued(sd))
2828 				____napi_schedule(sd, &sd->backlog);
2829 		}
2830 		goto enqueue;
2831 	}
2832 
2833 	sd->dropped++;
2834 	rps_unlock(sd);
2835 
2836 	local_irq_restore(flags);
2837 
2838 	atomic_long_inc(&skb->dev->rx_dropped);
2839 	kfree_skb(skb);
2840 	return NET_RX_DROP;
2841 }
2842 
2843 /**
2844  *	netif_rx	-	post buffer to the network code
2845  *	@skb: buffer to post
2846  *
2847  *	This function receives a packet from a device driver and queues it for
2848  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2849  *	may be dropped during processing for congestion control or by the
2850  *	protocol layers.
2851  *
2852  *	return values:
2853  *	NET_RX_SUCCESS	(no congestion)
2854  *	NET_RX_DROP     (packet was dropped)
2855  *
2856  */
2857 
2858 int netif_rx(struct sk_buff *skb)
2859 {
2860 	int ret;
2861 
2862 	/* if netpoll wants it, pretend we never saw it */
2863 	if (netpoll_rx(skb))
2864 		return NET_RX_DROP;
2865 
2866 	if (netdev_tstamp_prequeue)
2867 		net_timestamp_check(skb);
2868 
2869 	trace_netif_rx(skb);
2870 #ifdef CONFIG_RPS
2871 	{
2872 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2873 		int cpu;
2874 
2875 		preempt_disable();
2876 		rcu_read_lock();
2877 
2878 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2879 		if (cpu < 0)
2880 			cpu = smp_processor_id();
2881 
2882 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2883 
2884 		rcu_read_unlock();
2885 		preempt_enable();
2886 	}
2887 #else
2888 	{
2889 		unsigned int qtail;
2890 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2891 		put_cpu();
2892 	}
2893 #endif
2894 	return ret;
2895 }
2896 EXPORT_SYMBOL(netif_rx);
2897 
2898 int netif_rx_ni(struct sk_buff *skb)
2899 {
2900 	int err;
2901 
2902 	preempt_disable();
2903 	err = netif_rx(skb);
2904 	if (local_softirq_pending())
2905 		do_softirq();
2906 	preempt_enable();
2907 
2908 	return err;
2909 }
2910 EXPORT_SYMBOL(netif_rx_ni);
2911 
2912 static void net_tx_action(struct softirq_action *h)
2913 {
2914 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2915 
2916 	if (sd->completion_queue) {
2917 		struct sk_buff *clist;
2918 
2919 		local_irq_disable();
2920 		clist = sd->completion_queue;
2921 		sd->completion_queue = NULL;
2922 		local_irq_enable();
2923 
2924 		while (clist) {
2925 			struct sk_buff *skb = clist;
2926 			clist = clist->next;
2927 
2928 			WARN_ON(atomic_read(&skb->users));
2929 			trace_kfree_skb(skb, net_tx_action);
2930 			__kfree_skb(skb);
2931 		}
2932 	}
2933 
2934 	if (sd->output_queue) {
2935 		struct Qdisc *head;
2936 
2937 		local_irq_disable();
2938 		head = sd->output_queue;
2939 		sd->output_queue = NULL;
2940 		sd->output_queue_tailp = &sd->output_queue;
2941 		local_irq_enable();
2942 
2943 		while (head) {
2944 			struct Qdisc *q = head;
2945 			spinlock_t *root_lock;
2946 
2947 			head = head->next_sched;
2948 
2949 			root_lock = qdisc_lock(q);
2950 			if (spin_trylock(root_lock)) {
2951 				smp_mb__before_clear_bit();
2952 				clear_bit(__QDISC_STATE_SCHED,
2953 					  &q->state);
2954 				qdisc_run(q);
2955 				spin_unlock(root_lock);
2956 			} else {
2957 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2958 					      &q->state)) {
2959 					__netif_reschedule(q);
2960 				} else {
2961 					smp_mb__before_clear_bit();
2962 					clear_bit(__QDISC_STATE_SCHED,
2963 						  &q->state);
2964 				}
2965 			}
2966 		}
2967 	}
2968 }
2969 
2970 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2971     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2972 /* This hook is defined here for ATM LANE */
2973 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2974 			     unsigned char *addr) __read_mostly;
2975 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2976 #endif
2977 
2978 #ifdef CONFIG_NET_CLS_ACT
2979 /* TODO: Maybe we should just force sch_ingress to be compiled in
2980  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2981  * a compare and 2 stores extra right now if we dont have it on
2982  * but have CONFIG_NET_CLS_ACT
2983  * NOTE: This doesn't stop any functionality; if you dont have
2984  * the ingress scheduler, you just can't add policies on ingress.
2985  *
2986  */
2987 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2988 {
2989 	struct net_device *dev = skb->dev;
2990 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2991 	int result = TC_ACT_OK;
2992 	struct Qdisc *q;
2993 
2994 	if (unlikely(MAX_RED_LOOP < ttl++)) {
2995 		if (net_ratelimit())
2996 			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2997 			       skb->skb_iif, dev->ifindex);
2998 		return TC_ACT_SHOT;
2999 	}
3000 
3001 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3002 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3003 
3004 	q = rxq->qdisc;
3005 	if (q != &noop_qdisc) {
3006 		spin_lock(qdisc_lock(q));
3007 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3008 			result = qdisc_enqueue_root(skb, q);
3009 		spin_unlock(qdisc_lock(q));
3010 	}
3011 
3012 	return result;
3013 }
3014 
3015 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3016 					 struct packet_type **pt_prev,
3017 					 int *ret, struct net_device *orig_dev)
3018 {
3019 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3020 
3021 	if (!rxq || rxq->qdisc == &noop_qdisc)
3022 		goto out;
3023 
3024 	if (*pt_prev) {
3025 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3026 		*pt_prev = NULL;
3027 	}
3028 
3029 	switch (ing_filter(skb, rxq)) {
3030 	case TC_ACT_SHOT:
3031 	case TC_ACT_STOLEN:
3032 		kfree_skb(skb);
3033 		return NULL;
3034 	}
3035 
3036 out:
3037 	skb->tc_verd = 0;
3038 	return skb;
3039 }
3040 #endif
3041 
3042 /**
3043  *	netdev_rx_handler_register - register receive handler
3044  *	@dev: device to register a handler for
3045  *	@rx_handler: receive handler to register
3046  *	@rx_handler_data: data pointer that is used by rx handler
3047  *
3048  *	Register a receive hander for a device. This handler will then be
3049  *	called from __netif_receive_skb. A negative errno code is returned
3050  *	on a failure.
3051  *
3052  *	The caller must hold the rtnl_mutex.
3053  *
3054  *	For a general description of rx_handler, see enum rx_handler_result.
3055  */
3056 int netdev_rx_handler_register(struct net_device *dev,
3057 			       rx_handler_func_t *rx_handler,
3058 			       void *rx_handler_data)
3059 {
3060 	ASSERT_RTNL();
3061 
3062 	if (dev->rx_handler)
3063 		return -EBUSY;
3064 
3065 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3066 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3067 
3068 	return 0;
3069 }
3070 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3071 
3072 /**
3073  *	netdev_rx_handler_unregister - unregister receive handler
3074  *	@dev: device to unregister a handler from
3075  *
3076  *	Unregister a receive hander from a device.
3077  *
3078  *	The caller must hold the rtnl_mutex.
3079  */
3080 void netdev_rx_handler_unregister(struct net_device *dev)
3081 {
3082 
3083 	ASSERT_RTNL();
3084 	rcu_assign_pointer(dev->rx_handler, NULL);
3085 	rcu_assign_pointer(dev->rx_handler_data, NULL);
3086 }
3087 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3088 
3089 static int __netif_receive_skb(struct sk_buff *skb)
3090 {
3091 	struct packet_type *ptype, *pt_prev;
3092 	rx_handler_func_t *rx_handler;
3093 	struct net_device *orig_dev;
3094 	struct net_device *null_or_dev;
3095 	bool deliver_exact = false;
3096 	int ret = NET_RX_DROP;
3097 	__be16 type;
3098 
3099 	if (!netdev_tstamp_prequeue)
3100 		net_timestamp_check(skb);
3101 
3102 	trace_netif_receive_skb(skb);
3103 
3104 	/* if we've gotten here through NAPI, check netpoll */
3105 	if (netpoll_receive_skb(skb))
3106 		return NET_RX_DROP;
3107 
3108 	if (!skb->skb_iif)
3109 		skb->skb_iif = skb->dev->ifindex;
3110 	orig_dev = skb->dev;
3111 
3112 	skb_reset_network_header(skb);
3113 	skb_reset_transport_header(skb);
3114 	skb->mac_len = skb->network_header - skb->mac_header;
3115 
3116 	pt_prev = NULL;
3117 
3118 	rcu_read_lock();
3119 
3120 another_round:
3121 
3122 	__this_cpu_inc(softnet_data.processed);
3123 
3124 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3125 		skb = vlan_untag(skb);
3126 		if (unlikely(!skb))
3127 			goto out;
3128 	}
3129 
3130 #ifdef CONFIG_NET_CLS_ACT
3131 	if (skb->tc_verd & TC_NCLS) {
3132 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3133 		goto ncls;
3134 	}
3135 #endif
3136 
3137 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3138 		if (!ptype->dev || ptype->dev == skb->dev) {
3139 			if (pt_prev)
3140 				ret = deliver_skb(skb, pt_prev, orig_dev);
3141 			pt_prev = ptype;
3142 		}
3143 	}
3144 
3145 #ifdef CONFIG_NET_CLS_ACT
3146 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3147 	if (!skb)
3148 		goto out;
3149 ncls:
3150 #endif
3151 
3152 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3153 	if (rx_handler) {
3154 		if (pt_prev) {
3155 			ret = deliver_skb(skb, pt_prev, orig_dev);
3156 			pt_prev = NULL;
3157 		}
3158 		switch (rx_handler(&skb)) {
3159 		case RX_HANDLER_CONSUMED:
3160 			goto out;
3161 		case RX_HANDLER_ANOTHER:
3162 			goto another_round;
3163 		case RX_HANDLER_EXACT:
3164 			deliver_exact = true;
3165 		case RX_HANDLER_PASS:
3166 			break;
3167 		default:
3168 			BUG();
3169 		}
3170 	}
3171 
3172 	if (vlan_tx_tag_present(skb)) {
3173 		if (pt_prev) {
3174 			ret = deliver_skb(skb, pt_prev, orig_dev);
3175 			pt_prev = NULL;
3176 		}
3177 		if (vlan_do_receive(&skb)) {
3178 			ret = __netif_receive_skb(skb);
3179 			goto out;
3180 		} else if (unlikely(!skb))
3181 			goto out;
3182 	}
3183 
3184 	/* deliver only exact match when indicated */
3185 	null_or_dev = deliver_exact ? skb->dev : NULL;
3186 
3187 	type = skb->protocol;
3188 	list_for_each_entry_rcu(ptype,
3189 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3190 		if (ptype->type == type &&
3191 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3192 		     ptype->dev == orig_dev)) {
3193 			if (pt_prev)
3194 				ret = deliver_skb(skb, pt_prev, orig_dev);
3195 			pt_prev = ptype;
3196 		}
3197 	}
3198 
3199 	if (pt_prev) {
3200 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3201 	} else {
3202 		atomic_long_inc(&skb->dev->rx_dropped);
3203 		kfree_skb(skb);
3204 		/* Jamal, now you will not able to escape explaining
3205 		 * me how you were going to use this. :-)
3206 		 */
3207 		ret = NET_RX_DROP;
3208 	}
3209 
3210 out:
3211 	rcu_read_unlock();
3212 	return ret;
3213 }
3214 
3215 /**
3216  *	netif_receive_skb - process receive buffer from network
3217  *	@skb: buffer to process
3218  *
3219  *	netif_receive_skb() is the main receive data processing function.
3220  *	It always succeeds. The buffer may be dropped during processing
3221  *	for congestion control or by the protocol layers.
3222  *
3223  *	This function may only be called from softirq context and interrupts
3224  *	should be enabled.
3225  *
3226  *	Return values (usually ignored):
3227  *	NET_RX_SUCCESS: no congestion
3228  *	NET_RX_DROP: packet was dropped
3229  */
3230 int netif_receive_skb(struct sk_buff *skb)
3231 {
3232 	if (netdev_tstamp_prequeue)
3233 		net_timestamp_check(skb);
3234 
3235 	if (skb_defer_rx_timestamp(skb))
3236 		return NET_RX_SUCCESS;
3237 
3238 #ifdef CONFIG_RPS
3239 	{
3240 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3241 		int cpu, ret;
3242 
3243 		rcu_read_lock();
3244 
3245 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3246 
3247 		if (cpu >= 0) {
3248 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3249 			rcu_read_unlock();
3250 		} else {
3251 			rcu_read_unlock();
3252 			ret = __netif_receive_skb(skb);
3253 		}
3254 
3255 		return ret;
3256 	}
3257 #else
3258 	return __netif_receive_skb(skb);
3259 #endif
3260 }
3261 EXPORT_SYMBOL(netif_receive_skb);
3262 
3263 /* Network device is going away, flush any packets still pending
3264  * Called with irqs disabled.
3265  */
3266 static void flush_backlog(void *arg)
3267 {
3268 	struct net_device *dev = arg;
3269 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3270 	struct sk_buff *skb, *tmp;
3271 
3272 	rps_lock(sd);
3273 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3274 		if (skb->dev == dev) {
3275 			__skb_unlink(skb, &sd->input_pkt_queue);
3276 			kfree_skb(skb);
3277 			input_queue_head_incr(sd);
3278 		}
3279 	}
3280 	rps_unlock(sd);
3281 
3282 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3283 		if (skb->dev == dev) {
3284 			__skb_unlink(skb, &sd->process_queue);
3285 			kfree_skb(skb);
3286 			input_queue_head_incr(sd);
3287 		}
3288 	}
3289 }
3290 
3291 static int napi_gro_complete(struct sk_buff *skb)
3292 {
3293 	struct packet_type *ptype;
3294 	__be16 type = skb->protocol;
3295 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3296 	int err = -ENOENT;
3297 
3298 	if (NAPI_GRO_CB(skb)->count == 1) {
3299 		skb_shinfo(skb)->gso_size = 0;
3300 		goto out;
3301 	}
3302 
3303 	rcu_read_lock();
3304 	list_for_each_entry_rcu(ptype, head, list) {
3305 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3306 			continue;
3307 
3308 		err = ptype->gro_complete(skb);
3309 		break;
3310 	}
3311 	rcu_read_unlock();
3312 
3313 	if (err) {
3314 		WARN_ON(&ptype->list == head);
3315 		kfree_skb(skb);
3316 		return NET_RX_SUCCESS;
3317 	}
3318 
3319 out:
3320 	return netif_receive_skb(skb);
3321 }
3322 
3323 inline void napi_gro_flush(struct napi_struct *napi)
3324 {
3325 	struct sk_buff *skb, *next;
3326 
3327 	for (skb = napi->gro_list; skb; skb = next) {
3328 		next = skb->next;
3329 		skb->next = NULL;
3330 		napi_gro_complete(skb);
3331 	}
3332 
3333 	napi->gro_count = 0;
3334 	napi->gro_list = NULL;
3335 }
3336 EXPORT_SYMBOL(napi_gro_flush);
3337 
3338 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3339 {
3340 	struct sk_buff **pp = NULL;
3341 	struct packet_type *ptype;
3342 	__be16 type = skb->protocol;
3343 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3344 	int same_flow;
3345 	int mac_len;
3346 	enum gro_result ret;
3347 
3348 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3349 		goto normal;
3350 
3351 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3352 		goto normal;
3353 
3354 	rcu_read_lock();
3355 	list_for_each_entry_rcu(ptype, head, list) {
3356 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3357 			continue;
3358 
3359 		skb_set_network_header(skb, skb_gro_offset(skb));
3360 		mac_len = skb->network_header - skb->mac_header;
3361 		skb->mac_len = mac_len;
3362 		NAPI_GRO_CB(skb)->same_flow = 0;
3363 		NAPI_GRO_CB(skb)->flush = 0;
3364 		NAPI_GRO_CB(skb)->free = 0;
3365 
3366 		pp = ptype->gro_receive(&napi->gro_list, skb);
3367 		break;
3368 	}
3369 	rcu_read_unlock();
3370 
3371 	if (&ptype->list == head)
3372 		goto normal;
3373 
3374 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3375 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3376 
3377 	if (pp) {
3378 		struct sk_buff *nskb = *pp;
3379 
3380 		*pp = nskb->next;
3381 		nskb->next = NULL;
3382 		napi_gro_complete(nskb);
3383 		napi->gro_count--;
3384 	}
3385 
3386 	if (same_flow)
3387 		goto ok;
3388 
3389 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3390 		goto normal;
3391 
3392 	napi->gro_count++;
3393 	NAPI_GRO_CB(skb)->count = 1;
3394 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3395 	skb->next = napi->gro_list;
3396 	napi->gro_list = skb;
3397 	ret = GRO_HELD;
3398 
3399 pull:
3400 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3401 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3402 
3403 		BUG_ON(skb->end - skb->tail < grow);
3404 
3405 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3406 
3407 		skb->tail += grow;
3408 		skb->data_len -= grow;
3409 
3410 		skb_shinfo(skb)->frags[0].page_offset += grow;
3411 		skb_shinfo(skb)->frags[0].size -= grow;
3412 
3413 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3414 			put_page(skb_shinfo(skb)->frags[0].page);
3415 			memmove(skb_shinfo(skb)->frags,
3416 				skb_shinfo(skb)->frags + 1,
3417 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3418 		}
3419 	}
3420 
3421 ok:
3422 	return ret;
3423 
3424 normal:
3425 	ret = GRO_NORMAL;
3426 	goto pull;
3427 }
3428 EXPORT_SYMBOL(dev_gro_receive);
3429 
3430 static inline gro_result_t
3431 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3432 {
3433 	struct sk_buff *p;
3434 
3435 	for (p = napi->gro_list; p; p = p->next) {
3436 		unsigned long diffs;
3437 
3438 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3439 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3440 		diffs |= compare_ether_header(skb_mac_header(p),
3441 					      skb_gro_mac_header(skb));
3442 		NAPI_GRO_CB(p)->same_flow = !diffs;
3443 		NAPI_GRO_CB(p)->flush = 0;
3444 	}
3445 
3446 	return dev_gro_receive(napi, skb);
3447 }
3448 
3449 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3450 {
3451 	switch (ret) {
3452 	case GRO_NORMAL:
3453 		if (netif_receive_skb(skb))
3454 			ret = GRO_DROP;
3455 		break;
3456 
3457 	case GRO_DROP:
3458 	case GRO_MERGED_FREE:
3459 		kfree_skb(skb);
3460 		break;
3461 
3462 	case GRO_HELD:
3463 	case GRO_MERGED:
3464 		break;
3465 	}
3466 
3467 	return ret;
3468 }
3469 EXPORT_SYMBOL(napi_skb_finish);
3470 
3471 void skb_gro_reset_offset(struct sk_buff *skb)
3472 {
3473 	NAPI_GRO_CB(skb)->data_offset = 0;
3474 	NAPI_GRO_CB(skb)->frag0 = NULL;
3475 	NAPI_GRO_CB(skb)->frag0_len = 0;
3476 
3477 	if (skb->mac_header == skb->tail &&
3478 	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3479 		NAPI_GRO_CB(skb)->frag0 =
3480 			page_address(skb_shinfo(skb)->frags[0].page) +
3481 			skb_shinfo(skb)->frags[0].page_offset;
3482 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3483 	}
3484 }
3485 EXPORT_SYMBOL(skb_gro_reset_offset);
3486 
3487 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3488 {
3489 	skb_gro_reset_offset(skb);
3490 
3491 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3492 }
3493 EXPORT_SYMBOL(napi_gro_receive);
3494 
3495 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3496 {
3497 	__skb_pull(skb, skb_headlen(skb));
3498 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3499 	skb->vlan_tci = 0;
3500 	skb->dev = napi->dev;
3501 	skb->skb_iif = 0;
3502 
3503 	napi->skb = skb;
3504 }
3505 
3506 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3507 {
3508 	struct sk_buff *skb = napi->skb;
3509 
3510 	if (!skb) {
3511 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3512 		if (skb)
3513 			napi->skb = skb;
3514 	}
3515 	return skb;
3516 }
3517 EXPORT_SYMBOL(napi_get_frags);
3518 
3519 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3520 			       gro_result_t ret)
3521 {
3522 	switch (ret) {
3523 	case GRO_NORMAL:
3524 	case GRO_HELD:
3525 		skb->protocol = eth_type_trans(skb, skb->dev);
3526 
3527 		if (ret == GRO_HELD)
3528 			skb_gro_pull(skb, -ETH_HLEN);
3529 		else if (netif_receive_skb(skb))
3530 			ret = GRO_DROP;
3531 		break;
3532 
3533 	case GRO_DROP:
3534 	case GRO_MERGED_FREE:
3535 		napi_reuse_skb(napi, skb);
3536 		break;
3537 
3538 	case GRO_MERGED:
3539 		break;
3540 	}
3541 
3542 	return ret;
3543 }
3544 EXPORT_SYMBOL(napi_frags_finish);
3545 
3546 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3547 {
3548 	struct sk_buff *skb = napi->skb;
3549 	struct ethhdr *eth;
3550 	unsigned int hlen;
3551 	unsigned int off;
3552 
3553 	napi->skb = NULL;
3554 
3555 	skb_reset_mac_header(skb);
3556 	skb_gro_reset_offset(skb);
3557 
3558 	off = skb_gro_offset(skb);
3559 	hlen = off + sizeof(*eth);
3560 	eth = skb_gro_header_fast(skb, off);
3561 	if (skb_gro_header_hard(skb, hlen)) {
3562 		eth = skb_gro_header_slow(skb, hlen, off);
3563 		if (unlikely(!eth)) {
3564 			napi_reuse_skb(napi, skb);
3565 			skb = NULL;
3566 			goto out;
3567 		}
3568 	}
3569 
3570 	skb_gro_pull(skb, sizeof(*eth));
3571 
3572 	/*
3573 	 * This works because the only protocols we care about don't require
3574 	 * special handling.  We'll fix it up properly at the end.
3575 	 */
3576 	skb->protocol = eth->h_proto;
3577 
3578 out:
3579 	return skb;
3580 }
3581 EXPORT_SYMBOL(napi_frags_skb);
3582 
3583 gro_result_t napi_gro_frags(struct napi_struct *napi)
3584 {
3585 	struct sk_buff *skb = napi_frags_skb(napi);
3586 
3587 	if (!skb)
3588 		return GRO_DROP;
3589 
3590 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3591 }
3592 EXPORT_SYMBOL(napi_gro_frags);
3593 
3594 /*
3595  * net_rps_action sends any pending IPI's for rps.
3596  * Note: called with local irq disabled, but exits with local irq enabled.
3597  */
3598 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3599 {
3600 #ifdef CONFIG_RPS
3601 	struct softnet_data *remsd = sd->rps_ipi_list;
3602 
3603 	if (remsd) {
3604 		sd->rps_ipi_list = NULL;
3605 
3606 		local_irq_enable();
3607 
3608 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3609 		while (remsd) {
3610 			struct softnet_data *next = remsd->rps_ipi_next;
3611 
3612 			if (cpu_online(remsd->cpu))
3613 				__smp_call_function_single(remsd->cpu,
3614 							   &remsd->csd, 0);
3615 			remsd = next;
3616 		}
3617 	} else
3618 #endif
3619 		local_irq_enable();
3620 }
3621 
3622 static int process_backlog(struct napi_struct *napi, int quota)
3623 {
3624 	int work = 0;
3625 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3626 
3627 #ifdef CONFIG_RPS
3628 	/* Check if we have pending ipi, its better to send them now,
3629 	 * not waiting net_rx_action() end.
3630 	 */
3631 	if (sd->rps_ipi_list) {
3632 		local_irq_disable();
3633 		net_rps_action_and_irq_enable(sd);
3634 	}
3635 #endif
3636 	napi->weight = weight_p;
3637 	local_irq_disable();
3638 	while (work < quota) {
3639 		struct sk_buff *skb;
3640 		unsigned int qlen;
3641 
3642 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3643 			local_irq_enable();
3644 			__netif_receive_skb(skb);
3645 			local_irq_disable();
3646 			input_queue_head_incr(sd);
3647 			if (++work >= quota) {
3648 				local_irq_enable();
3649 				return work;
3650 			}
3651 		}
3652 
3653 		rps_lock(sd);
3654 		qlen = skb_queue_len(&sd->input_pkt_queue);
3655 		if (qlen)
3656 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3657 						   &sd->process_queue);
3658 
3659 		if (qlen < quota - work) {
3660 			/*
3661 			 * Inline a custom version of __napi_complete().
3662 			 * only current cpu owns and manipulates this napi,
3663 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3664 			 * we can use a plain write instead of clear_bit(),
3665 			 * and we dont need an smp_mb() memory barrier.
3666 			 */
3667 			list_del(&napi->poll_list);
3668 			napi->state = 0;
3669 
3670 			quota = work + qlen;
3671 		}
3672 		rps_unlock(sd);
3673 	}
3674 	local_irq_enable();
3675 
3676 	return work;
3677 }
3678 
3679 /**
3680  * __napi_schedule - schedule for receive
3681  * @n: entry to schedule
3682  *
3683  * The entry's receive function will be scheduled to run
3684  */
3685 void __napi_schedule(struct napi_struct *n)
3686 {
3687 	unsigned long flags;
3688 
3689 	local_irq_save(flags);
3690 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3691 	local_irq_restore(flags);
3692 }
3693 EXPORT_SYMBOL(__napi_schedule);
3694 
3695 void __napi_complete(struct napi_struct *n)
3696 {
3697 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3698 	BUG_ON(n->gro_list);
3699 
3700 	list_del(&n->poll_list);
3701 	smp_mb__before_clear_bit();
3702 	clear_bit(NAPI_STATE_SCHED, &n->state);
3703 }
3704 EXPORT_SYMBOL(__napi_complete);
3705 
3706 void napi_complete(struct napi_struct *n)
3707 {
3708 	unsigned long flags;
3709 
3710 	/*
3711 	 * don't let napi dequeue from the cpu poll list
3712 	 * just in case its running on a different cpu
3713 	 */
3714 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3715 		return;
3716 
3717 	napi_gro_flush(n);
3718 	local_irq_save(flags);
3719 	__napi_complete(n);
3720 	local_irq_restore(flags);
3721 }
3722 EXPORT_SYMBOL(napi_complete);
3723 
3724 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3725 		    int (*poll)(struct napi_struct *, int), int weight)
3726 {
3727 	INIT_LIST_HEAD(&napi->poll_list);
3728 	napi->gro_count = 0;
3729 	napi->gro_list = NULL;
3730 	napi->skb = NULL;
3731 	napi->poll = poll;
3732 	napi->weight = weight;
3733 	list_add(&napi->dev_list, &dev->napi_list);
3734 	napi->dev = dev;
3735 #ifdef CONFIG_NETPOLL
3736 	spin_lock_init(&napi->poll_lock);
3737 	napi->poll_owner = -1;
3738 #endif
3739 	set_bit(NAPI_STATE_SCHED, &napi->state);
3740 }
3741 EXPORT_SYMBOL(netif_napi_add);
3742 
3743 void netif_napi_del(struct napi_struct *napi)
3744 {
3745 	struct sk_buff *skb, *next;
3746 
3747 	list_del_init(&napi->dev_list);
3748 	napi_free_frags(napi);
3749 
3750 	for (skb = napi->gro_list; skb; skb = next) {
3751 		next = skb->next;
3752 		skb->next = NULL;
3753 		kfree_skb(skb);
3754 	}
3755 
3756 	napi->gro_list = NULL;
3757 	napi->gro_count = 0;
3758 }
3759 EXPORT_SYMBOL(netif_napi_del);
3760 
3761 static void net_rx_action(struct softirq_action *h)
3762 {
3763 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3764 	unsigned long time_limit = jiffies + 2;
3765 	int budget = netdev_budget;
3766 	void *have;
3767 
3768 	local_irq_disable();
3769 
3770 	while (!list_empty(&sd->poll_list)) {
3771 		struct napi_struct *n;
3772 		int work, weight;
3773 
3774 		/* If softirq window is exhuasted then punt.
3775 		 * Allow this to run for 2 jiffies since which will allow
3776 		 * an average latency of 1.5/HZ.
3777 		 */
3778 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3779 			goto softnet_break;
3780 
3781 		local_irq_enable();
3782 
3783 		/* Even though interrupts have been re-enabled, this
3784 		 * access is safe because interrupts can only add new
3785 		 * entries to the tail of this list, and only ->poll()
3786 		 * calls can remove this head entry from the list.
3787 		 */
3788 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3789 
3790 		have = netpoll_poll_lock(n);
3791 
3792 		weight = n->weight;
3793 
3794 		/* This NAPI_STATE_SCHED test is for avoiding a race
3795 		 * with netpoll's poll_napi().  Only the entity which
3796 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3797 		 * actually make the ->poll() call.  Therefore we avoid
3798 		 * accidentally calling ->poll() when NAPI is not scheduled.
3799 		 */
3800 		work = 0;
3801 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3802 			work = n->poll(n, weight);
3803 			trace_napi_poll(n);
3804 		}
3805 
3806 		WARN_ON_ONCE(work > weight);
3807 
3808 		budget -= work;
3809 
3810 		local_irq_disable();
3811 
3812 		/* Drivers must not modify the NAPI state if they
3813 		 * consume the entire weight.  In such cases this code
3814 		 * still "owns" the NAPI instance and therefore can
3815 		 * move the instance around on the list at-will.
3816 		 */
3817 		if (unlikely(work == weight)) {
3818 			if (unlikely(napi_disable_pending(n))) {
3819 				local_irq_enable();
3820 				napi_complete(n);
3821 				local_irq_disable();
3822 			} else
3823 				list_move_tail(&n->poll_list, &sd->poll_list);
3824 		}
3825 
3826 		netpoll_poll_unlock(have);
3827 	}
3828 out:
3829 	net_rps_action_and_irq_enable(sd);
3830 
3831 #ifdef CONFIG_NET_DMA
3832 	/*
3833 	 * There may not be any more sk_buffs coming right now, so push
3834 	 * any pending DMA copies to hardware
3835 	 */
3836 	dma_issue_pending_all();
3837 #endif
3838 
3839 	return;
3840 
3841 softnet_break:
3842 	sd->time_squeeze++;
3843 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3844 	goto out;
3845 }
3846 
3847 static gifconf_func_t *gifconf_list[NPROTO];
3848 
3849 /**
3850  *	register_gifconf	-	register a SIOCGIF handler
3851  *	@family: Address family
3852  *	@gifconf: Function handler
3853  *
3854  *	Register protocol dependent address dumping routines. The handler
3855  *	that is passed must not be freed or reused until it has been replaced
3856  *	by another handler.
3857  */
3858 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3859 {
3860 	if (family >= NPROTO)
3861 		return -EINVAL;
3862 	gifconf_list[family] = gifconf;
3863 	return 0;
3864 }
3865 EXPORT_SYMBOL(register_gifconf);
3866 
3867 
3868 /*
3869  *	Map an interface index to its name (SIOCGIFNAME)
3870  */
3871 
3872 /*
3873  *	We need this ioctl for efficient implementation of the
3874  *	if_indextoname() function required by the IPv6 API.  Without
3875  *	it, we would have to search all the interfaces to find a
3876  *	match.  --pb
3877  */
3878 
3879 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3880 {
3881 	struct net_device *dev;
3882 	struct ifreq ifr;
3883 
3884 	/*
3885 	 *	Fetch the caller's info block.
3886 	 */
3887 
3888 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3889 		return -EFAULT;
3890 
3891 	rcu_read_lock();
3892 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3893 	if (!dev) {
3894 		rcu_read_unlock();
3895 		return -ENODEV;
3896 	}
3897 
3898 	strcpy(ifr.ifr_name, dev->name);
3899 	rcu_read_unlock();
3900 
3901 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3902 		return -EFAULT;
3903 	return 0;
3904 }
3905 
3906 /*
3907  *	Perform a SIOCGIFCONF call. This structure will change
3908  *	size eventually, and there is nothing I can do about it.
3909  *	Thus we will need a 'compatibility mode'.
3910  */
3911 
3912 static int dev_ifconf(struct net *net, char __user *arg)
3913 {
3914 	struct ifconf ifc;
3915 	struct net_device *dev;
3916 	char __user *pos;
3917 	int len;
3918 	int total;
3919 	int i;
3920 
3921 	/*
3922 	 *	Fetch the caller's info block.
3923 	 */
3924 
3925 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3926 		return -EFAULT;
3927 
3928 	pos = ifc.ifc_buf;
3929 	len = ifc.ifc_len;
3930 
3931 	/*
3932 	 *	Loop over the interfaces, and write an info block for each.
3933 	 */
3934 
3935 	total = 0;
3936 	for_each_netdev(net, dev) {
3937 		for (i = 0; i < NPROTO; i++) {
3938 			if (gifconf_list[i]) {
3939 				int done;
3940 				if (!pos)
3941 					done = gifconf_list[i](dev, NULL, 0);
3942 				else
3943 					done = gifconf_list[i](dev, pos + total,
3944 							       len - total);
3945 				if (done < 0)
3946 					return -EFAULT;
3947 				total += done;
3948 			}
3949 		}
3950 	}
3951 
3952 	/*
3953 	 *	All done.  Write the updated control block back to the caller.
3954 	 */
3955 	ifc.ifc_len = total;
3956 
3957 	/*
3958 	 * 	Both BSD and Solaris return 0 here, so we do too.
3959 	 */
3960 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3961 }
3962 
3963 #ifdef CONFIG_PROC_FS
3964 /*
3965  *	This is invoked by the /proc filesystem handler to display a device
3966  *	in detail.
3967  */
3968 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3969 	__acquires(RCU)
3970 {
3971 	struct net *net = seq_file_net(seq);
3972 	loff_t off;
3973 	struct net_device *dev;
3974 
3975 	rcu_read_lock();
3976 	if (!*pos)
3977 		return SEQ_START_TOKEN;
3978 
3979 	off = 1;
3980 	for_each_netdev_rcu(net, dev)
3981 		if (off++ == *pos)
3982 			return dev;
3983 
3984 	return NULL;
3985 }
3986 
3987 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3988 {
3989 	struct net_device *dev = v;
3990 
3991 	if (v == SEQ_START_TOKEN)
3992 		dev = first_net_device_rcu(seq_file_net(seq));
3993 	else
3994 		dev = next_net_device_rcu(dev);
3995 
3996 	++*pos;
3997 	return dev;
3998 }
3999 
4000 void dev_seq_stop(struct seq_file *seq, void *v)
4001 	__releases(RCU)
4002 {
4003 	rcu_read_unlock();
4004 }
4005 
4006 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4007 {
4008 	struct rtnl_link_stats64 temp;
4009 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4010 
4011 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4012 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4013 		   dev->name, stats->rx_bytes, stats->rx_packets,
4014 		   stats->rx_errors,
4015 		   stats->rx_dropped + stats->rx_missed_errors,
4016 		   stats->rx_fifo_errors,
4017 		   stats->rx_length_errors + stats->rx_over_errors +
4018 		    stats->rx_crc_errors + stats->rx_frame_errors,
4019 		   stats->rx_compressed, stats->multicast,
4020 		   stats->tx_bytes, stats->tx_packets,
4021 		   stats->tx_errors, stats->tx_dropped,
4022 		   stats->tx_fifo_errors, stats->collisions,
4023 		   stats->tx_carrier_errors +
4024 		    stats->tx_aborted_errors +
4025 		    stats->tx_window_errors +
4026 		    stats->tx_heartbeat_errors,
4027 		   stats->tx_compressed);
4028 }
4029 
4030 /*
4031  *	Called from the PROCfs module. This now uses the new arbitrary sized
4032  *	/proc/net interface to create /proc/net/dev
4033  */
4034 static int dev_seq_show(struct seq_file *seq, void *v)
4035 {
4036 	if (v == SEQ_START_TOKEN)
4037 		seq_puts(seq, "Inter-|   Receive                            "
4038 			      "                    |  Transmit\n"
4039 			      " face |bytes    packets errs drop fifo frame "
4040 			      "compressed multicast|bytes    packets errs "
4041 			      "drop fifo colls carrier compressed\n");
4042 	else
4043 		dev_seq_printf_stats(seq, v);
4044 	return 0;
4045 }
4046 
4047 static struct softnet_data *softnet_get_online(loff_t *pos)
4048 {
4049 	struct softnet_data *sd = NULL;
4050 
4051 	while (*pos < nr_cpu_ids)
4052 		if (cpu_online(*pos)) {
4053 			sd = &per_cpu(softnet_data, *pos);
4054 			break;
4055 		} else
4056 			++*pos;
4057 	return sd;
4058 }
4059 
4060 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4061 {
4062 	return softnet_get_online(pos);
4063 }
4064 
4065 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4066 {
4067 	++*pos;
4068 	return softnet_get_online(pos);
4069 }
4070 
4071 static void softnet_seq_stop(struct seq_file *seq, void *v)
4072 {
4073 }
4074 
4075 static int softnet_seq_show(struct seq_file *seq, void *v)
4076 {
4077 	struct softnet_data *sd = v;
4078 
4079 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4080 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4081 		   0, 0, 0, 0, /* was fastroute */
4082 		   sd->cpu_collision, sd->received_rps);
4083 	return 0;
4084 }
4085 
4086 static const struct seq_operations dev_seq_ops = {
4087 	.start = dev_seq_start,
4088 	.next  = dev_seq_next,
4089 	.stop  = dev_seq_stop,
4090 	.show  = dev_seq_show,
4091 };
4092 
4093 static int dev_seq_open(struct inode *inode, struct file *file)
4094 {
4095 	return seq_open_net(inode, file, &dev_seq_ops,
4096 			    sizeof(struct seq_net_private));
4097 }
4098 
4099 static const struct file_operations dev_seq_fops = {
4100 	.owner	 = THIS_MODULE,
4101 	.open    = dev_seq_open,
4102 	.read    = seq_read,
4103 	.llseek  = seq_lseek,
4104 	.release = seq_release_net,
4105 };
4106 
4107 static const struct seq_operations softnet_seq_ops = {
4108 	.start = softnet_seq_start,
4109 	.next  = softnet_seq_next,
4110 	.stop  = softnet_seq_stop,
4111 	.show  = softnet_seq_show,
4112 };
4113 
4114 static int softnet_seq_open(struct inode *inode, struct file *file)
4115 {
4116 	return seq_open(file, &softnet_seq_ops);
4117 }
4118 
4119 static const struct file_operations softnet_seq_fops = {
4120 	.owner	 = THIS_MODULE,
4121 	.open    = softnet_seq_open,
4122 	.read    = seq_read,
4123 	.llseek  = seq_lseek,
4124 	.release = seq_release,
4125 };
4126 
4127 static void *ptype_get_idx(loff_t pos)
4128 {
4129 	struct packet_type *pt = NULL;
4130 	loff_t i = 0;
4131 	int t;
4132 
4133 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4134 		if (i == pos)
4135 			return pt;
4136 		++i;
4137 	}
4138 
4139 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4140 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4141 			if (i == pos)
4142 				return pt;
4143 			++i;
4144 		}
4145 	}
4146 	return NULL;
4147 }
4148 
4149 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4150 	__acquires(RCU)
4151 {
4152 	rcu_read_lock();
4153 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4154 }
4155 
4156 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4157 {
4158 	struct packet_type *pt;
4159 	struct list_head *nxt;
4160 	int hash;
4161 
4162 	++*pos;
4163 	if (v == SEQ_START_TOKEN)
4164 		return ptype_get_idx(0);
4165 
4166 	pt = v;
4167 	nxt = pt->list.next;
4168 	if (pt->type == htons(ETH_P_ALL)) {
4169 		if (nxt != &ptype_all)
4170 			goto found;
4171 		hash = 0;
4172 		nxt = ptype_base[0].next;
4173 	} else
4174 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4175 
4176 	while (nxt == &ptype_base[hash]) {
4177 		if (++hash >= PTYPE_HASH_SIZE)
4178 			return NULL;
4179 		nxt = ptype_base[hash].next;
4180 	}
4181 found:
4182 	return list_entry(nxt, struct packet_type, list);
4183 }
4184 
4185 static void ptype_seq_stop(struct seq_file *seq, void *v)
4186 	__releases(RCU)
4187 {
4188 	rcu_read_unlock();
4189 }
4190 
4191 static int ptype_seq_show(struct seq_file *seq, void *v)
4192 {
4193 	struct packet_type *pt = v;
4194 
4195 	if (v == SEQ_START_TOKEN)
4196 		seq_puts(seq, "Type Device      Function\n");
4197 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4198 		if (pt->type == htons(ETH_P_ALL))
4199 			seq_puts(seq, "ALL ");
4200 		else
4201 			seq_printf(seq, "%04x", ntohs(pt->type));
4202 
4203 		seq_printf(seq, " %-8s %pF\n",
4204 			   pt->dev ? pt->dev->name : "", pt->func);
4205 	}
4206 
4207 	return 0;
4208 }
4209 
4210 static const struct seq_operations ptype_seq_ops = {
4211 	.start = ptype_seq_start,
4212 	.next  = ptype_seq_next,
4213 	.stop  = ptype_seq_stop,
4214 	.show  = ptype_seq_show,
4215 };
4216 
4217 static int ptype_seq_open(struct inode *inode, struct file *file)
4218 {
4219 	return seq_open_net(inode, file, &ptype_seq_ops,
4220 			sizeof(struct seq_net_private));
4221 }
4222 
4223 static const struct file_operations ptype_seq_fops = {
4224 	.owner	 = THIS_MODULE,
4225 	.open    = ptype_seq_open,
4226 	.read    = seq_read,
4227 	.llseek  = seq_lseek,
4228 	.release = seq_release_net,
4229 };
4230 
4231 
4232 static int __net_init dev_proc_net_init(struct net *net)
4233 {
4234 	int rc = -ENOMEM;
4235 
4236 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4237 		goto out;
4238 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4239 		goto out_dev;
4240 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4241 		goto out_softnet;
4242 
4243 	if (wext_proc_init(net))
4244 		goto out_ptype;
4245 	rc = 0;
4246 out:
4247 	return rc;
4248 out_ptype:
4249 	proc_net_remove(net, "ptype");
4250 out_softnet:
4251 	proc_net_remove(net, "softnet_stat");
4252 out_dev:
4253 	proc_net_remove(net, "dev");
4254 	goto out;
4255 }
4256 
4257 static void __net_exit dev_proc_net_exit(struct net *net)
4258 {
4259 	wext_proc_exit(net);
4260 
4261 	proc_net_remove(net, "ptype");
4262 	proc_net_remove(net, "softnet_stat");
4263 	proc_net_remove(net, "dev");
4264 }
4265 
4266 static struct pernet_operations __net_initdata dev_proc_ops = {
4267 	.init = dev_proc_net_init,
4268 	.exit = dev_proc_net_exit,
4269 };
4270 
4271 static int __init dev_proc_init(void)
4272 {
4273 	return register_pernet_subsys(&dev_proc_ops);
4274 }
4275 #else
4276 #define dev_proc_init() 0
4277 #endif	/* CONFIG_PROC_FS */
4278 
4279 
4280 /**
4281  *	netdev_set_master	-	set up master pointer
4282  *	@slave: slave device
4283  *	@master: new master device
4284  *
4285  *	Changes the master device of the slave. Pass %NULL to break the
4286  *	bonding. The caller must hold the RTNL semaphore. On a failure
4287  *	a negative errno code is returned. On success the reference counts
4288  *	are adjusted and the function returns zero.
4289  */
4290 int netdev_set_master(struct net_device *slave, struct net_device *master)
4291 {
4292 	struct net_device *old = slave->master;
4293 
4294 	ASSERT_RTNL();
4295 
4296 	if (master) {
4297 		if (old)
4298 			return -EBUSY;
4299 		dev_hold(master);
4300 	}
4301 
4302 	slave->master = master;
4303 
4304 	if (old)
4305 		dev_put(old);
4306 	return 0;
4307 }
4308 EXPORT_SYMBOL(netdev_set_master);
4309 
4310 /**
4311  *	netdev_set_bond_master	-	set up bonding master/slave pair
4312  *	@slave: slave device
4313  *	@master: new master device
4314  *
4315  *	Changes the master device of the slave. Pass %NULL to break the
4316  *	bonding. The caller must hold the RTNL semaphore. On a failure
4317  *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4318  *	to the routing socket and the function returns zero.
4319  */
4320 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4321 {
4322 	int err;
4323 
4324 	ASSERT_RTNL();
4325 
4326 	err = netdev_set_master(slave, master);
4327 	if (err)
4328 		return err;
4329 	if (master)
4330 		slave->flags |= IFF_SLAVE;
4331 	else
4332 		slave->flags &= ~IFF_SLAVE;
4333 
4334 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4335 	return 0;
4336 }
4337 EXPORT_SYMBOL(netdev_set_bond_master);
4338 
4339 static void dev_change_rx_flags(struct net_device *dev, int flags)
4340 {
4341 	const struct net_device_ops *ops = dev->netdev_ops;
4342 
4343 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4344 		ops->ndo_change_rx_flags(dev, flags);
4345 }
4346 
4347 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4348 {
4349 	unsigned short old_flags = dev->flags;
4350 	uid_t uid;
4351 	gid_t gid;
4352 
4353 	ASSERT_RTNL();
4354 
4355 	dev->flags |= IFF_PROMISC;
4356 	dev->promiscuity += inc;
4357 	if (dev->promiscuity == 0) {
4358 		/*
4359 		 * Avoid overflow.
4360 		 * If inc causes overflow, untouch promisc and return error.
4361 		 */
4362 		if (inc < 0)
4363 			dev->flags &= ~IFF_PROMISC;
4364 		else {
4365 			dev->promiscuity -= inc;
4366 			printk(KERN_WARNING "%s: promiscuity touches roof, "
4367 				"set promiscuity failed, promiscuity feature "
4368 				"of device might be broken.\n", dev->name);
4369 			return -EOVERFLOW;
4370 		}
4371 	}
4372 	if (dev->flags != old_flags) {
4373 		printk(KERN_INFO "device %s %s promiscuous mode\n",
4374 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4375 							       "left");
4376 		if (audit_enabled) {
4377 			current_uid_gid(&uid, &gid);
4378 			audit_log(current->audit_context, GFP_ATOMIC,
4379 				AUDIT_ANOM_PROMISCUOUS,
4380 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4381 				dev->name, (dev->flags & IFF_PROMISC),
4382 				(old_flags & IFF_PROMISC),
4383 				audit_get_loginuid(current),
4384 				uid, gid,
4385 				audit_get_sessionid(current));
4386 		}
4387 
4388 		dev_change_rx_flags(dev, IFF_PROMISC);
4389 	}
4390 	return 0;
4391 }
4392 
4393 /**
4394  *	dev_set_promiscuity	- update promiscuity count on a device
4395  *	@dev: device
4396  *	@inc: modifier
4397  *
4398  *	Add or remove promiscuity from a device. While the count in the device
4399  *	remains above zero the interface remains promiscuous. Once it hits zero
4400  *	the device reverts back to normal filtering operation. A negative inc
4401  *	value is used to drop promiscuity on the device.
4402  *	Return 0 if successful or a negative errno code on error.
4403  */
4404 int dev_set_promiscuity(struct net_device *dev, int inc)
4405 {
4406 	unsigned short old_flags = dev->flags;
4407 	int err;
4408 
4409 	err = __dev_set_promiscuity(dev, inc);
4410 	if (err < 0)
4411 		return err;
4412 	if (dev->flags != old_flags)
4413 		dev_set_rx_mode(dev);
4414 	return err;
4415 }
4416 EXPORT_SYMBOL(dev_set_promiscuity);
4417 
4418 /**
4419  *	dev_set_allmulti	- update allmulti count on a device
4420  *	@dev: device
4421  *	@inc: modifier
4422  *
4423  *	Add or remove reception of all multicast frames to a device. While the
4424  *	count in the device remains above zero the interface remains listening
4425  *	to all interfaces. Once it hits zero the device reverts back to normal
4426  *	filtering operation. A negative @inc value is used to drop the counter
4427  *	when releasing a resource needing all multicasts.
4428  *	Return 0 if successful or a negative errno code on error.
4429  */
4430 
4431 int dev_set_allmulti(struct net_device *dev, int inc)
4432 {
4433 	unsigned short old_flags = dev->flags;
4434 
4435 	ASSERT_RTNL();
4436 
4437 	dev->flags |= IFF_ALLMULTI;
4438 	dev->allmulti += inc;
4439 	if (dev->allmulti == 0) {
4440 		/*
4441 		 * Avoid overflow.
4442 		 * If inc causes overflow, untouch allmulti and return error.
4443 		 */
4444 		if (inc < 0)
4445 			dev->flags &= ~IFF_ALLMULTI;
4446 		else {
4447 			dev->allmulti -= inc;
4448 			printk(KERN_WARNING "%s: allmulti touches roof, "
4449 				"set allmulti failed, allmulti feature of "
4450 				"device might be broken.\n", dev->name);
4451 			return -EOVERFLOW;
4452 		}
4453 	}
4454 	if (dev->flags ^ old_flags) {
4455 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4456 		dev_set_rx_mode(dev);
4457 	}
4458 	return 0;
4459 }
4460 EXPORT_SYMBOL(dev_set_allmulti);
4461 
4462 /*
4463  *	Upload unicast and multicast address lists to device and
4464  *	configure RX filtering. When the device doesn't support unicast
4465  *	filtering it is put in promiscuous mode while unicast addresses
4466  *	are present.
4467  */
4468 void __dev_set_rx_mode(struct net_device *dev)
4469 {
4470 	const struct net_device_ops *ops = dev->netdev_ops;
4471 
4472 	/* dev_open will call this function so the list will stay sane. */
4473 	if (!(dev->flags&IFF_UP))
4474 		return;
4475 
4476 	if (!netif_device_present(dev))
4477 		return;
4478 
4479 	if (ops->ndo_set_rx_mode)
4480 		ops->ndo_set_rx_mode(dev);
4481 	else {
4482 		/* Unicast addresses changes may only happen under the rtnl,
4483 		 * therefore calling __dev_set_promiscuity here is safe.
4484 		 */
4485 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4486 			__dev_set_promiscuity(dev, 1);
4487 			dev->uc_promisc = 1;
4488 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4489 			__dev_set_promiscuity(dev, -1);
4490 			dev->uc_promisc = 0;
4491 		}
4492 
4493 		if (ops->ndo_set_multicast_list)
4494 			ops->ndo_set_multicast_list(dev);
4495 	}
4496 }
4497 
4498 void dev_set_rx_mode(struct net_device *dev)
4499 {
4500 	netif_addr_lock_bh(dev);
4501 	__dev_set_rx_mode(dev);
4502 	netif_addr_unlock_bh(dev);
4503 }
4504 
4505 /**
4506  *	dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
4507  *	@dev: device
4508  *	@cmd: memory area for ethtool_ops::get_settings() result
4509  *
4510  *      The cmd arg is initialized properly (cleared and
4511  *      ethtool_cmd::cmd field set to ETHTOOL_GSET).
4512  *
4513  *	Return device's ethtool_ops::get_settings() result value or
4514  *	-EOPNOTSUPP when device doesn't expose
4515  *	ethtool_ops::get_settings() operation.
4516  */
4517 int dev_ethtool_get_settings(struct net_device *dev,
4518 			     struct ethtool_cmd *cmd)
4519 {
4520 	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
4521 		return -EOPNOTSUPP;
4522 
4523 	memset(cmd, 0, sizeof(struct ethtool_cmd));
4524 	cmd->cmd = ETHTOOL_GSET;
4525 	return dev->ethtool_ops->get_settings(dev, cmd);
4526 }
4527 EXPORT_SYMBOL(dev_ethtool_get_settings);
4528 
4529 /**
4530  *	dev_get_flags - get flags reported to userspace
4531  *	@dev: device
4532  *
4533  *	Get the combination of flag bits exported through APIs to userspace.
4534  */
4535 unsigned dev_get_flags(const struct net_device *dev)
4536 {
4537 	unsigned flags;
4538 
4539 	flags = (dev->flags & ~(IFF_PROMISC |
4540 				IFF_ALLMULTI |
4541 				IFF_RUNNING |
4542 				IFF_LOWER_UP |
4543 				IFF_DORMANT)) |
4544 		(dev->gflags & (IFF_PROMISC |
4545 				IFF_ALLMULTI));
4546 
4547 	if (netif_running(dev)) {
4548 		if (netif_oper_up(dev))
4549 			flags |= IFF_RUNNING;
4550 		if (netif_carrier_ok(dev))
4551 			flags |= IFF_LOWER_UP;
4552 		if (netif_dormant(dev))
4553 			flags |= IFF_DORMANT;
4554 	}
4555 
4556 	return flags;
4557 }
4558 EXPORT_SYMBOL(dev_get_flags);
4559 
4560 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4561 {
4562 	int old_flags = dev->flags;
4563 	int ret;
4564 
4565 	ASSERT_RTNL();
4566 
4567 	/*
4568 	 *	Set the flags on our device.
4569 	 */
4570 
4571 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4572 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4573 			       IFF_AUTOMEDIA)) |
4574 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4575 				    IFF_ALLMULTI));
4576 
4577 	/*
4578 	 *	Load in the correct multicast list now the flags have changed.
4579 	 */
4580 
4581 	if ((old_flags ^ flags) & IFF_MULTICAST)
4582 		dev_change_rx_flags(dev, IFF_MULTICAST);
4583 
4584 	dev_set_rx_mode(dev);
4585 
4586 	/*
4587 	 *	Have we downed the interface. We handle IFF_UP ourselves
4588 	 *	according to user attempts to set it, rather than blindly
4589 	 *	setting it.
4590 	 */
4591 
4592 	ret = 0;
4593 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4594 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4595 
4596 		if (!ret)
4597 			dev_set_rx_mode(dev);
4598 	}
4599 
4600 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4601 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4602 
4603 		dev->gflags ^= IFF_PROMISC;
4604 		dev_set_promiscuity(dev, inc);
4605 	}
4606 
4607 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4608 	   is important. Some (broken) drivers set IFF_PROMISC, when
4609 	   IFF_ALLMULTI is requested not asking us and not reporting.
4610 	 */
4611 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4612 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4613 
4614 		dev->gflags ^= IFF_ALLMULTI;
4615 		dev_set_allmulti(dev, inc);
4616 	}
4617 
4618 	return ret;
4619 }
4620 
4621 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4622 {
4623 	unsigned int changes = dev->flags ^ old_flags;
4624 
4625 	if (changes & IFF_UP) {
4626 		if (dev->flags & IFF_UP)
4627 			call_netdevice_notifiers(NETDEV_UP, dev);
4628 		else
4629 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4630 	}
4631 
4632 	if (dev->flags & IFF_UP &&
4633 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4634 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4635 }
4636 
4637 /**
4638  *	dev_change_flags - change device settings
4639  *	@dev: device
4640  *	@flags: device state flags
4641  *
4642  *	Change settings on device based state flags. The flags are
4643  *	in the userspace exported format.
4644  */
4645 int dev_change_flags(struct net_device *dev, unsigned flags)
4646 {
4647 	int ret, changes;
4648 	int old_flags = dev->flags;
4649 
4650 	ret = __dev_change_flags(dev, flags);
4651 	if (ret < 0)
4652 		return ret;
4653 
4654 	changes = old_flags ^ dev->flags;
4655 	if (changes)
4656 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4657 
4658 	__dev_notify_flags(dev, old_flags);
4659 	return ret;
4660 }
4661 EXPORT_SYMBOL(dev_change_flags);
4662 
4663 /**
4664  *	dev_set_mtu - Change maximum transfer unit
4665  *	@dev: device
4666  *	@new_mtu: new transfer unit
4667  *
4668  *	Change the maximum transfer size of the network device.
4669  */
4670 int dev_set_mtu(struct net_device *dev, int new_mtu)
4671 {
4672 	const struct net_device_ops *ops = dev->netdev_ops;
4673 	int err;
4674 
4675 	if (new_mtu == dev->mtu)
4676 		return 0;
4677 
4678 	/*	MTU must be positive.	 */
4679 	if (new_mtu < 0)
4680 		return -EINVAL;
4681 
4682 	if (!netif_device_present(dev))
4683 		return -ENODEV;
4684 
4685 	err = 0;
4686 	if (ops->ndo_change_mtu)
4687 		err = ops->ndo_change_mtu(dev, new_mtu);
4688 	else
4689 		dev->mtu = new_mtu;
4690 
4691 	if (!err && dev->flags & IFF_UP)
4692 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4693 	return err;
4694 }
4695 EXPORT_SYMBOL(dev_set_mtu);
4696 
4697 /**
4698  *	dev_set_group - Change group this device belongs to
4699  *	@dev: device
4700  *	@new_group: group this device should belong to
4701  */
4702 void dev_set_group(struct net_device *dev, int new_group)
4703 {
4704 	dev->group = new_group;
4705 }
4706 EXPORT_SYMBOL(dev_set_group);
4707 
4708 /**
4709  *	dev_set_mac_address - Change Media Access Control Address
4710  *	@dev: device
4711  *	@sa: new address
4712  *
4713  *	Change the hardware (MAC) address of the device
4714  */
4715 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4716 {
4717 	const struct net_device_ops *ops = dev->netdev_ops;
4718 	int err;
4719 
4720 	if (!ops->ndo_set_mac_address)
4721 		return -EOPNOTSUPP;
4722 	if (sa->sa_family != dev->type)
4723 		return -EINVAL;
4724 	if (!netif_device_present(dev))
4725 		return -ENODEV;
4726 	err = ops->ndo_set_mac_address(dev, sa);
4727 	if (!err)
4728 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4729 	return err;
4730 }
4731 EXPORT_SYMBOL(dev_set_mac_address);
4732 
4733 /*
4734  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4735  */
4736 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4737 {
4738 	int err;
4739 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4740 
4741 	if (!dev)
4742 		return -ENODEV;
4743 
4744 	switch (cmd) {
4745 	case SIOCGIFFLAGS:	/* Get interface flags */
4746 		ifr->ifr_flags = (short) dev_get_flags(dev);
4747 		return 0;
4748 
4749 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4750 				   (currently unused) */
4751 		ifr->ifr_metric = 0;
4752 		return 0;
4753 
4754 	case SIOCGIFMTU:	/* Get the MTU of a device */
4755 		ifr->ifr_mtu = dev->mtu;
4756 		return 0;
4757 
4758 	case SIOCGIFHWADDR:
4759 		if (!dev->addr_len)
4760 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4761 		else
4762 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4763 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4764 		ifr->ifr_hwaddr.sa_family = dev->type;
4765 		return 0;
4766 
4767 	case SIOCGIFSLAVE:
4768 		err = -EINVAL;
4769 		break;
4770 
4771 	case SIOCGIFMAP:
4772 		ifr->ifr_map.mem_start = dev->mem_start;
4773 		ifr->ifr_map.mem_end   = dev->mem_end;
4774 		ifr->ifr_map.base_addr = dev->base_addr;
4775 		ifr->ifr_map.irq       = dev->irq;
4776 		ifr->ifr_map.dma       = dev->dma;
4777 		ifr->ifr_map.port      = dev->if_port;
4778 		return 0;
4779 
4780 	case SIOCGIFINDEX:
4781 		ifr->ifr_ifindex = dev->ifindex;
4782 		return 0;
4783 
4784 	case SIOCGIFTXQLEN:
4785 		ifr->ifr_qlen = dev->tx_queue_len;
4786 		return 0;
4787 
4788 	default:
4789 		/* dev_ioctl() should ensure this case
4790 		 * is never reached
4791 		 */
4792 		WARN_ON(1);
4793 		err = -ENOTTY;
4794 		break;
4795 
4796 	}
4797 	return err;
4798 }
4799 
4800 /*
4801  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4802  */
4803 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4804 {
4805 	int err;
4806 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4807 	const struct net_device_ops *ops;
4808 
4809 	if (!dev)
4810 		return -ENODEV;
4811 
4812 	ops = dev->netdev_ops;
4813 
4814 	switch (cmd) {
4815 	case SIOCSIFFLAGS:	/* Set interface flags */
4816 		return dev_change_flags(dev, ifr->ifr_flags);
4817 
4818 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4819 				   (currently unused) */
4820 		return -EOPNOTSUPP;
4821 
4822 	case SIOCSIFMTU:	/* Set the MTU of a device */
4823 		return dev_set_mtu(dev, ifr->ifr_mtu);
4824 
4825 	case SIOCSIFHWADDR:
4826 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4827 
4828 	case SIOCSIFHWBROADCAST:
4829 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4830 			return -EINVAL;
4831 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4832 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4833 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4834 		return 0;
4835 
4836 	case SIOCSIFMAP:
4837 		if (ops->ndo_set_config) {
4838 			if (!netif_device_present(dev))
4839 				return -ENODEV;
4840 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4841 		}
4842 		return -EOPNOTSUPP;
4843 
4844 	case SIOCADDMULTI:
4845 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4846 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4847 			return -EINVAL;
4848 		if (!netif_device_present(dev))
4849 			return -ENODEV;
4850 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4851 
4852 	case SIOCDELMULTI:
4853 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4854 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4855 			return -EINVAL;
4856 		if (!netif_device_present(dev))
4857 			return -ENODEV;
4858 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4859 
4860 	case SIOCSIFTXQLEN:
4861 		if (ifr->ifr_qlen < 0)
4862 			return -EINVAL;
4863 		dev->tx_queue_len = ifr->ifr_qlen;
4864 		return 0;
4865 
4866 	case SIOCSIFNAME:
4867 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4868 		return dev_change_name(dev, ifr->ifr_newname);
4869 
4870 	/*
4871 	 *	Unknown or private ioctl
4872 	 */
4873 	default:
4874 		if ((cmd >= SIOCDEVPRIVATE &&
4875 		    cmd <= SIOCDEVPRIVATE + 15) ||
4876 		    cmd == SIOCBONDENSLAVE ||
4877 		    cmd == SIOCBONDRELEASE ||
4878 		    cmd == SIOCBONDSETHWADDR ||
4879 		    cmd == SIOCBONDSLAVEINFOQUERY ||
4880 		    cmd == SIOCBONDINFOQUERY ||
4881 		    cmd == SIOCBONDCHANGEACTIVE ||
4882 		    cmd == SIOCGMIIPHY ||
4883 		    cmd == SIOCGMIIREG ||
4884 		    cmd == SIOCSMIIREG ||
4885 		    cmd == SIOCBRADDIF ||
4886 		    cmd == SIOCBRDELIF ||
4887 		    cmd == SIOCSHWTSTAMP ||
4888 		    cmd == SIOCWANDEV) {
4889 			err = -EOPNOTSUPP;
4890 			if (ops->ndo_do_ioctl) {
4891 				if (netif_device_present(dev))
4892 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4893 				else
4894 					err = -ENODEV;
4895 			}
4896 		} else
4897 			err = -EINVAL;
4898 
4899 	}
4900 	return err;
4901 }
4902 
4903 /*
4904  *	This function handles all "interface"-type I/O control requests. The actual
4905  *	'doing' part of this is dev_ifsioc above.
4906  */
4907 
4908 /**
4909  *	dev_ioctl	-	network device ioctl
4910  *	@net: the applicable net namespace
4911  *	@cmd: command to issue
4912  *	@arg: pointer to a struct ifreq in user space
4913  *
4914  *	Issue ioctl functions to devices. This is normally called by the
4915  *	user space syscall interfaces but can sometimes be useful for
4916  *	other purposes. The return value is the return from the syscall if
4917  *	positive or a negative errno code on error.
4918  */
4919 
4920 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4921 {
4922 	struct ifreq ifr;
4923 	int ret;
4924 	char *colon;
4925 
4926 	/* One special case: SIOCGIFCONF takes ifconf argument
4927 	   and requires shared lock, because it sleeps writing
4928 	   to user space.
4929 	 */
4930 
4931 	if (cmd == SIOCGIFCONF) {
4932 		rtnl_lock();
4933 		ret = dev_ifconf(net, (char __user *) arg);
4934 		rtnl_unlock();
4935 		return ret;
4936 	}
4937 	if (cmd == SIOCGIFNAME)
4938 		return dev_ifname(net, (struct ifreq __user *)arg);
4939 
4940 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4941 		return -EFAULT;
4942 
4943 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4944 
4945 	colon = strchr(ifr.ifr_name, ':');
4946 	if (colon)
4947 		*colon = 0;
4948 
4949 	/*
4950 	 *	See which interface the caller is talking about.
4951 	 */
4952 
4953 	switch (cmd) {
4954 	/*
4955 	 *	These ioctl calls:
4956 	 *	- can be done by all.
4957 	 *	- atomic and do not require locking.
4958 	 *	- return a value
4959 	 */
4960 	case SIOCGIFFLAGS:
4961 	case SIOCGIFMETRIC:
4962 	case SIOCGIFMTU:
4963 	case SIOCGIFHWADDR:
4964 	case SIOCGIFSLAVE:
4965 	case SIOCGIFMAP:
4966 	case SIOCGIFINDEX:
4967 	case SIOCGIFTXQLEN:
4968 		dev_load(net, ifr.ifr_name);
4969 		rcu_read_lock();
4970 		ret = dev_ifsioc_locked(net, &ifr, cmd);
4971 		rcu_read_unlock();
4972 		if (!ret) {
4973 			if (colon)
4974 				*colon = ':';
4975 			if (copy_to_user(arg, &ifr,
4976 					 sizeof(struct ifreq)))
4977 				ret = -EFAULT;
4978 		}
4979 		return ret;
4980 
4981 	case SIOCETHTOOL:
4982 		dev_load(net, ifr.ifr_name);
4983 		rtnl_lock();
4984 		ret = dev_ethtool(net, &ifr);
4985 		rtnl_unlock();
4986 		if (!ret) {
4987 			if (colon)
4988 				*colon = ':';
4989 			if (copy_to_user(arg, &ifr,
4990 					 sizeof(struct ifreq)))
4991 				ret = -EFAULT;
4992 		}
4993 		return ret;
4994 
4995 	/*
4996 	 *	These ioctl calls:
4997 	 *	- require superuser power.
4998 	 *	- require strict serialization.
4999 	 *	- return a value
5000 	 */
5001 	case SIOCGMIIPHY:
5002 	case SIOCGMIIREG:
5003 	case SIOCSIFNAME:
5004 		if (!capable(CAP_NET_ADMIN))
5005 			return -EPERM;
5006 		dev_load(net, ifr.ifr_name);
5007 		rtnl_lock();
5008 		ret = dev_ifsioc(net, &ifr, cmd);
5009 		rtnl_unlock();
5010 		if (!ret) {
5011 			if (colon)
5012 				*colon = ':';
5013 			if (copy_to_user(arg, &ifr,
5014 					 sizeof(struct ifreq)))
5015 				ret = -EFAULT;
5016 		}
5017 		return ret;
5018 
5019 	/*
5020 	 *	These ioctl calls:
5021 	 *	- require superuser power.
5022 	 *	- require strict serialization.
5023 	 *	- do not return a value
5024 	 */
5025 	case SIOCSIFFLAGS:
5026 	case SIOCSIFMETRIC:
5027 	case SIOCSIFMTU:
5028 	case SIOCSIFMAP:
5029 	case SIOCSIFHWADDR:
5030 	case SIOCSIFSLAVE:
5031 	case SIOCADDMULTI:
5032 	case SIOCDELMULTI:
5033 	case SIOCSIFHWBROADCAST:
5034 	case SIOCSIFTXQLEN:
5035 	case SIOCSMIIREG:
5036 	case SIOCBONDENSLAVE:
5037 	case SIOCBONDRELEASE:
5038 	case SIOCBONDSETHWADDR:
5039 	case SIOCBONDCHANGEACTIVE:
5040 	case SIOCBRADDIF:
5041 	case SIOCBRDELIF:
5042 	case SIOCSHWTSTAMP:
5043 		if (!capable(CAP_NET_ADMIN))
5044 			return -EPERM;
5045 		/* fall through */
5046 	case SIOCBONDSLAVEINFOQUERY:
5047 	case SIOCBONDINFOQUERY:
5048 		dev_load(net, ifr.ifr_name);
5049 		rtnl_lock();
5050 		ret = dev_ifsioc(net, &ifr, cmd);
5051 		rtnl_unlock();
5052 		return ret;
5053 
5054 	case SIOCGIFMEM:
5055 		/* Get the per device memory space. We can add this but
5056 		 * currently do not support it */
5057 	case SIOCSIFMEM:
5058 		/* Set the per device memory buffer space.
5059 		 * Not applicable in our case */
5060 	case SIOCSIFLINK:
5061 		return -ENOTTY;
5062 
5063 	/*
5064 	 *	Unknown or private ioctl.
5065 	 */
5066 	default:
5067 		if (cmd == SIOCWANDEV ||
5068 		    (cmd >= SIOCDEVPRIVATE &&
5069 		     cmd <= SIOCDEVPRIVATE + 15)) {
5070 			dev_load(net, ifr.ifr_name);
5071 			rtnl_lock();
5072 			ret = dev_ifsioc(net, &ifr, cmd);
5073 			rtnl_unlock();
5074 			if (!ret && copy_to_user(arg, &ifr,
5075 						 sizeof(struct ifreq)))
5076 				ret = -EFAULT;
5077 			return ret;
5078 		}
5079 		/* Take care of Wireless Extensions */
5080 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5081 			return wext_handle_ioctl(net, &ifr, cmd, arg);
5082 		return -ENOTTY;
5083 	}
5084 }
5085 
5086 
5087 /**
5088  *	dev_new_index	-	allocate an ifindex
5089  *	@net: the applicable net namespace
5090  *
5091  *	Returns a suitable unique value for a new device interface
5092  *	number.  The caller must hold the rtnl semaphore or the
5093  *	dev_base_lock to be sure it remains unique.
5094  */
5095 static int dev_new_index(struct net *net)
5096 {
5097 	static int ifindex;
5098 	for (;;) {
5099 		if (++ifindex <= 0)
5100 			ifindex = 1;
5101 		if (!__dev_get_by_index(net, ifindex))
5102 			return ifindex;
5103 	}
5104 }
5105 
5106 /* Delayed registration/unregisteration */
5107 static LIST_HEAD(net_todo_list);
5108 
5109 static void net_set_todo(struct net_device *dev)
5110 {
5111 	list_add_tail(&dev->todo_list, &net_todo_list);
5112 }
5113 
5114 static void rollback_registered_many(struct list_head *head)
5115 {
5116 	struct net_device *dev, *tmp;
5117 
5118 	BUG_ON(dev_boot_phase);
5119 	ASSERT_RTNL();
5120 
5121 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5122 		/* Some devices call without registering
5123 		 * for initialization unwind. Remove those
5124 		 * devices and proceed with the remaining.
5125 		 */
5126 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5127 			pr_debug("unregister_netdevice: device %s/%p never "
5128 				 "was registered\n", dev->name, dev);
5129 
5130 			WARN_ON(1);
5131 			list_del(&dev->unreg_list);
5132 			continue;
5133 		}
5134 		dev->dismantle = true;
5135 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5136 	}
5137 
5138 	/* If device is running, close it first. */
5139 	dev_close_many(head);
5140 
5141 	list_for_each_entry(dev, head, unreg_list) {
5142 		/* And unlink it from device chain. */
5143 		unlist_netdevice(dev);
5144 
5145 		dev->reg_state = NETREG_UNREGISTERING;
5146 	}
5147 
5148 	synchronize_net();
5149 
5150 	list_for_each_entry(dev, head, unreg_list) {
5151 		/* Shutdown queueing discipline. */
5152 		dev_shutdown(dev);
5153 
5154 
5155 		/* Notify protocols, that we are about to destroy
5156 		   this device. They should clean all the things.
5157 		*/
5158 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5159 
5160 		if (!dev->rtnl_link_ops ||
5161 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5162 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5163 
5164 		/*
5165 		 *	Flush the unicast and multicast chains
5166 		 */
5167 		dev_uc_flush(dev);
5168 		dev_mc_flush(dev);
5169 
5170 		if (dev->netdev_ops->ndo_uninit)
5171 			dev->netdev_ops->ndo_uninit(dev);
5172 
5173 		/* Notifier chain MUST detach us from master device. */
5174 		WARN_ON(dev->master);
5175 
5176 		/* Remove entries from kobject tree */
5177 		netdev_unregister_kobject(dev);
5178 	}
5179 
5180 	/* Process any work delayed until the end of the batch */
5181 	dev = list_first_entry(head, struct net_device, unreg_list);
5182 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5183 
5184 	rcu_barrier();
5185 
5186 	list_for_each_entry(dev, head, unreg_list)
5187 		dev_put(dev);
5188 }
5189 
5190 static void rollback_registered(struct net_device *dev)
5191 {
5192 	LIST_HEAD(single);
5193 
5194 	list_add(&dev->unreg_list, &single);
5195 	rollback_registered_many(&single);
5196 	list_del(&single);
5197 }
5198 
5199 u32 netdev_fix_features(struct net_device *dev, u32 features)
5200 {
5201 	/* Fix illegal checksum combinations */
5202 	if ((features & NETIF_F_HW_CSUM) &&
5203 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5204 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5205 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5206 	}
5207 
5208 	if ((features & NETIF_F_NO_CSUM) &&
5209 	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5210 		netdev_warn(dev, "mixed no checksumming and other settings.\n");
5211 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5212 	}
5213 
5214 	/* Fix illegal SG+CSUM combinations. */
5215 	if ((features & NETIF_F_SG) &&
5216 	    !(features & NETIF_F_ALL_CSUM)) {
5217 		netdev_dbg(dev,
5218 			"Dropping NETIF_F_SG since no checksum feature.\n");
5219 		features &= ~NETIF_F_SG;
5220 	}
5221 
5222 	/* TSO requires that SG is present as well. */
5223 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5224 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5225 		features &= ~NETIF_F_ALL_TSO;
5226 	}
5227 
5228 	/* TSO ECN requires that TSO is present as well. */
5229 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5230 		features &= ~NETIF_F_TSO_ECN;
5231 
5232 	/* Software GSO depends on SG. */
5233 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5234 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5235 		features &= ~NETIF_F_GSO;
5236 	}
5237 
5238 	/* UFO needs SG and checksumming */
5239 	if (features & NETIF_F_UFO) {
5240 		/* maybe split UFO into V4 and V6? */
5241 		if (!((features & NETIF_F_GEN_CSUM) ||
5242 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5243 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5244 			netdev_dbg(dev,
5245 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5246 			features &= ~NETIF_F_UFO;
5247 		}
5248 
5249 		if (!(features & NETIF_F_SG)) {
5250 			netdev_dbg(dev,
5251 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5252 			features &= ~NETIF_F_UFO;
5253 		}
5254 	}
5255 
5256 	return features;
5257 }
5258 EXPORT_SYMBOL(netdev_fix_features);
5259 
5260 int __netdev_update_features(struct net_device *dev)
5261 {
5262 	u32 features;
5263 	int err = 0;
5264 
5265 	ASSERT_RTNL();
5266 
5267 	features = netdev_get_wanted_features(dev);
5268 
5269 	if (dev->netdev_ops->ndo_fix_features)
5270 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5271 
5272 	/* driver might be less strict about feature dependencies */
5273 	features = netdev_fix_features(dev, features);
5274 
5275 	if (dev->features == features)
5276 		return 0;
5277 
5278 	netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5279 		dev->features, features);
5280 
5281 	if (dev->netdev_ops->ndo_set_features)
5282 		err = dev->netdev_ops->ndo_set_features(dev, features);
5283 
5284 	if (unlikely(err < 0)) {
5285 		netdev_err(dev,
5286 			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5287 			err, features, dev->features);
5288 		return -1;
5289 	}
5290 
5291 	if (!err)
5292 		dev->features = features;
5293 
5294 	return 1;
5295 }
5296 
5297 /**
5298  *	netdev_update_features - recalculate device features
5299  *	@dev: the device to check
5300  *
5301  *	Recalculate dev->features set and send notifications if it
5302  *	has changed. Should be called after driver or hardware dependent
5303  *	conditions might have changed that influence the features.
5304  */
5305 void netdev_update_features(struct net_device *dev)
5306 {
5307 	if (__netdev_update_features(dev))
5308 		netdev_features_change(dev);
5309 }
5310 EXPORT_SYMBOL(netdev_update_features);
5311 
5312 /**
5313  *	netdev_change_features - recalculate device features
5314  *	@dev: the device to check
5315  *
5316  *	Recalculate dev->features set and send notifications even
5317  *	if they have not changed. Should be called instead of
5318  *	netdev_update_features() if also dev->vlan_features might
5319  *	have changed to allow the changes to be propagated to stacked
5320  *	VLAN devices.
5321  */
5322 void netdev_change_features(struct net_device *dev)
5323 {
5324 	__netdev_update_features(dev);
5325 	netdev_features_change(dev);
5326 }
5327 EXPORT_SYMBOL(netdev_change_features);
5328 
5329 /**
5330  *	netif_stacked_transfer_operstate -	transfer operstate
5331  *	@rootdev: the root or lower level device to transfer state from
5332  *	@dev: the device to transfer operstate to
5333  *
5334  *	Transfer operational state from root to device. This is normally
5335  *	called when a stacking relationship exists between the root
5336  *	device and the device(a leaf device).
5337  */
5338 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5339 					struct net_device *dev)
5340 {
5341 	if (rootdev->operstate == IF_OPER_DORMANT)
5342 		netif_dormant_on(dev);
5343 	else
5344 		netif_dormant_off(dev);
5345 
5346 	if (netif_carrier_ok(rootdev)) {
5347 		if (!netif_carrier_ok(dev))
5348 			netif_carrier_on(dev);
5349 	} else {
5350 		if (netif_carrier_ok(dev))
5351 			netif_carrier_off(dev);
5352 	}
5353 }
5354 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5355 
5356 #ifdef CONFIG_RPS
5357 static int netif_alloc_rx_queues(struct net_device *dev)
5358 {
5359 	unsigned int i, count = dev->num_rx_queues;
5360 	struct netdev_rx_queue *rx;
5361 
5362 	BUG_ON(count < 1);
5363 
5364 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5365 	if (!rx) {
5366 		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5367 		return -ENOMEM;
5368 	}
5369 	dev->_rx = rx;
5370 
5371 	for (i = 0; i < count; i++)
5372 		rx[i].dev = dev;
5373 	return 0;
5374 }
5375 #endif
5376 
5377 static void netdev_init_one_queue(struct net_device *dev,
5378 				  struct netdev_queue *queue, void *_unused)
5379 {
5380 	/* Initialize queue lock */
5381 	spin_lock_init(&queue->_xmit_lock);
5382 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5383 	queue->xmit_lock_owner = -1;
5384 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5385 	queue->dev = dev;
5386 }
5387 
5388 static int netif_alloc_netdev_queues(struct net_device *dev)
5389 {
5390 	unsigned int count = dev->num_tx_queues;
5391 	struct netdev_queue *tx;
5392 
5393 	BUG_ON(count < 1);
5394 
5395 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5396 	if (!tx) {
5397 		pr_err("netdev: Unable to allocate %u tx queues.\n",
5398 		       count);
5399 		return -ENOMEM;
5400 	}
5401 	dev->_tx = tx;
5402 
5403 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5404 	spin_lock_init(&dev->tx_global_lock);
5405 
5406 	return 0;
5407 }
5408 
5409 /**
5410  *	register_netdevice	- register a network device
5411  *	@dev: device to register
5412  *
5413  *	Take a completed network device structure and add it to the kernel
5414  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5415  *	chain. 0 is returned on success. A negative errno code is returned
5416  *	on a failure to set up the device, or if the name is a duplicate.
5417  *
5418  *	Callers must hold the rtnl semaphore. You may want
5419  *	register_netdev() instead of this.
5420  *
5421  *	BUGS:
5422  *	The locking appears insufficient to guarantee two parallel registers
5423  *	will not get the same name.
5424  */
5425 
5426 int register_netdevice(struct net_device *dev)
5427 {
5428 	int ret;
5429 	struct net *net = dev_net(dev);
5430 
5431 	BUG_ON(dev_boot_phase);
5432 	ASSERT_RTNL();
5433 
5434 	might_sleep();
5435 
5436 	/* When net_device's are persistent, this will be fatal. */
5437 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5438 	BUG_ON(!net);
5439 
5440 	spin_lock_init(&dev->addr_list_lock);
5441 	netdev_set_addr_lockdep_class(dev);
5442 
5443 	dev->iflink = -1;
5444 
5445 	ret = dev_get_valid_name(dev, dev->name);
5446 	if (ret < 0)
5447 		goto out;
5448 
5449 	/* Init, if this function is available */
5450 	if (dev->netdev_ops->ndo_init) {
5451 		ret = dev->netdev_ops->ndo_init(dev);
5452 		if (ret) {
5453 			if (ret > 0)
5454 				ret = -EIO;
5455 			goto out;
5456 		}
5457 	}
5458 
5459 	dev->ifindex = dev_new_index(net);
5460 	if (dev->iflink == -1)
5461 		dev->iflink = dev->ifindex;
5462 
5463 	/* Transfer changeable features to wanted_features and enable
5464 	 * software offloads (GSO and GRO).
5465 	 */
5466 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5467 	dev->features |= NETIF_F_SOFT_FEATURES;
5468 	dev->wanted_features = dev->features & dev->hw_features;
5469 
5470 	/* Turn on no cache copy if HW is doing checksum */
5471 	dev->hw_features |= NETIF_F_NOCACHE_COPY;
5472 	if ((dev->features & NETIF_F_ALL_CSUM) &&
5473 	    !(dev->features & NETIF_F_NO_CSUM)) {
5474 		dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5475 		dev->features |= NETIF_F_NOCACHE_COPY;
5476 	}
5477 
5478 	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5479 	 * vlan_dev_init() will do the dev->features check, so these features
5480 	 * are enabled only if supported by underlying device.
5481 	 */
5482 	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5483 
5484 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5485 	ret = notifier_to_errno(ret);
5486 	if (ret)
5487 		goto err_uninit;
5488 
5489 	ret = netdev_register_kobject(dev);
5490 	if (ret)
5491 		goto err_uninit;
5492 	dev->reg_state = NETREG_REGISTERED;
5493 
5494 	__netdev_update_features(dev);
5495 
5496 	/*
5497 	 *	Default initial state at registry is that the
5498 	 *	device is present.
5499 	 */
5500 
5501 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5502 
5503 	dev_init_scheduler(dev);
5504 	dev_hold(dev);
5505 	list_netdevice(dev);
5506 
5507 	/* Notify protocols, that a new device appeared. */
5508 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5509 	ret = notifier_to_errno(ret);
5510 	if (ret) {
5511 		rollback_registered(dev);
5512 		dev->reg_state = NETREG_UNREGISTERED;
5513 	}
5514 	/*
5515 	 *	Prevent userspace races by waiting until the network
5516 	 *	device is fully setup before sending notifications.
5517 	 */
5518 	if (!dev->rtnl_link_ops ||
5519 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5520 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5521 
5522 out:
5523 	return ret;
5524 
5525 err_uninit:
5526 	if (dev->netdev_ops->ndo_uninit)
5527 		dev->netdev_ops->ndo_uninit(dev);
5528 	goto out;
5529 }
5530 EXPORT_SYMBOL(register_netdevice);
5531 
5532 /**
5533  *	init_dummy_netdev	- init a dummy network device for NAPI
5534  *	@dev: device to init
5535  *
5536  *	This takes a network device structure and initialize the minimum
5537  *	amount of fields so it can be used to schedule NAPI polls without
5538  *	registering a full blown interface. This is to be used by drivers
5539  *	that need to tie several hardware interfaces to a single NAPI
5540  *	poll scheduler due to HW limitations.
5541  */
5542 int init_dummy_netdev(struct net_device *dev)
5543 {
5544 	/* Clear everything. Note we don't initialize spinlocks
5545 	 * are they aren't supposed to be taken by any of the
5546 	 * NAPI code and this dummy netdev is supposed to be
5547 	 * only ever used for NAPI polls
5548 	 */
5549 	memset(dev, 0, sizeof(struct net_device));
5550 
5551 	/* make sure we BUG if trying to hit standard
5552 	 * register/unregister code path
5553 	 */
5554 	dev->reg_state = NETREG_DUMMY;
5555 
5556 	/* NAPI wants this */
5557 	INIT_LIST_HEAD(&dev->napi_list);
5558 
5559 	/* a dummy interface is started by default */
5560 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5561 	set_bit(__LINK_STATE_START, &dev->state);
5562 
5563 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5564 	 * because users of this 'device' dont need to change
5565 	 * its refcount.
5566 	 */
5567 
5568 	return 0;
5569 }
5570 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5571 
5572 
5573 /**
5574  *	register_netdev	- register a network device
5575  *	@dev: device to register
5576  *
5577  *	Take a completed network device structure and add it to the kernel
5578  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5579  *	chain. 0 is returned on success. A negative errno code is returned
5580  *	on a failure to set up the device, or if the name is a duplicate.
5581  *
5582  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5583  *	and expands the device name if you passed a format string to
5584  *	alloc_netdev.
5585  */
5586 int register_netdev(struct net_device *dev)
5587 {
5588 	int err;
5589 
5590 	rtnl_lock();
5591 	err = register_netdevice(dev);
5592 	rtnl_unlock();
5593 	return err;
5594 }
5595 EXPORT_SYMBOL(register_netdev);
5596 
5597 int netdev_refcnt_read(const struct net_device *dev)
5598 {
5599 	int i, refcnt = 0;
5600 
5601 	for_each_possible_cpu(i)
5602 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5603 	return refcnt;
5604 }
5605 EXPORT_SYMBOL(netdev_refcnt_read);
5606 
5607 /*
5608  * netdev_wait_allrefs - wait until all references are gone.
5609  *
5610  * This is called when unregistering network devices.
5611  *
5612  * Any protocol or device that holds a reference should register
5613  * for netdevice notification, and cleanup and put back the
5614  * reference if they receive an UNREGISTER event.
5615  * We can get stuck here if buggy protocols don't correctly
5616  * call dev_put.
5617  */
5618 static void netdev_wait_allrefs(struct net_device *dev)
5619 {
5620 	unsigned long rebroadcast_time, warning_time;
5621 	int refcnt;
5622 
5623 	linkwatch_forget_dev(dev);
5624 
5625 	rebroadcast_time = warning_time = jiffies;
5626 	refcnt = netdev_refcnt_read(dev);
5627 
5628 	while (refcnt != 0) {
5629 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5630 			rtnl_lock();
5631 
5632 			/* Rebroadcast unregister notification */
5633 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5634 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5635 			 * should have already handle it the first time */
5636 
5637 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5638 				     &dev->state)) {
5639 				/* We must not have linkwatch events
5640 				 * pending on unregister. If this
5641 				 * happens, we simply run the queue
5642 				 * unscheduled, resulting in a noop
5643 				 * for this device.
5644 				 */
5645 				linkwatch_run_queue();
5646 			}
5647 
5648 			__rtnl_unlock();
5649 
5650 			rebroadcast_time = jiffies;
5651 		}
5652 
5653 		msleep(250);
5654 
5655 		refcnt = netdev_refcnt_read(dev);
5656 
5657 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5658 			printk(KERN_EMERG "unregister_netdevice: "
5659 			       "waiting for %s to become free. Usage "
5660 			       "count = %d\n",
5661 			       dev->name, refcnt);
5662 			warning_time = jiffies;
5663 		}
5664 	}
5665 }
5666 
5667 /* The sequence is:
5668  *
5669  *	rtnl_lock();
5670  *	...
5671  *	register_netdevice(x1);
5672  *	register_netdevice(x2);
5673  *	...
5674  *	unregister_netdevice(y1);
5675  *	unregister_netdevice(y2);
5676  *      ...
5677  *	rtnl_unlock();
5678  *	free_netdev(y1);
5679  *	free_netdev(y2);
5680  *
5681  * We are invoked by rtnl_unlock().
5682  * This allows us to deal with problems:
5683  * 1) We can delete sysfs objects which invoke hotplug
5684  *    without deadlocking with linkwatch via keventd.
5685  * 2) Since we run with the RTNL semaphore not held, we can sleep
5686  *    safely in order to wait for the netdev refcnt to drop to zero.
5687  *
5688  * We must not return until all unregister events added during
5689  * the interval the lock was held have been completed.
5690  */
5691 void netdev_run_todo(void)
5692 {
5693 	struct list_head list;
5694 
5695 	/* Snapshot list, allow later requests */
5696 	list_replace_init(&net_todo_list, &list);
5697 
5698 	__rtnl_unlock();
5699 
5700 	while (!list_empty(&list)) {
5701 		struct net_device *dev
5702 			= list_first_entry(&list, struct net_device, todo_list);
5703 		list_del(&dev->todo_list);
5704 
5705 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5706 			printk(KERN_ERR "network todo '%s' but state %d\n",
5707 			       dev->name, dev->reg_state);
5708 			dump_stack();
5709 			continue;
5710 		}
5711 
5712 		dev->reg_state = NETREG_UNREGISTERED;
5713 
5714 		on_each_cpu(flush_backlog, dev, 1);
5715 
5716 		netdev_wait_allrefs(dev);
5717 
5718 		/* paranoia */
5719 		BUG_ON(netdev_refcnt_read(dev));
5720 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5721 		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5722 		WARN_ON(dev->dn_ptr);
5723 
5724 		if (dev->destructor)
5725 			dev->destructor(dev);
5726 
5727 		/* Free network device */
5728 		kobject_put(&dev->dev.kobj);
5729 	}
5730 }
5731 
5732 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5733  * fields in the same order, with only the type differing.
5734  */
5735 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5736 				    const struct net_device_stats *netdev_stats)
5737 {
5738 #if BITS_PER_LONG == 64
5739         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5740         memcpy(stats64, netdev_stats, sizeof(*stats64));
5741 #else
5742 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5743 	const unsigned long *src = (const unsigned long *)netdev_stats;
5744 	u64 *dst = (u64 *)stats64;
5745 
5746 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5747 		     sizeof(*stats64) / sizeof(u64));
5748 	for (i = 0; i < n; i++)
5749 		dst[i] = src[i];
5750 #endif
5751 }
5752 
5753 /**
5754  *	dev_get_stats	- get network device statistics
5755  *	@dev: device to get statistics from
5756  *	@storage: place to store stats
5757  *
5758  *	Get network statistics from device. Return @storage.
5759  *	The device driver may provide its own method by setting
5760  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5761  *	otherwise the internal statistics structure is used.
5762  */
5763 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5764 					struct rtnl_link_stats64 *storage)
5765 {
5766 	const struct net_device_ops *ops = dev->netdev_ops;
5767 
5768 	if (ops->ndo_get_stats64) {
5769 		memset(storage, 0, sizeof(*storage));
5770 		ops->ndo_get_stats64(dev, storage);
5771 	} else if (ops->ndo_get_stats) {
5772 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5773 	} else {
5774 		netdev_stats_to_stats64(storage, &dev->stats);
5775 	}
5776 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5777 	return storage;
5778 }
5779 EXPORT_SYMBOL(dev_get_stats);
5780 
5781 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5782 {
5783 	struct netdev_queue *queue = dev_ingress_queue(dev);
5784 
5785 #ifdef CONFIG_NET_CLS_ACT
5786 	if (queue)
5787 		return queue;
5788 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5789 	if (!queue)
5790 		return NULL;
5791 	netdev_init_one_queue(dev, queue, NULL);
5792 	queue->qdisc = &noop_qdisc;
5793 	queue->qdisc_sleeping = &noop_qdisc;
5794 	rcu_assign_pointer(dev->ingress_queue, queue);
5795 #endif
5796 	return queue;
5797 }
5798 
5799 /**
5800  *	alloc_netdev_mqs - allocate network device
5801  *	@sizeof_priv:	size of private data to allocate space for
5802  *	@name:		device name format string
5803  *	@setup:		callback to initialize device
5804  *	@txqs:		the number of TX subqueues to allocate
5805  *	@rxqs:		the number of RX subqueues to allocate
5806  *
5807  *	Allocates a struct net_device with private data area for driver use
5808  *	and performs basic initialization.  Also allocates subquue structs
5809  *	for each queue on the device.
5810  */
5811 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5812 		void (*setup)(struct net_device *),
5813 		unsigned int txqs, unsigned int rxqs)
5814 {
5815 	struct net_device *dev;
5816 	size_t alloc_size;
5817 	struct net_device *p;
5818 
5819 	BUG_ON(strlen(name) >= sizeof(dev->name));
5820 
5821 	if (txqs < 1) {
5822 		pr_err("alloc_netdev: Unable to allocate device "
5823 		       "with zero queues.\n");
5824 		return NULL;
5825 	}
5826 
5827 #ifdef CONFIG_RPS
5828 	if (rxqs < 1) {
5829 		pr_err("alloc_netdev: Unable to allocate device "
5830 		       "with zero RX queues.\n");
5831 		return NULL;
5832 	}
5833 #endif
5834 
5835 	alloc_size = sizeof(struct net_device);
5836 	if (sizeof_priv) {
5837 		/* ensure 32-byte alignment of private area */
5838 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5839 		alloc_size += sizeof_priv;
5840 	}
5841 	/* ensure 32-byte alignment of whole construct */
5842 	alloc_size += NETDEV_ALIGN - 1;
5843 
5844 	p = kzalloc(alloc_size, GFP_KERNEL);
5845 	if (!p) {
5846 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5847 		return NULL;
5848 	}
5849 
5850 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5851 	dev->padded = (char *)dev - (char *)p;
5852 
5853 	dev->pcpu_refcnt = alloc_percpu(int);
5854 	if (!dev->pcpu_refcnt)
5855 		goto free_p;
5856 
5857 	if (dev_addr_init(dev))
5858 		goto free_pcpu;
5859 
5860 	dev_mc_init(dev);
5861 	dev_uc_init(dev);
5862 
5863 	dev_net_set(dev, &init_net);
5864 
5865 	dev->gso_max_size = GSO_MAX_SIZE;
5866 
5867 	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5868 	dev->ethtool_ntuple_list.count = 0;
5869 	INIT_LIST_HEAD(&dev->napi_list);
5870 	INIT_LIST_HEAD(&dev->unreg_list);
5871 	INIT_LIST_HEAD(&dev->link_watch_list);
5872 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5873 	setup(dev);
5874 
5875 	dev->num_tx_queues = txqs;
5876 	dev->real_num_tx_queues = txqs;
5877 	if (netif_alloc_netdev_queues(dev))
5878 		goto free_all;
5879 
5880 #ifdef CONFIG_RPS
5881 	dev->num_rx_queues = rxqs;
5882 	dev->real_num_rx_queues = rxqs;
5883 	if (netif_alloc_rx_queues(dev))
5884 		goto free_all;
5885 #endif
5886 
5887 	strcpy(dev->name, name);
5888 	dev->group = INIT_NETDEV_GROUP;
5889 	return dev;
5890 
5891 free_all:
5892 	free_netdev(dev);
5893 	return NULL;
5894 
5895 free_pcpu:
5896 	free_percpu(dev->pcpu_refcnt);
5897 	kfree(dev->_tx);
5898 #ifdef CONFIG_RPS
5899 	kfree(dev->_rx);
5900 #endif
5901 
5902 free_p:
5903 	kfree(p);
5904 	return NULL;
5905 }
5906 EXPORT_SYMBOL(alloc_netdev_mqs);
5907 
5908 /**
5909  *	free_netdev - free network device
5910  *	@dev: device
5911  *
5912  *	This function does the last stage of destroying an allocated device
5913  * 	interface. The reference to the device object is released.
5914  *	If this is the last reference then it will be freed.
5915  */
5916 void free_netdev(struct net_device *dev)
5917 {
5918 	struct napi_struct *p, *n;
5919 
5920 	release_net(dev_net(dev));
5921 
5922 	kfree(dev->_tx);
5923 #ifdef CONFIG_RPS
5924 	kfree(dev->_rx);
5925 #endif
5926 
5927 	kfree(rcu_dereference_raw(dev->ingress_queue));
5928 
5929 	/* Flush device addresses */
5930 	dev_addr_flush(dev);
5931 
5932 	/* Clear ethtool n-tuple list */
5933 	ethtool_ntuple_flush(dev);
5934 
5935 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5936 		netif_napi_del(p);
5937 
5938 	free_percpu(dev->pcpu_refcnt);
5939 	dev->pcpu_refcnt = NULL;
5940 
5941 	/*  Compatibility with error handling in drivers */
5942 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5943 		kfree((char *)dev - dev->padded);
5944 		return;
5945 	}
5946 
5947 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5948 	dev->reg_state = NETREG_RELEASED;
5949 
5950 	/* will free via device release */
5951 	put_device(&dev->dev);
5952 }
5953 EXPORT_SYMBOL(free_netdev);
5954 
5955 /**
5956  *	synchronize_net -  Synchronize with packet receive processing
5957  *
5958  *	Wait for packets currently being received to be done.
5959  *	Does not block later packets from starting.
5960  */
5961 void synchronize_net(void)
5962 {
5963 	might_sleep();
5964 	if (rtnl_is_locked())
5965 		synchronize_rcu_expedited();
5966 	else
5967 		synchronize_rcu();
5968 }
5969 EXPORT_SYMBOL(synchronize_net);
5970 
5971 /**
5972  *	unregister_netdevice_queue - remove device from the kernel
5973  *	@dev: device
5974  *	@head: list
5975  *
5976  *	This function shuts down a device interface and removes it
5977  *	from the kernel tables.
5978  *	If head not NULL, device is queued to be unregistered later.
5979  *
5980  *	Callers must hold the rtnl semaphore.  You may want
5981  *	unregister_netdev() instead of this.
5982  */
5983 
5984 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5985 {
5986 	ASSERT_RTNL();
5987 
5988 	if (head) {
5989 		list_move_tail(&dev->unreg_list, head);
5990 	} else {
5991 		rollback_registered(dev);
5992 		/* Finish processing unregister after unlock */
5993 		net_set_todo(dev);
5994 	}
5995 }
5996 EXPORT_SYMBOL(unregister_netdevice_queue);
5997 
5998 /**
5999  *	unregister_netdevice_many - unregister many devices
6000  *	@head: list of devices
6001  */
6002 void unregister_netdevice_many(struct list_head *head)
6003 {
6004 	struct net_device *dev;
6005 
6006 	if (!list_empty(head)) {
6007 		rollback_registered_many(head);
6008 		list_for_each_entry(dev, head, unreg_list)
6009 			net_set_todo(dev);
6010 	}
6011 }
6012 EXPORT_SYMBOL(unregister_netdevice_many);
6013 
6014 /**
6015  *	unregister_netdev - remove device from the kernel
6016  *	@dev: device
6017  *
6018  *	This function shuts down a device interface and removes it
6019  *	from the kernel tables.
6020  *
6021  *	This is just a wrapper for unregister_netdevice that takes
6022  *	the rtnl semaphore.  In general you want to use this and not
6023  *	unregister_netdevice.
6024  */
6025 void unregister_netdev(struct net_device *dev)
6026 {
6027 	rtnl_lock();
6028 	unregister_netdevice(dev);
6029 	rtnl_unlock();
6030 }
6031 EXPORT_SYMBOL(unregister_netdev);
6032 
6033 /**
6034  *	dev_change_net_namespace - move device to different nethost namespace
6035  *	@dev: device
6036  *	@net: network namespace
6037  *	@pat: If not NULL name pattern to try if the current device name
6038  *	      is already taken in the destination network namespace.
6039  *
6040  *	This function shuts down a device interface and moves it
6041  *	to a new network namespace. On success 0 is returned, on
6042  *	a failure a netagive errno code is returned.
6043  *
6044  *	Callers must hold the rtnl semaphore.
6045  */
6046 
6047 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6048 {
6049 	int err;
6050 
6051 	ASSERT_RTNL();
6052 
6053 	/* Don't allow namespace local devices to be moved. */
6054 	err = -EINVAL;
6055 	if (dev->features & NETIF_F_NETNS_LOCAL)
6056 		goto out;
6057 
6058 	/* Ensure the device has been registrered */
6059 	err = -EINVAL;
6060 	if (dev->reg_state != NETREG_REGISTERED)
6061 		goto out;
6062 
6063 	/* Get out if there is nothing todo */
6064 	err = 0;
6065 	if (net_eq(dev_net(dev), net))
6066 		goto out;
6067 
6068 	/* Pick the destination device name, and ensure
6069 	 * we can use it in the destination network namespace.
6070 	 */
6071 	err = -EEXIST;
6072 	if (__dev_get_by_name(net, dev->name)) {
6073 		/* We get here if we can't use the current device name */
6074 		if (!pat)
6075 			goto out;
6076 		if (dev_get_valid_name(dev, pat) < 0)
6077 			goto out;
6078 	}
6079 
6080 	/*
6081 	 * And now a mini version of register_netdevice unregister_netdevice.
6082 	 */
6083 
6084 	/* If device is running close it first. */
6085 	dev_close(dev);
6086 
6087 	/* And unlink it from device chain */
6088 	err = -ENODEV;
6089 	unlist_netdevice(dev);
6090 
6091 	synchronize_net();
6092 
6093 	/* Shutdown queueing discipline. */
6094 	dev_shutdown(dev);
6095 
6096 	/* Notify protocols, that we are about to destroy
6097 	   this device. They should clean all the things.
6098 
6099 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6100 	   This is wanted because this way 8021q and macvlan know
6101 	   the device is just moving and can keep their slaves up.
6102 	*/
6103 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6104 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6105 
6106 	/*
6107 	 *	Flush the unicast and multicast chains
6108 	 */
6109 	dev_uc_flush(dev);
6110 	dev_mc_flush(dev);
6111 
6112 	/* Actually switch the network namespace */
6113 	dev_net_set(dev, net);
6114 
6115 	/* If there is an ifindex conflict assign a new one */
6116 	if (__dev_get_by_index(net, dev->ifindex)) {
6117 		int iflink = (dev->iflink == dev->ifindex);
6118 		dev->ifindex = dev_new_index(net);
6119 		if (iflink)
6120 			dev->iflink = dev->ifindex;
6121 	}
6122 
6123 	/* Fixup kobjects */
6124 	err = device_rename(&dev->dev, dev->name);
6125 	WARN_ON(err);
6126 
6127 	/* Add the device back in the hashes */
6128 	list_netdevice(dev);
6129 
6130 	/* Notify protocols, that a new device appeared. */
6131 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6132 
6133 	/*
6134 	 *	Prevent userspace races by waiting until the network
6135 	 *	device is fully setup before sending notifications.
6136 	 */
6137 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6138 
6139 	synchronize_net();
6140 	err = 0;
6141 out:
6142 	return err;
6143 }
6144 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6145 
6146 static int dev_cpu_callback(struct notifier_block *nfb,
6147 			    unsigned long action,
6148 			    void *ocpu)
6149 {
6150 	struct sk_buff **list_skb;
6151 	struct sk_buff *skb;
6152 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6153 	struct softnet_data *sd, *oldsd;
6154 
6155 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6156 		return NOTIFY_OK;
6157 
6158 	local_irq_disable();
6159 	cpu = smp_processor_id();
6160 	sd = &per_cpu(softnet_data, cpu);
6161 	oldsd = &per_cpu(softnet_data, oldcpu);
6162 
6163 	/* Find end of our completion_queue. */
6164 	list_skb = &sd->completion_queue;
6165 	while (*list_skb)
6166 		list_skb = &(*list_skb)->next;
6167 	/* Append completion queue from offline CPU. */
6168 	*list_skb = oldsd->completion_queue;
6169 	oldsd->completion_queue = NULL;
6170 
6171 	/* Append output queue from offline CPU. */
6172 	if (oldsd->output_queue) {
6173 		*sd->output_queue_tailp = oldsd->output_queue;
6174 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6175 		oldsd->output_queue = NULL;
6176 		oldsd->output_queue_tailp = &oldsd->output_queue;
6177 	}
6178 
6179 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6180 	local_irq_enable();
6181 
6182 	/* Process offline CPU's input_pkt_queue */
6183 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6184 		netif_rx(skb);
6185 		input_queue_head_incr(oldsd);
6186 	}
6187 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6188 		netif_rx(skb);
6189 		input_queue_head_incr(oldsd);
6190 	}
6191 
6192 	return NOTIFY_OK;
6193 }
6194 
6195 
6196 /**
6197  *	netdev_increment_features - increment feature set by one
6198  *	@all: current feature set
6199  *	@one: new feature set
6200  *	@mask: mask feature set
6201  *
6202  *	Computes a new feature set after adding a device with feature set
6203  *	@one to the master device with current feature set @all.  Will not
6204  *	enable anything that is off in @mask. Returns the new feature set.
6205  */
6206 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6207 {
6208 	if (mask & NETIF_F_GEN_CSUM)
6209 		mask |= NETIF_F_ALL_CSUM;
6210 	mask |= NETIF_F_VLAN_CHALLENGED;
6211 
6212 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6213 	all &= one | ~NETIF_F_ALL_FOR_ALL;
6214 
6215 	/* If device needs checksumming, downgrade to it. */
6216 	if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6217 		all &= ~NETIF_F_NO_CSUM;
6218 
6219 	/* If one device supports hw checksumming, set for all. */
6220 	if (all & NETIF_F_GEN_CSUM)
6221 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6222 
6223 	return all;
6224 }
6225 EXPORT_SYMBOL(netdev_increment_features);
6226 
6227 static struct hlist_head *netdev_create_hash(void)
6228 {
6229 	int i;
6230 	struct hlist_head *hash;
6231 
6232 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6233 	if (hash != NULL)
6234 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6235 			INIT_HLIST_HEAD(&hash[i]);
6236 
6237 	return hash;
6238 }
6239 
6240 /* Initialize per network namespace state */
6241 static int __net_init netdev_init(struct net *net)
6242 {
6243 	INIT_LIST_HEAD(&net->dev_base_head);
6244 
6245 	net->dev_name_head = netdev_create_hash();
6246 	if (net->dev_name_head == NULL)
6247 		goto err_name;
6248 
6249 	net->dev_index_head = netdev_create_hash();
6250 	if (net->dev_index_head == NULL)
6251 		goto err_idx;
6252 
6253 	return 0;
6254 
6255 err_idx:
6256 	kfree(net->dev_name_head);
6257 err_name:
6258 	return -ENOMEM;
6259 }
6260 
6261 /**
6262  *	netdev_drivername - network driver for the device
6263  *	@dev: network device
6264  *	@buffer: buffer for resulting name
6265  *	@len: size of buffer
6266  *
6267  *	Determine network driver for device.
6268  */
6269 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6270 {
6271 	const struct device_driver *driver;
6272 	const struct device *parent;
6273 
6274 	if (len <= 0 || !buffer)
6275 		return buffer;
6276 	buffer[0] = 0;
6277 
6278 	parent = dev->dev.parent;
6279 
6280 	if (!parent)
6281 		return buffer;
6282 
6283 	driver = parent->driver;
6284 	if (driver && driver->name)
6285 		strlcpy(buffer, driver->name, len);
6286 	return buffer;
6287 }
6288 
6289 static int __netdev_printk(const char *level, const struct net_device *dev,
6290 			   struct va_format *vaf)
6291 {
6292 	int r;
6293 
6294 	if (dev && dev->dev.parent)
6295 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6296 			       netdev_name(dev), vaf);
6297 	else if (dev)
6298 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6299 	else
6300 		r = printk("%s(NULL net_device): %pV", level, vaf);
6301 
6302 	return r;
6303 }
6304 
6305 int netdev_printk(const char *level, const struct net_device *dev,
6306 		  const char *format, ...)
6307 {
6308 	struct va_format vaf;
6309 	va_list args;
6310 	int r;
6311 
6312 	va_start(args, format);
6313 
6314 	vaf.fmt = format;
6315 	vaf.va = &args;
6316 
6317 	r = __netdev_printk(level, dev, &vaf);
6318 	va_end(args);
6319 
6320 	return r;
6321 }
6322 EXPORT_SYMBOL(netdev_printk);
6323 
6324 #define define_netdev_printk_level(func, level)			\
6325 int func(const struct net_device *dev, const char *fmt, ...)	\
6326 {								\
6327 	int r;							\
6328 	struct va_format vaf;					\
6329 	va_list args;						\
6330 								\
6331 	va_start(args, fmt);					\
6332 								\
6333 	vaf.fmt = fmt;						\
6334 	vaf.va = &args;						\
6335 								\
6336 	r = __netdev_printk(level, dev, &vaf);			\
6337 	va_end(args);						\
6338 								\
6339 	return r;						\
6340 }								\
6341 EXPORT_SYMBOL(func);
6342 
6343 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6344 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6345 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6346 define_netdev_printk_level(netdev_err, KERN_ERR);
6347 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6348 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6349 define_netdev_printk_level(netdev_info, KERN_INFO);
6350 
6351 static void __net_exit netdev_exit(struct net *net)
6352 {
6353 	kfree(net->dev_name_head);
6354 	kfree(net->dev_index_head);
6355 }
6356 
6357 static struct pernet_operations __net_initdata netdev_net_ops = {
6358 	.init = netdev_init,
6359 	.exit = netdev_exit,
6360 };
6361 
6362 static void __net_exit default_device_exit(struct net *net)
6363 {
6364 	struct net_device *dev, *aux;
6365 	/*
6366 	 * Push all migratable network devices back to the
6367 	 * initial network namespace
6368 	 */
6369 	rtnl_lock();
6370 	for_each_netdev_safe(net, dev, aux) {
6371 		int err;
6372 		char fb_name[IFNAMSIZ];
6373 
6374 		/* Ignore unmoveable devices (i.e. loopback) */
6375 		if (dev->features & NETIF_F_NETNS_LOCAL)
6376 			continue;
6377 
6378 		/* Leave virtual devices for the generic cleanup */
6379 		if (dev->rtnl_link_ops)
6380 			continue;
6381 
6382 		/* Push remaining network devices to init_net */
6383 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6384 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6385 		if (err) {
6386 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6387 				__func__, dev->name, err);
6388 			BUG();
6389 		}
6390 	}
6391 	rtnl_unlock();
6392 }
6393 
6394 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6395 {
6396 	/* At exit all network devices most be removed from a network
6397 	 * namespace.  Do this in the reverse order of registration.
6398 	 * Do this across as many network namespaces as possible to
6399 	 * improve batching efficiency.
6400 	 */
6401 	struct net_device *dev;
6402 	struct net *net;
6403 	LIST_HEAD(dev_kill_list);
6404 
6405 	rtnl_lock();
6406 	list_for_each_entry(net, net_list, exit_list) {
6407 		for_each_netdev_reverse(net, dev) {
6408 			if (dev->rtnl_link_ops)
6409 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6410 			else
6411 				unregister_netdevice_queue(dev, &dev_kill_list);
6412 		}
6413 	}
6414 	unregister_netdevice_many(&dev_kill_list);
6415 	list_del(&dev_kill_list);
6416 	rtnl_unlock();
6417 }
6418 
6419 static struct pernet_operations __net_initdata default_device_ops = {
6420 	.exit = default_device_exit,
6421 	.exit_batch = default_device_exit_batch,
6422 };
6423 
6424 /*
6425  *	Initialize the DEV module. At boot time this walks the device list and
6426  *	unhooks any devices that fail to initialise (normally hardware not
6427  *	present) and leaves us with a valid list of present and active devices.
6428  *
6429  */
6430 
6431 /*
6432  *       This is called single threaded during boot, so no need
6433  *       to take the rtnl semaphore.
6434  */
6435 static int __init net_dev_init(void)
6436 {
6437 	int i, rc = -ENOMEM;
6438 
6439 	BUG_ON(!dev_boot_phase);
6440 
6441 	if (dev_proc_init())
6442 		goto out;
6443 
6444 	if (netdev_kobject_init())
6445 		goto out;
6446 
6447 	INIT_LIST_HEAD(&ptype_all);
6448 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6449 		INIT_LIST_HEAD(&ptype_base[i]);
6450 
6451 	if (register_pernet_subsys(&netdev_net_ops))
6452 		goto out;
6453 
6454 	/*
6455 	 *	Initialise the packet receive queues.
6456 	 */
6457 
6458 	for_each_possible_cpu(i) {
6459 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6460 
6461 		memset(sd, 0, sizeof(*sd));
6462 		skb_queue_head_init(&sd->input_pkt_queue);
6463 		skb_queue_head_init(&sd->process_queue);
6464 		sd->completion_queue = NULL;
6465 		INIT_LIST_HEAD(&sd->poll_list);
6466 		sd->output_queue = NULL;
6467 		sd->output_queue_tailp = &sd->output_queue;
6468 #ifdef CONFIG_RPS
6469 		sd->csd.func = rps_trigger_softirq;
6470 		sd->csd.info = sd;
6471 		sd->csd.flags = 0;
6472 		sd->cpu = i;
6473 #endif
6474 
6475 		sd->backlog.poll = process_backlog;
6476 		sd->backlog.weight = weight_p;
6477 		sd->backlog.gro_list = NULL;
6478 		sd->backlog.gro_count = 0;
6479 	}
6480 
6481 	dev_boot_phase = 0;
6482 
6483 	/* The loopback device is special if any other network devices
6484 	 * is present in a network namespace the loopback device must
6485 	 * be present. Since we now dynamically allocate and free the
6486 	 * loopback device ensure this invariant is maintained by
6487 	 * keeping the loopback device as the first device on the
6488 	 * list of network devices.  Ensuring the loopback devices
6489 	 * is the first device that appears and the last network device
6490 	 * that disappears.
6491 	 */
6492 	if (register_pernet_device(&loopback_net_ops))
6493 		goto out;
6494 
6495 	if (register_pernet_device(&default_device_ops))
6496 		goto out;
6497 
6498 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6499 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6500 
6501 	hotcpu_notifier(dev_cpu_callback, 0);
6502 	dst_init();
6503 	dev_mcast_init();
6504 	rc = 0;
6505 out:
6506 	return rc;
6507 }
6508 
6509 subsys_initcall(net_dev_init);
6510 
6511 static int __init initialize_hashrnd(void)
6512 {
6513 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6514 	return 0;
6515 }
6516 
6517 late_initcall_sync(initialize_hashrnd);
6518 
6519