xref: /linux/net/core/dev.c (revision 363737d66427c18edb321a06933ac999d9ce5d7f)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/proc_fs.h>
101 #include <linux/seq_file.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/net_tstamp.h>
136 #include <linux/static_key.h>
137 #include <net/flow_keys.h>
138 
139 #include "net-sysfs.h"
140 
141 /* Instead of increasing this, you should create a hash table. */
142 #define MAX_GRO_SKBS 8
143 
144 /* This should be increased if a protocol with a bigger head is added. */
145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146 
147 /*
148  *	The list of packet types we will receive (as opposed to discard)
149  *	and the routines to invoke.
150  *
151  *	Why 16. Because with 16 the only overlap we get on a hash of the
152  *	low nibble of the protocol value is RARP/SNAP/X.25.
153  *
154  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
155  *             sure which should go first, but I bet it won't make much
156  *             difference if we are running VLANs.  The good news is that
157  *             this protocol won't be in the list unless compiled in, so
158  *             the average user (w/out VLANs) will not be adversely affected.
159  *             --BLG
160  *
161  *		0800	IP
162  *		8100    802.1Q VLAN
163  *		0001	802.3
164  *		0002	AX.25
165  *		0004	802.2
166  *		8035	RARP
167  *		0005	SNAP
168  *		0805	X.25
169  *		0806	ARP
170  *		8137	IPX
171  *		0009	Localtalk
172  *		86DD	IPv6
173  */
174 
175 #define PTYPE_HASH_SIZE	(16)
176 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
177 
178 static DEFINE_SPINLOCK(ptype_lock);
179 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
180 static struct list_head ptype_all __read_mostly;	/* Taps */
181 
182 /*
183  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
184  * semaphore.
185  *
186  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
187  *
188  * Writers must hold the rtnl semaphore while they loop through the
189  * dev_base_head list, and hold dev_base_lock for writing when they do the
190  * actual updates.  This allows pure readers to access the list even
191  * while a writer is preparing to update it.
192  *
193  * To put it another way, dev_base_lock is held for writing only to
194  * protect against pure readers; the rtnl semaphore provides the
195  * protection against other writers.
196  *
197  * See, for example usages, register_netdevice() and
198  * unregister_netdevice(), which must be called with the rtnl
199  * semaphore held.
200  */
201 DEFINE_RWLOCK(dev_base_lock);
202 EXPORT_SYMBOL(dev_base_lock);
203 
204 static inline void dev_base_seq_inc(struct net *net)
205 {
206 	while (++net->dev_base_seq == 0);
207 }
208 
209 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
210 {
211 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
212 
213 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
214 }
215 
216 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
217 {
218 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
219 }
220 
221 static inline void rps_lock(struct softnet_data *sd)
222 {
223 #ifdef CONFIG_RPS
224 	spin_lock(&sd->input_pkt_queue.lock);
225 #endif
226 }
227 
228 static inline void rps_unlock(struct softnet_data *sd)
229 {
230 #ifdef CONFIG_RPS
231 	spin_unlock(&sd->input_pkt_queue.lock);
232 #endif
233 }
234 
235 /* Device list insertion */
236 static int list_netdevice(struct net_device *dev)
237 {
238 	struct net *net = dev_net(dev);
239 
240 	ASSERT_RTNL();
241 
242 	write_lock_bh(&dev_base_lock);
243 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
244 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
245 	hlist_add_head_rcu(&dev->index_hlist,
246 			   dev_index_hash(net, dev->ifindex));
247 	write_unlock_bh(&dev_base_lock);
248 
249 	dev_base_seq_inc(net);
250 
251 	return 0;
252 }
253 
254 /* Device list removal
255  * caller must respect a RCU grace period before freeing/reusing dev
256  */
257 static void unlist_netdevice(struct net_device *dev)
258 {
259 	ASSERT_RTNL();
260 
261 	/* Unlink dev from the device chain */
262 	write_lock_bh(&dev_base_lock);
263 	list_del_rcu(&dev->dev_list);
264 	hlist_del_rcu(&dev->name_hlist);
265 	hlist_del_rcu(&dev->index_hlist);
266 	write_unlock_bh(&dev_base_lock);
267 
268 	dev_base_seq_inc(dev_net(dev));
269 }
270 
271 /*
272  *	Our notifier list
273  */
274 
275 static RAW_NOTIFIER_HEAD(netdev_chain);
276 
277 /*
278  *	Device drivers call our routines to queue packets here. We empty the
279  *	queue in the local softnet handler.
280  */
281 
282 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
283 EXPORT_PER_CPU_SYMBOL(softnet_data);
284 
285 #ifdef CONFIG_LOCKDEP
286 /*
287  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
288  * according to dev->type
289  */
290 static const unsigned short netdev_lock_type[] =
291 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
292 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
293 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
294 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
295 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
296 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
297 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
298 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
299 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
300 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
301 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
302 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
303 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
304 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
305 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
306 
307 static const char *const netdev_lock_name[] =
308 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
309 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
310 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
311 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
312 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
313 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
314 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
315 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
316 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
317 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
318 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
319 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
320 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
321 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
322 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
323 
324 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
325 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
326 
327 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
328 {
329 	int i;
330 
331 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
332 		if (netdev_lock_type[i] == dev_type)
333 			return i;
334 	/* the last key is used by default */
335 	return ARRAY_SIZE(netdev_lock_type) - 1;
336 }
337 
338 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
339 						 unsigned short dev_type)
340 {
341 	int i;
342 
343 	i = netdev_lock_pos(dev_type);
344 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
345 				   netdev_lock_name[i]);
346 }
347 
348 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349 {
350 	int i;
351 
352 	i = netdev_lock_pos(dev->type);
353 	lockdep_set_class_and_name(&dev->addr_list_lock,
354 				   &netdev_addr_lock_key[i],
355 				   netdev_lock_name[i]);
356 }
357 #else
358 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
359 						 unsigned short dev_type)
360 {
361 }
362 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
363 {
364 }
365 #endif
366 
367 /*******************************************************************************
368 
369 		Protocol management and registration routines
370 
371 *******************************************************************************/
372 
373 /*
374  *	Add a protocol ID to the list. Now that the input handler is
375  *	smarter we can dispense with all the messy stuff that used to be
376  *	here.
377  *
378  *	BEWARE!!! Protocol handlers, mangling input packets,
379  *	MUST BE last in hash buckets and checking protocol handlers
380  *	MUST start from promiscuous ptype_all chain in net_bh.
381  *	It is true now, do not change it.
382  *	Explanation follows: if protocol handler, mangling packet, will
383  *	be the first on list, it is not able to sense, that packet
384  *	is cloned and should be copied-on-write, so that it will
385  *	change it and subsequent readers will get broken packet.
386  *							--ANK (980803)
387  */
388 
389 static inline struct list_head *ptype_head(const struct packet_type *pt)
390 {
391 	if (pt->type == htons(ETH_P_ALL))
392 		return &ptype_all;
393 	else
394 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
395 }
396 
397 /**
398  *	dev_add_pack - add packet handler
399  *	@pt: packet type declaration
400  *
401  *	Add a protocol handler to the networking stack. The passed &packet_type
402  *	is linked into kernel lists and may not be freed until it has been
403  *	removed from the kernel lists.
404  *
405  *	This call does not sleep therefore it can not
406  *	guarantee all CPU's that are in middle of receiving packets
407  *	will see the new packet type (until the next received packet).
408  */
409 
410 void dev_add_pack(struct packet_type *pt)
411 {
412 	struct list_head *head = ptype_head(pt);
413 
414 	spin_lock(&ptype_lock);
415 	list_add_rcu(&pt->list, head);
416 	spin_unlock(&ptype_lock);
417 }
418 EXPORT_SYMBOL(dev_add_pack);
419 
420 /**
421  *	__dev_remove_pack	 - remove packet handler
422  *	@pt: packet type declaration
423  *
424  *	Remove a protocol handler that was previously added to the kernel
425  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
426  *	from the kernel lists and can be freed or reused once this function
427  *	returns.
428  *
429  *      The packet type might still be in use by receivers
430  *	and must not be freed until after all the CPU's have gone
431  *	through a quiescent state.
432  */
433 void __dev_remove_pack(struct packet_type *pt)
434 {
435 	struct list_head *head = ptype_head(pt);
436 	struct packet_type *pt1;
437 
438 	spin_lock(&ptype_lock);
439 
440 	list_for_each_entry(pt1, head, list) {
441 		if (pt == pt1) {
442 			list_del_rcu(&pt->list);
443 			goto out;
444 		}
445 	}
446 
447 	pr_warn("dev_remove_pack: %p not found\n", pt);
448 out:
449 	spin_unlock(&ptype_lock);
450 }
451 EXPORT_SYMBOL(__dev_remove_pack);
452 
453 /**
454  *	dev_remove_pack	 - remove packet handler
455  *	@pt: packet type declaration
456  *
457  *	Remove a protocol handler that was previously added to the kernel
458  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
459  *	from the kernel lists and can be freed or reused once this function
460  *	returns.
461  *
462  *	This call sleeps to guarantee that no CPU is looking at the packet
463  *	type after return.
464  */
465 void dev_remove_pack(struct packet_type *pt)
466 {
467 	__dev_remove_pack(pt);
468 
469 	synchronize_net();
470 }
471 EXPORT_SYMBOL(dev_remove_pack);
472 
473 /******************************************************************************
474 
475 		      Device Boot-time Settings Routines
476 
477 *******************************************************************************/
478 
479 /* Boot time configuration table */
480 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
481 
482 /**
483  *	netdev_boot_setup_add	- add new setup entry
484  *	@name: name of the device
485  *	@map: configured settings for the device
486  *
487  *	Adds new setup entry to the dev_boot_setup list.  The function
488  *	returns 0 on error and 1 on success.  This is a generic routine to
489  *	all netdevices.
490  */
491 static int netdev_boot_setup_add(char *name, struct ifmap *map)
492 {
493 	struct netdev_boot_setup *s;
494 	int i;
495 
496 	s = dev_boot_setup;
497 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
498 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
499 			memset(s[i].name, 0, sizeof(s[i].name));
500 			strlcpy(s[i].name, name, IFNAMSIZ);
501 			memcpy(&s[i].map, map, sizeof(s[i].map));
502 			break;
503 		}
504 	}
505 
506 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
507 }
508 
509 /**
510  *	netdev_boot_setup_check	- check boot time settings
511  *	@dev: the netdevice
512  *
513  * 	Check boot time settings for the device.
514  *	The found settings are set for the device to be used
515  *	later in the device probing.
516  *	Returns 0 if no settings found, 1 if they are.
517  */
518 int netdev_boot_setup_check(struct net_device *dev)
519 {
520 	struct netdev_boot_setup *s = dev_boot_setup;
521 	int i;
522 
523 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
524 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
525 		    !strcmp(dev->name, s[i].name)) {
526 			dev->irq 	= s[i].map.irq;
527 			dev->base_addr 	= s[i].map.base_addr;
528 			dev->mem_start 	= s[i].map.mem_start;
529 			dev->mem_end 	= s[i].map.mem_end;
530 			return 1;
531 		}
532 	}
533 	return 0;
534 }
535 EXPORT_SYMBOL(netdev_boot_setup_check);
536 
537 
538 /**
539  *	netdev_boot_base	- get address from boot time settings
540  *	@prefix: prefix for network device
541  *	@unit: id for network device
542  *
543  * 	Check boot time settings for the base address of device.
544  *	The found settings are set for the device to be used
545  *	later in the device probing.
546  *	Returns 0 if no settings found.
547  */
548 unsigned long netdev_boot_base(const char *prefix, int unit)
549 {
550 	const struct netdev_boot_setup *s = dev_boot_setup;
551 	char name[IFNAMSIZ];
552 	int i;
553 
554 	sprintf(name, "%s%d", prefix, unit);
555 
556 	/*
557 	 * If device already registered then return base of 1
558 	 * to indicate not to probe for this interface
559 	 */
560 	if (__dev_get_by_name(&init_net, name))
561 		return 1;
562 
563 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
564 		if (!strcmp(name, s[i].name))
565 			return s[i].map.base_addr;
566 	return 0;
567 }
568 
569 /*
570  * Saves at boot time configured settings for any netdevice.
571  */
572 int __init netdev_boot_setup(char *str)
573 {
574 	int ints[5];
575 	struct ifmap map;
576 
577 	str = get_options(str, ARRAY_SIZE(ints), ints);
578 	if (!str || !*str)
579 		return 0;
580 
581 	/* Save settings */
582 	memset(&map, 0, sizeof(map));
583 	if (ints[0] > 0)
584 		map.irq = ints[1];
585 	if (ints[0] > 1)
586 		map.base_addr = ints[2];
587 	if (ints[0] > 2)
588 		map.mem_start = ints[3];
589 	if (ints[0] > 3)
590 		map.mem_end = ints[4];
591 
592 	/* Add new entry to the list */
593 	return netdev_boot_setup_add(str, &map);
594 }
595 
596 __setup("netdev=", netdev_boot_setup);
597 
598 /*******************************************************************************
599 
600 			    Device Interface Subroutines
601 
602 *******************************************************************************/
603 
604 /**
605  *	__dev_get_by_name	- find a device by its name
606  *	@net: the applicable net namespace
607  *	@name: name to find
608  *
609  *	Find an interface by name. Must be called under RTNL semaphore
610  *	or @dev_base_lock. If the name is found a pointer to the device
611  *	is returned. If the name is not found then %NULL is returned. The
612  *	reference counters are not incremented so the caller must be
613  *	careful with locks.
614  */
615 
616 struct net_device *__dev_get_by_name(struct net *net, const char *name)
617 {
618 	struct hlist_node *p;
619 	struct net_device *dev;
620 	struct hlist_head *head = dev_name_hash(net, name);
621 
622 	hlist_for_each_entry(dev, p, head, name_hlist)
623 		if (!strncmp(dev->name, name, IFNAMSIZ))
624 			return dev;
625 
626 	return NULL;
627 }
628 EXPORT_SYMBOL(__dev_get_by_name);
629 
630 /**
631  *	dev_get_by_name_rcu	- find a device by its name
632  *	@net: the applicable net namespace
633  *	@name: name to find
634  *
635  *	Find an interface by name.
636  *	If the name is found a pointer to the device is returned.
637  * 	If the name is not found then %NULL is returned.
638  *	The reference counters are not incremented so the caller must be
639  *	careful with locks. The caller must hold RCU lock.
640  */
641 
642 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
643 {
644 	struct hlist_node *p;
645 	struct net_device *dev;
646 	struct hlist_head *head = dev_name_hash(net, name);
647 
648 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
649 		if (!strncmp(dev->name, name, IFNAMSIZ))
650 			return dev;
651 
652 	return NULL;
653 }
654 EXPORT_SYMBOL(dev_get_by_name_rcu);
655 
656 /**
657  *	dev_get_by_name		- find a device by its name
658  *	@net: the applicable net namespace
659  *	@name: name to find
660  *
661  *	Find an interface by name. This can be called from any
662  *	context and does its own locking. The returned handle has
663  *	the usage count incremented and the caller must use dev_put() to
664  *	release it when it is no longer needed. %NULL is returned if no
665  *	matching device is found.
666  */
667 
668 struct net_device *dev_get_by_name(struct net *net, const char *name)
669 {
670 	struct net_device *dev;
671 
672 	rcu_read_lock();
673 	dev = dev_get_by_name_rcu(net, name);
674 	if (dev)
675 		dev_hold(dev);
676 	rcu_read_unlock();
677 	return dev;
678 }
679 EXPORT_SYMBOL(dev_get_by_name);
680 
681 /**
682  *	__dev_get_by_index - find a device by its ifindex
683  *	@net: the applicable net namespace
684  *	@ifindex: index of device
685  *
686  *	Search for an interface by index. Returns %NULL if the device
687  *	is not found or a pointer to the device. The device has not
688  *	had its reference counter increased so the caller must be careful
689  *	about locking. The caller must hold either the RTNL semaphore
690  *	or @dev_base_lock.
691  */
692 
693 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
694 {
695 	struct hlist_node *p;
696 	struct net_device *dev;
697 	struct hlist_head *head = dev_index_hash(net, ifindex);
698 
699 	hlist_for_each_entry(dev, p, head, index_hlist)
700 		if (dev->ifindex == ifindex)
701 			return dev;
702 
703 	return NULL;
704 }
705 EXPORT_SYMBOL(__dev_get_by_index);
706 
707 /**
708  *	dev_get_by_index_rcu - find a device by its ifindex
709  *	@net: the applicable net namespace
710  *	@ifindex: index of device
711  *
712  *	Search for an interface by index. Returns %NULL if the device
713  *	is not found or a pointer to the device. The device has not
714  *	had its reference counter increased so the caller must be careful
715  *	about locking. The caller must hold RCU lock.
716  */
717 
718 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
719 {
720 	struct hlist_node *p;
721 	struct net_device *dev;
722 	struct hlist_head *head = dev_index_hash(net, ifindex);
723 
724 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
725 		if (dev->ifindex == ifindex)
726 			return dev;
727 
728 	return NULL;
729 }
730 EXPORT_SYMBOL(dev_get_by_index_rcu);
731 
732 
733 /**
734  *	dev_get_by_index - find a device by its ifindex
735  *	@net: the applicable net namespace
736  *	@ifindex: index of device
737  *
738  *	Search for an interface by index. Returns NULL if the device
739  *	is not found or a pointer to the device. The device returned has
740  *	had a reference added and the pointer is safe until the user calls
741  *	dev_put to indicate they have finished with it.
742  */
743 
744 struct net_device *dev_get_by_index(struct net *net, int ifindex)
745 {
746 	struct net_device *dev;
747 
748 	rcu_read_lock();
749 	dev = dev_get_by_index_rcu(net, ifindex);
750 	if (dev)
751 		dev_hold(dev);
752 	rcu_read_unlock();
753 	return dev;
754 }
755 EXPORT_SYMBOL(dev_get_by_index);
756 
757 /**
758  *	dev_getbyhwaddr_rcu - find a device by its hardware address
759  *	@net: the applicable net namespace
760  *	@type: media type of device
761  *	@ha: hardware address
762  *
763  *	Search for an interface by MAC address. Returns NULL if the device
764  *	is not found or a pointer to the device.
765  *	The caller must hold RCU or RTNL.
766  *	The returned device has not had its ref count increased
767  *	and the caller must therefore be careful about locking
768  *
769  */
770 
771 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
772 				       const char *ha)
773 {
774 	struct net_device *dev;
775 
776 	for_each_netdev_rcu(net, dev)
777 		if (dev->type == type &&
778 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
779 			return dev;
780 
781 	return NULL;
782 }
783 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
784 
785 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
786 {
787 	struct net_device *dev;
788 
789 	ASSERT_RTNL();
790 	for_each_netdev(net, dev)
791 		if (dev->type == type)
792 			return dev;
793 
794 	return NULL;
795 }
796 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
797 
798 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
799 {
800 	struct net_device *dev, *ret = NULL;
801 
802 	rcu_read_lock();
803 	for_each_netdev_rcu(net, dev)
804 		if (dev->type == type) {
805 			dev_hold(dev);
806 			ret = dev;
807 			break;
808 		}
809 	rcu_read_unlock();
810 	return ret;
811 }
812 EXPORT_SYMBOL(dev_getfirstbyhwtype);
813 
814 /**
815  *	dev_get_by_flags_rcu - find any device with given flags
816  *	@net: the applicable net namespace
817  *	@if_flags: IFF_* values
818  *	@mask: bitmask of bits in if_flags to check
819  *
820  *	Search for any interface with the given flags. Returns NULL if a device
821  *	is not found or a pointer to the device. Must be called inside
822  *	rcu_read_lock(), and result refcount is unchanged.
823  */
824 
825 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
826 				    unsigned short mask)
827 {
828 	struct net_device *dev, *ret;
829 
830 	ret = NULL;
831 	for_each_netdev_rcu(net, dev) {
832 		if (((dev->flags ^ if_flags) & mask) == 0) {
833 			ret = dev;
834 			break;
835 		}
836 	}
837 	return ret;
838 }
839 EXPORT_SYMBOL(dev_get_by_flags_rcu);
840 
841 /**
842  *	dev_valid_name - check if name is okay for network device
843  *	@name: name string
844  *
845  *	Network device names need to be valid file names to
846  *	to allow sysfs to work.  We also disallow any kind of
847  *	whitespace.
848  */
849 bool dev_valid_name(const char *name)
850 {
851 	if (*name == '\0')
852 		return false;
853 	if (strlen(name) >= IFNAMSIZ)
854 		return false;
855 	if (!strcmp(name, ".") || !strcmp(name, ".."))
856 		return false;
857 
858 	while (*name) {
859 		if (*name == '/' || isspace(*name))
860 			return false;
861 		name++;
862 	}
863 	return true;
864 }
865 EXPORT_SYMBOL(dev_valid_name);
866 
867 /**
868  *	__dev_alloc_name - allocate a name for a device
869  *	@net: network namespace to allocate the device name in
870  *	@name: name format string
871  *	@buf:  scratch buffer and result name string
872  *
873  *	Passed a format string - eg "lt%d" it will try and find a suitable
874  *	id. It scans list of devices to build up a free map, then chooses
875  *	the first empty slot. The caller must hold the dev_base or rtnl lock
876  *	while allocating the name and adding the device in order to avoid
877  *	duplicates.
878  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
879  *	Returns the number of the unit assigned or a negative errno code.
880  */
881 
882 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
883 {
884 	int i = 0;
885 	const char *p;
886 	const int max_netdevices = 8*PAGE_SIZE;
887 	unsigned long *inuse;
888 	struct net_device *d;
889 
890 	p = strnchr(name, IFNAMSIZ-1, '%');
891 	if (p) {
892 		/*
893 		 * Verify the string as this thing may have come from
894 		 * the user.  There must be either one "%d" and no other "%"
895 		 * characters.
896 		 */
897 		if (p[1] != 'd' || strchr(p + 2, '%'))
898 			return -EINVAL;
899 
900 		/* Use one page as a bit array of possible slots */
901 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
902 		if (!inuse)
903 			return -ENOMEM;
904 
905 		for_each_netdev(net, d) {
906 			if (!sscanf(d->name, name, &i))
907 				continue;
908 			if (i < 0 || i >= max_netdevices)
909 				continue;
910 
911 			/*  avoid cases where sscanf is not exact inverse of printf */
912 			snprintf(buf, IFNAMSIZ, name, i);
913 			if (!strncmp(buf, d->name, IFNAMSIZ))
914 				set_bit(i, inuse);
915 		}
916 
917 		i = find_first_zero_bit(inuse, max_netdevices);
918 		free_page((unsigned long) inuse);
919 	}
920 
921 	if (buf != name)
922 		snprintf(buf, IFNAMSIZ, name, i);
923 	if (!__dev_get_by_name(net, buf))
924 		return i;
925 
926 	/* It is possible to run out of possible slots
927 	 * when the name is long and there isn't enough space left
928 	 * for the digits, or if all bits are used.
929 	 */
930 	return -ENFILE;
931 }
932 
933 /**
934  *	dev_alloc_name - allocate a name for a device
935  *	@dev: device
936  *	@name: name format string
937  *
938  *	Passed a format string - eg "lt%d" it will try and find a suitable
939  *	id. It scans list of devices to build up a free map, then chooses
940  *	the first empty slot. The caller must hold the dev_base or rtnl lock
941  *	while allocating the name and adding the device in order to avoid
942  *	duplicates.
943  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
944  *	Returns the number of the unit assigned or a negative errno code.
945  */
946 
947 int dev_alloc_name(struct net_device *dev, const char *name)
948 {
949 	char buf[IFNAMSIZ];
950 	struct net *net;
951 	int ret;
952 
953 	BUG_ON(!dev_net(dev));
954 	net = dev_net(dev);
955 	ret = __dev_alloc_name(net, name, buf);
956 	if (ret >= 0)
957 		strlcpy(dev->name, buf, IFNAMSIZ);
958 	return ret;
959 }
960 EXPORT_SYMBOL(dev_alloc_name);
961 
962 static int dev_get_valid_name(struct net_device *dev, const char *name)
963 {
964 	struct net *net;
965 
966 	BUG_ON(!dev_net(dev));
967 	net = dev_net(dev);
968 
969 	if (!dev_valid_name(name))
970 		return -EINVAL;
971 
972 	if (strchr(name, '%'))
973 		return dev_alloc_name(dev, name);
974 	else if (__dev_get_by_name(net, name))
975 		return -EEXIST;
976 	else if (dev->name != name)
977 		strlcpy(dev->name, name, IFNAMSIZ);
978 
979 	return 0;
980 }
981 
982 /**
983  *	dev_change_name - change name of a device
984  *	@dev: device
985  *	@newname: name (or format string) must be at least IFNAMSIZ
986  *
987  *	Change name of a device, can pass format strings "eth%d".
988  *	for wildcarding.
989  */
990 int dev_change_name(struct net_device *dev, const char *newname)
991 {
992 	char oldname[IFNAMSIZ];
993 	int err = 0;
994 	int ret;
995 	struct net *net;
996 
997 	ASSERT_RTNL();
998 	BUG_ON(!dev_net(dev));
999 
1000 	net = dev_net(dev);
1001 	if (dev->flags & IFF_UP)
1002 		return -EBUSY;
1003 
1004 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1005 		return 0;
1006 
1007 	memcpy(oldname, dev->name, IFNAMSIZ);
1008 
1009 	err = dev_get_valid_name(dev, newname);
1010 	if (err < 0)
1011 		return err;
1012 
1013 rollback:
1014 	ret = device_rename(&dev->dev, dev->name);
1015 	if (ret) {
1016 		memcpy(dev->name, oldname, IFNAMSIZ);
1017 		return ret;
1018 	}
1019 
1020 	write_lock_bh(&dev_base_lock);
1021 	hlist_del_rcu(&dev->name_hlist);
1022 	write_unlock_bh(&dev_base_lock);
1023 
1024 	synchronize_rcu();
1025 
1026 	write_lock_bh(&dev_base_lock);
1027 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1028 	write_unlock_bh(&dev_base_lock);
1029 
1030 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1031 	ret = notifier_to_errno(ret);
1032 
1033 	if (ret) {
1034 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1035 		if (err >= 0) {
1036 			err = ret;
1037 			memcpy(dev->name, oldname, IFNAMSIZ);
1038 			goto rollback;
1039 		} else {
1040 			pr_err("%s: name change rollback failed: %d\n",
1041 			       dev->name, ret);
1042 		}
1043 	}
1044 
1045 	return err;
1046 }
1047 
1048 /**
1049  *	dev_set_alias - change ifalias of a device
1050  *	@dev: device
1051  *	@alias: name up to IFALIASZ
1052  *	@len: limit of bytes to copy from info
1053  *
1054  *	Set ifalias for a device,
1055  */
1056 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057 {
1058 	ASSERT_RTNL();
1059 
1060 	if (len >= IFALIASZ)
1061 		return -EINVAL;
1062 
1063 	if (!len) {
1064 		if (dev->ifalias) {
1065 			kfree(dev->ifalias);
1066 			dev->ifalias = NULL;
1067 		}
1068 		return 0;
1069 	}
1070 
1071 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1072 	if (!dev->ifalias)
1073 		return -ENOMEM;
1074 
1075 	strlcpy(dev->ifalias, alias, len+1);
1076 	return len;
1077 }
1078 
1079 
1080 /**
1081  *	netdev_features_change - device changes features
1082  *	@dev: device to cause notification
1083  *
1084  *	Called to indicate a device has changed features.
1085  */
1086 void netdev_features_change(struct net_device *dev)
1087 {
1088 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1089 }
1090 EXPORT_SYMBOL(netdev_features_change);
1091 
1092 /**
1093  *	netdev_state_change - device changes state
1094  *	@dev: device to cause notification
1095  *
1096  *	Called to indicate a device has changed state. This function calls
1097  *	the notifier chains for netdev_chain and sends a NEWLINK message
1098  *	to the routing socket.
1099  */
1100 void netdev_state_change(struct net_device *dev)
1101 {
1102 	if (dev->flags & IFF_UP) {
1103 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1104 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1105 	}
1106 }
1107 EXPORT_SYMBOL(netdev_state_change);
1108 
1109 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1110 {
1111 	return call_netdevice_notifiers(event, dev);
1112 }
1113 EXPORT_SYMBOL(netdev_bonding_change);
1114 
1115 /**
1116  *	dev_load 	- load a network module
1117  *	@net: the applicable net namespace
1118  *	@name: name of interface
1119  *
1120  *	If a network interface is not present and the process has suitable
1121  *	privileges this function loads the module. If module loading is not
1122  *	available in this kernel then it becomes a nop.
1123  */
1124 
1125 void dev_load(struct net *net, const char *name)
1126 {
1127 	struct net_device *dev;
1128 	int no_module;
1129 
1130 	rcu_read_lock();
1131 	dev = dev_get_by_name_rcu(net, name);
1132 	rcu_read_unlock();
1133 
1134 	no_module = !dev;
1135 	if (no_module && capable(CAP_NET_ADMIN))
1136 		no_module = request_module("netdev-%s", name);
1137 	if (no_module && capable(CAP_SYS_MODULE)) {
1138 		if (!request_module("%s", name))
1139 			pr_err("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1140 			       name);
1141 	}
1142 }
1143 EXPORT_SYMBOL(dev_load);
1144 
1145 static int __dev_open(struct net_device *dev)
1146 {
1147 	const struct net_device_ops *ops = dev->netdev_ops;
1148 	int ret;
1149 
1150 	ASSERT_RTNL();
1151 
1152 	if (!netif_device_present(dev))
1153 		return -ENODEV;
1154 
1155 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1156 	ret = notifier_to_errno(ret);
1157 	if (ret)
1158 		return ret;
1159 
1160 	set_bit(__LINK_STATE_START, &dev->state);
1161 
1162 	if (ops->ndo_validate_addr)
1163 		ret = ops->ndo_validate_addr(dev);
1164 
1165 	if (!ret && ops->ndo_open)
1166 		ret = ops->ndo_open(dev);
1167 
1168 	if (ret)
1169 		clear_bit(__LINK_STATE_START, &dev->state);
1170 	else {
1171 		dev->flags |= IFF_UP;
1172 		net_dmaengine_get();
1173 		dev_set_rx_mode(dev);
1174 		dev_activate(dev);
1175 	}
1176 
1177 	return ret;
1178 }
1179 
1180 /**
1181  *	dev_open	- prepare an interface for use.
1182  *	@dev:	device to open
1183  *
1184  *	Takes a device from down to up state. The device's private open
1185  *	function is invoked and then the multicast lists are loaded. Finally
1186  *	the device is moved into the up state and a %NETDEV_UP message is
1187  *	sent to the netdev notifier chain.
1188  *
1189  *	Calling this function on an active interface is a nop. On a failure
1190  *	a negative errno code is returned.
1191  */
1192 int dev_open(struct net_device *dev)
1193 {
1194 	int ret;
1195 
1196 	if (dev->flags & IFF_UP)
1197 		return 0;
1198 
1199 	ret = __dev_open(dev);
1200 	if (ret < 0)
1201 		return ret;
1202 
1203 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1204 	call_netdevice_notifiers(NETDEV_UP, dev);
1205 
1206 	return ret;
1207 }
1208 EXPORT_SYMBOL(dev_open);
1209 
1210 static int __dev_close_many(struct list_head *head)
1211 {
1212 	struct net_device *dev;
1213 
1214 	ASSERT_RTNL();
1215 	might_sleep();
1216 
1217 	list_for_each_entry(dev, head, unreg_list) {
1218 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1219 
1220 		clear_bit(__LINK_STATE_START, &dev->state);
1221 
1222 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1223 		 * can be even on different cpu. So just clear netif_running().
1224 		 *
1225 		 * dev->stop() will invoke napi_disable() on all of it's
1226 		 * napi_struct instances on this device.
1227 		 */
1228 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1229 	}
1230 
1231 	dev_deactivate_many(head);
1232 
1233 	list_for_each_entry(dev, head, unreg_list) {
1234 		const struct net_device_ops *ops = dev->netdev_ops;
1235 
1236 		/*
1237 		 *	Call the device specific close. This cannot fail.
1238 		 *	Only if device is UP
1239 		 *
1240 		 *	We allow it to be called even after a DETACH hot-plug
1241 		 *	event.
1242 		 */
1243 		if (ops->ndo_stop)
1244 			ops->ndo_stop(dev);
1245 
1246 		dev->flags &= ~IFF_UP;
1247 		net_dmaengine_put();
1248 	}
1249 
1250 	return 0;
1251 }
1252 
1253 static int __dev_close(struct net_device *dev)
1254 {
1255 	int retval;
1256 	LIST_HEAD(single);
1257 
1258 	list_add(&dev->unreg_list, &single);
1259 	retval = __dev_close_many(&single);
1260 	list_del(&single);
1261 	return retval;
1262 }
1263 
1264 static int dev_close_many(struct list_head *head)
1265 {
1266 	struct net_device *dev, *tmp;
1267 	LIST_HEAD(tmp_list);
1268 
1269 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1270 		if (!(dev->flags & IFF_UP))
1271 			list_move(&dev->unreg_list, &tmp_list);
1272 
1273 	__dev_close_many(head);
1274 
1275 	list_for_each_entry(dev, head, unreg_list) {
1276 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1277 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1278 	}
1279 
1280 	/* rollback_registered_many needs the complete original list */
1281 	list_splice(&tmp_list, head);
1282 	return 0;
1283 }
1284 
1285 /**
1286  *	dev_close - shutdown an interface.
1287  *	@dev: device to shutdown
1288  *
1289  *	This function moves an active device into down state. A
1290  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1291  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1292  *	chain.
1293  */
1294 int dev_close(struct net_device *dev)
1295 {
1296 	if (dev->flags & IFF_UP) {
1297 		LIST_HEAD(single);
1298 
1299 		list_add(&dev->unreg_list, &single);
1300 		dev_close_many(&single);
1301 		list_del(&single);
1302 	}
1303 	return 0;
1304 }
1305 EXPORT_SYMBOL(dev_close);
1306 
1307 
1308 /**
1309  *	dev_disable_lro - disable Large Receive Offload on a device
1310  *	@dev: device
1311  *
1312  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1313  *	called under RTNL.  This is needed if received packets may be
1314  *	forwarded to another interface.
1315  */
1316 void dev_disable_lro(struct net_device *dev)
1317 {
1318 	/*
1319 	 * If we're trying to disable lro on a vlan device
1320 	 * use the underlying physical device instead
1321 	 */
1322 	if (is_vlan_dev(dev))
1323 		dev = vlan_dev_real_dev(dev);
1324 
1325 	dev->wanted_features &= ~NETIF_F_LRO;
1326 	netdev_update_features(dev);
1327 
1328 	if (unlikely(dev->features & NETIF_F_LRO))
1329 		netdev_WARN(dev, "failed to disable LRO!\n");
1330 }
1331 EXPORT_SYMBOL(dev_disable_lro);
1332 
1333 
1334 static int dev_boot_phase = 1;
1335 
1336 /**
1337  *	register_netdevice_notifier - register a network notifier block
1338  *	@nb: notifier
1339  *
1340  *	Register a notifier to be called when network device events occur.
1341  *	The notifier passed is linked into the kernel structures and must
1342  *	not be reused until it has been unregistered. A negative errno code
1343  *	is returned on a failure.
1344  *
1345  * 	When registered all registration and up events are replayed
1346  *	to the new notifier to allow device to have a race free
1347  *	view of the network device list.
1348  */
1349 
1350 int register_netdevice_notifier(struct notifier_block *nb)
1351 {
1352 	struct net_device *dev;
1353 	struct net_device *last;
1354 	struct net *net;
1355 	int err;
1356 
1357 	rtnl_lock();
1358 	err = raw_notifier_chain_register(&netdev_chain, nb);
1359 	if (err)
1360 		goto unlock;
1361 	if (dev_boot_phase)
1362 		goto unlock;
1363 	for_each_net(net) {
1364 		for_each_netdev(net, dev) {
1365 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1366 			err = notifier_to_errno(err);
1367 			if (err)
1368 				goto rollback;
1369 
1370 			if (!(dev->flags & IFF_UP))
1371 				continue;
1372 
1373 			nb->notifier_call(nb, NETDEV_UP, dev);
1374 		}
1375 	}
1376 
1377 unlock:
1378 	rtnl_unlock();
1379 	return err;
1380 
1381 rollback:
1382 	last = dev;
1383 	for_each_net(net) {
1384 		for_each_netdev(net, dev) {
1385 			if (dev == last)
1386 				goto outroll;
1387 
1388 			if (dev->flags & IFF_UP) {
1389 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1390 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1391 			}
1392 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1393 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1394 		}
1395 	}
1396 
1397 outroll:
1398 	raw_notifier_chain_unregister(&netdev_chain, nb);
1399 	goto unlock;
1400 }
1401 EXPORT_SYMBOL(register_netdevice_notifier);
1402 
1403 /**
1404  *	unregister_netdevice_notifier - unregister a network notifier block
1405  *	@nb: notifier
1406  *
1407  *	Unregister a notifier previously registered by
1408  *	register_netdevice_notifier(). The notifier is unlinked into the
1409  *	kernel structures and may then be reused. A negative errno code
1410  *	is returned on a failure.
1411  *
1412  * 	After unregistering unregister and down device events are synthesized
1413  *	for all devices on the device list to the removed notifier to remove
1414  *	the need for special case cleanup code.
1415  */
1416 
1417 int unregister_netdevice_notifier(struct notifier_block *nb)
1418 {
1419 	struct net_device *dev;
1420 	struct net *net;
1421 	int err;
1422 
1423 	rtnl_lock();
1424 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1425 	if (err)
1426 		goto unlock;
1427 
1428 	for_each_net(net) {
1429 		for_each_netdev(net, dev) {
1430 			if (dev->flags & IFF_UP) {
1431 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1432 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1433 			}
1434 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1435 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1436 		}
1437 	}
1438 unlock:
1439 	rtnl_unlock();
1440 	return err;
1441 }
1442 EXPORT_SYMBOL(unregister_netdevice_notifier);
1443 
1444 /**
1445  *	call_netdevice_notifiers - call all network notifier blocks
1446  *      @val: value passed unmodified to notifier function
1447  *      @dev: net_device pointer passed unmodified to notifier function
1448  *
1449  *	Call all network notifier blocks.  Parameters and return value
1450  *	are as for raw_notifier_call_chain().
1451  */
1452 
1453 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1454 {
1455 	ASSERT_RTNL();
1456 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1457 }
1458 EXPORT_SYMBOL(call_netdevice_notifiers);
1459 
1460 static struct static_key netstamp_needed __read_mostly;
1461 #ifdef HAVE_JUMP_LABEL
1462 /* We are not allowed to call static_key_slow_dec() from irq context
1463  * If net_disable_timestamp() is called from irq context, defer the
1464  * static_key_slow_dec() calls.
1465  */
1466 static atomic_t netstamp_needed_deferred;
1467 #endif
1468 
1469 void net_enable_timestamp(void)
1470 {
1471 #ifdef HAVE_JUMP_LABEL
1472 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1473 
1474 	if (deferred) {
1475 		while (--deferred)
1476 			static_key_slow_dec(&netstamp_needed);
1477 		return;
1478 	}
1479 #endif
1480 	WARN_ON(in_interrupt());
1481 	static_key_slow_inc(&netstamp_needed);
1482 }
1483 EXPORT_SYMBOL(net_enable_timestamp);
1484 
1485 void net_disable_timestamp(void)
1486 {
1487 #ifdef HAVE_JUMP_LABEL
1488 	if (in_interrupt()) {
1489 		atomic_inc(&netstamp_needed_deferred);
1490 		return;
1491 	}
1492 #endif
1493 	static_key_slow_dec(&netstamp_needed);
1494 }
1495 EXPORT_SYMBOL(net_disable_timestamp);
1496 
1497 static inline void net_timestamp_set(struct sk_buff *skb)
1498 {
1499 	skb->tstamp.tv64 = 0;
1500 	if (static_key_false(&netstamp_needed))
1501 		__net_timestamp(skb);
1502 }
1503 
1504 #define net_timestamp_check(COND, SKB)			\
1505 	if (static_key_false(&netstamp_needed)) {		\
1506 		if ((COND) && !(SKB)->tstamp.tv64)	\
1507 			__net_timestamp(SKB);		\
1508 	}						\
1509 
1510 static int net_hwtstamp_validate(struct ifreq *ifr)
1511 {
1512 	struct hwtstamp_config cfg;
1513 	enum hwtstamp_tx_types tx_type;
1514 	enum hwtstamp_rx_filters rx_filter;
1515 	int tx_type_valid = 0;
1516 	int rx_filter_valid = 0;
1517 
1518 	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1519 		return -EFAULT;
1520 
1521 	if (cfg.flags) /* reserved for future extensions */
1522 		return -EINVAL;
1523 
1524 	tx_type = cfg.tx_type;
1525 	rx_filter = cfg.rx_filter;
1526 
1527 	switch (tx_type) {
1528 	case HWTSTAMP_TX_OFF:
1529 	case HWTSTAMP_TX_ON:
1530 	case HWTSTAMP_TX_ONESTEP_SYNC:
1531 		tx_type_valid = 1;
1532 		break;
1533 	}
1534 
1535 	switch (rx_filter) {
1536 	case HWTSTAMP_FILTER_NONE:
1537 	case HWTSTAMP_FILTER_ALL:
1538 	case HWTSTAMP_FILTER_SOME:
1539 	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1540 	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1541 	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1542 	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1543 	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1544 	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1545 	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1546 	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1547 	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1548 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
1549 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
1550 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1551 		rx_filter_valid = 1;
1552 		break;
1553 	}
1554 
1555 	if (!tx_type_valid || !rx_filter_valid)
1556 		return -ERANGE;
1557 
1558 	return 0;
1559 }
1560 
1561 static inline bool is_skb_forwardable(struct net_device *dev,
1562 				      struct sk_buff *skb)
1563 {
1564 	unsigned int len;
1565 
1566 	if (!(dev->flags & IFF_UP))
1567 		return false;
1568 
1569 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1570 	if (skb->len <= len)
1571 		return true;
1572 
1573 	/* if TSO is enabled, we don't care about the length as the packet
1574 	 * could be forwarded without being segmented before
1575 	 */
1576 	if (skb_is_gso(skb))
1577 		return true;
1578 
1579 	return false;
1580 }
1581 
1582 /**
1583  * dev_forward_skb - loopback an skb to another netif
1584  *
1585  * @dev: destination network device
1586  * @skb: buffer to forward
1587  *
1588  * return values:
1589  *	NET_RX_SUCCESS	(no congestion)
1590  *	NET_RX_DROP     (packet was dropped, but freed)
1591  *
1592  * dev_forward_skb can be used for injecting an skb from the
1593  * start_xmit function of one device into the receive queue
1594  * of another device.
1595  *
1596  * The receiving device may be in another namespace, so
1597  * we have to clear all information in the skb that could
1598  * impact namespace isolation.
1599  */
1600 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1601 {
1602 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1603 		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1604 			atomic_long_inc(&dev->rx_dropped);
1605 			kfree_skb(skb);
1606 			return NET_RX_DROP;
1607 		}
1608 	}
1609 
1610 	skb_orphan(skb);
1611 	nf_reset(skb);
1612 
1613 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1614 		atomic_long_inc(&dev->rx_dropped);
1615 		kfree_skb(skb);
1616 		return NET_RX_DROP;
1617 	}
1618 	skb->skb_iif = 0;
1619 	skb->dev = dev;
1620 	skb_dst_drop(skb);
1621 	skb->tstamp.tv64 = 0;
1622 	skb->pkt_type = PACKET_HOST;
1623 	skb->protocol = eth_type_trans(skb, dev);
1624 	skb->mark = 0;
1625 	secpath_reset(skb);
1626 	nf_reset(skb);
1627 	return netif_rx(skb);
1628 }
1629 EXPORT_SYMBOL_GPL(dev_forward_skb);
1630 
1631 static inline int deliver_skb(struct sk_buff *skb,
1632 			      struct packet_type *pt_prev,
1633 			      struct net_device *orig_dev)
1634 {
1635 	atomic_inc(&skb->users);
1636 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1637 }
1638 
1639 /*
1640  *	Support routine. Sends outgoing frames to any network
1641  *	taps currently in use.
1642  */
1643 
1644 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1645 {
1646 	struct packet_type *ptype;
1647 	struct sk_buff *skb2 = NULL;
1648 	struct packet_type *pt_prev = NULL;
1649 
1650 	rcu_read_lock();
1651 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1652 		/* Never send packets back to the socket
1653 		 * they originated from - MvS (miquels@drinkel.ow.org)
1654 		 */
1655 		if ((ptype->dev == dev || !ptype->dev) &&
1656 		    (ptype->af_packet_priv == NULL ||
1657 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1658 			if (pt_prev) {
1659 				deliver_skb(skb2, pt_prev, skb->dev);
1660 				pt_prev = ptype;
1661 				continue;
1662 			}
1663 
1664 			skb2 = skb_clone(skb, GFP_ATOMIC);
1665 			if (!skb2)
1666 				break;
1667 
1668 			net_timestamp_set(skb2);
1669 
1670 			/* skb->nh should be correctly
1671 			   set by sender, so that the second statement is
1672 			   just protection against buggy protocols.
1673 			 */
1674 			skb_reset_mac_header(skb2);
1675 
1676 			if (skb_network_header(skb2) < skb2->data ||
1677 			    skb2->network_header > skb2->tail) {
1678 				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1679 						     ntohs(skb2->protocol),
1680 						     dev->name);
1681 				skb_reset_network_header(skb2);
1682 			}
1683 
1684 			skb2->transport_header = skb2->network_header;
1685 			skb2->pkt_type = PACKET_OUTGOING;
1686 			pt_prev = ptype;
1687 		}
1688 	}
1689 	if (pt_prev)
1690 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1691 	rcu_read_unlock();
1692 }
1693 
1694 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1695  * @dev: Network device
1696  * @txq: number of queues available
1697  *
1698  * If real_num_tx_queues is changed the tc mappings may no longer be
1699  * valid. To resolve this verify the tc mapping remains valid and if
1700  * not NULL the mapping. With no priorities mapping to this
1701  * offset/count pair it will no longer be used. In the worst case TC0
1702  * is invalid nothing can be done so disable priority mappings. If is
1703  * expected that drivers will fix this mapping if they can before
1704  * calling netif_set_real_num_tx_queues.
1705  */
1706 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1707 {
1708 	int i;
1709 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1710 
1711 	/* If TC0 is invalidated disable TC mapping */
1712 	if (tc->offset + tc->count > txq) {
1713 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1714 		dev->num_tc = 0;
1715 		return;
1716 	}
1717 
1718 	/* Invalidated prio to tc mappings set to TC0 */
1719 	for (i = 1; i < TC_BITMASK + 1; i++) {
1720 		int q = netdev_get_prio_tc_map(dev, i);
1721 
1722 		tc = &dev->tc_to_txq[q];
1723 		if (tc->offset + tc->count > txq) {
1724 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1725 				i, q);
1726 			netdev_set_prio_tc_map(dev, i, 0);
1727 		}
1728 	}
1729 }
1730 
1731 /*
1732  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1733  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1734  */
1735 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1736 {
1737 	int rc;
1738 
1739 	if (txq < 1 || txq > dev->num_tx_queues)
1740 		return -EINVAL;
1741 
1742 	if (dev->reg_state == NETREG_REGISTERED ||
1743 	    dev->reg_state == NETREG_UNREGISTERING) {
1744 		ASSERT_RTNL();
1745 
1746 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1747 						  txq);
1748 		if (rc)
1749 			return rc;
1750 
1751 		if (dev->num_tc)
1752 			netif_setup_tc(dev, txq);
1753 
1754 		if (txq < dev->real_num_tx_queues)
1755 			qdisc_reset_all_tx_gt(dev, txq);
1756 	}
1757 
1758 	dev->real_num_tx_queues = txq;
1759 	return 0;
1760 }
1761 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1762 
1763 #ifdef CONFIG_RPS
1764 /**
1765  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1766  *	@dev: Network device
1767  *	@rxq: Actual number of RX queues
1768  *
1769  *	This must be called either with the rtnl_lock held or before
1770  *	registration of the net device.  Returns 0 on success, or a
1771  *	negative error code.  If called before registration, it always
1772  *	succeeds.
1773  */
1774 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1775 {
1776 	int rc;
1777 
1778 	if (rxq < 1 || rxq > dev->num_rx_queues)
1779 		return -EINVAL;
1780 
1781 	if (dev->reg_state == NETREG_REGISTERED) {
1782 		ASSERT_RTNL();
1783 
1784 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1785 						  rxq);
1786 		if (rc)
1787 			return rc;
1788 	}
1789 
1790 	dev->real_num_rx_queues = rxq;
1791 	return 0;
1792 }
1793 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1794 #endif
1795 
1796 static inline void __netif_reschedule(struct Qdisc *q)
1797 {
1798 	struct softnet_data *sd;
1799 	unsigned long flags;
1800 
1801 	local_irq_save(flags);
1802 	sd = &__get_cpu_var(softnet_data);
1803 	q->next_sched = NULL;
1804 	*sd->output_queue_tailp = q;
1805 	sd->output_queue_tailp = &q->next_sched;
1806 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1807 	local_irq_restore(flags);
1808 }
1809 
1810 void __netif_schedule(struct Qdisc *q)
1811 {
1812 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1813 		__netif_reschedule(q);
1814 }
1815 EXPORT_SYMBOL(__netif_schedule);
1816 
1817 void dev_kfree_skb_irq(struct sk_buff *skb)
1818 {
1819 	if (atomic_dec_and_test(&skb->users)) {
1820 		struct softnet_data *sd;
1821 		unsigned long flags;
1822 
1823 		local_irq_save(flags);
1824 		sd = &__get_cpu_var(softnet_data);
1825 		skb->next = sd->completion_queue;
1826 		sd->completion_queue = skb;
1827 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1828 		local_irq_restore(flags);
1829 	}
1830 }
1831 EXPORT_SYMBOL(dev_kfree_skb_irq);
1832 
1833 void dev_kfree_skb_any(struct sk_buff *skb)
1834 {
1835 	if (in_irq() || irqs_disabled())
1836 		dev_kfree_skb_irq(skb);
1837 	else
1838 		dev_kfree_skb(skb);
1839 }
1840 EXPORT_SYMBOL(dev_kfree_skb_any);
1841 
1842 
1843 /**
1844  * netif_device_detach - mark device as removed
1845  * @dev: network device
1846  *
1847  * Mark device as removed from system and therefore no longer available.
1848  */
1849 void netif_device_detach(struct net_device *dev)
1850 {
1851 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1852 	    netif_running(dev)) {
1853 		netif_tx_stop_all_queues(dev);
1854 	}
1855 }
1856 EXPORT_SYMBOL(netif_device_detach);
1857 
1858 /**
1859  * netif_device_attach - mark device as attached
1860  * @dev: network device
1861  *
1862  * Mark device as attached from system and restart if needed.
1863  */
1864 void netif_device_attach(struct net_device *dev)
1865 {
1866 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1867 	    netif_running(dev)) {
1868 		netif_tx_wake_all_queues(dev);
1869 		__netdev_watchdog_up(dev);
1870 	}
1871 }
1872 EXPORT_SYMBOL(netif_device_attach);
1873 
1874 static void skb_warn_bad_offload(const struct sk_buff *skb)
1875 {
1876 	static const netdev_features_t null_features = 0;
1877 	struct net_device *dev = skb->dev;
1878 	const char *driver = "";
1879 
1880 	if (dev && dev->dev.parent)
1881 		driver = dev_driver_string(dev->dev.parent);
1882 
1883 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1884 	     "gso_type=%d ip_summed=%d\n",
1885 	     driver, dev ? &dev->features : &null_features,
1886 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
1887 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1888 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
1889 }
1890 
1891 /*
1892  * Invalidate hardware checksum when packet is to be mangled, and
1893  * complete checksum manually on outgoing path.
1894  */
1895 int skb_checksum_help(struct sk_buff *skb)
1896 {
1897 	__wsum csum;
1898 	int ret = 0, offset;
1899 
1900 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1901 		goto out_set_summed;
1902 
1903 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1904 		skb_warn_bad_offload(skb);
1905 		return -EINVAL;
1906 	}
1907 
1908 	offset = skb_checksum_start_offset(skb);
1909 	BUG_ON(offset >= skb_headlen(skb));
1910 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1911 
1912 	offset += skb->csum_offset;
1913 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1914 
1915 	if (skb_cloned(skb) &&
1916 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1917 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1918 		if (ret)
1919 			goto out;
1920 	}
1921 
1922 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1923 out_set_summed:
1924 	skb->ip_summed = CHECKSUM_NONE;
1925 out:
1926 	return ret;
1927 }
1928 EXPORT_SYMBOL(skb_checksum_help);
1929 
1930 /**
1931  *	skb_gso_segment - Perform segmentation on skb.
1932  *	@skb: buffer to segment
1933  *	@features: features for the output path (see dev->features)
1934  *
1935  *	This function segments the given skb and returns a list of segments.
1936  *
1937  *	It may return NULL if the skb requires no segmentation.  This is
1938  *	only possible when GSO is used for verifying header integrity.
1939  */
1940 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1941 	netdev_features_t features)
1942 {
1943 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1944 	struct packet_type *ptype;
1945 	__be16 type = skb->protocol;
1946 	int vlan_depth = ETH_HLEN;
1947 	int err;
1948 
1949 	while (type == htons(ETH_P_8021Q)) {
1950 		struct vlan_hdr *vh;
1951 
1952 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1953 			return ERR_PTR(-EINVAL);
1954 
1955 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1956 		type = vh->h_vlan_encapsulated_proto;
1957 		vlan_depth += VLAN_HLEN;
1958 	}
1959 
1960 	skb_reset_mac_header(skb);
1961 	skb->mac_len = skb->network_header - skb->mac_header;
1962 	__skb_pull(skb, skb->mac_len);
1963 
1964 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1965 		skb_warn_bad_offload(skb);
1966 
1967 		if (skb_header_cloned(skb) &&
1968 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1969 			return ERR_PTR(err);
1970 	}
1971 
1972 	rcu_read_lock();
1973 	list_for_each_entry_rcu(ptype,
1974 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1975 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1976 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1977 				err = ptype->gso_send_check(skb);
1978 				segs = ERR_PTR(err);
1979 				if (err || skb_gso_ok(skb, features))
1980 					break;
1981 				__skb_push(skb, (skb->data -
1982 						 skb_network_header(skb)));
1983 			}
1984 			segs = ptype->gso_segment(skb, features);
1985 			break;
1986 		}
1987 	}
1988 	rcu_read_unlock();
1989 
1990 	__skb_push(skb, skb->data - skb_mac_header(skb));
1991 
1992 	return segs;
1993 }
1994 EXPORT_SYMBOL(skb_gso_segment);
1995 
1996 /* Take action when hardware reception checksum errors are detected. */
1997 #ifdef CONFIG_BUG
1998 void netdev_rx_csum_fault(struct net_device *dev)
1999 {
2000 	if (net_ratelimit()) {
2001 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2002 		dump_stack();
2003 	}
2004 }
2005 EXPORT_SYMBOL(netdev_rx_csum_fault);
2006 #endif
2007 
2008 /* Actually, we should eliminate this check as soon as we know, that:
2009  * 1. IOMMU is present and allows to map all the memory.
2010  * 2. No high memory really exists on this machine.
2011  */
2012 
2013 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2014 {
2015 #ifdef CONFIG_HIGHMEM
2016 	int i;
2017 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2018 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2019 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2020 			if (PageHighMem(skb_frag_page(frag)))
2021 				return 1;
2022 		}
2023 	}
2024 
2025 	if (PCI_DMA_BUS_IS_PHYS) {
2026 		struct device *pdev = dev->dev.parent;
2027 
2028 		if (!pdev)
2029 			return 0;
2030 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2031 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2032 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2033 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2034 				return 1;
2035 		}
2036 	}
2037 #endif
2038 	return 0;
2039 }
2040 
2041 struct dev_gso_cb {
2042 	void (*destructor)(struct sk_buff *skb);
2043 };
2044 
2045 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2046 
2047 static void dev_gso_skb_destructor(struct sk_buff *skb)
2048 {
2049 	struct dev_gso_cb *cb;
2050 
2051 	do {
2052 		struct sk_buff *nskb = skb->next;
2053 
2054 		skb->next = nskb->next;
2055 		nskb->next = NULL;
2056 		kfree_skb(nskb);
2057 	} while (skb->next);
2058 
2059 	cb = DEV_GSO_CB(skb);
2060 	if (cb->destructor)
2061 		cb->destructor(skb);
2062 }
2063 
2064 /**
2065  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2066  *	@skb: buffer to segment
2067  *	@features: device features as applicable to this skb
2068  *
2069  *	This function segments the given skb and stores the list of segments
2070  *	in skb->next.
2071  */
2072 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2073 {
2074 	struct sk_buff *segs;
2075 
2076 	segs = skb_gso_segment(skb, features);
2077 
2078 	/* Verifying header integrity only. */
2079 	if (!segs)
2080 		return 0;
2081 
2082 	if (IS_ERR(segs))
2083 		return PTR_ERR(segs);
2084 
2085 	skb->next = segs;
2086 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2087 	skb->destructor = dev_gso_skb_destructor;
2088 
2089 	return 0;
2090 }
2091 
2092 /*
2093  * Try to orphan skb early, right before transmission by the device.
2094  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2095  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2096  */
2097 static inline void skb_orphan_try(struct sk_buff *skb)
2098 {
2099 	struct sock *sk = skb->sk;
2100 
2101 	if (sk && !skb_shinfo(skb)->tx_flags) {
2102 		/* skb_tx_hash() wont be able to get sk.
2103 		 * We copy sk_hash into skb->rxhash
2104 		 */
2105 		if (!skb->rxhash)
2106 			skb->rxhash = sk->sk_hash;
2107 		skb_orphan(skb);
2108 	}
2109 }
2110 
2111 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2112 {
2113 	return ((features & NETIF_F_GEN_CSUM) ||
2114 		((features & NETIF_F_V4_CSUM) &&
2115 		 protocol == htons(ETH_P_IP)) ||
2116 		((features & NETIF_F_V6_CSUM) &&
2117 		 protocol == htons(ETH_P_IPV6)) ||
2118 		((features & NETIF_F_FCOE_CRC) &&
2119 		 protocol == htons(ETH_P_FCOE)));
2120 }
2121 
2122 static netdev_features_t harmonize_features(struct sk_buff *skb,
2123 	__be16 protocol, netdev_features_t features)
2124 {
2125 	if (!can_checksum_protocol(features, protocol)) {
2126 		features &= ~NETIF_F_ALL_CSUM;
2127 		features &= ~NETIF_F_SG;
2128 	} else if (illegal_highdma(skb->dev, skb)) {
2129 		features &= ~NETIF_F_SG;
2130 	}
2131 
2132 	return features;
2133 }
2134 
2135 netdev_features_t netif_skb_features(struct sk_buff *skb)
2136 {
2137 	__be16 protocol = skb->protocol;
2138 	netdev_features_t features = skb->dev->features;
2139 
2140 	if (protocol == htons(ETH_P_8021Q)) {
2141 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2142 		protocol = veh->h_vlan_encapsulated_proto;
2143 	} else if (!vlan_tx_tag_present(skb)) {
2144 		return harmonize_features(skb, protocol, features);
2145 	}
2146 
2147 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2148 
2149 	if (protocol != htons(ETH_P_8021Q)) {
2150 		return harmonize_features(skb, protocol, features);
2151 	} else {
2152 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2153 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2154 		return harmonize_features(skb, protocol, features);
2155 	}
2156 }
2157 EXPORT_SYMBOL(netif_skb_features);
2158 
2159 /*
2160  * Returns true if either:
2161  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2162  *	2. skb is fragmented and the device does not support SG, or if
2163  *	   at least one of fragments is in highmem and device does not
2164  *	   support DMA from it.
2165  */
2166 static inline int skb_needs_linearize(struct sk_buff *skb,
2167 				      int features)
2168 {
2169 	return skb_is_nonlinear(skb) &&
2170 			((skb_has_frag_list(skb) &&
2171 				!(features & NETIF_F_FRAGLIST)) ||
2172 			(skb_shinfo(skb)->nr_frags &&
2173 				!(features & NETIF_F_SG)));
2174 }
2175 
2176 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2177 			struct netdev_queue *txq)
2178 {
2179 	const struct net_device_ops *ops = dev->netdev_ops;
2180 	int rc = NETDEV_TX_OK;
2181 	unsigned int skb_len;
2182 
2183 	if (likely(!skb->next)) {
2184 		netdev_features_t features;
2185 
2186 		/*
2187 		 * If device doesn't need skb->dst, release it right now while
2188 		 * its hot in this cpu cache
2189 		 */
2190 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2191 			skb_dst_drop(skb);
2192 
2193 		if (!list_empty(&ptype_all))
2194 			dev_queue_xmit_nit(skb, dev);
2195 
2196 		skb_orphan_try(skb);
2197 
2198 		features = netif_skb_features(skb);
2199 
2200 		if (vlan_tx_tag_present(skb) &&
2201 		    !(features & NETIF_F_HW_VLAN_TX)) {
2202 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2203 			if (unlikely(!skb))
2204 				goto out;
2205 
2206 			skb->vlan_tci = 0;
2207 		}
2208 
2209 		if (netif_needs_gso(skb, features)) {
2210 			if (unlikely(dev_gso_segment(skb, features)))
2211 				goto out_kfree_skb;
2212 			if (skb->next)
2213 				goto gso;
2214 		} else {
2215 			if (skb_needs_linearize(skb, features) &&
2216 			    __skb_linearize(skb))
2217 				goto out_kfree_skb;
2218 
2219 			/* If packet is not checksummed and device does not
2220 			 * support checksumming for this protocol, complete
2221 			 * checksumming here.
2222 			 */
2223 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2224 				skb_set_transport_header(skb,
2225 					skb_checksum_start_offset(skb));
2226 				if (!(features & NETIF_F_ALL_CSUM) &&
2227 				     skb_checksum_help(skb))
2228 					goto out_kfree_skb;
2229 			}
2230 		}
2231 
2232 		skb_len = skb->len;
2233 		rc = ops->ndo_start_xmit(skb, dev);
2234 		trace_net_dev_xmit(skb, rc, dev, skb_len);
2235 		if (rc == NETDEV_TX_OK)
2236 			txq_trans_update(txq);
2237 		return rc;
2238 	}
2239 
2240 gso:
2241 	do {
2242 		struct sk_buff *nskb = skb->next;
2243 
2244 		skb->next = nskb->next;
2245 		nskb->next = NULL;
2246 
2247 		/*
2248 		 * If device doesn't need nskb->dst, release it right now while
2249 		 * its hot in this cpu cache
2250 		 */
2251 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2252 			skb_dst_drop(nskb);
2253 
2254 		skb_len = nskb->len;
2255 		rc = ops->ndo_start_xmit(nskb, dev);
2256 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2257 		if (unlikely(rc != NETDEV_TX_OK)) {
2258 			if (rc & ~NETDEV_TX_MASK)
2259 				goto out_kfree_gso_skb;
2260 			nskb->next = skb->next;
2261 			skb->next = nskb;
2262 			return rc;
2263 		}
2264 		txq_trans_update(txq);
2265 		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2266 			return NETDEV_TX_BUSY;
2267 	} while (skb->next);
2268 
2269 out_kfree_gso_skb:
2270 	if (likely(skb->next == NULL))
2271 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2272 out_kfree_skb:
2273 	kfree_skb(skb);
2274 out:
2275 	return rc;
2276 }
2277 
2278 static u32 hashrnd __read_mostly;
2279 
2280 /*
2281  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2282  * to be used as a distribution range.
2283  */
2284 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2285 		  unsigned int num_tx_queues)
2286 {
2287 	u32 hash;
2288 	u16 qoffset = 0;
2289 	u16 qcount = num_tx_queues;
2290 
2291 	if (skb_rx_queue_recorded(skb)) {
2292 		hash = skb_get_rx_queue(skb);
2293 		while (unlikely(hash >= num_tx_queues))
2294 			hash -= num_tx_queues;
2295 		return hash;
2296 	}
2297 
2298 	if (dev->num_tc) {
2299 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2300 		qoffset = dev->tc_to_txq[tc].offset;
2301 		qcount = dev->tc_to_txq[tc].count;
2302 	}
2303 
2304 	if (skb->sk && skb->sk->sk_hash)
2305 		hash = skb->sk->sk_hash;
2306 	else
2307 		hash = (__force u16) skb->protocol ^ skb->rxhash;
2308 	hash = jhash_1word(hash, hashrnd);
2309 
2310 	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2311 }
2312 EXPORT_SYMBOL(__skb_tx_hash);
2313 
2314 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2315 {
2316 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2317 		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2318 				     dev->name, queue_index,
2319 				     dev->real_num_tx_queues);
2320 		return 0;
2321 	}
2322 	return queue_index;
2323 }
2324 
2325 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2326 {
2327 #ifdef CONFIG_XPS
2328 	struct xps_dev_maps *dev_maps;
2329 	struct xps_map *map;
2330 	int queue_index = -1;
2331 
2332 	rcu_read_lock();
2333 	dev_maps = rcu_dereference(dev->xps_maps);
2334 	if (dev_maps) {
2335 		map = rcu_dereference(
2336 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2337 		if (map) {
2338 			if (map->len == 1)
2339 				queue_index = map->queues[0];
2340 			else {
2341 				u32 hash;
2342 				if (skb->sk && skb->sk->sk_hash)
2343 					hash = skb->sk->sk_hash;
2344 				else
2345 					hash = (__force u16) skb->protocol ^
2346 					    skb->rxhash;
2347 				hash = jhash_1word(hash, hashrnd);
2348 				queue_index = map->queues[
2349 				    ((u64)hash * map->len) >> 32];
2350 			}
2351 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2352 				queue_index = -1;
2353 		}
2354 	}
2355 	rcu_read_unlock();
2356 
2357 	return queue_index;
2358 #else
2359 	return -1;
2360 #endif
2361 }
2362 
2363 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2364 					struct sk_buff *skb)
2365 {
2366 	int queue_index;
2367 	const struct net_device_ops *ops = dev->netdev_ops;
2368 
2369 	if (dev->real_num_tx_queues == 1)
2370 		queue_index = 0;
2371 	else if (ops->ndo_select_queue) {
2372 		queue_index = ops->ndo_select_queue(dev, skb);
2373 		queue_index = dev_cap_txqueue(dev, queue_index);
2374 	} else {
2375 		struct sock *sk = skb->sk;
2376 		queue_index = sk_tx_queue_get(sk);
2377 
2378 		if (queue_index < 0 || skb->ooo_okay ||
2379 		    queue_index >= dev->real_num_tx_queues) {
2380 			int old_index = queue_index;
2381 
2382 			queue_index = get_xps_queue(dev, skb);
2383 			if (queue_index < 0)
2384 				queue_index = skb_tx_hash(dev, skb);
2385 
2386 			if (queue_index != old_index && sk) {
2387 				struct dst_entry *dst =
2388 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2389 
2390 				if (dst && skb_dst(skb) == dst)
2391 					sk_tx_queue_set(sk, queue_index);
2392 			}
2393 		}
2394 	}
2395 
2396 	skb_set_queue_mapping(skb, queue_index);
2397 	return netdev_get_tx_queue(dev, queue_index);
2398 }
2399 
2400 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2401 				 struct net_device *dev,
2402 				 struct netdev_queue *txq)
2403 {
2404 	spinlock_t *root_lock = qdisc_lock(q);
2405 	bool contended;
2406 	int rc;
2407 
2408 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2409 	qdisc_calculate_pkt_len(skb, q);
2410 	/*
2411 	 * Heuristic to force contended enqueues to serialize on a
2412 	 * separate lock before trying to get qdisc main lock.
2413 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2414 	 * and dequeue packets faster.
2415 	 */
2416 	contended = qdisc_is_running(q);
2417 	if (unlikely(contended))
2418 		spin_lock(&q->busylock);
2419 
2420 	spin_lock(root_lock);
2421 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2422 		kfree_skb(skb);
2423 		rc = NET_XMIT_DROP;
2424 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2425 		   qdisc_run_begin(q)) {
2426 		/*
2427 		 * This is a work-conserving queue; there are no old skbs
2428 		 * waiting to be sent out; and the qdisc is not running -
2429 		 * xmit the skb directly.
2430 		 */
2431 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2432 			skb_dst_force(skb);
2433 
2434 		qdisc_bstats_update(q, skb);
2435 
2436 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2437 			if (unlikely(contended)) {
2438 				spin_unlock(&q->busylock);
2439 				contended = false;
2440 			}
2441 			__qdisc_run(q);
2442 		} else
2443 			qdisc_run_end(q);
2444 
2445 		rc = NET_XMIT_SUCCESS;
2446 	} else {
2447 		skb_dst_force(skb);
2448 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2449 		if (qdisc_run_begin(q)) {
2450 			if (unlikely(contended)) {
2451 				spin_unlock(&q->busylock);
2452 				contended = false;
2453 			}
2454 			__qdisc_run(q);
2455 		}
2456 	}
2457 	spin_unlock(root_lock);
2458 	if (unlikely(contended))
2459 		spin_unlock(&q->busylock);
2460 	return rc;
2461 }
2462 
2463 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2464 static void skb_update_prio(struct sk_buff *skb)
2465 {
2466 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2467 
2468 	if ((!skb->priority) && (skb->sk) && map)
2469 		skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx];
2470 }
2471 #else
2472 #define skb_update_prio(skb)
2473 #endif
2474 
2475 static DEFINE_PER_CPU(int, xmit_recursion);
2476 #define RECURSION_LIMIT 10
2477 
2478 /**
2479  *	dev_queue_xmit - transmit a buffer
2480  *	@skb: buffer to transmit
2481  *
2482  *	Queue a buffer for transmission to a network device. The caller must
2483  *	have set the device and priority and built the buffer before calling
2484  *	this function. The function can be called from an interrupt.
2485  *
2486  *	A negative errno code is returned on a failure. A success does not
2487  *	guarantee the frame will be transmitted as it may be dropped due
2488  *	to congestion or traffic shaping.
2489  *
2490  * -----------------------------------------------------------------------------------
2491  *      I notice this method can also return errors from the queue disciplines,
2492  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2493  *      be positive.
2494  *
2495  *      Regardless of the return value, the skb is consumed, so it is currently
2496  *      difficult to retry a send to this method.  (You can bump the ref count
2497  *      before sending to hold a reference for retry if you are careful.)
2498  *
2499  *      When calling this method, interrupts MUST be enabled.  This is because
2500  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2501  *          --BLG
2502  */
2503 int dev_queue_xmit(struct sk_buff *skb)
2504 {
2505 	struct net_device *dev = skb->dev;
2506 	struct netdev_queue *txq;
2507 	struct Qdisc *q;
2508 	int rc = -ENOMEM;
2509 
2510 	/* Disable soft irqs for various locks below. Also
2511 	 * stops preemption for RCU.
2512 	 */
2513 	rcu_read_lock_bh();
2514 
2515 	skb_update_prio(skb);
2516 
2517 	txq = dev_pick_tx(dev, skb);
2518 	q = rcu_dereference_bh(txq->qdisc);
2519 
2520 #ifdef CONFIG_NET_CLS_ACT
2521 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2522 #endif
2523 	trace_net_dev_queue(skb);
2524 	if (q->enqueue) {
2525 		rc = __dev_xmit_skb(skb, q, dev, txq);
2526 		goto out;
2527 	}
2528 
2529 	/* The device has no queue. Common case for software devices:
2530 	   loopback, all the sorts of tunnels...
2531 
2532 	   Really, it is unlikely that netif_tx_lock protection is necessary
2533 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2534 	   counters.)
2535 	   However, it is possible, that they rely on protection
2536 	   made by us here.
2537 
2538 	   Check this and shot the lock. It is not prone from deadlocks.
2539 	   Either shot noqueue qdisc, it is even simpler 8)
2540 	 */
2541 	if (dev->flags & IFF_UP) {
2542 		int cpu = smp_processor_id(); /* ok because BHs are off */
2543 
2544 		if (txq->xmit_lock_owner != cpu) {
2545 
2546 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2547 				goto recursion_alert;
2548 
2549 			HARD_TX_LOCK(dev, txq, cpu);
2550 
2551 			if (!netif_xmit_stopped(txq)) {
2552 				__this_cpu_inc(xmit_recursion);
2553 				rc = dev_hard_start_xmit(skb, dev, txq);
2554 				__this_cpu_dec(xmit_recursion);
2555 				if (dev_xmit_complete(rc)) {
2556 					HARD_TX_UNLOCK(dev, txq);
2557 					goto out;
2558 				}
2559 			}
2560 			HARD_TX_UNLOCK(dev, txq);
2561 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2562 					     dev->name);
2563 		} else {
2564 			/* Recursion is detected! It is possible,
2565 			 * unfortunately
2566 			 */
2567 recursion_alert:
2568 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2569 					     dev->name);
2570 		}
2571 	}
2572 
2573 	rc = -ENETDOWN;
2574 	rcu_read_unlock_bh();
2575 
2576 	kfree_skb(skb);
2577 	return rc;
2578 out:
2579 	rcu_read_unlock_bh();
2580 	return rc;
2581 }
2582 EXPORT_SYMBOL(dev_queue_xmit);
2583 
2584 
2585 /*=======================================================================
2586 			Receiver routines
2587   =======================================================================*/
2588 
2589 int netdev_max_backlog __read_mostly = 1000;
2590 int netdev_tstamp_prequeue __read_mostly = 1;
2591 int netdev_budget __read_mostly = 300;
2592 int weight_p __read_mostly = 64;            /* old backlog weight */
2593 
2594 /* Called with irq disabled */
2595 static inline void ____napi_schedule(struct softnet_data *sd,
2596 				     struct napi_struct *napi)
2597 {
2598 	list_add_tail(&napi->poll_list, &sd->poll_list);
2599 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2600 }
2601 
2602 /*
2603  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2604  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2605  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2606  * if hash is a canonical 4-tuple hash over transport ports.
2607  */
2608 void __skb_get_rxhash(struct sk_buff *skb)
2609 {
2610 	struct flow_keys keys;
2611 	u32 hash;
2612 
2613 	if (!skb_flow_dissect(skb, &keys))
2614 		return;
2615 
2616 	if (keys.ports) {
2617 		if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2618 			swap(keys.port16[0], keys.port16[1]);
2619 		skb->l4_rxhash = 1;
2620 	}
2621 
2622 	/* get a consistent hash (same value on both flow directions) */
2623 	if ((__force u32)keys.dst < (__force u32)keys.src)
2624 		swap(keys.dst, keys.src);
2625 
2626 	hash = jhash_3words((__force u32)keys.dst,
2627 			    (__force u32)keys.src,
2628 			    (__force u32)keys.ports, hashrnd);
2629 	if (!hash)
2630 		hash = 1;
2631 
2632 	skb->rxhash = hash;
2633 }
2634 EXPORT_SYMBOL(__skb_get_rxhash);
2635 
2636 #ifdef CONFIG_RPS
2637 
2638 /* One global table that all flow-based protocols share. */
2639 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2640 EXPORT_SYMBOL(rps_sock_flow_table);
2641 
2642 struct static_key rps_needed __read_mostly;
2643 
2644 static struct rps_dev_flow *
2645 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2646 	    struct rps_dev_flow *rflow, u16 next_cpu)
2647 {
2648 	if (next_cpu != RPS_NO_CPU) {
2649 #ifdef CONFIG_RFS_ACCEL
2650 		struct netdev_rx_queue *rxqueue;
2651 		struct rps_dev_flow_table *flow_table;
2652 		struct rps_dev_flow *old_rflow;
2653 		u32 flow_id;
2654 		u16 rxq_index;
2655 		int rc;
2656 
2657 		/* Should we steer this flow to a different hardware queue? */
2658 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2659 		    !(dev->features & NETIF_F_NTUPLE))
2660 			goto out;
2661 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2662 		if (rxq_index == skb_get_rx_queue(skb))
2663 			goto out;
2664 
2665 		rxqueue = dev->_rx + rxq_index;
2666 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2667 		if (!flow_table)
2668 			goto out;
2669 		flow_id = skb->rxhash & flow_table->mask;
2670 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2671 							rxq_index, flow_id);
2672 		if (rc < 0)
2673 			goto out;
2674 		old_rflow = rflow;
2675 		rflow = &flow_table->flows[flow_id];
2676 		rflow->filter = rc;
2677 		if (old_rflow->filter == rflow->filter)
2678 			old_rflow->filter = RPS_NO_FILTER;
2679 	out:
2680 #endif
2681 		rflow->last_qtail =
2682 			per_cpu(softnet_data, next_cpu).input_queue_head;
2683 	}
2684 
2685 	rflow->cpu = next_cpu;
2686 	return rflow;
2687 }
2688 
2689 /*
2690  * get_rps_cpu is called from netif_receive_skb and returns the target
2691  * CPU from the RPS map of the receiving queue for a given skb.
2692  * rcu_read_lock must be held on entry.
2693  */
2694 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2695 		       struct rps_dev_flow **rflowp)
2696 {
2697 	struct netdev_rx_queue *rxqueue;
2698 	struct rps_map *map;
2699 	struct rps_dev_flow_table *flow_table;
2700 	struct rps_sock_flow_table *sock_flow_table;
2701 	int cpu = -1;
2702 	u16 tcpu;
2703 
2704 	if (skb_rx_queue_recorded(skb)) {
2705 		u16 index = skb_get_rx_queue(skb);
2706 		if (unlikely(index >= dev->real_num_rx_queues)) {
2707 			WARN_ONCE(dev->real_num_rx_queues > 1,
2708 				  "%s received packet on queue %u, but number "
2709 				  "of RX queues is %u\n",
2710 				  dev->name, index, dev->real_num_rx_queues);
2711 			goto done;
2712 		}
2713 		rxqueue = dev->_rx + index;
2714 	} else
2715 		rxqueue = dev->_rx;
2716 
2717 	map = rcu_dereference(rxqueue->rps_map);
2718 	if (map) {
2719 		if (map->len == 1 &&
2720 		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2721 			tcpu = map->cpus[0];
2722 			if (cpu_online(tcpu))
2723 				cpu = tcpu;
2724 			goto done;
2725 		}
2726 	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2727 		goto done;
2728 	}
2729 
2730 	skb_reset_network_header(skb);
2731 	if (!skb_get_rxhash(skb))
2732 		goto done;
2733 
2734 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2735 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2736 	if (flow_table && sock_flow_table) {
2737 		u16 next_cpu;
2738 		struct rps_dev_flow *rflow;
2739 
2740 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2741 		tcpu = rflow->cpu;
2742 
2743 		next_cpu = sock_flow_table->ents[skb->rxhash &
2744 		    sock_flow_table->mask];
2745 
2746 		/*
2747 		 * If the desired CPU (where last recvmsg was done) is
2748 		 * different from current CPU (one in the rx-queue flow
2749 		 * table entry), switch if one of the following holds:
2750 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2751 		 *   - Current CPU is offline.
2752 		 *   - The current CPU's queue tail has advanced beyond the
2753 		 *     last packet that was enqueued using this table entry.
2754 		 *     This guarantees that all previous packets for the flow
2755 		 *     have been dequeued, thus preserving in order delivery.
2756 		 */
2757 		if (unlikely(tcpu != next_cpu) &&
2758 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2759 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2760 		      rflow->last_qtail)) >= 0))
2761 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2762 
2763 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2764 			*rflowp = rflow;
2765 			cpu = tcpu;
2766 			goto done;
2767 		}
2768 	}
2769 
2770 	if (map) {
2771 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2772 
2773 		if (cpu_online(tcpu)) {
2774 			cpu = tcpu;
2775 			goto done;
2776 		}
2777 	}
2778 
2779 done:
2780 	return cpu;
2781 }
2782 
2783 #ifdef CONFIG_RFS_ACCEL
2784 
2785 /**
2786  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2787  * @dev: Device on which the filter was set
2788  * @rxq_index: RX queue index
2789  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2790  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2791  *
2792  * Drivers that implement ndo_rx_flow_steer() should periodically call
2793  * this function for each installed filter and remove the filters for
2794  * which it returns %true.
2795  */
2796 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2797 			 u32 flow_id, u16 filter_id)
2798 {
2799 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2800 	struct rps_dev_flow_table *flow_table;
2801 	struct rps_dev_flow *rflow;
2802 	bool expire = true;
2803 	int cpu;
2804 
2805 	rcu_read_lock();
2806 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2807 	if (flow_table && flow_id <= flow_table->mask) {
2808 		rflow = &flow_table->flows[flow_id];
2809 		cpu = ACCESS_ONCE(rflow->cpu);
2810 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2811 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2812 			   rflow->last_qtail) <
2813 		     (int)(10 * flow_table->mask)))
2814 			expire = false;
2815 	}
2816 	rcu_read_unlock();
2817 	return expire;
2818 }
2819 EXPORT_SYMBOL(rps_may_expire_flow);
2820 
2821 #endif /* CONFIG_RFS_ACCEL */
2822 
2823 /* Called from hardirq (IPI) context */
2824 static void rps_trigger_softirq(void *data)
2825 {
2826 	struct softnet_data *sd = data;
2827 
2828 	____napi_schedule(sd, &sd->backlog);
2829 	sd->received_rps++;
2830 }
2831 
2832 #endif /* CONFIG_RPS */
2833 
2834 /*
2835  * Check if this softnet_data structure is another cpu one
2836  * If yes, queue it to our IPI list and return 1
2837  * If no, return 0
2838  */
2839 static int rps_ipi_queued(struct softnet_data *sd)
2840 {
2841 #ifdef CONFIG_RPS
2842 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2843 
2844 	if (sd != mysd) {
2845 		sd->rps_ipi_next = mysd->rps_ipi_list;
2846 		mysd->rps_ipi_list = sd;
2847 
2848 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2849 		return 1;
2850 	}
2851 #endif /* CONFIG_RPS */
2852 	return 0;
2853 }
2854 
2855 /*
2856  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2857  * queue (may be a remote CPU queue).
2858  */
2859 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2860 			      unsigned int *qtail)
2861 {
2862 	struct softnet_data *sd;
2863 	unsigned long flags;
2864 
2865 	sd = &per_cpu(softnet_data, cpu);
2866 
2867 	local_irq_save(flags);
2868 
2869 	rps_lock(sd);
2870 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2871 		if (skb_queue_len(&sd->input_pkt_queue)) {
2872 enqueue:
2873 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2874 			input_queue_tail_incr_save(sd, qtail);
2875 			rps_unlock(sd);
2876 			local_irq_restore(flags);
2877 			return NET_RX_SUCCESS;
2878 		}
2879 
2880 		/* Schedule NAPI for backlog device
2881 		 * We can use non atomic operation since we own the queue lock
2882 		 */
2883 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2884 			if (!rps_ipi_queued(sd))
2885 				____napi_schedule(sd, &sd->backlog);
2886 		}
2887 		goto enqueue;
2888 	}
2889 
2890 	sd->dropped++;
2891 	rps_unlock(sd);
2892 
2893 	local_irq_restore(flags);
2894 
2895 	atomic_long_inc(&skb->dev->rx_dropped);
2896 	kfree_skb(skb);
2897 	return NET_RX_DROP;
2898 }
2899 
2900 /**
2901  *	netif_rx	-	post buffer to the network code
2902  *	@skb: buffer to post
2903  *
2904  *	This function receives a packet from a device driver and queues it for
2905  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2906  *	may be dropped during processing for congestion control or by the
2907  *	protocol layers.
2908  *
2909  *	return values:
2910  *	NET_RX_SUCCESS	(no congestion)
2911  *	NET_RX_DROP     (packet was dropped)
2912  *
2913  */
2914 
2915 int netif_rx(struct sk_buff *skb)
2916 {
2917 	int ret;
2918 
2919 	/* if netpoll wants it, pretend we never saw it */
2920 	if (netpoll_rx(skb))
2921 		return NET_RX_DROP;
2922 
2923 	net_timestamp_check(netdev_tstamp_prequeue, skb);
2924 
2925 	trace_netif_rx(skb);
2926 #ifdef CONFIG_RPS
2927 	if (static_key_false(&rps_needed)) {
2928 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2929 		int cpu;
2930 
2931 		preempt_disable();
2932 		rcu_read_lock();
2933 
2934 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2935 		if (cpu < 0)
2936 			cpu = smp_processor_id();
2937 
2938 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2939 
2940 		rcu_read_unlock();
2941 		preempt_enable();
2942 	} else
2943 #endif
2944 	{
2945 		unsigned int qtail;
2946 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2947 		put_cpu();
2948 	}
2949 	return ret;
2950 }
2951 EXPORT_SYMBOL(netif_rx);
2952 
2953 int netif_rx_ni(struct sk_buff *skb)
2954 {
2955 	int err;
2956 
2957 	preempt_disable();
2958 	err = netif_rx(skb);
2959 	if (local_softirq_pending())
2960 		do_softirq();
2961 	preempt_enable();
2962 
2963 	return err;
2964 }
2965 EXPORT_SYMBOL(netif_rx_ni);
2966 
2967 static void net_tx_action(struct softirq_action *h)
2968 {
2969 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2970 
2971 	if (sd->completion_queue) {
2972 		struct sk_buff *clist;
2973 
2974 		local_irq_disable();
2975 		clist = sd->completion_queue;
2976 		sd->completion_queue = NULL;
2977 		local_irq_enable();
2978 
2979 		while (clist) {
2980 			struct sk_buff *skb = clist;
2981 			clist = clist->next;
2982 
2983 			WARN_ON(atomic_read(&skb->users));
2984 			trace_kfree_skb(skb, net_tx_action);
2985 			__kfree_skb(skb);
2986 		}
2987 	}
2988 
2989 	if (sd->output_queue) {
2990 		struct Qdisc *head;
2991 
2992 		local_irq_disable();
2993 		head = sd->output_queue;
2994 		sd->output_queue = NULL;
2995 		sd->output_queue_tailp = &sd->output_queue;
2996 		local_irq_enable();
2997 
2998 		while (head) {
2999 			struct Qdisc *q = head;
3000 			spinlock_t *root_lock;
3001 
3002 			head = head->next_sched;
3003 
3004 			root_lock = qdisc_lock(q);
3005 			if (spin_trylock(root_lock)) {
3006 				smp_mb__before_clear_bit();
3007 				clear_bit(__QDISC_STATE_SCHED,
3008 					  &q->state);
3009 				qdisc_run(q);
3010 				spin_unlock(root_lock);
3011 			} else {
3012 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3013 					      &q->state)) {
3014 					__netif_reschedule(q);
3015 				} else {
3016 					smp_mb__before_clear_bit();
3017 					clear_bit(__QDISC_STATE_SCHED,
3018 						  &q->state);
3019 				}
3020 			}
3021 		}
3022 	}
3023 }
3024 
3025 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3026     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3027 /* This hook is defined here for ATM LANE */
3028 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3029 			     unsigned char *addr) __read_mostly;
3030 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3031 #endif
3032 
3033 #ifdef CONFIG_NET_CLS_ACT
3034 /* TODO: Maybe we should just force sch_ingress to be compiled in
3035  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3036  * a compare and 2 stores extra right now if we dont have it on
3037  * but have CONFIG_NET_CLS_ACT
3038  * NOTE: This doesn't stop any functionality; if you dont have
3039  * the ingress scheduler, you just can't add policies on ingress.
3040  *
3041  */
3042 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3043 {
3044 	struct net_device *dev = skb->dev;
3045 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3046 	int result = TC_ACT_OK;
3047 	struct Qdisc *q;
3048 
3049 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3050 		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3051 				     skb->skb_iif, dev->ifindex);
3052 		return TC_ACT_SHOT;
3053 	}
3054 
3055 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3056 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3057 
3058 	q = rxq->qdisc;
3059 	if (q != &noop_qdisc) {
3060 		spin_lock(qdisc_lock(q));
3061 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3062 			result = qdisc_enqueue_root(skb, q);
3063 		spin_unlock(qdisc_lock(q));
3064 	}
3065 
3066 	return result;
3067 }
3068 
3069 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3070 					 struct packet_type **pt_prev,
3071 					 int *ret, struct net_device *orig_dev)
3072 {
3073 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3074 
3075 	if (!rxq || rxq->qdisc == &noop_qdisc)
3076 		goto out;
3077 
3078 	if (*pt_prev) {
3079 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3080 		*pt_prev = NULL;
3081 	}
3082 
3083 	switch (ing_filter(skb, rxq)) {
3084 	case TC_ACT_SHOT:
3085 	case TC_ACT_STOLEN:
3086 		kfree_skb(skb);
3087 		return NULL;
3088 	}
3089 
3090 out:
3091 	skb->tc_verd = 0;
3092 	return skb;
3093 }
3094 #endif
3095 
3096 /**
3097  *	netdev_rx_handler_register - register receive handler
3098  *	@dev: device to register a handler for
3099  *	@rx_handler: receive handler to register
3100  *	@rx_handler_data: data pointer that is used by rx handler
3101  *
3102  *	Register a receive hander for a device. This handler will then be
3103  *	called from __netif_receive_skb. A negative errno code is returned
3104  *	on a failure.
3105  *
3106  *	The caller must hold the rtnl_mutex.
3107  *
3108  *	For a general description of rx_handler, see enum rx_handler_result.
3109  */
3110 int netdev_rx_handler_register(struct net_device *dev,
3111 			       rx_handler_func_t *rx_handler,
3112 			       void *rx_handler_data)
3113 {
3114 	ASSERT_RTNL();
3115 
3116 	if (dev->rx_handler)
3117 		return -EBUSY;
3118 
3119 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3120 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3121 
3122 	return 0;
3123 }
3124 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3125 
3126 /**
3127  *	netdev_rx_handler_unregister - unregister receive handler
3128  *	@dev: device to unregister a handler from
3129  *
3130  *	Unregister a receive hander from a device.
3131  *
3132  *	The caller must hold the rtnl_mutex.
3133  */
3134 void netdev_rx_handler_unregister(struct net_device *dev)
3135 {
3136 
3137 	ASSERT_RTNL();
3138 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3139 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3140 }
3141 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3142 
3143 static int __netif_receive_skb(struct sk_buff *skb)
3144 {
3145 	struct packet_type *ptype, *pt_prev;
3146 	rx_handler_func_t *rx_handler;
3147 	struct net_device *orig_dev;
3148 	struct net_device *null_or_dev;
3149 	bool deliver_exact = false;
3150 	int ret = NET_RX_DROP;
3151 	__be16 type;
3152 
3153 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3154 
3155 	trace_netif_receive_skb(skb);
3156 
3157 	/* if we've gotten here through NAPI, check netpoll */
3158 	if (netpoll_receive_skb(skb))
3159 		return NET_RX_DROP;
3160 
3161 	if (!skb->skb_iif)
3162 		skb->skb_iif = skb->dev->ifindex;
3163 	orig_dev = skb->dev;
3164 
3165 	skb_reset_network_header(skb);
3166 	skb_reset_transport_header(skb);
3167 	skb_reset_mac_len(skb);
3168 
3169 	pt_prev = NULL;
3170 
3171 	rcu_read_lock();
3172 
3173 another_round:
3174 
3175 	__this_cpu_inc(softnet_data.processed);
3176 
3177 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3178 		skb = vlan_untag(skb);
3179 		if (unlikely(!skb))
3180 			goto out;
3181 	}
3182 
3183 #ifdef CONFIG_NET_CLS_ACT
3184 	if (skb->tc_verd & TC_NCLS) {
3185 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3186 		goto ncls;
3187 	}
3188 #endif
3189 
3190 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3191 		if (!ptype->dev || ptype->dev == skb->dev) {
3192 			if (pt_prev)
3193 				ret = deliver_skb(skb, pt_prev, orig_dev);
3194 			pt_prev = ptype;
3195 		}
3196 	}
3197 
3198 #ifdef CONFIG_NET_CLS_ACT
3199 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3200 	if (!skb)
3201 		goto out;
3202 ncls:
3203 #endif
3204 
3205 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3206 	if (vlan_tx_tag_present(skb)) {
3207 		if (pt_prev) {
3208 			ret = deliver_skb(skb, pt_prev, orig_dev);
3209 			pt_prev = NULL;
3210 		}
3211 		if (vlan_do_receive(&skb, !rx_handler))
3212 			goto another_round;
3213 		else if (unlikely(!skb))
3214 			goto out;
3215 	}
3216 
3217 	if (rx_handler) {
3218 		if (pt_prev) {
3219 			ret = deliver_skb(skb, pt_prev, orig_dev);
3220 			pt_prev = NULL;
3221 		}
3222 		switch (rx_handler(&skb)) {
3223 		case RX_HANDLER_CONSUMED:
3224 			goto out;
3225 		case RX_HANDLER_ANOTHER:
3226 			goto another_round;
3227 		case RX_HANDLER_EXACT:
3228 			deliver_exact = true;
3229 		case RX_HANDLER_PASS:
3230 			break;
3231 		default:
3232 			BUG();
3233 		}
3234 	}
3235 
3236 	/* deliver only exact match when indicated */
3237 	null_or_dev = deliver_exact ? skb->dev : NULL;
3238 
3239 	type = skb->protocol;
3240 	list_for_each_entry_rcu(ptype,
3241 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3242 		if (ptype->type == type &&
3243 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3244 		     ptype->dev == orig_dev)) {
3245 			if (pt_prev)
3246 				ret = deliver_skb(skb, pt_prev, orig_dev);
3247 			pt_prev = ptype;
3248 		}
3249 	}
3250 
3251 	if (pt_prev) {
3252 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3253 	} else {
3254 		atomic_long_inc(&skb->dev->rx_dropped);
3255 		kfree_skb(skb);
3256 		/* Jamal, now you will not able to escape explaining
3257 		 * me how you were going to use this. :-)
3258 		 */
3259 		ret = NET_RX_DROP;
3260 	}
3261 
3262 out:
3263 	rcu_read_unlock();
3264 	return ret;
3265 }
3266 
3267 /**
3268  *	netif_receive_skb - process receive buffer from network
3269  *	@skb: buffer to process
3270  *
3271  *	netif_receive_skb() is the main receive data processing function.
3272  *	It always succeeds. The buffer may be dropped during processing
3273  *	for congestion control or by the protocol layers.
3274  *
3275  *	This function may only be called from softirq context and interrupts
3276  *	should be enabled.
3277  *
3278  *	Return values (usually ignored):
3279  *	NET_RX_SUCCESS: no congestion
3280  *	NET_RX_DROP: packet was dropped
3281  */
3282 int netif_receive_skb(struct sk_buff *skb)
3283 {
3284 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3285 
3286 	if (skb_defer_rx_timestamp(skb))
3287 		return NET_RX_SUCCESS;
3288 
3289 #ifdef CONFIG_RPS
3290 	if (static_key_false(&rps_needed)) {
3291 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3292 		int cpu, ret;
3293 
3294 		rcu_read_lock();
3295 
3296 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3297 
3298 		if (cpu >= 0) {
3299 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3300 			rcu_read_unlock();
3301 			return ret;
3302 		}
3303 		rcu_read_unlock();
3304 	}
3305 #endif
3306 	return __netif_receive_skb(skb);
3307 }
3308 EXPORT_SYMBOL(netif_receive_skb);
3309 
3310 /* Network device is going away, flush any packets still pending
3311  * Called with irqs disabled.
3312  */
3313 static void flush_backlog(void *arg)
3314 {
3315 	struct net_device *dev = arg;
3316 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3317 	struct sk_buff *skb, *tmp;
3318 
3319 	rps_lock(sd);
3320 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3321 		if (skb->dev == dev) {
3322 			__skb_unlink(skb, &sd->input_pkt_queue);
3323 			kfree_skb(skb);
3324 			input_queue_head_incr(sd);
3325 		}
3326 	}
3327 	rps_unlock(sd);
3328 
3329 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3330 		if (skb->dev == dev) {
3331 			__skb_unlink(skb, &sd->process_queue);
3332 			kfree_skb(skb);
3333 			input_queue_head_incr(sd);
3334 		}
3335 	}
3336 }
3337 
3338 static int napi_gro_complete(struct sk_buff *skb)
3339 {
3340 	struct packet_type *ptype;
3341 	__be16 type = skb->protocol;
3342 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3343 	int err = -ENOENT;
3344 
3345 	if (NAPI_GRO_CB(skb)->count == 1) {
3346 		skb_shinfo(skb)->gso_size = 0;
3347 		goto out;
3348 	}
3349 
3350 	rcu_read_lock();
3351 	list_for_each_entry_rcu(ptype, head, list) {
3352 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3353 			continue;
3354 
3355 		err = ptype->gro_complete(skb);
3356 		break;
3357 	}
3358 	rcu_read_unlock();
3359 
3360 	if (err) {
3361 		WARN_ON(&ptype->list == head);
3362 		kfree_skb(skb);
3363 		return NET_RX_SUCCESS;
3364 	}
3365 
3366 out:
3367 	return netif_receive_skb(skb);
3368 }
3369 
3370 inline void napi_gro_flush(struct napi_struct *napi)
3371 {
3372 	struct sk_buff *skb, *next;
3373 
3374 	for (skb = napi->gro_list; skb; skb = next) {
3375 		next = skb->next;
3376 		skb->next = NULL;
3377 		napi_gro_complete(skb);
3378 	}
3379 
3380 	napi->gro_count = 0;
3381 	napi->gro_list = NULL;
3382 }
3383 EXPORT_SYMBOL(napi_gro_flush);
3384 
3385 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3386 {
3387 	struct sk_buff **pp = NULL;
3388 	struct packet_type *ptype;
3389 	__be16 type = skb->protocol;
3390 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3391 	int same_flow;
3392 	int mac_len;
3393 	enum gro_result ret;
3394 
3395 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3396 		goto normal;
3397 
3398 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3399 		goto normal;
3400 
3401 	rcu_read_lock();
3402 	list_for_each_entry_rcu(ptype, head, list) {
3403 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3404 			continue;
3405 
3406 		skb_set_network_header(skb, skb_gro_offset(skb));
3407 		mac_len = skb->network_header - skb->mac_header;
3408 		skb->mac_len = mac_len;
3409 		NAPI_GRO_CB(skb)->same_flow = 0;
3410 		NAPI_GRO_CB(skb)->flush = 0;
3411 		NAPI_GRO_CB(skb)->free = 0;
3412 
3413 		pp = ptype->gro_receive(&napi->gro_list, skb);
3414 		break;
3415 	}
3416 	rcu_read_unlock();
3417 
3418 	if (&ptype->list == head)
3419 		goto normal;
3420 
3421 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3422 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3423 
3424 	if (pp) {
3425 		struct sk_buff *nskb = *pp;
3426 
3427 		*pp = nskb->next;
3428 		nskb->next = NULL;
3429 		napi_gro_complete(nskb);
3430 		napi->gro_count--;
3431 	}
3432 
3433 	if (same_flow)
3434 		goto ok;
3435 
3436 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3437 		goto normal;
3438 
3439 	napi->gro_count++;
3440 	NAPI_GRO_CB(skb)->count = 1;
3441 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3442 	skb->next = napi->gro_list;
3443 	napi->gro_list = skb;
3444 	ret = GRO_HELD;
3445 
3446 pull:
3447 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3448 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3449 
3450 		BUG_ON(skb->end - skb->tail < grow);
3451 
3452 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3453 
3454 		skb->tail += grow;
3455 		skb->data_len -= grow;
3456 
3457 		skb_shinfo(skb)->frags[0].page_offset += grow;
3458 		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3459 
3460 		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3461 			skb_frag_unref(skb, 0);
3462 			memmove(skb_shinfo(skb)->frags,
3463 				skb_shinfo(skb)->frags + 1,
3464 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3465 		}
3466 	}
3467 
3468 ok:
3469 	return ret;
3470 
3471 normal:
3472 	ret = GRO_NORMAL;
3473 	goto pull;
3474 }
3475 EXPORT_SYMBOL(dev_gro_receive);
3476 
3477 static inline gro_result_t
3478 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3479 {
3480 	struct sk_buff *p;
3481 	unsigned int maclen = skb->dev->hard_header_len;
3482 
3483 	for (p = napi->gro_list; p; p = p->next) {
3484 		unsigned long diffs;
3485 
3486 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3487 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3488 		if (maclen == ETH_HLEN)
3489 			diffs |= compare_ether_header(skb_mac_header(p),
3490 						      skb_gro_mac_header(skb));
3491 		else if (!diffs)
3492 			diffs = memcmp(skb_mac_header(p),
3493 				       skb_gro_mac_header(skb),
3494 				       maclen);
3495 		NAPI_GRO_CB(p)->same_flow = !diffs;
3496 		NAPI_GRO_CB(p)->flush = 0;
3497 	}
3498 
3499 	return dev_gro_receive(napi, skb);
3500 }
3501 
3502 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3503 {
3504 	switch (ret) {
3505 	case GRO_NORMAL:
3506 		if (netif_receive_skb(skb))
3507 			ret = GRO_DROP;
3508 		break;
3509 
3510 	case GRO_DROP:
3511 		kfree_skb(skb);
3512 		break;
3513 
3514 	case GRO_MERGED_FREE:
3515 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3516 			kmem_cache_free(skbuff_head_cache, skb);
3517 		else
3518 			__kfree_skb(skb);
3519 		break;
3520 
3521 	case GRO_HELD:
3522 	case GRO_MERGED:
3523 		break;
3524 	}
3525 
3526 	return ret;
3527 }
3528 EXPORT_SYMBOL(napi_skb_finish);
3529 
3530 void skb_gro_reset_offset(struct sk_buff *skb)
3531 {
3532 	NAPI_GRO_CB(skb)->data_offset = 0;
3533 	NAPI_GRO_CB(skb)->frag0 = NULL;
3534 	NAPI_GRO_CB(skb)->frag0_len = 0;
3535 
3536 	if (skb->mac_header == skb->tail &&
3537 	    !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3538 		NAPI_GRO_CB(skb)->frag0 =
3539 			skb_frag_address(&skb_shinfo(skb)->frags[0]);
3540 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3541 	}
3542 }
3543 EXPORT_SYMBOL(skb_gro_reset_offset);
3544 
3545 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3546 {
3547 	skb_gro_reset_offset(skb);
3548 
3549 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3550 }
3551 EXPORT_SYMBOL(napi_gro_receive);
3552 
3553 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3554 {
3555 	__skb_pull(skb, skb_headlen(skb));
3556 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
3557 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3558 	skb->vlan_tci = 0;
3559 	skb->dev = napi->dev;
3560 	skb->skb_iif = 0;
3561 
3562 	napi->skb = skb;
3563 }
3564 
3565 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3566 {
3567 	struct sk_buff *skb = napi->skb;
3568 
3569 	if (!skb) {
3570 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3571 		if (skb)
3572 			napi->skb = skb;
3573 	}
3574 	return skb;
3575 }
3576 EXPORT_SYMBOL(napi_get_frags);
3577 
3578 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3579 			       gro_result_t ret)
3580 {
3581 	switch (ret) {
3582 	case GRO_NORMAL:
3583 	case GRO_HELD:
3584 		skb->protocol = eth_type_trans(skb, skb->dev);
3585 
3586 		if (ret == GRO_HELD)
3587 			skb_gro_pull(skb, -ETH_HLEN);
3588 		else if (netif_receive_skb(skb))
3589 			ret = GRO_DROP;
3590 		break;
3591 
3592 	case GRO_DROP:
3593 	case GRO_MERGED_FREE:
3594 		napi_reuse_skb(napi, skb);
3595 		break;
3596 
3597 	case GRO_MERGED:
3598 		break;
3599 	}
3600 
3601 	return ret;
3602 }
3603 EXPORT_SYMBOL(napi_frags_finish);
3604 
3605 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3606 {
3607 	struct sk_buff *skb = napi->skb;
3608 	struct ethhdr *eth;
3609 	unsigned int hlen;
3610 	unsigned int off;
3611 
3612 	napi->skb = NULL;
3613 
3614 	skb_reset_mac_header(skb);
3615 	skb_gro_reset_offset(skb);
3616 
3617 	off = skb_gro_offset(skb);
3618 	hlen = off + sizeof(*eth);
3619 	eth = skb_gro_header_fast(skb, off);
3620 	if (skb_gro_header_hard(skb, hlen)) {
3621 		eth = skb_gro_header_slow(skb, hlen, off);
3622 		if (unlikely(!eth)) {
3623 			napi_reuse_skb(napi, skb);
3624 			skb = NULL;
3625 			goto out;
3626 		}
3627 	}
3628 
3629 	skb_gro_pull(skb, sizeof(*eth));
3630 
3631 	/*
3632 	 * This works because the only protocols we care about don't require
3633 	 * special handling.  We'll fix it up properly at the end.
3634 	 */
3635 	skb->protocol = eth->h_proto;
3636 
3637 out:
3638 	return skb;
3639 }
3640 
3641 gro_result_t napi_gro_frags(struct napi_struct *napi)
3642 {
3643 	struct sk_buff *skb = napi_frags_skb(napi);
3644 
3645 	if (!skb)
3646 		return GRO_DROP;
3647 
3648 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3649 }
3650 EXPORT_SYMBOL(napi_gro_frags);
3651 
3652 /*
3653  * net_rps_action sends any pending IPI's for rps.
3654  * Note: called with local irq disabled, but exits with local irq enabled.
3655  */
3656 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3657 {
3658 #ifdef CONFIG_RPS
3659 	struct softnet_data *remsd = sd->rps_ipi_list;
3660 
3661 	if (remsd) {
3662 		sd->rps_ipi_list = NULL;
3663 
3664 		local_irq_enable();
3665 
3666 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3667 		while (remsd) {
3668 			struct softnet_data *next = remsd->rps_ipi_next;
3669 
3670 			if (cpu_online(remsd->cpu))
3671 				__smp_call_function_single(remsd->cpu,
3672 							   &remsd->csd, 0);
3673 			remsd = next;
3674 		}
3675 	} else
3676 #endif
3677 		local_irq_enable();
3678 }
3679 
3680 static int process_backlog(struct napi_struct *napi, int quota)
3681 {
3682 	int work = 0;
3683 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3684 
3685 #ifdef CONFIG_RPS
3686 	/* Check if we have pending ipi, its better to send them now,
3687 	 * not waiting net_rx_action() end.
3688 	 */
3689 	if (sd->rps_ipi_list) {
3690 		local_irq_disable();
3691 		net_rps_action_and_irq_enable(sd);
3692 	}
3693 #endif
3694 	napi->weight = weight_p;
3695 	local_irq_disable();
3696 	while (work < quota) {
3697 		struct sk_buff *skb;
3698 		unsigned int qlen;
3699 
3700 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3701 			local_irq_enable();
3702 			__netif_receive_skb(skb);
3703 			local_irq_disable();
3704 			input_queue_head_incr(sd);
3705 			if (++work >= quota) {
3706 				local_irq_enable();
3707 				return work;
3708 			}
3709 		}
3710 
3711 		rps_lock(sd);
3712 		qlen = skb_queue_len(&sd->input_pkt_queue);
3713 		if (qlen)
3714 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3715 						   &sd->process_queue);
3716 
3717 		if (qlen < quota - work) {
3718 			/*
3719 			 * Inline a custom version of __napi_complete().
3720 			 * only current cpu owns and manipulates this napi,
3721 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3722 			 * we can use a plain write instead of clear_bit(),
3723 			 * and we dont need an smp_mb() memory barrier.
3724 			 */
3725 			list_del(&napi->poll_list);
3726 			napi->state = 0;
3727 
3728 			quota = work + qlen;
3729 		}
3730 		rps_unlock(sd);
3731 	}
3732 	local_irq_enable();
3733 
3734 	return work;
3735 }
3736 
3737 /**
3738  * __napi_schedule - schedule for receive
3739  * @n: entry to schedule
3740  *
3741  * The entry's receive function will be scheduled to run
3742  */
3743 void __napi_schedule(struct napi_struct *n)
3744 {
3745 	unsigned long flags;
3746 
3747 	local_irq_save(flags);
3748 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3749 	local_irq_restore(flags);
3750 }
3751 EXPORT_SYMBOL(__napi_schedule);
3752 
3753 void __napi_complete(struct napi_struct *n)
3754 {
3755 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3756 	BUG_ON(n->gro_list);
3757 
3758 	list_del(&n->poll_list);
3759 	smp_mb__before_clear_bit();
3760 	clear_bit(NAPI_STATE_SCHED, &n->state);
3761 }
3762 EXPORT_SYMBOL(__napi_complete);
3763 
3764 void napi_complete(struct napi_struct *n)
3765 {
3766 	unsigned long flags;
3767 
3768 	/*
3769 	 * don't let napi dequeue from the cpu poll list
3770 	 * just in case its running on a different cpu
3771 	 */
3772 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3773 		return;
3774 
3775 	napi_gro_flush(n);
3776 	local_irq_save(flags);
3777 	__napi_complete(n);
3778 	local_irq_restore(flags);
3779 }
3780 EXPORT_SYMBOL(napi_complete);
3781 
3782 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3783 		    int (*poll)(struct napi_struct *, int), int weight)
3784 {
3785 	INIT_LIST_HEAD(&napi->poll_list);
3786 	napi->gro_count = 0;
3787 	napi->gro_list = NULL;
3788 	napi->skb = NULL;
3789 	napi->poll = poll;
3790 	napi->weight = weight;
3791 	list_add(&napi->dev_list, &dev->napi_list);
3792 	napi->dev = dev;
3793 #ifdef CONFIG_NETPOLL
3794 	spin_lock_init(&napi->poll_lock);
3795 	napi->poll_owner = -1;
3796 #endif
3797 	set_bit(NAPI_STATE_SCHED, &napi->state);
3798 }
3799 EXPORT_SYMBOL(netif_napi_add);
3800 
3801 void netif_napi_del(struct napi_struct *napi)
3802 {
3803 	struct sk_buff *skb, *next;
3804 
3805 	list_del_init(&napi->dev_list);
3806 	napi_free_frags(napi);
3807 
3808 	for (skb = napi->gro_list; skb; skb = next) {
3809 		next = skb->next;
3810 		skb->next = NULL;
3811 		kfree_skb(skb);
3812 	}
3813 
3814 	napi->gro_list = NULL;
3815 	napi->gro_count = 0;
3816 }
3817 EXPORT_SYMBOL(netif_napi_del);
3818 
3819 static void net_rx_action(struct softirq_action *h)
3820 {
3821 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3822 	unsigned long time_limit = jiffies + 2;
3823 	int budget = netdev_budget;
3824 	void *have;
3825 
3826 	local_irq_disable();
3827 
3828 	while (!list_empty(&sd->poll_list)) {
3829 		struct napi_struct *n;
3830 		int work, weight;
3831 
3832 		/* If softirq window is exhuasted then punt.
3833 		 * Allow this to run for 2 jiffies since which will allow
3834 		 * an average latency of 1.5/HZ.
3835 		 */
3836 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3837 			goto softnet_break;
3838 
3839 		local_irq_enable();
3840 
3841 		/* Even though interrupts have been re-enabled, this
3842 		 * access is safe because interrupts can only add new
3843 		 * entries to the tail of this list, and only ->poll()
3844 		 * calls can remove this head entry from the list.
3845 		 */
3846 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3847 
3848 		have = netpoll_poll_lock(n);
3849 
3850 		weight = n->weight;
3851 
3852 		/* This NAPI_STATE_SCHED test is for avoiding a race
3853 		 * with netpoll's poll_napi().  Only the entity which
3854 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3855 		 * actually make the ->poll() call.  Therefore we avoid
3856 		 * accidentally calling ->poll() when NAPI is not scheduled.
3857 		 */
3858 		work = 0;
3859 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3860 			work = n->poll(n, weight);
3861 			trace_napi_poll(n);
3862 		}
3863 
3864 		WARN_ON_ONCE(work > weight);
3865 
3866 		budget -= work;
3867 
3868 		local_irq_disable();
3869 
3870 		/* Drivers must not modify the NAPI state if they
3871 		 * consume the entire weight.  In such cases this code
3872 		 * still "owns" the NAPI instance and therefore can
3873 		 * move the instance around on the list at-will.
3874 		 */
3875 		if (unlikely(work == weight)) {
3876 			if (unlikely(napi_disable_pending(n))) {
3877 				local_irq_enable();
3878 				napi_complete(n);
3879 				local_irq_disable();
3880 			} else
3881 				list_move_tail(&n->poll_list, &sd->poll_list);
3882 		}
3883 
3884 		netpoll_poll_unlock(have);
3885 	}
3886 out:
3887 	net_rps_action_and_irq_enable(sd);
3888 
3889 #ifdef CONFIG_NET_DMA
3890 	/*
3891 	 * There may not be any more sk_buffs coming right now, so push
3892 	 * any pending DMA copies to hardware
3893 	 */
3894 	dma_issue_pending_all();
3895 #endif
3896 
3897 	return;
3898 
3899 softnet_break:
3900 	sd->time_squeeze++;
3901 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3902 	goto out;
3903 }
3904 
3905 static gifconf_func_t *gifconf_list[NPROTO];
3906 
3907 /**
3908  *	register_gifconf	-	register a SIOCGIF handler
3909  *	@family: Address family
3910  *	@gifconf: Function handler
3911  *
3912  *	Register protocol dependent address dumping routines. The handler
3913  *	that is passed must not be freed or reused until it has been replaced
3914  *	by another handler.
3915  */
3916 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3917 {
3918 	if (family >= NPROTO)
3919 		return -EINVAL;
3920 	gifconf_list[family] = gifconf;
3921 	return 0;
3922 }
3923 EXPORT_SYMBOL(register_gifconf);
3924 
3925 
3926 /*
3927  *	Map an interface index to its name (SIOCGIFNAME)
3928  */
3929 
3930 /*
3931  *	We need this ioctl for efficient implementation of the
3932  *	if_indextoname() function required by the IPv6 API.  Without
3933  *	it, we would have to search all the interfaces to find a
3934  *	match.  --pb
3935  */
3936 
3937 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3938 {
3939 	struct net_device *dev;
3940 	struct ifreq ifr;
3941 
3942 	/*
3943 	 *	Fetch the caller's info block.
3944 	 */
3945 
3946 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3947 		return -EFAULT;
3948 
3949 	rcu_read_lock();
3950 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3951 	if (!dev) {
3952 		rcu_read_unlock();
3953 		return -ENODEV;
3954 	}
3955 
3956 	strcpy(ifr.ifr_name, dev->name);
3957 	rcu_read_unlock();
3958 
3959 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3960 		return -EFAULT;
3961 	return 0;
3962 }
3963 
3964 /*
3965  *	Perform a SIOCGIFCONF call. This structure will change
3966  *	size eventually, and there is nothing I can do about it.
3967  *	Thus we will need a 'compatibility mode'.
3968  */
3969 
3970 static int dev_ifconf(struct net *net, char __user *arg)
3971 {
3972 	struct ifconf ifc;
3973 	struct net_device *dev;
3974 	char __user *pos;
3975 	int len;
3976 	int total;
3977 	int i;
3978 
3979 	/*
3980 	 *	Fetch the caller's info block.
3981 	 */
3982 
3983 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3984 		return -EFAULT;
3985 
3986 	pos = ifc.ifc_buf;
3987 	len = ifc.ifc_len;
3988 
3989 	/*
3990 	 *	Loop over the interfaces, and write an info block for each.
3991 	 */
3992 
3993 	total = 0;
3994 	for_each_netdev(net, dev) {
3995 		for (i = 0; i < NPROTO; i++) {
3996 			if (gifconf_list[i]) {
3997 				int done;
3998 				if (!pos)
3999 					done = gifconf_list[i](dev, NULL, 0);
4000 				else
4001 					done = gifconf_list[i](dev, pos + total,
4002 							       len - total);
4003 				if (done < 0)
4004 					return -EFAULT;
4005 				total += done;
4006 			}
4007 		}
4008 	}
4009 
4010 	/*
4011 	 *	All done.  Write the updated control block back to the caller.
4012 	 */
4013 	ifc.ifc_len = total;
4014 
4015 	/*
4016 	 * 	Both BSD and Solaris return 0 here, so we do too.
4017 	 */
4018 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4019 }
4020 
4021 #ifdef CONFIG_PROC_FS
4022 
4023 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4024 
4025 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4026 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4027 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4028 
4029 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4030 {
4031 	struct net *net = seq_file_net(seq);
4032 	struct net_device *dev;
4033 	struct hlist_node *p;
4034 	struct hlist_head *h;
4035 	unsigned int count = 0, offset = get_offset(*pos);
4036 
4037 	h = &net->dev_name_head[get_bucket(*pos)];
4038 	hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4039 		if (++count == offset)
4040 			return dev;
4041 	}
4042 
4043 	return NULL;
4044 }
4045 
4046 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4047 {
4048 	struct net_device *dev;
4049 	unsigned int bucket;
4050 
4051 	do {
4052 		dev = dev_from_same_bucket(seq, pos);
4053 		if (dev)
4054 			return dev;
4055 
4056 		bucket = get_bucket(*pos) + 1;
4057 		*pos = set_bucket_offset(bucket, 1);
4058 	} while (bucket < NETDEV_HASHENTRIES);
4059 
4060 	return NULL;
4061 }
4062 
4063 /*
4064  *	This is invoked by the /proc filesystem handler to display a device
4065  *	in detail.
4066  */
4067 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4068 	__acquires(RCU)
4069 {
4070 	rcu_read_lock();
4071 	if (!*pos)
4072 		return SEQ_START_TOKEN;
4073 
4074 	if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4075 		return NULL;
4076 
4077 	return dev_from_bucket(seq, pos);
4078 }
4079 
4080 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4081 {
4082 	++*pos;
4083 	return dev_from_bucket(seq, pos);
4084 }
4085 
4086 void dev_seq_stop(struct seq_file *seq, void *v)
4087 	__releases(RCU)
4088 {
4089 	rcu_read_unlock();
4090 }
4091 
4092 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4093 {
4094 	struct rtnl_link_stats64 temp;
4095 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4096 
4097 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4098 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4099 		   dev->name, stats->rx_bytes, stats->rx_packets,
4100 		   stats->rx_errors,
4101 		   stats->rx_dropped + stats->rx_missed_errors,
4102 		   stats->rx_fifo_errors,
4103 		   stats->rx_length_errors + stats->rx_over_errors +
4104 		    stats->rx_crc_errors + stats->rx_frame_errors,
4105 		   stats->rx_compressed, stats->multicast,
4106 		   stats->tx_bytes, stats->tx_packets,
4107 		   stats->tx_errors, stats->tx_dropped,
4108 		   stats->tx_fifo_errors, stats->collisions,
4109 		   stats->tx_carrier_errors +
4110 		    stats->tx_aborted_errors +
4111 		    stats->tx_window_errors +
4112 		    stats->tx_heartbeat_errors,
4113 		   stats->tx_compressed);
4114 }
4115 
4116 /*
4117  *	Called from the PROCfs module. This now uses the new arbitrary sized
4118  *	/proc/net interface to create /proc/net/dev
4119  */
4120 static int dev_seq_show(struct seq_file *seq, void *v)
4121 {
4122 	if (v == SEQ_START_TOKEN)
4123 		seq_puts(seq, "Inter-|   Receive                            "
4124 			      "                    |  Transmit\n"
4125 			      " face |bytes    packets errs drop fifo frame "
4126 			      "compressed multicast|bytes    packets errs "
4127 			      "drop fifo colls carrier compressed\n");
4128 	else
4129 		dev_seq_printf_stats(seq, v);
4130 	return 0;
4131 }
4132 
4133 static struct softnet_data *softnet_get_online(loff_t *pos)
4134 {
4135 	struct softnet_data *sd = NULL;
4136 
4137 	while (*pos < nr_cpu_ids)
4138 		if (cpu_online(*pos)) {
4139 			sd = &per_cpu(softnet_data, *pos);
4140 			break;
4141 		} else
4142 			++*pos;
4143 	return sd;
4144 }
4145 
4146 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4147 {
4148 	return softnet_get_online(pos);
4149 }
4150 
4151 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4152 {
4153 	++*pos;
4154 	return softnet_get_online(pos);
4155 }
4156 
4157 static void softnet_seq_stop(struct seq_file *seq, void *v)
4158 {
4159 }
4160 
4161 static int softnet_seq_show(struct seq_file *seq, void *v)
4162 {
4163 	struct softnet_data *sd = v;
4164 
4165 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4166 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4167 		   0, 0, 0, 0, /* was fastroute */
4168 		   sd->cpu_collision, sd->received_rps);
4169 	return 0;
4170 }
4171 
4172 static const struct seq_operations dev_seq_ops = {
4173 	.start = dev_seq_start,
4174 	.next  = dev_seq_next,
4175 	.stop  = dev_seq_stop,
4176 	.show  = dev_seq_show,
4177 };
4178 
4179 static int dev_seq_open(struct inode *inode, struct file *file)
4180 {
4181 	return seq_open_net(inode, file, &dev_seq_ops,
4182 			    sizeof(struct seq_net_private));
4183 }
4184 
4185 static const struct file_operations dev_seq_fops = {
4186 	.owner	 = THIS_MODULE,
4187 	.open    = dev_seq_open,
4188 	.read    = seq_read,
4189 	.llseek  = seq_lseek,
4190 	.release = seq_release_net,
4191 };
4192 
4193 static const struct seq_operations softnet_seq_ops = {
4194 	.start = softnet_seq_start,
4195 	.next  = softnet_seq_next,
4196 	.stop  = softnet_seq_stop,
4197 	.show  = softnet_seq_show,
4198 };
4199 
4200 static int softnet_seq_open(struct inode *inode, struct file *file)
4201 {
4202 	return seq_open(file, &softnet_seq_ops);
4203 }
4204 
4205 static const struct file_operations softnet_seq_fops = {
4206 	.owner	 = THIS_MODULE,
4207 	.open    = softnet_seq_open,
4208 	.read    = seq_read,
4209 	.llseek  = seq_lseek,
4210 	.release = seq_release,
4211 };
4212 
4213 static void *ptype_get_idx(loff_t pos)
4214 {
4215 	struct packet_type *pt = NULL;
4216 	loff_t i = 0;
4217 	int t;
4218 
4219 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4220 		if (i == pos)
4221 			return pt;
4222 		++i;
4223 	}
4224 
4225 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4226 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4227 			if (i == pos)
4228 				return pt;
4229 			++i;
4230 		}
4231 	}
4232 	return NULL;
4233 }
4234 
4235 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4236 	__acquires(RCU)
4237 {
4238 	rcu_read_lock();
4239 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4240 }
4241 
4242 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4243 {
4244 	struct packet_type *pt;
4245 	struct list_head *nxt;
4246 	int hash;
4247 
4248 	++*pos;
4249 	if (v == SEQ_START_TOKEN)
4250 		return ptype_get_idx(0);
4251 
4252 	pt = v;
4253 	nxt = pt->list.next;
4254 	if (pt->type == htons(ETH_P_ALL)) {
4255 		if (nxt != &ptype_all)
4256 			goto found;
4257 		hash = 0;
4258 		nxt = ptype_base[0].next;
4259 	} else
4260 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4261 
4262 	while (nxt == &ptype_base[hash]) {
4263 		if (++hash >= PTYPE_HASH_SIZE)
4264 			return NULL;
4265 		nxt = ptype_base[hash].next;
4266 	}
4267 found:
4268 	return list_entry(nxt, struct packet_type, list);
4269 }
4270 
4271 static void ptype_seq_stop(struct seq_file *seq, void *v)
4272 	__releases(RCU)
4273 {
4274 	rcu_read_unlock();
4275 }
4276 
4277 static int ptype_seq_show(struct seq_file *seq, void *v)
4278 {
4279 	struct packet_type *pt = v;
4280 
4281 	if (v == SEQ_START_TOKEN)
4282 		seq_puts(seq, "Type Device      Function\n");
4283 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4284 		if (pt->type == htons(ETH_P_ALL))
4285 			seq_puts(seq, "ALL ");
4286 		else
4287 			seq_printf(seq, "%04x", ntohs(pt->type));
4288 
4289 		seq_printf(seq, " %-8s %pF\n",
4290 			   pt->dev ? pt->dev->name : "", pt->func);
4291 	}
4292 
4293 	return 0;
4294 }
4295 
4296 static const struct seq_operations ptype_seq_ops = {
4297 	.start = ptype_seq_start,
4298 	.next  = ptype_seq_next,
4299 	.stop  = ptype_seq_stop,
4300 	.show  = ptype_seq_show,
4301 };
4302 
4303 static int ptype_seq_open(struct inode *inode, struct file *file)
4304 {
4305 	return seq_open_net(inode, file, &ptype_seq_ops,
4306 			sizeof(struct seq_net_private));
4307 }
4308 
4309 static const struct file_operations ptype_seq_fops = {
4310 	.owner	 = THIS_MODULE,
4311 	.open    = ptype_seq_open,
4312 	.read    = seq_read,
4313 	.llseek  = seq_lseek,
4314 	.release = seq_release_net,
4315 };
4316 
4317 
4318 static int __net_init dev_proc_net_init(struct net *net)
4319 {
4320 	int rc = -ENOMEM;
4321 
4322 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4323 		goto out;
4324 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4325 		goto out_dev;
4326 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4327 		goto out_softnet;
4328 
4329 	if (wext_proc_init(net))
4330 		goto out_ptype;
4331 	rc = 0;
4332 out:
4333 	return rc;
4334 out_ptype:
4335 	proc_net_remove(net, "ptype");
4336 out_softnet:
4337 	proc_net_remove(net, "softnet_stat");
4338 out_dev:
4339 	proc_net_remove(net, "dev");
4340 	goto out;
4341 }
4342 
4343 static void __net_exit dev_proc_net_exit(struct net *net)
4344 {
4345 	wext_proc_exit(net);
4346 
4347 	proc_net_remove(net, "ptype");
4348 	proc_net_remove(net, "softnet_stat");
4349 	proc_net_remove(net, "dev");
4350 }
4351 
4352 static struct pernet_operations __net_initdata dev_proc_ops = {
4353 	.init = dev_proc_net_init,
4354 	.exit = dev_proc_net_exit,
4355 };
4356 
4357 static int __init dev_proc_init(void)
4358 {
4359 	return register_pernet_subsys(&dev_proc_ops);
4360 }
4361 #else
4362 #define dev_proc_init() 0
4363 #endif	/* CONFIG_PROC_FS */
4364 
4365 
4366 /**
4367  *	netdev_set_master	-	set up master pointer
4368  *	@slave: slave device
4369  *	@master: new master device
4370  *
4371  *	Changes the master device of the slave. Pass %NULL to break the
4372  *	bonding. The caller must hold the RTNL semaphore. On a failure
4373  *	a negative errno code is returned. On success the reference counts
4374  *	are adjusted and the function returns zero.
4375  */
4376 int netdev_set_master(struct net_device *slave, struct net_device *master)
4377 {
4378 	struct net_device *old = slave->master;
4379 
4380 	ASSERT_RTNL();
4381 
4382 	if (master) {
4383 		if (old)
4384 			return -EBUSY;
4385 		dev_hold(master);
4386 	}
4387 
4388 	slave->master = master;
4389 
4390 	if (old)
4391 		dev_put(old);
4392 	return 0;
4393 }
4394 EXPORT_SYMBOL(netdev_set_master);
4395 
4396 /**
4397  *	netdev_set_bond_master	-	set up bonding master/slave pair
4398  *	@slave: slave device
4399  *	@master: new master device
4400  *
4401  *	Changes the master device of the slave. Pass %NULL to break the
4402  *	bonding. The caller must hold the RTNL semaphore. On a failure
4403  *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4404  *	to the routing socket and the function returns zero.
4405  */
4406 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4407 {
4408 	int err;
4409 
4410 	ASSERT_RTNL();
4411 
4412 	err = netdev_set_master(slave, master);
4413 	if (err)
4414 		return err;
4415 	if (master)
4416 		slave->flags |= IFF_SLAVE;
4417 	else
4418 		slave->flags &= ~IFF_SLAVE;
4419 
4420 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4421 	return 0;
4422 }
4423 EXPORT_SYMBOL(netdev_set_bond_master);
4424 
4425 static void dev_change_rx_flags(struct net_device *dev, int flags)
4426 {
4427 	const struct net_device_ops *ops = dev->netdev_ops;
4428 
4429 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4430 		ops->ndo_change_rx_flags(dev, flags);
4431 }
4432 
4433 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4434 {
4435 	unsigned int old_flags = dev->flags;
4436 	uid_t uid;
4437 	gid_t gid;
4438 
4439 	ASSERT_RTNL();
4440 
4441 	dev->flags |= IFF_PROMISC;
4442 	dev->promiscuity += inc;
4443 	if (dev->promiscuity == 0) {
4444 		/*
4445 		 * Avoid overflow.
4446 		 * If inc causes overflow, untouch promisc and return error.
4447 		 */
4448 		if (inc < 0)
4449 			dev->flags &= ~IFF_PROMISC;
4450 		else {
4451 			dev->promiscuity -= inc;
4452 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4453 				dev->name);
4454 			return -EOVERFLOW;
4455 		}
4456 	}
4457 	if (dev->flags != old_flags) {
4458 		pr_info("device %s %s promiscuous mode\n",
4459 			dev->name,
4460 			dev->flags & IFF_PROMISC ? "entered" : "left");
4461 		if (audit_enabled) {
4462 			current_uid_gid(&uid, &gid);
4463 			audit_log(current->audit_context, GFP_ATOMIC,
4464 				AUDIT_ANOM_PROMISCUOUS,
4465 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4466 				dev->name, (dev->flags & IFF_PROMISC),
4467 				(old_flags & IFF_PROMISC),
4468 				audit_get_loginuid(current),
4469 				uid, gid,
4470 				audit_get_sessionid(current));
4471 		}
4472 
4473 		dev_change_rx_flags(dev, IFF_PROMISC);
4474 	}
4475 	return 0;
4476 }
4477 
4478 /**
4479  *	dev_set_promiscuity	- update promiscuity count on a device
4480  *	@dev: device
4481  *	@inc: modifier
4482  *
4483  *	Add or remove promiscuity from a device. While the count in the device
4484  *	remains above zero the interface remains promiscuous. Once it hits zero
4485  *	the device reverts back to normal filtering operation. A negative inc
4486  *	value is used to drop promiscuity on the device.
4487  *	Return 0 if successful or a negative errno code on error.
4488  */
4489 int dev_set_promiscuity(struct net_device *dev, int inc)
4490 {
4491 	unsigned int old_flags = dev->flags;
4492 	int err;
4493 
4494 	err = __dev_set_promiscuity(dev, inc);
4495 	if (err < 0)
4496 		return err;
4497 	if (dev->flags != old_flags)
4498 		dev_set_rx_mode(dev);
4499 	return err;
4500 }
4501 EXPORT_SYMBOL(dev_set_promiscuity);
4502 
4503 /**
4504  *	dev_set_allmulti	- update allmulti count on a device
4505  *	@dev: device
4506  *	@inc: modifier
4507  *
4508  *	Add or remove reception of all multicast frames to a device. While the
4509  *	count in the device remains above zero the interface remains listening
4510  *	to all interfaces. Once it hits zero the device reverts back to normal
4511  *	filtering operation. A negative @inc value is used to drop the counter
4512  *	when releasing a resource needing all multicasts.
4513  *	Return 0 if successful or a negative errno code on error.
4514  */
4515 
4516 int dev_set_allmulti(struct net_device *dev, int inc)
4517 {
4518 	unsigned int old_flags = dev->flags;
4519 
4520 	ASSERT_RTNL();
4521 
4522 	dev->flags |= IFF_ALLMULTI;
4523 	dev->allmulti += inc;
4524 	if (dev->allmulti == 0) {
4525 		/*
4526 		 * Avoid overflow.
4527 		 * If inc causes overflow, untouch allmulti and return error.
4528 		 */
4529 		if (inc < 0)
4530 			dev->flags &= ~IFF_ALLMULTI;
4531 		else {
4532 			dev->allmulti -= inc;
4533 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4534 				dev->name);
4535 			return -EOVERFLOW;
4536 		}
4537 	}
4538 	if (dev->flags ^ old_flags) {
4539 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4540 		dev_set_rx_mode(dev);
4541 	}
4542 	return 0;
4543 }
4544 EXPORT_SYMBOL(dev_set_allmulti);
4545 
4546 /*
4547  *	Upload unicast and multicast address lists to device and
4548  *	configure RX filtering. When the device doesn't support unicast
4549  *	filtering it is put in promiscuous mode while unicast addresses
4550  *	are present.
4551  */
4552 void __dev_set_rx_mode(struct net_device *dev)
4553 {
4554 	const struct net_device_ops *ops = dev->netdev_ops;
4555 
4556 	/* dev_open will call this function so the list will stay sane. */
4557 	if (!(dev->flags&IFF_UP))
4558 		return;
4559 
4560 	if (!netif_device_present(dev))
4561 		return;
4562 
4563 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4564 		/* Unicast addresses changes may only happen under the rtnl,
4565 		 * therefore calling __dev_set_promiscuity here is safe.
4566 		 */
4567 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4568 			__dev_set_promiscuity(dev, 1);
4569 			dev->uc_promisc = true;
4570 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4571 			__dev_set_promiscuity(dev, -1);
4572 			dev->uc_promisc = false;
4573 		}
4574 	}
4575 
4576 	if (ops->ndo_set_rx_mode)
4577 		ops->ndo_set_rx_mode(dev);
4578 }
4579 
4580 void dev_set_rx_mode(struct net_device *dev)
4581 {
4582 	netif_addr_lock_bh(dev);
4583 	__dev_set_rx_mode(dev);
4584 	netif_addr_unlock_bh(dev);
4585 }
4586 
4587 /**
4588  *	dev_get_flags - get flags reported to userspace
4589  *	@dev: device
4590  *
4591  *	Get the combination of flag bits exported through APIs to userspace.
4592  */
4593 unsigned int dev_get_flags(const struct net_device *dev)
4594 {
4595 	unsigned int flags;
4596 
4597 	flags = (dev->flags & ~(IFF_PROMISC |
4598 				IFF_ALLMULTI |
4599 				IFF_RUNNING |
4600 				IFF_LOWER_UP |
4601 				IFF_DORMANT)) |
4602 		(dev->gflags & (IFF_PROMISC |
4603 				IFF_ALLMULTI));
4604 
4605 	if (netif_running(dev)) {
4606 		if (netif_oper_up(dev))
4607 			flags |= IFF_RUNNING;
4608 		if (netif_carrier_ok(dev))
4609 			flags |= IFF_LOWER_UP;
4610 		if (netif_dormant(dev))
4611 			flags |= IFF_DORMANT;
4612 	}
4613 
4614 	return flags;
4615 }
4616 EXPORT_SYMBOL(dev_get_flags);
4617 
4618 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4619 {
4620 	unsigned int old_flags = dev->flags;
4621 	int ret;
4622 
4623 	ASSERT_RTNL();
4624 
4625 	/*
4626 	 *	Set the flags on our device.
4627 	 */
4628 
4629 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4630 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4631 			       IFF_AUTOMEDIA)) |
4632 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4633 				    IFF_ALLMULTI));
4634 
4635 	/*
4636 	 *	Load in the correct multicast list now the flags have changed.
4637 	 */
4638 
4639 	if ((old_flags ^ flags) & IFF_MULTICAST)
4640 		dev_change_rx_flags(dev, IFF_MULTICAST);
4641 
4642 	dev_set_rx_mode(dev);
4643 
4644 	/*
4645 	 *	Have we downed the interface. We handle IFF_UP ourselves
4646 	 *	according to user attempts to set it, rather than blindly
4647 	 *	setting it.
4648 	 */
4649 
4650 	ret = 0;
4651 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4652 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4653 
4654 		if (!ret)
4655 			dev_set_rx_mode(dev);
4656 	}
4657 
4658 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4659 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4660 
4661 		dev->gflags ^= IFF_PROMISC;
4662 		dev_set_promiscuity(dev, inc);
4663 	}
4664 
4665 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4666 	   is important. Some (broken) drivers set IFF_PROMISC, when
4667 	   IFF_ALLMULTI is requested not asking us and not reporting.
4668 	 */
4669 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4670 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4671 
4672 		dev->gflags ^= IFF_ALLMULTI;
4673 		dev_set_allmulti(dev, inc);
4674 	}
4675 
4676 	return ret;
4677 }
4678 
4679 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4680 {
4681 	unsigned int changes = dev->flags ^ old_flags;
4682 
4683 	if (changes & IFF_UP) {
4684 		if (dev->flags & IFF_UP)
4685 			call_netdevice_notifiers(NETDEV_UP, dev);
4686 		else
4687 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4688 	}
4689 
4690 	if (dev->flags & IFF_UP &&
4691 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4692 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4693 }
4694 
4695 /**
4696  *	dev_change_flags - change device settings
4697  *	@dev: device
4698  *	@flags: device state flags
4699  *
4700  *	Change settings on device based state flags. The flags are
4701  *	in the userspace exported format.
4702  */
4703 int dev_change_flags(struct net_device *dev, unsigned int flags)
4704 {
4705 	int ret;
4706 	unsigned int changes, old_flags = dev->flags;
4707 
4708 	ret = __dev_change_flags(dev, flags);
4709 	if (ret < 0)
4710 		return ret;
4711 
4712 	changes = old_flags ^ dev->flags;
4713 	if (changes)
4714 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4715 
4716 	__dev_notify_flags(dev, old_flags);
4717 	return ret;
4718 }
4719 EXPORT_SYMBOL(dev_change_flags);
4720 
4721 /**
4722  *	dev_set_mtu - Change maximum transfer unit
4723  *	@dev: device
4724  *	@new_mtu: new transfer unit
4725  *
4726  *	Change the maximum transfer size of the network device.
4727  */
4728 int dev_set_mtu(struct net_device *dev, int new_mtu)
4729 {
4730 	const struct net_device_ops *ops = dev->netdev_ops;
4731 	int err;
4732 
4733 	if (new_mtu == dev->mtu)
4734 		return 0;
4735 
4736 	/*	MTU must be positive.	 */
4737 	if (new_mtu < 0)
4738 		return -EINVAL;
4739 
4740 	if (!netif_device_present(dev))
4741 		return -ENODEV;
4742 
4743 	err = 0;
4744 	if (ops->ndo_change_mtu)
4745 		err = ops->ndo_change_mtu(dev, new_mtu);
4746 	else
4747 		dev->mtu = new_mtu;
4748 
4749 	if (!err && dev->flags & IFF_UP)
4750 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4751 	return err;
4752 }
4753 EXPORT_SYMBOL(dev_set_mtu);
4754 
4755 /**
4756  *	dev_set_group - Change group this device belongs to
4757  *	@dev: device
4758  *	@new_group: group this device should belong to
4759  */
4760 void dev_set_group(struct net_device *dev, int new_group)
4761 {
4762 	dev->group = new_group;
4763 }
4764 EXPORT_SYMBOL(dev_set_group);
4765 
4766 /**
4767  *	dev_set_mac_address - Change Media Access Control Address
4768  *	@dev: device
4769  *	@sa: new address
4770  *
4771  *	Change the hardware (MAC) address of the device
4772  */
4773 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4774 {
4775 	const struct net_device_ops *ops = dev->netdev_ops;
4776 	int err;
4777 
4778 	if (!ops->ndo_set_mac_address)
4779 		return -EOPNOTSUPP;
4780 	if (sa->sa_family != dev->type)
4781 		return -EINVAL;
4782 	if (!netif_device_present(dev))
4783 		return -ENODEV;
4784 	err = ops->ndo_set_mac_address(dev, sa);
4785 	if (!err)
4786 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4787 	return err;
4788 }
4789 EXPORT_SYMBOL(dev_set_mac_address);
4790 
4791 /*
4792  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4793  */
4794 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4795 {
4796 	int err;
4797 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4798 
4799 	if (!dev)
4800 		return -ENODEV;
4801 
4802 	switch (cmd) {
4803 	case SIOCGIFFLAGS:	/* Get interface flags */
4804 		ifr->ifr_flags = (short) dev_get_flags(dev);
4805 		return 0;
4806 
4807 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4808 				   (currently unused) */
4809 		ifr->ifr_metric = 0;
4810 		return 0;
4811 
4812 	case SIOCGIFMTU:	/* Get the MTU of a device */
4813 		ifr->ifr_mtu = dev->mtu;
4814 		return 0;
4815 
4816 	case SIOCGIFHWADDR:
4817 		if (!dev->addr_len)
4818 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4819 		else
4820 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4821 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4822 		ifr->ifr_hwaddr.sa_family = dev->type;
4823 		return 0;
4824 
4825 	case SIOCGIFSLAVE:
4826 		err = -EINVAL;
4827 		break;
4828 
4829 	case SIOCGIFMAP:
4830 		ifr->ifr_map.mem_start = dev->mem_start;
4831 		ifr->ifr_map.mem_end   = dev->mem_end;
4832 		ifr->ifr_map.base_addr = dev->base_addr;
4833 		ifr->ifr_map.irq       = dev->irq;
4834 		ifr->ifr_map.dma       = dev->dma;
4835 		ifr->ifr_map.port      = dev->if_port;
4836 		return 0;
4837 
4838 	case SIOCGIFINDEX:
4839 		ifr->ifr_ifindex = dev->ifindex;
4840 		return 0;
4841 
4842 	case SIOCGIFTXQLEN:
4843 		ifr->ifr_qlen = dev->tx_queue_len;
4844 		return 0;
4845 
4846 	default:
4847 		/* dev_ioctl() should ensure this case
4848 		 * is never reached
4849 		 */
4850 		WARN_ON(1);
4851 		err = -ENOTTY;
4852 		break;
4853 
4854 	}
4855 	return err;
4856 }
4857 
4858 /*
4859  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4860  */
4861 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4862 {
4863 	int err;
4864 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4865 	const struct net_device_ops *ops;
4866 
4867 	if (!dev)
4868 		return -ENODEV;
4869 
4870 	ops = dev->netdev_ops;
4871 
4872 	switch (cmd) {
4873 	case SIOCSIFFLAGS:	/* Set interface flags */
4874 		return dev_change_flags(dev, ifr->ifr_flags);
4875 
4876 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4877 				   (currently unused) */
4878 		return -EOPNOTSUPP;
4879 
4880 	case SIOCSIFMTU:	/* Set the MTU of a device */
4881 		return dev_set_mtu(dev, ifr->ifr_mtu);
4882 
4883 	case SIOCSIFHWADDR:
4884 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4885 
4886 	case SIOCSIFHWBROADCAST:
4887 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4888 			return -EINVAL;
4889 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4890 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4891 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4892 		return 0;
4893 
4894 	case SIOCSIFMAP:
4895 		if (ops->ndo_set_config) {
4896 			if (!netif_device_present(dev))
4897 				return -ENODEV;
4898 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4899 		}
4900 		return -EOPNOTSUPP;
4901 
4902 	case SIOCADDMULTI:
4903 		if (!ops->ndo_set_rx_mode ||
4904 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4905 			return -EINVAL;
4906 		if (!netif_device_present(dev))
4907 			return -ENODEV;
4908 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4909 
4910 	case SIOCDELMULTI:
4911 		if (!ops->ndo_set_rx_mode ||
4912 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4913 			return -EINVAL;
4914 		if (!netif_device_present(dev))
4915 			return -ENODEV;
4916 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4917 
4918 	case SIOCSIFTXQLEN:
4919 		if (ifr->ifr_qlen < 0)
4920 			return -EINVAL;
4921 		dev->tx_queue_len = ifr->ifr_qlen;
4922 		return 0;
4923 
4924 	case SIOCSIFNAME:
4925 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4926 		return dev_change_name(dev, ifr->ifr_newname);
4927 
4928 	case SIOCSHWTSTAMP:
4929 		err = net_hwtstamp_validate(ifr);
4930 		if (err)
4931 			return err;
4932 		/* fall through */
4933 
4934 	/*
4935 	 *	Unknown or private ioctl
4936 	 */
4937 	default:
4938 		if ((cmd >= SIOCDEVPRIVATE &&
4939 		    cmd <= SIOCDEVPRIVATE + 15) ||
4940 		    cmd == SIOCBONDENSLAVE ||
4941 		    cmd == SIOCBONDRELEASE ||
4942 		    cmd == SIOCBONDSETHWADDR ||
4943 		    cmd == SIOCBONDSLAVEINFOQUERY ||
4944 		    cmd == SIOCBONDINFOQUERY ||
4945 		    cmd == SIOCBONDCHANGEACTIVE ||
4946 		    cmd == SIOCGMIIPHY ||
4947 		    cmd == SIOCGMIIREG ||
4948 		    cmd == SIOCSMIIREG ||
4949 		    cmd == SIOCBRADDIF ||
4950 		    cmd == SIOCBRDELIF ||
4951 		    cmd == SIOCSHWTSTAMP ||
4952 		    cmd == SIOCWANDEV) {
4953 			err = -EOPNOTSUPP;
4954 			if (ops->ndo_do_ioctl) {
4955 				if (netif_device_present(dev))
4956 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4957 				else
4958 					err = -ENODEV;
4959 			}
4960 		} else
4961 			err = -EINVAL;
4962 
4963 	}
4964 	return err;
4965 }
4966 
4967 /*
4968  *	This function handles all "interface"-type I/O control requests. The actual
4969  *	'doing' part of this is dev_ifsioc above.
4970  */
4971 
4972 /**
4973  *	dev_ioctl	-	network device ioctl
4974  *	@net: the applicable net namespace
4975  *	@cmd: command to issue
4976  *	@arg: pointer to a struct ifreq in user space
4977  *
4978  *	Issue ioctl functions to devices. This is normally called by the
4979  *	user space syscall interfaces but can sometimes be useful for
4980  *	other purposes. The return value is the return from the syscall if
4981  *	positive or a negative errno code on error.
4982  */
4983 
4984 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4985 {
4986 	struct ifreq ifr;
4987 	int ret;
4988 	char *colon;
4989 
4990 	/* One special case: SIOCGIFCONF takes ifconf argument
4991 	   and requires shared lock, because it sleeps writing
4992 	   to user space.
4993 	 */
4994 
4995 	if (cmd == SIOCGIFCONF) {
4996 		rtnl_lock();
4997 		ret = dev_ifconf(net, (char __user *) arg);
4998 		rtnl_unlock();
4999 		return ret;
5000 	}
5001 	if (cmd == SIOCGIFNAME)
5002 		return dev_ifname(net, (struct ifreq __user *)arg);
5003 
5004 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5005 		return -EFAULT;
5006 
5007 	ifr.ifr_name[IFNAMSIZ-1] = 0;
5008 
5009 	colon = strchr(ifr.ifr_name, ':');
5010 	if (colon)
5011 		*colon = 0;
5012 
5013 	/*
5014 	 *	See which interface the caller is talking about.
5015 	 */
5016 
5017 	switch (cmd) {
5018 	/*
5019 	 *	These ioctl calls:
5020 	 *	- can be done by all.
5021 	 *	- atomic and do not require locking.
5022 	 *	- return a value
5023 	 */
5024 	case SIOCGIFFLAGS:
5025 	case SIOCGIFMETRIC:
5026 	case SIOCGIFMTU:
5027 	case SIOCGIFHWADDR:
5028 	case SIOCGIFSLAVE:
5029 	case SIOCGIFMAP:
5030 	case SIOCGIFINDEX:
5031 	case SIOCGIFTXQLEN:
5032 		dev_load(net, ifr.ifr_name);
5033 		rcu_read_lock();
5034 		ret = dev_ifsioc_locked(net, &ifr, cmd);
5035 		rcu_read_unlock();
5036 		if (!ret) {
5037 			if (colon)
5038 				*colon = ':';
5039 			if (copy_to_user(arg, &ifr,
5040 					 sizeof(struct ifreq)))
5041 				ret = -EFAULT;
5042 		}
5043 		return ret;
5044 
5045 	case SIOCETHTOOL:
5046 		dev_load(net, ifr.ifr_name);
5047 		rtnl_lock();
5048 		ret = dev_ethtool(net, &ifr);
5049 		rtnl_unlock();
5050 		if (!ret) {
5051 			if (colon)
5052 				*colon = ':';
5053 			if (copy_to_user(arg, &ifr,
5054 					 sizeof(struct ifreq)))
5055 				ret = -EFAULT;
5056 		}
5057 		return ret;
5058 
5059 	/*
5060 	 *	These ioctl calls:
5061 	 *	- require superuser power.
5062 	 *	- require strict serialization.
5063 	 *	- return a value
5064 	 */
5065 	case SIOCGMIIPHY:
5066 	case SIOCGMIIREG:
5067 	case SIOCSIFNAME:
5068 		if (!capable(CAP_NET_ADMIN))
5069 			return -EPERM;
5070 		dev_load(net, ifr.ifr_name);
5071 		rtnl_lock();
5072 		ret = dev_ifsioc(net, &ifr, cmd);
5073 		rtnl_unlock();
5074 		if (!ret) {
5075 			if (colon)
5076 				*colon = ':';
5077 			if (copy_to_user(arg, &ifr,
5078 					 sizeof(struct ifreq)))
5079 				ret = -EFAULT;
5080 		}
5081 		return ret;
5082 
5083 	/*
5084 	 *	These ioctl calls:
5085 	 *	- require superuser power.
5086 	 *	- require strict serialization.
5087 	 *	- do not return a value
5088 	 */
5089 	case SIOCSIFFLAGS:
5090 	case SIOCSIFMETRIC:
5091 	case SIOCSIFMTU:
5092 	case SIOCSIFMAP:
5093 	case SIOCSIFHWADDR:
5094 	case SIOCSIFSLAVE:
5095 	case SIOCADDMULTI:
5096 	case SIOCDELMULTI:
5097 	case SIOCSIFHWBROADCAST:
5098 	case SIOCSIFTXQLEN:
5099 	case SIOCSMIIREG:
5100 	case SIOCBONDENSLAVE:
5101 	case SIOCBONDRELEASE:
5102 	case SIOCBONDSETHWADDR:
5103 	case SIOCBONDCHANGEACTIVE:
5104 	case SIOCBRADDIF:
5105 	case SIOCBRDELIF:
5106 	case SIOCSHWTSTAMP:
5107 		if (!capable(CAP_NET_ADMIN))
5108 			return -EPERM;
5109 		/* fall through */
5110 	case SIOCBONDSLAVEINFOQUERY:
5111 	case SIOCBONDINFOQUERY:
5112 		dev_load(net, ifr.ifr_name);
5113 		rtnl_lock();
5114 		ret = dev_ifsioc(net, &ifr, cmd);
5115 		rtnl_unlock();
5116 		return ret;
5117 
5118 	case SIOCGIFMEM:
5119 		/* Get the per device memory space. We can add this but
5120 		 * currently do not support it */
5121 	case SIOCSIFMEM:
5122 		/* Set the per device memory buffer space.
5123 		 * Not applicable in our case */
5124 	case SIOCSIFLINK:
5125 		return -ENOTTY;
5126 
5127 	/*
5128 	 *	Unknown or private ioctl.
5129 	 */
5130 	default:
5131 		if (cmd == SIOCWANDEV ||
5132 		    (cmd >= SIOCDEVPRIVATE &&
5133 		     cmd <= SIOCDEVPRIVATE + 15)) {
5134 			dev_load(net, ifr.ifr_name);
5135 			rtnl_lock();
5136 			ret = dev_ifsioc(net, &ifr, cmd);
5137 			rtnl_unlock();
5138 			if (!ret && copy_to_user(arg, &ifr,
5139 						 sizeof(struct ifreq)))
5140 				ret = -EFAULT;
5141 			return ret;
5142 		}
5143 		/* Take care of Wireless Extensions */
5144 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5145 			return wext_handle_ioctl(net, &ifr, cmd, arg);
5146 		return -ENOTTY;
5147 	}
5148 }
5149 
5150 
5151 /**
5152  *	dev_new_index	-	allocate an ifindex
5153  *	@net: the applicable net namespace
5154  *
5155  *	Returns a suitable unique value for a new device interface
5156  *	number.  The caller must hold the rtnl semaphore or the
5157  *	dev_base_lock to be sure it remains unique.
5158  */
5159 static int dev_new_index(struct net *net)
5160 {
5161 	static int ifindex;
5162 	for (;;) {
5163 		if (++ifindex <= 0)
5164 			ifindex = 1;
5165 		if (!__dev_get_by_index(net, ifindex))
5166 			return ifindex;
5167 	}
5168 }
5169 
5170 /* Delayed registration/unregisteration */
5171 static LIST_HEAD(net_todo_list);
5172 
5173 static void net_set_todo(struct net_device *dev)
5174 {
5175 	list_add_tail(&dev->todo_list, &net_todo_list);
5176 }
5177 
5178 static void rollback_registered_many(struct list_head *head)
5179 {
5180 	struct net_device *dev, *tmp;
5181 
5182 	BUG_ON(dev_boot_phase);
5183 	ASSERT_RTNL();
5184 
5185 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5186 		/* Some devices call without registering
5187 		 * for initialization unwind. Remove those
5188 		 * devices and proceed with the remaining.
5189 		 */
5190 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5191 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5192 				 dev->name, dev);
5193 
5194 			WARN_ON(1);
5195 			list_del(&dev->unreg_list);
5196 			continue;
5197 		}
5198 		dev->dismantle = true;
5199 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5200 	}
5201 
5202 	/* If device is running, close it first. */
5203 	dev_close_many(head);
5204 
5205 	list_for_each_entry(dev, head, unreg_list) {
5206 		/* And unlink it from device chain. */
5207 		unlist_netdevice(dev);
5208 
5209 		dev->reg_state = NETREG_UNREGISTERING;
5210 	}
5211 
5212 	synchronize_net();
5213 
5214 	list_for_each_entry(dev, head, unreg_list) {
5215 		/* Shutdown queueing discipline. */
5216 		dev_shutdown(dev);
5217 
5218 
5219 		/* Notify protocols, that we are about to destroy
5220 		   this device. They should clean all the things.
5221 		*/
5222 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5223 
5224 		if (!dev->rtnl_link_ops ||
5225 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5226 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5227 
5228 		/*
5229 		 *	Flush the unicast and multicast chains
5230 		 */
5231 		dev_uc_flush(dev);
5232 		dev_mc_flush(dev);
5233 
5234 		if (dev->netdev_ops->ndo_uninit)
5235 			dev->netdev_ops->ndo_uninit(dev);
5236 
5237 		/* Notifier chain MUST detach us from master device. */
5238 		WARN_ON(dev->master);
5239 
5240 		/* Remove entries from kobject tree */
5241 		netdev_unregister_kobject(dev);
5242 	}
5243 
5244 	/* Process any work delayed until the end of the batch */
5245 	dev = list_first_entry(head, struct net_device, unreg_list);
5246 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5247 
5248 	synchronize_net();
5249 
5250 	list_for_each_entry(dev, head, unreg_list)
5251 		dev_put(dev);
5252 }
5253 
5254 static void rollback_registered(struct net_device *dev)
5255 {
5256 	LIST_HEAD(single);
5257 
5258 	list_add(&dev->unreg_list, &single);
5259 	rollback_registered_many(&single);
5260 	list_del(&single);
5261 }
5262 
5263 static netdev_features_t netdev_fix_features(struct net_device *dev,
5264 	netdev_features_t features)
5265 {
5266 	/* Fix illegal checksum combinations */
5267 	if ((features & NETIF_F_HW_CSUM) &&
5268 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5269 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5270 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5271 	}
5272 
5273 	/* Fix illegal SG+CSUM combinations. */
5274 	if ((features & NETIF_F_SG) &&
5275 	    !(features & NETIF_F_ALL_CSUM)) {
5276 		netdev_dbg(dev,
5277 			"Dropping NETIF_F_SG since no checksum feature.\n");
5278 		features &= ~NETIF_F_SG;
5279 	}
5280 
5281 	/* TSO requires that SG is present as well. */
5282 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5283 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5284 		features &= ~NETIF_F_ALL_TSO;
5285 	}
5286 
5287 	/* TSO ECN requires that TSO is present as well. */
5288 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5289 		features &= ~NETIF_F_TSO_ECN;
5290 
5291 	/* Software GSO depends on SG. */
5292 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5293 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5294 		features &= ~NETIF_F_GSO;
5295 	}
5296 
5297 	/* UFO needs SG and checksumming */
5298 	if (features & NETIF_F_UFO) {
5299 		/* maybe split UFO into V4 and V6? */
5300 		if (!((features & NETIF_F_GEN_CSUM) ||
5301 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5302 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5303 			netdev_dbg(dev,
5304 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5305 			features &= ~NETIF_F_UFO;
5306 		}
5307 
5308 		if (!(features & NETIF_F_SG)) {
5309 			netdev_dbg(dev,
5310 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5311 			features &= ~NETIF_F_UFO;
5312 		}
5313 	}
5314 
5315 	return features;
5316 }
5317 
5318 int __netdev_update_features(struct net_device *dev)
5319 {
5320 	netdev_features_t features;
5321 	int err = 0;
5322 
5323 	ASSERT_RTNL();
5324 
5325 	features = netdev_get_wanted_features(dev);
5326 
5327 	if (dev->netdev_ops->ndo_fix_features)
5328 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5329 
5330 	/* driver might be less strict about feature dependencies */
5331 	features = netdev_fix_features(dev, features);
5332 
5333 	if (dev->features == features)
5334 		return 0;
5335 
5336 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5337 		&dev->features, &features);
5338 
5339 	if (dev->netdev_ops->ndo_set_features)
5340 		err = dev->netdev_ops->ndo_set_features(dev, features);
5341 
5342 	if (unlikely(err < 0)) {
5343 		netdev_err(dev,
5344 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5345 			err, &features, &dev->features);
5346 		return -1;
5347 	}
5348 
5349 	if (!err)
5350 		dev->features = features;
5351 
5352 	return 1;
5353 }
5354 
5355 /**
5356  *	netdev_update_features - recalculate device features
5357  *	@dev: the device to check
5358  *
5359  *	Recalculate dev->features set and send notifications if it
5360  *	has changed. Should be called after driver or hardware dependent
5361  *	conditions might have changed that influence the features.
5362  */
5363 void netdev_update_features(struct net_device *dev)
5364 {
5365 	if (__netdev_update_features(dev))
5366 		netdev_features_change(dev);
5367 }
5368 EXPORT_SYMBOL(netdev_update_features);
5369 
5370 /**
5371  *	netdev_change_features - recalculate device features
5372  *	@dev: the device to check
5373  *
5374  *	Recalculate dev->features set and send notifications even
5375  *	if they have not changed. Should be called instead of
5376  *	netdev_update_features() if also dev->vlan_features might
5377  *	have changed to allow the changes to be propagated to stacked
5378  *	VLAN devices.
5379  */
5380 void netdev_change_features(struct net_device *dev)
5381 {
5382 	__netdev_update_features(dev);
5383 	netdev_features_change(dev);
5384 }
5385 EXPORT_SYMBOL(netdev_change_features);
5386 
5387 /**
5388  *	netif_stacked_transfer_operstate -	transfer operstate
5389  *	@rootdev: the root or lower level device to transfer state from
5390  *	@dev: the device to transfer operstate to
5391  *
5392  *	Transfer operational state from root to device. This is normally
5393  *	called when a stacking relationship exists between the root
5394  *	device and the device(a leaf device).
5395  */
5396 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5397 					struct net_device *dev)
5398 {
5399 	if (rootdev->operstate == IF_OPER_DORMANT)
5400 		netif_dormant_on(dev);
5401 	else
5402 		netif_dormant_off(dev);
5403 
5404 	if (netif_carrier_ok(rootdev)) {
5405 		if (!netif_carrier_ok(dev))
5406 			netif_carrier_on(dev);
5407 	} else {
5408 		if (netif_carrier_ok(dev))
5409 			netif_carrier_off(dev);
5410 	}
5411 }
5412 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5413 
5414 #ifdef CONFIG_RPS
5415 static int netif_alloc_rx_queues(struct net_device *dev)
5416 {
5417 	unsigned int i, count = dev->num_rx_queues;
5418 	struct netdev_rx_queue *rx;
5419 
5420 	BUG_ON(count < 1);
5421 
5422 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5423 	if (!rx) {
5424 		pr_err("netdev: Unable to allocate %u rx queues\n", count);
5425 		return -ENOMEM;
5426 	}
5427 	dev->_rx = rx;
5428 
5429 	for (i = 0; i < count; i++)
5430 		rx[i].dev = dev;
5431 	return 0;
5432 }
5433 #endif
5434 
5435 static void netdev_init_one_queue(struct net_device *dev,
5436 				  struct netdev_queue *queue, void *_unused)
5437 {
5438 	/* Initialize queue lock */
5439 	spin_lock_init(&queue->_xmit_lock);
5440 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5441 	queue->xmit_lock_owner = -1;
5442 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5443 	queue->dev = dev;
5444 #ifdef CONFIG_BQL
5445 	dql_init(&queue->dql, HZ);
5446 #endif
5447 }
5448 
5449 static int netif_alloc_netdev_queues(struct net_device *dev)
5450 {
5451 	unsigned int count = dev->num_tx_queues;
5452 	struct netdev_queue *tx;
5453 
5454 	BUG_ON(count < 1);
5455 
5456 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5457 	if (!tx) {
5458 		pr_err("netdev: Unable to allocate %u tx queues\n", count);
5459 		return -ENOMEM;
5460 	}
5461 	dev->_tx = tx;
5462 
5463 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5464 	spin_lock_init(&dev->tx_global_lock);
5465 
5466 	return 0;
5467 }
5468 
5469 /**
5470  *	register_netdevice	- register a network device
5471  *	@dev: device to register
5472  *
5473  *	Take a completed network device structure and add it to the kernel
5474  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5475  *	chain. 0 is returned on success. A negative errno code is returned
5476  *	on a failure to set up the device, or if the name is a duplicate.
5477  *
5478  *	Callers must hold the rtnl semaphore. You may want
5479  *	register_netdev() instead of this.
5480  *
5481  *	BUGS:
5482  *	The locking appears insufficient to guarantee two parallel registers
5483  *	will not get the same name.
5484  */
5485 
5486 int register_netdevice(struct net_device *dev)
5487 {
5488 	int ret;
5489 	struct net *net = dev_net(dev);
5490 
5491 	BUG_ON(dev_boot_phase);
5492 	ASSERT_RTNL();
5493 
5494 	might_sleep();
5495 
5496 	/* When net_device's are persistent, this will be fatal. */
5497 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5498 	BUG_ON(!net);
5499 
5500 	spin_lock_init(&dev->addr_list_lock);
5501 	netdev_set_addr_lockdep_class(dev);
5502 
5503 	dev->iflink = -1;
5504 
5505 	ret = dev_get_valid_name(dev, dev->name);
5506 	if (ret < 0)
5507 		goto out;
5508 
5509 	/* Init, if this function is available */
5510 	if (dev->netdev_ops->ndo_init) {
5511 		ret = dev->netdev_ops->ndo_init(dev);
5512 		if (ret) {
5513 			if (ret > 0)
5514 				ret = -EIO;
5515 			goto out;
5516 		}
5517 	}
5518 
5519 	dev->ifindex = dev_new_index(net);
5520 	if (dev->iflink == -1)
5521 		dev->iflink = dev->ifindex;
5522 
5523 	/* Transfer changeable features to wanted_features and enable
5524 	 * software offloads (GSO and GRO).
5525 	 */
5526 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5527 	dev->features |= NETIF_F_SOFT_FEATURES;
5528 	dev->wanted_features = dev->features & dev->hw_features;
5529 
5530 	/* Turn on no cache copy if HW is doing checksum */
5531 	if (!(dev->flags & IFF_LOOPBACK)) {
5532 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
5533 		if (dev->features & NETIF_F_ALL_CSUM) {
5534 			dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5535 			dev->features |= NETIF_F_NOCACHE_COPY;
5536 		}
5537 	}
5538 
5539 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5540 	 */
5541 	dev->vlan_features |= NETIF_F_HIGHDMA;
5542 
5543 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5544 	ret = notifier_to_errno(ret);
5545 	if (ret)
5546 		goto err_uninit;
5547 
5548 	ret = netdev_register_kobject(dev);
5549 	if (ret)
5550 		goto err_uninit;
5551 	dev->reg_state = NETREG_REGISTERED;
5552 
5553 	__netdev_update_features(dev);
5554 
5555 	/*
5556 	 *	Default initial state at registry is that the
5557 	 *	device is present.
5558 	 */
5559 
5560 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5561 
5562 	dev_init_scheduler(dev);
5563 	dev_hold(dev);
5564 	list_netdevice(dev);
5565 
5566 	/* Notify protocols, that a new device appeared. */
5567 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5568 	ret = notifier_to_errno(ret);
5569 	if (ret) {
5570 		rollback_registered(dev);
5571 		dev->reg_state = NETREG_UNREGISTERED;
5572 	}
5573 	/*
5574 	 *	Prevent userspace races by waiting until the network
5575 	 *	device is fully setup before sending notifications.
5576 	 */
5577 	if (!dev->rtnl_link_ops ||
5578 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5579 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5580 
5581 out:
5582 	return ret;
5583 
5584 err_uninit:
5585 	if (dev->netdev_ops->ndo_uninit)
5586 		dev->netdev_ops->ndo_uninit(dev);
5587 	goto out;
5588 }
5589 EXPORT_SYMBOL(register_netdevice);
5590 
5591 /**
5592  *	init_dummy_netdev	- init a dummy network device for NAPI
5593  *	@dev: device to init
5594  *
5595  *	This takes a network device structure and initialize the minimum
5596  *	amount of fields so it can be used to schedule NAPI polls without
5597  *	registering a full blown interface. This is to be used by drivers
5598  *	that need to tie several hardware interfaces to a single NAPI
5599  *	poll scheduler due to HW limitations.
5600  */
5601 int init_dummy_netdev(struct net_device *dev)
5602 {
5603 	/* Clear everything. Note we don't initialize spinlocks
5604 	 * are they aren't supposed to be taken by any of the
5605 	 * NAPI code and this dummy netdev is supposed to be
5606 	 * only ever used for NAPI polls
5607 	 */
5608 	memset(dev, 0, sizeof(struct net_device));
5609 
5610 	/* make sure we BUG if trying to hit standard
5611 	 * register/unregister code path
5612 	 */
5613 	dev->reg_state = NETREG_DUMMY;
5614 
5615 	/* NAPI wants this */
5616 	INIT_LIST_HEAD(&dev->napi_list);
5617 
5618 	/* a dummy interface is started by default */
5619 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5620 	set_bit(__LINK_STATE_START, &dev->state);
5621 
5622 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5623 	 * because users of this 'device' dont need to change
5624 	 * its refcount.
5625 	 */
5626 
5627 	return 0;
5628 }
5629 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5630 
5631 
5632 /**
5633  *	register_netdev	- register a network device
5634  *	@dev: device to register
5635  *
5636  *	Take a completed network device structure and add it to the kernel
5637  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5638  *	chain. 0 is returned on success. A negative errno code is returned
5639  *	on a failure to set up the device, or if the name is a duplicate.
5640  *
5641  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5642  *	and expands the device name if you passed a format string to
5643  *	alloc_netdev.
5644  */
5645 int register_netdev(struct net_device *dev)
5646 {
5647 	int err;
5648 
5649 	rtnl_lock();
5650 	err = register_netdevice(dev);
5651 	rtnl_unlock();
5652 	return err;
5653 }
5654 EXPORT_SYMBOL(register_netdev);
5655 
5656 int netdev_refcnt_read(const struct net_device *dev)
5657 {
5658 	int i, refcnt = 0;
5659 
5660 	for_each_possible_cpu(i)
5661 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5662 	return refcnt;
5663 }
5664 EXPORT_SYMBOL(netdev_refcnt_read);
5665 
5666 /*
5667  * netdev_wait_allrefs - wait until all references are gone.
5668  *
5669  * This is called when unregistering network devices.
5670  *
5671  * Any protocol or device that holds a reference should register
5672  * for netdevice notification, and cleanup and put back the
5673  * reference if they receive an UNREGISTER event.
5674  * We can get stuck here if buggy protocols don't correctly
5675  * call dev_put.
5676  */
5677 static void netdev_wait_allrefs(struct net_device *dev)
5678 {
5679 	unsigned long rebroadcast_time, warning_time;
5680 	int refcnt;
5681 
5682 	linkwatch_forget_dev(dev);
5683 
5684 	rebroadcast_time = warning_time = jiffies;
5685 	refcnt = netdev_refcnt_read(dev);
5686 
5687 	while (refcnt != 0) {
5688 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5689 			rtnl_lock();
5690 
5691 			/* Rebroadcast unregister notification */
5692 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5693 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5694 			 * should have already handle it the first time */
5695 
5696 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5697 				     &dev->state)) {
5698 				/* We must not have linkwatch events
5699 				 * pending on unregister. If this
5700 				 * happens, we simply run the queue
5701 				 * unscheduled, resulting in a noop
5702 				 * for this device.
5703 				 */
5704 				linkwatch_run_queue();
5705 			}
5706 
5707 			__rtnl_unlock();
5708 
5709 			rebroadcast_time = jiffies;
5710 		}
5711 
5712 		msleep(250);
5713 
5714 		refcnt = netdev_refcnt_read(dev);
5715 
5716 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5717 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5718 				 dev->name, refcnt);
5719 			warning_time = jiffies;
5720 		}
5721 	}
5722 }
5723 
5724 /* The sequence is:
5725  *
5726  *	rtnl_lock();
5727  *	...
5728  *	register_netdevice(x1);
5729  *	register_netdevice(x2);
5730  *	...
5731  *	unregister_netdevice(y1);
5732  *	unregister_netdevice(y2);
5733  *      ...
5734  *	rtnl_unlock();
5735  *	free_netdev(y1);
5736  *	free_netdev(y2);
5737  *
5738  * We are invoked by rtnl_unlock().
5739  * This allows us to deal with problems:
5740  * 1) We can delete sysfs objects which invoke hotplug
5741  *    without deadlocking with linkwatch via keventd.
5742  * 2) Since we run with the RTNL semaphore not held, we can sleep
5743  *    safely in order to wait for the netdev refcnt to drop to zero.
5744  *
5745  * We must not return until all unregister events added during
5746  * the interval the lock was held have been completed.
5747  */
5748 void netdev_run_todo(void)
5749 {
5750 	struct list_head list;
5751 
5752 	/* Snapshot list, allow later requests */
5753 	list_replace_init(&net_todo_list, &list);
5754 
5755 	__rtnl_unlock();
5756 
5757 	/* Wait for rcu callbacks to finish before attempting to drain
5758 	 * the device list.  This usually avoids a 250ms wait.
5759 	 */
5760 	if (!list_empty(&list))
5761 		rcu_barrier();
5762 
5763 	while (!list_empty(&list)) {
5764 		struct net_device *dev
5765 			= list_first_entry(&list, struct net_device, todo_list);
5766 		list_del(&dev->todo_list);
5767 
5768 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5769 			pr_err("network todo '%s' but state %d\n",
5770 			       dev->name, dev->reg_state);
5771 			dump_stack();
5772 			continue;
5773 		}
5774 
5775 		dev->reg_state = NETREG_UNREGISTERED;
5776 
5777 		on_each_cpu(flush_backlog, dev, 1);
5778 
5779 		netdev_wait_allrefs(dev);
5780 
5781 		/* paranoia */
5782 		BUG_ON(netdev_refcnt_read(dev));
5783 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
5784 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5785 		WARN_ON(dev->dn_ptr);
5786 
5787 		if (dev->destructor)
5788 			dev->destructor(dev);
5789 
5790 		/* Free network device */
5791 		kobject_put(&dev->dev.kobj);
5792 	}
5793 }
5794 
5795 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5796  * fields in the same order, with only the type differing.
5797  */
5798 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5799 			     const struct net_device_stats *netdev_stats)
5800 {
5801 #if BITS_PER_LONG == 64
5802 	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5803 	memcpy(stats64, netdev_stats, sizeof(*stats64));
5804 #else
5805 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5806 	const unsigned long *src = (const unsigned long *)netdev_stats;
5807 	u64 *dst = (u64 *)stats64;
5808 
5809 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5810 		     sizeof(*stats64) / sizeof(u64));
5811 	for (i = 0; i < n; i++)
5812 		dst[i] = src[i];
5813 #endif
5814 }
5815 EXPORT_SYMBOL(netdev_stats_to_stats64);
5816 
5817 /**
5818  *	dev_get_stats	- get network device statistics
5819  *	@dev: device to get statistics from
5820  *	@storage: place to store stats
5821  *
5822  *	Get network statistics from device. Return @storage.
5823  *	The device driver may provide its own method by setting
5824  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5825  *	otherwise the internal statistics structure is used.
5826  */
5827 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5828 					struct rtnl_link_stats64 *storage)
5829 {
5830 	const struct net_device_ops *ops = dev->netdev_ops;
5831 
5832 	if (ops->ndo_get_stats64) {
5833 		memset(storage, 0, sizeof(*storage));
5834 		ops->ndo_get_stats64(dev, storage);
5835 	} else if (ops->ndo_get_stats) {
5836 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5837 	} else {
5838 		netdev_stats_to_stats64(storage, &dev->stats);
5839 	}
5840 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5841 	return storage;
5842 }
5843 EXPORT_SYMBOL(dev_get_stats);
5844 
5845 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5846 {
5847 	struct netdev_queue *queue = dev_ingress_queue(dev);
5848 
5849 #ifdef CONFIG_NET_CLS_ACT
5850 	if (queue)
5851 		return queue;
5852 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5853 	if (!queue)
5854 		return NULL;
5855 	netdev_init_one_queue(dev, queue, NULL);
5856 	queue->qdisc = &noop_qdisc;
5857 	queue->qdisc_sleeping = &noop_qdisc;
5858 	rcu_assign_pointer(dev->ingress_queue, queue);
5859 #endif
5860 	return queue;
5861 }
5862 
5863 /**
5864  *	alloc_netdev_mqs - allocate network device
5865  *	@sizeof_priv:	size of private data to allocate space for
5866  *	@name:		device name format string
5867  *	@setup:		callback to initialize device
5868  *	@txqs:		the number of TX subqueues to allocate
5869  *	@rxqs:		the number of RX subqueues to allocate
5870  *
5871  *	Allocates a struct net_device with private data area for driver use
5872  *	and performs basic initialization.  Also allocates subquue structs
5873  *	for each queue on the device.
5874  */
5875 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5876 		void (*setup)(struct net_device *),
5877 		unsigned int txqs, unsigned int rxqs)
5878 {
5879 	struct net_device *dev;
5880 	size_t alloc_size;
5881 	struct net_device *p;
5882 
5883 	BUG_ON(strlen(name) >= sizeof(dev->name));
5884 
5885 	if (txqs < 1) {
5886 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5887 		return NULL;
5888 	}
5889 
5890 #ifdef CONFIG_RPS
5891 	if (rxqs < 1) {
5892 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5893 		return NULL;
5894 	}
5895 #endif
5896 
5897 	alloc_size = sizeof(struct net_device);
5898 	if (sizeof_priv) {
5899 		/* ensure 32-byte alignment of private area */
5900 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5901 		alloc_size += sizeof_priv;
5902 	}
5903 	/* ensure 32-byte alignment of whole construct */
5904 	alloc_size += NETDEV_ALIGN - 1;
5905 
5906 	p = kzalloc(alloc_size, GFP_KERNEL);
5907 	if (!p) {
5908 		pr_err("alloc_netdev: Unable to allocate device\n");
5909 		return NULL;
5910 	}
5911 
5912 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5913 	dev->padded = (char *)dev - (char *)p;
5914 
5915 	dev->pcpu_refcnt = alloc_percpu(int);
5916 	if (!dev->pcpu_refcnt)
5917 		goto free_p;
5918 
5919 	if (dev_addr_init(dev))
5920 		goto free_pcpu;
5921 
5922 	dev_mc_init(dev);
5923 	dev_uc_init(dev);
5924 
5925 	dev_net_set(dev, &init_net);
5926 
5927 	dev->gso_max_size = GSO_MAX_SIZE;
5928 
5929 	INIT_LIST_HEAD(&dev->napi_list);
5930 	INIT_LIST_HEAD(&dev->unreg_list);
5931 	INIT_LIST_HEAD(&dev->link_watch_list);
5932 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5933 	setup(dev);
5934 
5935 	dev->num_tx_queues = txqs;
5936 	dev->real_num_tx_queues = txqs;
5937 	if (netif_alloc_netdev_queues(dev))
5938 		goto free_all;
5939 
5940 #ifdef CONFIG_RPS
5941 	dev->num_rx_queues = rxqs;
5942 	dev->real_num_rx_queues = rxqs;
5943 	if (netif_alloc_rx_queues(dev))
5944 		goto free_all;
5945 #endif
5946 
5947 	strcpy(dev->name, name);
5948 	dev->group = INIT_NETDEV_GROUP;
5949 	return dev;
5950 
5951 free_all:
5952 	free_netdev(dev);
5953 	return NULL;
5954 
5955 free_pcpu:
5956 	free_percpu(dev->pcpu_refcnt);
5957 	kfree(dev->_tx);
5958 #ifdef CONFIG_RPS
5959 	kfree(dev->_rx);
5960 #endif
5961 
5962 free_p:
5963 	kfree(p);
5964 	return NULL;
5965 }
5966 EXPORT_SYMBOL(alloc_netdev_mqs);
5967 
5968 /**
5969  *	free_netdev - free network device
5970  *	@dev: device
5971  *
5972  *	This function does the last stage of destroying an allocated device
5973  * 	interface. The reference to the device object is released.
5974  *	If this is the last reference then it will be freed.
5975  */
5976 void free_netdev(struct net_device *dev)
5977 {
5978 	struct napi_struct *p, *n;
5979 
5980 	release_net(dev_net(dev));
5981 
5982 	kfree(dev->_tx);
5983 #ifdef CONFIG_RPS
5984 	kfree(dev->_rx);
5985 #endif
5986 
5987 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5988 
5989 	/* Flush device addresses */
5990 	dev_addr_flush(dev);
5991 
5992 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5993 		netif_napi_del(p);
5994 
5995 	free_percpu(dev->pcpu_refcnt);
5996 	dev->pcpu_refcnt = NULL;
5997 
5998 	/*  Compatibility with error handling in drivers */
5999 	if (dev->reg_state == NETREG_UNINITIALIZED) {
6000 		kfree((char *)dev - dev->padded);
6001 		return;
6002 	}
6003 
6004 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6005 	dev->reg_state = NETREG_RELEASED;
6006 
6007 	/* will free via device release */
6008 	put_device(&dev->dev);
6009 }
6010 EXPORT_SYMBOL(free_netdev);
6011 
6012 /**
6013  *	synchronize_net -  Synchronize with packet receive processing
6014  *
6015  *	Wait for packets currently being received to be done.
6016  *	Does not block later packets from starting.
6017  */
6018 void synchronize_net(void)
6019 {
6020 	might_sleep();
6021 	if (rtnl_is_locked())
6022 		synchronize_rcu_expedited();
6023 	else
6024 		synchronize_rcu();
6025 }
6026 EXPORT_SYMBOL(synchronize_net);
6027 
6028 /**
6029  *	unregister_netdevice_queue - remove device from the kernel
6030  *	@dev: device
6031  *	@head: list
6032  *
6033  *	This function shuts down a device interface and removes it
6034  *	from the kernel tables.
6035  *	If head not NULL, device is queued to be unregistered later.
6036  *
6037  *	Callers must hold the rtnl semaphore.  You may want
6038  *	unregister_netdev() instead of this.
6039  */
6040 
6041 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6042 {
6043 	ASSERT_RTNL();
6044 
6045 	if (head) {
6046 		list_move_tail(&dev->unreg_list, head);
6047 	} else {
6048 		rollback_registered(dev);
6049 		/* Finish processing unregister after unlock */
6050 		net_set_todo(dev);
6051 	}
6052 }
6053 EXPORT_SYMBOL(unregister_netdevice_queue);
6054 
6055 /**
6056  *	unregister_netdevice_many - unregister many devices
6057  *	@head: list of devices
6058  */
6059 void unregister_netdevice_many(struct list_head *head)
6060 {
6061 	struct net_device *dev;
6062 
6063 	if (!list_empty(head)) {
6064 		rollback_registered_many(head);
6065 		list_for_each_entry(dev, head, unreg_list)
6066 			net_set_todo(dev);
6067 	}
6068 }
6069 EXPORT_SYMBOL(unregister_netdevice_many);
6070 
6071 /**
6072  *	unregister_netdev - remove device from the kernel
6073  *	@dev: device
6074  *
6075  *	This function shuts down a device interface and removes it
6076  *	from the kernel tables.
6077  *
6078  *	This is just a wrapper for unregister_netdevice that takes
6079  *	the rtnl semaphore.  In general you want to use this and not
6080  *	unregister_netdevice.
6081  */
6082 void unregister_netdev(struct net_device *dev)
6083 {
6084 	rtnl_lock();
6085 	unregister_netdevice(dev);
6086 	rtnl_unlock();
6087 }
6088 EXPORT_SYMBOL(unregister_netdev);
6089 
6090 /**
6091  *	dev_change_net_namespace - move device to different nethost namespace
6092  *	@dev: device
6093  *	@net: network namespace
6094  *	@pat: If not NULL name pattern to try if the current device name
6095  *	      is already taken in the destination network namespace.
6096  *
6097  *	This function shuts down a device interface and moves it
6098  *	to a new network namespace. On success 0 is returned, on
6099  *	a failure a netagive errno code is returned.
6100  *
6101  *	Callers must hold the rtnl semaphore.
6102  */
6103 
6104 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6105 {
6106 	int err;
6107 
6108 	ASSERT_RTNL();
6109 
6110 	/* Don't allow namespace local devices to be moved. */
6111 	err = -EINVAL;
6112 	if (dev->features & NETIF_F_NETNS_LOCAL)
6113 		goto out;
6114 
6115 	/* Ensure the device has been registrered */
6116 	err = -EINVAL;
6117 	if (dev->reg_state != NETREG_REGISTERED)
6118 		goto out;
6119 
6120 	/* Get out if there is nothing todo */
6121 	err = 0;
6122 	if (net_eq(dev_net(dev), net))
6123 		goto out;
6124 
6125 	/* Pick the destination device name, and ensure
6126 	 * we can use it in the destination network namespace.
6127 	 */
6128 	err = -EEXIST;
6129 	if (__dev_get_by_name(net, dev->name)) {
6130 		/* We get here if we can't use the current device name */
6131 		if (!pat)
6132 			goto out;
6133 		if (dev_get_valid_name(dev, pat) < 0)
6134 			goto out;
6135 	}
6136 
6137 	/*
6138 	 * And now a mini version of register_netdevice unregister_netdevice.
6139 	 */
6140 
6141 	/* If device is running close it first. */
6142 	dev_close(dev);
6143 
6144 	/* And unlink it from device chain */
6145 	err = -ENODEV;
6146 	unlist_netdevice(dev);
6147 
6148 	synchronize_net();
6149 
6150 	/* Shutdown queueing discipline. */
6151 	dev_shutdown(dev);
6152 
6153 	/* Notify protocols, that we are about to destroy
6154 	   this device. They should clean all the things.
6155 
6156 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6157 	   This is wanted because this way 8021q and macvlan know
6158 	   the device is just moving and can keep their slaves up.
6159 	*/
6160 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6161 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6162 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6163 
6164 	/*
6165 	 *	Flush the unicast and multicast chains
6166 	 */
6167 	dev_uc_flush(dev);
6168 	dev_mc_flush(dev);
6169 
6170 	/* Actually switch the network namespace */
6171 	dev_net_set(dev, net);
6172 
6173 	/* If there is an ifindex conflict assign a new one */
6174 	if (__dev_get_by_index(net, dev->ifindex)) {
6175 		int iflink = (dev->iflink == dev->ifindex);
6176 		dev->ifindex = dev_new_index(net);
6177 		if (iflink)
6178 			dev->iflink = dev->ifindex;
6179 	}
6180 
6181 	/* Fixup kobjects */
6182 	err = device_rename(&dev->dev, dev->name);
6183 	WARN_ON(err);
6184 
6185 	/* Add the device back in the hashes */
6186 	list_netdevice(dev);
6187 
6188 	/* Notify protocols, that a new device appeared. */
6189 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6190 
6191 	/*
6192 	 *	Prevent userspace races by waiting until the network
6193 	 *	device is fully setup before sending notifications.
6194 	 */
6195 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6196 
6197 	synchronize_net();
6198 	err = 0;
6199 out:
6200 	return err;
6201 }
6202 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6203 
6204 static int dev_cpu_callback(struct notifier_block *nfb,
6205 			    unsigned long action,
6206 			    void *ocpu)
6207 {
6208 	struct sk_buff **list_skb;
6209 	struct sk_buff *skb;
6210 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6211 	struct softnet_data *sd, *oldsd;
6212 
6213 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6214 		return NOTIFY_OK;
6215 
6216 	local_irq_disable();
6217 	cpu = smp_processor_id();
6218 	sd = &per_cpu(softnet_data, cpu);
6219 	oldsd = &per_cpu(softnet_data, oldcpu);
6220 
6221 	/* Find end of our completion_queue. */
6222 	list_skb = &sd->completion_queue;
6223 	while (*list_skb)
6224 		list_skb = &(*list_skb)->next;
6225 	/* Append completion queue from offline CPU. */
6226 	*list_skb = oldsd->completion_queue;
6227 	oldsd->completion_queue = NULL;
6228 
6229 	/* Append output queue from offline CPU. */
6230 	if (oldsd->output_queue) {
6231 		*sd->output_queue_tailp = oldsd->output_queue;
6232 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6233 		oldsd->output_queue = NULL;
6234 		oldsd->output_queue_tailp = &oldsd->output_queue;
6235 	}
6236 	/* Append NAPI poll list from offline CPU. */
6237 	if (!list_empty(&oldsd->poll_list)) {
6238 		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6239 		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6240 	}
6241 
6242 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6243 	local_irq_enable();
6244 
6245 	/* Process offline CPU's input_pkt_queue */
6246 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6247 		netif_rx(skb);
6248 		input_queue_head_incr(oldsd);
6249 	}
6250 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6251 		netif_rx(skb);
6252 		input_queue_head_incr(oldsd);
6253 	}
6254 
6255 	return NOTIFY_OK;
6256 }
6257 
6258 
6259 /**
6260  *	netdev_increment_features - increment feature set by one
6261  *	@all: current feature set
6262  *	@one: new feature set
6263  *	@mask: mask feature set
6264  *
6265  *	Computes a new feature set after adding a device with feature set
6266  *	@one to the master device with current feature set @all.  Will not
6267  *	enable anything that is off in @mask. Returns the new feature set.
6268  */
6269 netdev_features_t netdev_increment_features(netdev_features_t all,
6270 	netdev_features_t one, netdev_features_t mask)
6271 {
6272 	if (mask & NETIF_F_GEN_CSUM)
6273 		mask |= NETIF_F_ALL_CSUM;
6274 	mask |= NETIF_F_VLAN_CHALLENGED;
6275 
6276 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6277 	all &= one | ~NETIF_F_ALL_FOR_ALL;
6278 
6279 	/* If one device supports hw checksumming, set for all. */
6280 	if (all & NETIF_F_GEN_CSUM)
6281 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6282 
6283 	return all;
6284 }
6285 EXPORT_SYMBOL(netdev_increment_features);
6286 
6287 static struct hlist_head *netdev_create_hash(void)
6288 {
6289 	int i;
6290 	struct hlist_head *hash;
6291 
6292 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6293 	if (hash != NULL)
6294 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6295 			INIT_HLIST_HEAD(&hash[i]);
6296 
6297 	return hash;
6298 }
6299 
6300 /* Initialize per network namespace state */
6301 static int __net_init netdev_init(struct net *net)
6302 {
6303 	INIT_LIST_HEAD(&net->dev_base_head);
6304 
6305 	net->dev_name_head = netdev_create_hash();
6306 	if (net->dev_name_head == NULL)
6307 		goto err_name;
6308 
6309 	net->dev_index_head = netdev_create_hash();
6310 	if (net->dev_index_head == NULL)
6311 		goto err_idx;
6312 
6313 	return 0;
6314 
6315 err_idx:
6316 	kfree(net->dev_name_head);
6317 err_name:
6318 	return -ENOMEM;
6319 }
6320 
6321 /**
6322  *	netdev_drivername - network driver for the device
6323  *	@dev: network device
6324  *
6325  *	Determine network driver for device.
6326  */
6327 const char *netdev_drivername(const struct net_device *dev)
6328 {
6329 	const struct device_driver *driver;
6330 	const struct device *parent;
6331 	const char *empty = "";
6332 
6333 	parent = dev->dev.parent;
6334 	if (!parent)
6335 		return empty;
6336 
6337 	driver = parent->driver;
6338 	if (driver && driver->name)
6339 		return driver->name;
6340 	return empty;
6341 }
6342 
6343 int __netdev_printk(const char *level, const struct net_device *dev,
6344 			   struct va_format *vaf)
6345 {
6346 	int r;
6347 
6348 	if (dev && dev->dev.parent)
6349 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6350 			       netdev_name(dev), vaf);
6351 	else if (dev)
6352 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6353 	else
6354 		r = printk("%s(NULL net_device): %pV", level, vaf);
6355 
6356 	return r;
6357 }
6358 EXPORT_SYMBOL(__netdev_printk);
6359 
6360 int netdev_printk(const char *level, const struct net_device *dev,
6361 		  const char *format, ...)
6362 {
6363 	struct va_format vaf;
6364 	va_list args;
6365 	int r;
6366 
6367 	va_start(args, format);
6368 
6369 	vaf.fmt = format;
6370 	vaf.va = &args;
6371 
6372 	r = __netdev_printk(level, dev, &vaf);
6373 	va_end(args);
6374 
6375 	return r;
6376 }
6377 EXPORT_SYMBOL(netdev_printk);
6378 
6379 #define define_netdev_printk_level(func, level)			\
6380 int func(const struct net_device *dev, const char *fmt, ...)	\
6381 {								\
6382 	int r;							\
6383 	struct va_format vaf;					\
6384 	va_list args;						\
6385 								\
6386 	va_start(args, fmt);					\
6387 								\
6388 	vaf.fmt = fmt;						\
6389 	vaf.va = &args;						\
6390 								\
6391 	r = __netdev_printk(level, dev, &vaf);			\
6392 	va_end(args);						\
6393 								\
6394 	return r;						\
6395 }								\
6396 EXPORT_SYMBOL(func);
6397 
6398 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6399 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6400 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6401 define_netdev_printk_level(netdev_err, KERN_ERR);
6402 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6403 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6404 define_netdev_printk_level(netdev_info, KERN_INFO);
6405 
6406 static void __net_exit netdev_exit(struct net *net)
6407 {
6408 	kfree(net->dev_name_head);
6409 	kfree(net->dev_index_head);
6410 }
6411 
6412 static struct pernet_operations __net_initdata netdev_net_ops = {
6413 	.init = netdev_init,
6414 	.exit = netdev_exit,
6415 };
6416 
6417 static void __net_exit default_device_exit(struct net *net)
6418 {
6419 	struct net_device *dev, *aux;
6420 	/*
6421 	 * Push all migratable network devices back to the
6422 	 * initial network namespace
6423 	 */
6424 	rtnl_lock();
6425 	for_each_netdev_safe(net, dev, aux) {
6426 		int err;
6427 		char fb_name[IFNAMSIZ];
6428 
6429 		/* Ignore unmoveable devices (i.e. loopback) */
6430 		if (dev->features & NETIF_F_NETNS_LOCAL)
6431 			continue;
6432 
6433 		/* Leave virtual devices for the generic cleanup */
6434 		if (dev->rtnl_link_ops)
6435 			continue;
6436 
6437 		/* Push remaining network devices to init_net */
6438 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6439 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6440 		if (err) {
6441 			pr_emerg("%s: failed to move %s to init_net: %d\n",
6442 				 __func__, dev->name, err);
6443 			BUG();
6444 		}
6445 	}
6446 	rtnl_unlock();
6447 }
6448 
6449 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6450 {
6451 	/* At exit all network devices most be removed from a network
6452 	 * namespace.  Do this in the reverse order of registration.
6453 	 * Do this across as many network namespaces as possible to
6454 	 * improve batching efficiency.
6455 	 */
6456 	struct net_device *dev;
6457 	struct net *net;
6458 	LIST_HEAD(dev_kill_list);
6459 
6460 	rtnl_lock();
6461 	list_for_each_entry(net, net_list, exit_list) {
6462 		for_each_netdev_reverse(net, dev) {
6463 			if (dev->rtnl_link_ops)
6464 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6465 			else
6466 				unregister_netdevice_queue(dev, &dev_kill_list);
6467 		}
6468 	}
6469 	unregister_netdevice_many(&dev_kill_list);
6470 	list_del(&dev_kill_list);
6471 	rtnl_unlock();
6472 }
6473 
6474 static struct pernet_operations __net_initdata default_device_ops = {
6475 	.exit = default_device_exit,
6476 	.exit_batch = default_device_exit_batch,
6477 };
6478 
6479 /*
6480  *	Initialize the DEV module. At boot time this walks the device list and
6481  *	unhooks any devices that fail to initialise (normally hardware not
6482  *	present) and leaves us with a valid list of present and active devices.
6483  *
6484  */
6485 
6486 /*
6487  *       This is called single threaded during boot, so no need
6488  *       to take the rtnl semaphore.
6489  */
6490 static int __init net_dev_init(void)
6491 {
6492 	int i, rc = -ENOMEM;
6493 
6494 	BUG_ON(!dev_boot_phase);
6495 
6496 	if (dev_proc_init())
6497 		goto out;
6498 
6499 	if (netdev_kobject_init())
6500 		goto out;
6501 
6502 	INIT_LIST_HEAD(&ptype_all);
6503 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6504 		INIT_LIST_HEAD(&ptype_base[i]);
6505 
6506 	if (register_pernet_subsys(&netdev_net_ops))
6507 		goto out;
6508 
6509 	/*
6510 	 *	Initialise the packet receive queues.
6511 	 */
6512 
6513 	for_each_possible_cpu(i) {
6514 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6515 
6516 		memset(sd, 0, sizeof(*sd));
6517 		skb_queue_head_init(&sd->input_pkt_queue);
6518 		skb_queue_head_init(&sd->process_queue);
6519 		sd->completion_queue = NULL;
6520 		INIT_LIST_HEAD(&sd->poll_list);
6521 		sd->output_queue = NULL;
6522 		sd->output_queue_tailp = &sd->output_queue;
6523 #ifdef CONFIG_RPS
6524 		sd->csd.func = rps_trigger_softirq;
6525 		sd->csd.info = sd;
6526 		sd->csd.flags = 0;
6527 		sd->cpu = i;
6528 #endif
6529 
6530 		sd->backlog.poll = process_backlog;
6531 		sd->backlog.weight = weight_p;
6532 		sd->backlog.gro_list = NULL;
6533 		sd->backlog.gro_count = 0;
6534 	}
6535 
6536 	dev_boot_phase = 0;
6537 
6538 	/* The loopback device is special if any other network devices
6539 	 * is present in a network namespace the loopback device must
6540 	 * be present. Since we now dynamically allocate and free the
6541 	 * loopback device ensure this invariant is maintained by
6542 	 * keeping the loopback device as the first device on the
6543 	 * list of network devices.  Ensuring the loopback devices
6544 	 * is the first device that appears and the last network device
6545 	 * that disappears.
6546 	 */
6547 	if (register_pernet_device(&loopback_net_ops))
6548 		goto out;
6549 
6550 	if (register_pernet_device(&default_device_ops))
6551 		goto out;
6552 
6553 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6554 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6555 
6556 	hotcpu_notifier(dev_cpu_callback, 0);
6557 	dst_init();
6558 	dev_mcast_init();
6559 	rc = 0;
6560 out:
6561 	return rc;
6562 }
6563 
6564 subsys_initcall(net_dev_init);
6565 
6566 static int __init initialize_hashrnd(void)
6567 {
6568 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6569 	return 0;
6570 }
6571 
6572 late_initcall_sync(initialize_hashrnd);
6573 
6574