xref: /linux/net/core/dev.c (revision 4949009eb8d40a441dcddcd96e101e77d31cf1b2)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <net/mpls.h>
122 #include <linux/ipv6.h>
123 #include <linux/in.h>
124 #include <linux/jhash.h>
125 #include <linux/random.h>
126 #include <trace/events/napi.h>
127 #include <trace/events/net.h>
128 #include <trace/events/skb.h>
129 #include <linux/pci.h>
130 #include <linux/inetdevice.h>
131 #include <linux/cpu_rmap.h>
132 #include <linux/static_key.h>
133 #include <linux/hashtable.h>
134 #include <linux/vmalloc.h>
135 #include <linux/if_macvlan.h>
136 #include <linux/errqueue.h>
137 #include <linux/hrtimer.h>
138 
139 #include "net-sysfs.h"
140 
141 /* Instead of increasing this, you should create a hash table. */
142 #define MAX_GRO_SKBS 8
143 
144 /* This should be increased if a protocol with a bigger head is added. */
145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146 
147 static DEFINE_SPINLOCK(ptype_lock);
148 static DEFINE_SPINLOCK(offload_lock);
149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
150 struct list_head ptype_all __read_mostly;	/* Taps */
151 static struct list_head offload_base __read_mostly;
152 
153 static int netif_rx_internal(struct sk_buff *skb);
154 static int call_netdevice_notifiers_info(unsigned long val,
155 					 struct net_device *dev,
156 					 struct netdev_notifier_info *info);
157 
158 /*
159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
160  * semaphore.
161  *
162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
163  *
164  * Writers must hold the rtnl semaphore while they loop through the
165  * dev_base_head list, and hold dev_base_lock for writing when they do the
166  * actual updates.  This allows pure readers to access the list even
167  * while a writer is preparing to update it.
168  *
169  * To put it another way, dev_base_lock is held for writing only to
170  * protect against pure readers; the rtnl semaphore provides the
171  * protection against other writers.
172  *
173  * See, for example usages, register_netdevice() and
174  * unregister_netdevice(), which must be called with the rtnl
175  * semaphore held.
176  */
177 DEFINE_RWLOCK(dev_base_lock);
178 EXPORT_SYMBOL(dev_base_lock);
179 
180 /* protects napi_hash addition/deletion and napi_gen_id */
181 static DEFINE_SPINLOCK(napi_hash_lock);
182 
183 static unsigned int napi_gen_id;
184 static DEFINE_HASHTABLE(napi_hash, 8);
185 
186 static seqcount_t devnet_rename_seq;
187 
188 static inline void dev_base_seq_inc(struct net *net)
189 {
190 	while (++net->dev_base_seq == 0);
191 }
192 
193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
194 {
195 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
196 
197 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
198 }
199 
200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
201 {
202 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
203 }
204 
205 static inline void rps_lock(struct softnet_data *sd)
206 {
207 #ifdef CONFIG_RPS
208 	spin_lock(&sd->input_pkt_queue.lock);
209 #endif
210 }
211 
212 static inline void rps_unlock(struct softnet_data *sd)
213 {
214 #ifdef CONFIG_RPS
215 	spin_unlock(&sd->input_pkt_queue.lock);
216 #endif
217 }
218 
219 /* Device list insertion */
220 static void list_netdevice(struct net_device *dev)
221 {
222 	struct net *net = dev_net(dev);
223 
224 	ASSERT_RTNL();
225 
226 	write_lock_bh(&dev_base_lock);
227 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
228 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
229 	hlist_add_head_rcu(&dev->index_hlist,
230 			   dev_index_hash(net, dev->ifindex));
231 	write_unlock_bh(&dev_base_lock);
232 
233 	dev_base_seq_inc(net);
234 }
235 
236 /* Device list removal
237  * caller must respect a RCU grace period before freeing/reusing dev
238  */
239 static void unlist_netdevice(struct net_device *dev)
240 {
241 	ASSERT_RTNL();
242 
243 	/* Unlink dev from the device chain */
244 	write_lock_bh(&dev_base_lock);
245 	list_del_rcu(&dev->dev_list);
246 	hlist_del_rcu(&dev->name_hlist);
247 	hlist_del_rcu(&dev->index_hlist);
248 	write_unlock_bh(&dev_base_lock);
249 
250 	dev_base_seq_inc(dev_net(dev));
251 }
252 
253 /*
254  *	Our notifier list
255  */
256 
257 static RAW_NOTIFIER_HEAD(netdev_chain);
258 
259 /*
260  *	Device drivers call our routines to queue packets here. We empty the
261  *	queue in the local softnet handler.
262  */
263 
264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
265 EXPORT_PER_CPU_SYMBOL(softnet_data);
266 
267 #ifdef CONFIG_LOCKDEP
268 /*
269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
270  * according to dev->type
271  */
272 static const unsigned short netdev_lock_type[] =
273 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
274 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
275 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
276 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
277 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
278 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
279 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
280 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
281 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
282 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
283 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
284 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
285 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
286 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
287 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
288 
289 static const char *const netdev_lock_name[] =
290 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
291 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
292 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
293 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
294 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
295 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
296 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
297 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
298 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
299 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
300 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
301 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
302 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
303 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
304 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
305 
306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
308 
309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
310 {
311 	int i;
312 
313 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
314 		if (netdev_lock_type[i] == dev_type)
315 			return i;
316 	/* the last key is used by default */
317 	return ARRAY_SIZE(netdev_lock_type) - 1;
318 }
319 
320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
321 						 unsigned short dev_type)
322 {
323 	int i;
324 
325 	i = netdev_lock_pos(dev_type);
326 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
327 				   netdev_lock_name[i]);
328 }
329 
330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
331 {
332 	int i;
333 
334 	i = netdev_lock_pos(dev->type);
335 	lockdep_set_class_and_name(&dev->addr_list_lock,
336 				   &netdev_addr_lock_key[i],
337 				   netdev_lock_name[i]);
338 }
339 #else
340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
341 						 unsigned short dev_type)
342 {
343 }
344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
345 {
346 }
347 #endif
348 
349 /*******************************************************************************
350 
351 		Protocol management and registration routines
352 
353 *******************************************************************************/
354 
355 /*
356  *	Add a protocol ID to the list. Now that the input handler is
357  *	smarter we can dispense with all the messy stuff that used to be
358  *	here.
359  *
360  *	BEWARE!!! Protocol handlers, mangling input packets,
361  *	MUST BE last in hash buckets and checking protocol handlers
362  *	MUST start from promiscuous ptype_all chain in net_bh.
363  *	It is true now, do not change it.
364  *	Explanation follows: if protocol handler, mangling packet, will
365  *	be the first on list, it is not able to sense, that packet
366  *	is cloned and should be copied-on-write, so that it will
367  *	change it and subsequent readers will get broken packet.
368  *							--ANK (980803)
369  */
370 
371 static inline struct list_head *ptype_head(const struct packet_type *pt)
372 {
373 	if (pt->type == htons(ETH_P_ALL))
374 		return &ptype_all;
375 	else
376 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
377 }
378 
379 /**
380  *	dev_add_pack - add packet handler
381  *	@pt: packet type declaration
382  *
383  *	Add a protocol handler to the networking stack. The passed &packet_type
384  *	is linked into kernel lists and may not be freed until it has been
385  *	removed from the kernel lists.
386  *
387  *	This call does not sleep therefore it can not
388  *	guarantee all CPU's that are in middle of receiving packets
389  *	will see the new packet type (until the next received packet).
390  */
391 
392 void dev_add_pack(struct packet_type *pt)
393 {
394 	struct list_head *head = ptype_head(pt);
395 
396 	spin_lock(&ptype_lock);
397 	list_add_rcu(&pt->list, head);
398 	spin_unlock(&ptype_lock);
399 }
400 EXPORT_SYMBOL(dev_add_pack);
401 
402 /**
403  *	__dev_remove_pack	 - remove packet handler
404  *	@pt: packet type declaration
405  *
406  *	Remove a protocol handler that was previously added to the kernel
407  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
408  *	from the kernel lists and can be freed or reused once this function
409  *	returns.
410  *
411  *      The packet type might still be in use by receivers
412  *	and must not be freed until after all the CPU's have gone
413  *	through a quiescent state.
414  */
415 void __dev_remove_pack(struct packet_type *pt)
416 {
417 	struct list_head *head = ptype_head(pt);
418 	struct packet_type *pt1;
419 
420 	spin_lock(&ptype_lock);
421 
422 	list_for_each_entry(pt1, head, list) {
423 		if (pt == pt1) {
424 			list_del_rcu(&pt->list);
425 			goto out;
426 		}
427 	}
428 
429 	pr_warn("dev_remove_pack: %p not found\n", pt);
430 out:
431 	spin_unlock(&ptype_lock);
432 }
433 EXPORT_SYMBOL(__dev_remove_pack);
434 
435 /**
436  *	dev_remove_pack	 - remove packet handler
437  *	@pt: packet type declaration
438  *
439  *	Remove a protocol handler that was previously added to the kernel
440  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
441  *	from the kernel lists and can be freed or reused once this function
442  *	returns.
443  *
444  *	This call sleeps to guarantee that no CPU is looking at the packet
445  *	type after return.
446  */
447 void dev_remove_pack(struct packet_type *pt)
448 {
449 	__dev_remove_pack(pt);
450 
451 	synchronize_net();
452 }
453 EXPORT_SYMBOL(dev_remove_pack);
454 
455 
456 /**
457  *	dev_add_offload - register offload handlers
458  *	@po: protocol offload declaration
459  *
460  *	Add protocol offload handlers to the networking stack. The passed
461  *	&proto_offload is linked into kernel lists and may not be freed until
462  *	it has been removed from the kernel lists.
463  *
464  *	This call does not sleep therefore it can not
465  *	guarantee all CPU's that are in middle of receiving packets
466  *	will see the new offload handlers (until the next received packet).
467  */
468 void dev_add_offload(struct packet_offload *po)
469 {
470 	struct list_head *head = &offload_base;
471 
472 	spin_lock(&offload_lock);
473 	list_add_rcu(&po->list, head);
474 	spin_unlock(&offload_lock);
475 }
476 EXPORT_SYMBOL(dev_add_offload);
477 
478 /**
479  *	__dev_remove_offload	 - remove offload handler
480  *	@po: packet offload declaration
481  *
482  *	Remove a protocol offload handler that was previously added to the
483  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
484  *	is removed from the kernel lists and can be freed or reused once this
485  *	function returns.
486  *
487  *      The packet type might still be in use by receivers
488  *	and must not be freed until after all the CPU's have gone
489  *	through a quiescent state.
490  */
491 static void __dev_remove_offload(struct packet_offload *po)
492 {
493 	struct list_head *head = &offload_base;
494 	struct packet_offload *po1;
495 
496 	spin_lock(&offload_lock);
497 
498 	list_for_each_entry(po1, head, list) {
499 		if (po == po1) {
500 			list_del_rcu(&po->list);
501 			goto out;
502 		}
503 	}
504 
505 	pr_warn("dev_remove_offload: %p not found\n", po);
506 out:
507 	spin_unlock(&offload_lock);
508 }
509 
510 /**
511  *	dev_remove_offload	 - remove packet offload handler
512  *	@po: packet offload declaration
513  *
514  *	Remove a packet offload handler that was previously added to the kernel
515  *	offload handlers by dev_add_offload(). The passed &offload_type is
516  *	removed from the kernel lists and can be freed or reused once this
517  *	function returns.
518  *
519  *	This call sleeps to guarantee that no CPU is looking at the packet
520  *	type after return.
521  */
522 void dev_remove_offload(struct packet_offload *po)
523 {
524 	__dev_remove_offload(po);
525 
526 	synchronize_net();
527 }
528 EXPORT_SYMBOL(dev_remove_offload);
529 
530 /******************************************************************************
531 
532 		      Device Boot-time Settings Routines
533 
534 *******************************************************************************/
535 
536 /* Boot time configuration table */
537 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
538 
539 /**
540  *	netdev_boot_setup_add	- add new setup entry
541  *	@name: name of the device
542  *	@map: configured settings for the device
543  *
544  *	Adds new setup entry to the dev_boot_setup list.  The function
545  *	returns 0 on error and 1 on success.  This is a generic routine to
546  *	all netdevices.
547  */
548 static int netdev_boot_setup_add(char *name, struct ifmap *map)
549 {
550 	struct netdev_boot_setup *s;
551 	int i;
552 
553 	s = dev_boot_setup;
554 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
555 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
556 			memset(s[i].name, 0, sizeof(s[i].name));
557 			strlcpy(s[i].name, name, IFNAMSIZ);
558 			memcpy(&s[i].map, map, sizeof(s[i].map));
559 			break;
560 		}
561 	}
562 
563 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
564 }
565 
566 /**
567  *	netdev_boot_setup_check	- check boot time settings
568  *	@dev: the netdevice
569  *
570  * 	Check boot time settings for the device.
571  *	The found settings are set for the device to be used
572  *	later in the device probing.
573  *	Returns 0 if no settings found, 1 if they are.
574  */
575 int netdev_boot_setup_check(struct net_device *dev)
576 {
577 	struct netdev_boot_setup *s = dev_boot_setup;
578 	int i;
579 
580 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
581 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
582 		    !strcmp(dev->name, s[i].name)) {
583 			dev->irq 	= s[i].map.irq;
584 			dev->base_addr 	= s[i].map.base_addr;
585 			dev->mem_start 	= s[i].map.mem_start;
586 			dev->mem_end 	= s[i].map.mem_end;
587 			return 1;
588 		}
589 	}
590 	return 0;
591 }
592 EXPORT_SYMBOL(netdev_boot_setup_check);
593 
594 
595 /**
596  *	netdev_boot_base	- get address from boot time settings
597  *	@prefix: prefix for network device
598  *	@unit: id for network device
599  *
600  * 	Check boot time settings for the base address of device.
601  *	The found settings are set for the device to be used
602  *	later in the device probing.
603  *	Returns 0 if no settings found.
604  */
605 unsigned long netdev_boot_base(const char *prefix, int unit)
606 {
607 	const struct netdev_boot_setup *s = dev_boot_setup;
608 	char name[IFNAMSIZ];
609 	int i;
610 
611 	sprintf(name, "%s%d", prefix, unit);
612 
613 	/*
614 	 * If device already registered then return base of 1
615 	 * to indicate not to probe for this interface
616 	 */
617 	if (__dev_get_by_name(&init_net, name))
618 		return 1;
619 
620 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
621 		if (!strcmp(name, s[i].name))
622 			return s[i].map.base_addr;
623 	return 0;
624 }
625 
626 /*
627  * Saves at boot time configured settings for any netdevice.
628  */
629 int __init netdev_boot_setup(char *str)
630 {
631 	int ints[5];
632 	struct ifmap map;
633 
634 	str = get_options(str, ARRAY_SIZE(ints), ints);
635 	if (!str || !*str)
636 		return 0;
637 
638 	/* Save settings */
639 	memset(&map, 0, sizeof(map));
640 	if (ints[0] > 0)
641 		map.irq = ints[1];
642 	if (ints[0] > 1)
643 		map.base_addr = ints[2];
644 	if (ints[0] > 2)
645 		map.mem_start = ints[3];
646 	if (ints[0] > 3)
647 		map.mem_end = ints[4];
648 
649 	/* Add new entry to the list */
650 	return netdev_boot_setup_add(str, &map);
651 }
652 
653 __setup("netdev=", netdev_boot_setup);
654 
655 /*******************************************************************************
656 
657 			    Device Interface Subroutines
658 
659 *******************************************************************************/
660 
661 /**
662  *	__dev_get_by_name	- find a device by its name
663  *	@net: the applicable net namespace
664  *	@name: name to find
665  *
666  *	Find an interface by name. Must be called under RTNL semaphore
667  *	or @dev_base_lock. If the name is found a pointer to the device
668  *	is returned. If the name is not found then %NULL is returned. The
669  *	reference counters are not incremented so the caller must be
670  *	careful with locks.
671  */
672 
673 struct net_device *__dev_get_by_name(struct net *net, const char *name)
674 {
675 	struct net_device *dev;
676 	struct hlist_head *head = dev_name_hash(net, name);
677 
678 	hlist_for_each_entry(dev, head, name_hlist)
679 		if (!strncmp(dev->name, name, IFNAMSIZ))
680 			return dev;
681 
682 	return NULL;
683 }
684 EXPORT_SYMBOL(__dev_get_by_name);
685 
686 /**
687  *	dev_get_by_name_rcu	- find a device by its name
688  *	@net: the applicable net namespace
689  *	@name: name to find
690  *
691  *	Find an interface by name.
692  *	If the name is found a pointer to the device is returned.
693  * 	If the name is not found then %NULL is returned.
694  *	The reference counters are not incremented so the caller must be
695  *	careful with locks. The caller must hold RCU lock.
696  */
697 
698 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
699 {
700 	struct net_device *dev;
701 	struct hlist_head *head = dev_name_hash(net, name);
702 
703 	hlist_for_each_entry_rcu(dev, head, name_hlist)
704 		if (!strncmp(dev->name, name, IFNAMSIZ))
705 			return dev;
706 
707 	return NULL;
708 }
709 EXPORT_SYMBOL(dev_get_by_name_rcu);
710 
711 /**
712  *	dev_get_by_name		- find a device by its name
713  *	@net: the applicable net namespace
714  *	@name: name to find
715  *
716  *	Find an interface by name. This can be called from any
717  *	context and does its own locking. The returned handle has
718  *	the usage count incremented and the caller must use dev_put() to
719  *	release it when it is no longer needed. %NULL is returned if no
720  *	matching device is found.
721  */
722 
723 struct net_device *dev_get_by_name(struct net *net, const char *name)
724 {
725 	struct net_device *dev;
726 
727 	rcu_read_lock();
728 	dev = dev_get_by_name_rcu(net, name);
729 	if (dev)
730 		dev_hold(dev);
731 	rcu_read_unlock();
732 	return dev;
733 }
734 EXPORT_SYMBOL(dev_get_by_name);
735 
736 /**
737  *	__dev_get_by_index - find a device by its ifindex
738  *	@net: the applicable net namespace
739  *	@ifindex: index of device
740  *
741  *	Search for an interface by index. Returns %NULL if the device
742  *	is not found or a pointer to the device. The device has not
743  *	had its reference counter increased so the caller must be careful
744  *	about locking. The caller must hold either the RTNL semaphore
745  *	or @dev_base_lock.
746  */
747 
748 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
749 {
750 	struct net_device *dev;
751 	struct hlist_head *head = dev_index_hash(net, ifindex);
752 
753 	hlist_for_each_entry(dev, head, index_hlist)
754 		if (dev->ifindex == ifindex)
755 			return dev;
756 
757 	return NULL;
758 }
759 EXPORT_SYMBOL(__dev_get_by_index);
760 
761 /**
762  *	dev_get_by_index_rcu - find a device by its ifindex
763  *	@net: the applicable net namespace
764  *	@ifindex: index of device
765  *
766  *	Search for an interface by index. Returns %NULL if the device
767  *	is not found or a pointer to the device. The device has not
768  *	had its reference counter increased so the caller must be careful
769  *	about locking. The caller must hold RCU lock.
770  */
771 
772 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
773 {
774 	struct net_device *dev;
775 	struct hlist_head *head = dev_index_hash(net, ifindex);
776 
777 	hlist_for_each_entry_rcu(dev, head, index_hlist)
778 		if (dev->ifindex == ifindex)
779 			return dev;
780 
781 	return NULL;
782 }
783 EXPORT_SYMBOL(dev_get_by_index_rcu);
784 
785 
786 /**
787  *	dev_get_by_index - find a device by its ifindex
788  *	@net: the applicable net namespace
789  *	@ifindex: index of device
790  *
791  *	Search for an interface by index. Returns NULL if the device
792  *	is not found or a pointer to the device. The device returned has
793  *	had a reference added and the pointer is safe until the user calls
794  *	dev_put to indicate they have finished with it.
795  */
796 
797 struct net_device *dev_get_by_index(struct net *net, int ifindex)
798 {
799 	struct net_device *dev;
800 
801 	rcu_read_lock();
802 	dev = dev_get_by_index_rcu(net, ifindex);
803 	if (dev)
804 		dev_hold(dev);
805 	rcu_read_unlock();
806 	return dev;
807 }
808 EXPORT_SYMBOL(dev_get_by_index);
809 
810 /**
811  *	netdev_get_name - get a netdevice name, knowing its ifindex.
812  *	@net: network namespace
813  *	@name: a pointer to the buffer where the name will be stored.
814  *	@ifindex: the ifindex of the interface to get the name from.
815  *
816  *	The use of raw_seqcount_begin() and cond_resched() before
817  *	retrying is required as we want to give the writers a chance
818  *	to complete when CONFIG_PREEMPT is not set.
819  */
820 int netdev_get_name(struct net *net, char *name, int ifindex)
821 {
822 	struct net_device *dev;
823 	unsigned int seq;
824 
825 retry:
826 	seq = raw_seqcount_begin(&devnet_rename_seq);
827 	rcu_read_lock();
828 	dev = dev_get_by_index_rcu(net, ifindex);
829 	if (!dev) {
830 		rcu_read_unlock();
831 		return -ENODEV;
832 	}
833 
834 	strcpy(name, dev->name);
835 	rcu_read_unlock();
836 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
837 		cond_resched();
838 		goto retry;
839 	}
840 
841 	return 0;
842 }
843 
844 /**
845  *	dev_getbyhwaddr_rcu - find a device by its hardware address
846  *	@net: the applicable net namespace
847  *	@type: media type of device
848  *	@ha: hardware address
849  *
850  *	Search for an interface by MAC address. Returns NULL if the device
851  *	is not found or a pointer to the device.
852  *	The caller must hold RCU or RTNL.
853  *	The returned device has not had its ref count increased
854  *	and the caller must therefore be careful about locking
855  *
856  */
857 
858 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
859 				       const char *ha)
860 {
861 	struct net_device *dev;
862 
863 	for_each_netdev_rcu(net, dev)
864 		if (dev->type == type &&
865 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
866 			return dev;
867 
868 	return NULL;
869 }
870 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
871 
872 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
873 {
874 	struct net_device *dev;
875 
876 	ASSERT_RTNL();
877 	for_each_netdev(net, dev)
878 		if (dev->type == type)
879 			return dev;
880 
881 	return NULL;
882 }
883 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
884 
885 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
886 {
887 	struct net_device *dev, *ret = NULL;
888 
889 	rcu_read_lock();
890 	for_each_netdev_rcu(net, dev)
891 		if (dev->type == type) {
892 			dev_hold(dev);
893 			ret = dev;
894 			break;
895 		}
896 	rcu_read_unlock();
897 	return ret;
898 }
899 EXPORT_SYMBOL(dev_getfirstbyhwtype);
900 
901 /**
902  *	__dev_get_by_flags - find any device with given flags
903  *	@net: the applicable net namespace
904  *	@if_flags: IFF_* values
905  *	@mask: bitmask of bits in if_flags to check
906  *
907  *	Search for any interface with the given flags. Returns NULL if a device
908  *	is not found or a pointer to the device. Must be called inside
909  *	rtnl_lock(), and result refcount is unchanged.
910  */
911 
912 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
913 				      unsigned short mask)
914 {
915 	struct net_device *dev, *ret;
916 
917 	ASSERT_RTNL();
918 
919 	ret = NULL;
920 	for_each_netdev(net, dev) {
921 		if (((dev->flags ^ if_flags) & mask) == 0) {
922 			ret = dev;
923 			break;
924 		}
925 	}
926 	return ret;
927 }
928 EXPORT_SYMBOL(__dev_get_by_flags);
929 
930 /**
931  *	dev_valid_name - check if name is okay for network device
932  *	@name: name string
933  *
934  *	Network device names need to be valid file names to
935  *	to allow sysfs to work.  We also disallow any kind of
936  *	whitespace.
937  */
938 bool dev_valid_name(const char *name)
939 {
940 	if (*name == '\0')
941 		return false;
942 	if (strlen(name) >= IFNAMSIZ)
943 		return false;
944 	if (!strcmp(name, ".") || !strcmp(name, ".."))
945 		return false;
946 
947 	while (*name) {
948 		if (*name == '/' || isspace(*name))
949 			return false;
950 		name++;
951 	}
952 	return true;
953 }
954 EXPORT_SYMBOL(dev_valid_name);
955 
956 /**
957  *	__dev_alloc_name - allocate a name for a device
958  *	@net: network namespace to allocate the device name in
959  *	@name: name format string
960  *	@buf:  scratch buffer and result name string
961  *
962  *	Passed a format string - eg "lt%d" it will try and find a suitable
963  *	id. It scans list of devices to build up a free map, then chooses
964  *	the first empty slot. The caller must hold the dev_base or rtnl lock
965  *	while allocating the name and adding the device in order to avoid
966  *	duplicates.
967  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
968  *	Returns the number of the unit assigned or a negative errno code.
969  */
970 
971 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
972 {
973 	int i = 0;
974 	const char *p;
975 	const int max_netdevices = 8*PAGE_SIZE;
976 	unsigned long *inuse;
977 	struct net_device *d;
978 
979 	p = strnchr(name, IFNAMSIZ-1, '%');
980 	if (p) {
981 		/*
982 		 * Verify the string as this thing may have come from
983 		 * the user.  There must be either one "%d" and no other "%"
984 		 * characters.
985 		 */
986 		if (p[1] != 'd' || strchr(p + 2, '%'))
987 			return -EINVAL;
988 
989 		/* Use one page as a bit array of possible slots */
990 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
991 		if (!inuse)
992 			return -ENOMEM;
993 
994 		for_each_netdev(net, d) {
995 			if (!sscanf(d->name, name, &i))
996 				continue;
997 			if (i < 0 || i >= max_netdevices)
998 				continue;
999 
1000 			/*  avoid cases where sscanf is not exact inverse of printf */
1001 			snprintf(buf, IFNAMSIZ, name, i);
1002 			if (!strncmp(buf, d->name, IFNAMSIZ))
1003 				set_bit(i, inuse);
1004 		}
1005 
1006 		i = find_first_zero_bit(inuse, max_netdevices);
1007 		free_page((unsigned long) inuse);
1008 	}
1009 
1010 	if (buf != name)
1011 		snprintf(buf, IFNAMSIZ, name, i);
1012 	if (!__dev_get_by_name(net, buf))
1013 		return i;
1014 
1015 	/* It is possible to run out of possible slots
1016 	 * when the name is long and there isn't enough space left
1017 	 * for the digits, or if all bits are used.
1018 	 */
1019 	return -ENFILE;
1020 }
1021 
1022 /**
1023  *	dev_alloc_name - allocate a name for a device
1024  *	@dev: device
1025  *	@name: name format string
1026  *
1027  *	Passed a format string - eg "lt%d" it will try and find a suitable
1028  *	id. It scans list of devices to build up a free map, then chooses
1029  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1030  *	while allocating the name and adding the device in order to avoid
1031  *	duplicates.
1032  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1033  *	Returns the number of the unit assigned or a negative errno code.
1034  */
1035 
1036 int dev_alloc_name(struct net_device *dev, const char *name)
1037 {
1038 	char buf[IFNAMSIZ];
1039 	struct net *net;
1040 	int ret;
1041 
1042 	BUG_ON(!dev_net(dev));
1043 	net = dev_net(dev);
1044 	ret = __dev_alloc_name(net, name, buf);
1045 	if (ret >= 0)
1046 		strlcpy(dev->name, buf, IFNAMSIZ);
1047 	return ret;
1048 }
1049 EXPORT_SYMBOL(dev_alloc_name);
1050 
1051 static int dev_alloc_name_ns(struct net *net,
1052 			     struct net_device *dev,
1053 			     const char *name)
1054 {
1055 	char buf[IFNAMSIZ];
1056 	int ret;
1057 
1058 	ret = __dev_alloc_name(net, name, buf);
1059 	if (ret >= 0)
1060 		strlcpy(dev->name, buf, IFNAMSIZ);
1061 	return ret;
1062 }
1063 
1064 static int dev_get_valid_name(struct net *net,
1065 			      struct net_device *dev,
1066 			      const char *name)
1067 {
1068 	BUG_ON(!net);
1069 
1070 	if (!dev_valid_name(name))
1071 		return -EINVAL;
1072 
1073 	if (strchr(name, '%'))
1074 		return dev_alloc_name_ns(net, dev, name);
1075 	else if (__dev_get_by_name(net, name))
1076 		return -EEXIST;
1077 	else if (dev->name != name)
1078 		strlcpy(dev->name, name, IFNAMSIZ);
1079 
1080 	return 0;
1081 }
1082 
1083 /**
1084  *	dev_change_name - change name of a device
1085  *	@dev: device
1086  *	@newname: name (or format string) must be at least IFNAMSIZ
1087  *
1088  *	Change name of a device, can pass format strings "eth%d".
1089  *	for wildcarding.
1090  */
1091 int dev_change_name(struct net_device *dev, const char *newname)
1092 {
1093 	unsigned char old_assign_type;
1094 	char oldname[IFNAMSIZ];
1095 	int err = 0;
1096 	int ret;
1097 	struct net *net;
1098 
1099 	ASSERT_RTNL();
1100 	BUG_ON(!dev_net(dev));
1101 
1102 	net = dev_net(dev);
1103 	if (dev->flags & IFF_UP)
1104 		return -EBUSY;
1105 
1106 	write_seqcount_begin(&devnet_rename_seq);
1107 
1108 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1109 		write_seqcount_end(&devnet_rename_seq);
1110 		return 0;
1111 	}
1112 
1113 	memcpy(oldname, dev->name, IFNAMSIZ);
1114 
1115 	err = dev_get_valid_name(net, dev, newname);
1116 	if (err < 0) {
1117 		write_seqcount_end(&devnet_rename_seq);
1118 		return err;
1119 	}
1120 
1121 	if (oldname[0] && !strchr(oldname, '%'))
1122 		netdev_info(dev, "renamed from %s\n", oldname);
1123 
1124 	old_assign_type = dev->name_assign_type;
1125 	dev->name_assign_type = NET_NAME_RENAMED;
1126 
1127 rollback:
1128 	ret = device_rename(&dev->dev, dev->name);
1129 	if (ret) {
1130 		memcpy(dev->name, oldname, IFNAMSIZ);
1131 		dev->name_assign_type = old_assign_type;
1132 		write_seqcount_end(&devnet_rename_seq);
1133 		return ret;
1134 	}
1135 
1136 	write_seqcount_end(&devnet_rename_seq);
1137 
1138 	netdev_adjacent_rename_links(dev, oldname);
1139 
1140 	write_lock_bh(&dev_base_lock);
1141 	hlist_del_rcu(&dev->name_hlist);
1142 	write_unlock_bh(&dev_base_lock);
1143 
1144 	synchronize_rcu();
1145 
1146 	write_lock_bh(&dev_base_lock);
1147 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1148 	write_unlock_bh(&dev_base_lock);
1149 
1150 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1151 	ret = notifier_to_errno(ret);
1152 
1153 	if (ret) {
1154 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1155 		if (err >= 0) {
1156 			err = ret;
1157 			write_seqcount_begin(&devnet_rename_seq);
1158 			memcpy(dev->name, oldname, IFNAMSIZ);
1159 			memcpy(oldname, newname, IFNAMSIZ);
1160 			dev->name_assign_type = old_assign_type;
1161 			old_assign_type = NET_NAME_RENAMED;
1162 			goto rollback;
1163 		} else {
1164 			pr_err("%s: name change rollback failed: %d\n",
1165 			       dev->name, ret);
1166 		}
1167 	}
1168 
1169 	return err;
1170 }
1171 
1172 /**
1173  *	dev_set_alias - change ifalias of a device
1174  *	@dev: device
1175  *	@alias: name up to IFALIASZ
1176  *	@len: limit of bytes to copy from info
1177  *
1178  *	Set ifalias for a device,
1179  */
1180 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1181 {
1182 	char *new_ifalias;
1183 
1184 	ASSERT_RTNL();
1185 
1186 	if (len >= IFALIASZ)
1187 		return -EINVAL;
1188 
1189 	if (!len) {
1190 		kfree(dev->ifalias);
1191 		dev->ifalias = NULL;
1192 		return 0;
1193 	}
1194 
1195 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1196 	if (!new_ifalias)
1197 		return -ENOMEM;
1198 	dev->ifalias = new_ifalias;
1199 
1200 	strlcpy(dev->ifalias, alias, len+1);
1201 	return len;
1202 }
1203 
1204 
1205 /**
1206  *	netdev_features_change - device changes features
1207  *	@dev: device to cause notification
1208  *
1209  *	Called to indicate a device has changed features.
1210  */
1211 void netdev_features_change(struct net_device *dev)
1212 {
1213 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1214 }
1215 EXPORT_SYMBOL(netdev_features_change);
1216 
1217 /**
1218  *	netdev_state_change - device changes state
1219  *	@dev: device to cause notification
1220  *
1221  *	Called to indicate a device has changed state. This function calls
1222  *	the notifier chains for netdev_chain and sends a NEWLINK message
1223  *	to the routing socket.
1224  */
1225 void netdev_state_change(struct net_device *dev)
1226 {
1227 	if (dev->flags & IFF_UP) {
1228 		struct netdev_notifier_change_info change_info;
1229 
1230 		change_info.flags_changed = 0;
1231 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1232 					      &change_info.info);
1233 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1234 	}
1235 }
1236 EXPORT_SYMBOL(netdev_state_change);
1237 
1238 /**
1239  * 	netdev_notify_peers - notify network peers about existence of @dev
1240  * 	@dev: network device
1241  *
1242  * Generate traffic such that interested network peers are aware of
1243  * @dev, such as by generating a gratuitous ARP. This may be used when
1244  * a device wants to inform the rest of the network about some sort of
1245  * reconfiguration such as a failover event or virtual machine
1246  * migration.
1247  */
1248 void netdev_notify_peers(struct net_device *dev)
1249 {
1250 	rtnl_lock();
1251 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1252 	rtnl_unlock();
1253 }
1254 EXPORT_SYMBOL(netdev_notify_peers);
1255 
1256 static int __dev_open(struct net_device *dev)
1257 {
1258 	const struct net_device_ops *ops = dev->netdev_ops;
1259 	int ret;
1260 
1261 	ASSERT_RTNL();
1262 
1263 	if (!netif_device_present(dev))
1264 		return -ENODEV;
1265 
1266 	/* Block netpoll from trying to do any rx path servicing.
1267 	 * If we don't do this there is a chance ndo_poll_controller
1268 	 * or ndo_poll may be running while we open the device
1269 	 */
1270 	netpoll_poll_disable(dev);
1271 
1272 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1273 	ret = notifier_to_errno(ret);
1274 	if (ret)
1275 		return ret;
1276 
1277 	set_bit(__LINK_STATE_START, &dev->state);
1278 
1279 	if (ops->ndo_validate_addr)
1280 		ret = ops->ndo_validate_addr(dev);
1281 
1282 	if (!ret && ops->ndo_open)
1283 		ret = ops->ndo_open(dev);
1284 
1285 	netpoll_poll_enable(dev);
1286 
1287 	if (ret)
1288 		clear_bit(__LINK_STATE_START, &dev->state);
1289 	else {
1290 		dev->flags |= IFF_UP;
1291 		dev_set_rx_mode(dev);
1292 		dev_activate(dev);
1293 		add_device_randomness(dev->dev_addr, dev->addr_len);
1294 	}
1295 
1296 	return ret;
1297 }
1298 
1299 /**
1300  *	dev_open	- prepare an interface for use.
1301  *	@dev:	device to open
1302  *
1303  *	Takes a device from down to up state. The device's private open
1304  *	function is invoked and then the multicast lists are loaded. Finally
1305  *	the device is moved into the up state and a %NETDEV_UP message is
1306  *	sent to the netdev notifier chain.
1307  *
1308  *	Calling this function on an active interface is a nop. On a failure
1309  *	a negative errno code is returned.
1310  */
1311 int dev_open(struct net_device *dev)
1312 {
1313 	int ret;
1314 
1315 	if (dev->flags & IFF_UP)
1316 		return 0;
1317 
1318 	ret = __dev_open(dev);
1319 	if (ret < 0)
1320 		return ret;
1321 
1322 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1323 	call_netdevice_notifiers(NETDEV_UP, dev);
1324 
1325 	return ret;
1326 }
1327 EXPORT_SYMBOL(dev_open);
1328 
1329 static int __dev_close_many(struct list_head *head)
1330 {
1331 	struct net_device *dev;
1332 
1333 	ASSERT_RTNL();
1334 	might_sleep();
1335 
1336 	list_for_each_entry(dev, head, close_list) {
1337 		/* Temporarily disable netpoll until the interface is down */
1338 		netpoll_poll_disable(dev);
1339 
1340 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1341 
1342 		clear_bit(__LINK_STATE_START, &dev->state);
1343 
1344 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1345 		 * can be even on different cpu. So just clear netif_running().
1346 		 *
1347 		 * dev->stop() will invoke napi_disable() on all of it's
1348 		 * napi_struct instances on this device.
1349 		 */
1350 		smp_mb__after_atomic(); /* Commit netif_running(). */
1351 	}
1352 
1353 	dev_deactivate_many(head);
1354 
1355 	list_for_each_entry(dev, head, close_list) {
1356 		const struct net_device_ops *ops = dev->netdev_ops;
1357 
1358 		/*
1359 		 *	Call the device specific close. This cannot fail.
1360 		 *	Only if device is UP
1361 		 *
1362 		 *	We allow it to be called even after a DETACH hot-plug
1363 		 *	event.
1364 		 */
1365 		if (ops->ndo_stop)
1366 			ops->ndo_stop(dev);
1367 
1368 		dev->flags &= ~IFF_UP;
1369 		netpoll_poll_enable(dev);
1370 	}
1371 
1372 	return 0;
1373 }
1374 
1375 static int __dev_close(struct net_device *dev)
1376 {
1377 	int retval;
1378 	LIST_HEAD(single);
1379 
1380 	list_add(&dev->close_list, &single);
1381 	retval = __dev_close_many(&single);
1382 	list_del(&single);
1383 
1384 	return retval;
1385 }
1386 
1387 static int dev_close_many(struct list_head *head)
1388 {
1389 	struct net_device *dev, *tmp;
1390 
1391 	/* Remove the devices that don't need to be closed */
1392 	list_for_each_entry_safe(dev, tmp, head, close_list)
1393 		if (!(dev->flags & IFF_UP))
1394 			list_del_init(&dev->close_list);
1395 
1396 	__dev_close_many(head);
1397 
1398 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1399 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1400 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1401 		list_del_init(&dev->close_list);
1402 	}
1403 
1404 	return 0;
1405 }
1406 
1407 /**
1408  *	dev_close - shutdown an interface.
1409  *	@dev: device to shutdown
1410  *
1411  *	This function moves an active device into down state. A
1412  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1413  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1414  *	chain.
1415  */
1416 int dev_close(struct net_device *dev)
1417 {
1418 	if (dev->flags & IFF_UP) {
1419 		LIST_HEAD(single);
1420 
1421 		list_add(&dev->close_list, &single);
1422 		dev_close_many(&single);
1423 		list_del(&single);
1424 	}
1425 	return 0;
1426 }
1427 EXPORT_SYMBOL(dev_close);
1428 
1429 
1430 /**
1431  *	dev_disable_lro - disable Large Receive Offload on a device
1432  *	@dev: device
1433  *
1434  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1435  *	called under RTNL.  This is needed if received packets may be
1436  *	forwarded to another interface.
1437  */
1438 void dev_disable_lro(struct net_device *dev)
1439 {
1440 	struct net_device *lower_dev;
1441 	struct list_head *iter;
1442 
1443 	dev->wanted_features &= ~NETIF_F_LRO;
1444 	netdev_update_features(dev);
1445 
1446 	if (unlikely(dev->features & NETIF_F_LRO))
1447 		netdev_WARN(dev, "failed to disable LRO!\n");
1448 
1449 	netdev_for_each_lower_dev(dev, lower_dev, iter)
1450 		dev_disable_lro(lower_dev);
1451 }
1452 EXPORT_SYMBOL(dev_disable_lro);
1453 
1454 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1455 				   struct net_device *dev)
1456 {
1457 	struct netdev_notifier_info info;
1458 
1459 	netdev_notifier_info_init(&info, dev);
1460 	return nb->notifier_call(nb, val, &info);
1461 }
1462 
1463 static int dev_boot_phase = 1;
1464 
1465 /**
1466  *	register_netdevice_notifier - register a network notifier block
1467  *	@nb: notifier
1468  *
1469  *	Register a notifier to be called when network device events occur.
1470  *	The notifier passed is linked into the kernel structures and must
1471  *	not be reused until it has been unregistered. A negative errno code
1472  *	is returned on a failure.
1473  *
1474  * 	When registered all registration and up events are replayed
1475  *	to the new notifier to allow device to have a race free
1476  *	view of the network device list.
1477  */
1478 
1479 int register_netdevice_notifier(struct notifier_block *nb)
1480 {
1481 	struct net_device *dev;
1482 	struct net_device *last;
1483 	struct net *net;
1484 	int err;
1485 
1486 	rtnl_lock();
1487 	err = raw_notifier_chain_register(&netdev_chain, nb);
1488 	if (err)
1489 		goto unlock;
1490 	if (dev_boot_phase)
1491 		goto unlock;
1492 	for_each_net(net) {
1493 		for_each_netdev(net, dev) {
1494 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1495 			err = notifier_to_errno(err);
1496 			if (err)
1497 				goto rollback;
1498 
1499 			if (!(dev->flags & IFF_UP))
1500 				continue;
1501 
1502 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1503 		}
1504 	}
1505 
1506 unlock:
1507 	rtnl_unlock();
1508 	return err;
1509 
1510 rollback:
1511 	last = dev;
1512 	for_each_net(net) {
1513 		for_each_netdev(net, dev) {
1514 			if (dev == last)
1515 				goto outroll;
1516 
1517 			if (dev->flags & IFF_UP) {
1518 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1519 							dev);
1520 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1521 			}
1522 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1523 		}
1524 	}
1525 
1526 outroll:
1527 	raw_notifier_chain_unregister(&netdev_chain, nb);
1528 	goto unlock;
1529 }
1530 EXPORT_SYMBOL(register_netdevice_notifier);
1531 
1532 /**
1533  *	unregister_netdevice_notifier - unregister a network notifier block
1534  *	@nb: notifier
1535  *
1536  *	Unregister a notifier previously registered by
1537  *	register_netdevice_notifier(). The notifier is unlinked into the
1538  *	kernel structures and may then be reused. A negative errno code
1539  *	is returned on a failure.
1540  *
1541  * 	After unregistering unregister and down device events are synthesized
1542  *	for all devices on the device list to the removed notifier to remove
1543  *	the need for special case cleanup code.
1544  */
1545 
1546 int unregister_netdevice_notifier(struct notifier_block *nb)
1547 {
1548 	struct net_device *dev;
1549 	struct net *net;
1550 	int err;
1551 
1552 	rtnl_lock();
1553 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1554 	if (err)
1555 		goto unlock;
1556 
1557 	for_each_net(net) {
1558 		for_each_netdev(net, dev) {
1559 			if (dev->flags & IFF_UP) {
1560 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1561 							dev);
1562 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1563 			}
1564 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1565 		}
1566 	}
1567 unlock:
1568 	rtnl_unlock();
1569 	return err;
1570 }
1571 EXPORT_SYMBOL(unregister_netdevice_notifier);
1572 
1573 /**
1574  *	call_netdevice_notifiers_info - call all network notifier blocks
1575  *	@val: value passed unmodified to notifier function
1576  *	@dev: net_device pointer passed unmodified to notifier function
1577  *	@info: notifier information data
1578  *
1579  *	Call all network notifier blocks.  Parameters and return value
1580  *	are as for raw_notifier_call_chain().
1581  */
1582 
1583 static int call_netdevice_notifiers_info(unsigned long val,
1584 					 struct net_device *dev,
1585 					 struct netdev_notifier_info *info)
1586 {
1587 	ASSERT_RTNL();
1588 	netdev_notifier_info_init(info, dev);
1589 	return raw_notifier_call_chain(&netdev_chain, val, info);
1590 }
1591 
1592 /**
1593  *	call_netdevice_notifiers - call all network notifier blocks
1594  *      @val: value passed unmodified to notifier function
1595  *      @dev: net_device pointer passed unmodified to notifier function
1596  *
1597  *	Call all network notifier blocks.  Parameters and return value
1598  *	are as for raw_notifier_call_chain().
1599  */
1600 
1601 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1602 {
1603 	struct netdev_notifier_info info;
1604 
1605 	return call_netdevice_notifiers_info(val, dev, &info);
1606 }
1607 EXPORT_SYMBOL(call_netdevice_notifiers);
1608 
1609 static struct static_key netstamp_needed __read_mostly;
1610 #ifdef HAVE_JUMP_LABEL
1611 /* We are not allowed to call static_key_slow_dec() from irq context
1612  * If net_disable_timestamp() is called from irq context, defer the
1613  * static_key_slow_dec() calls.
1614  */
1615 static atomic_t netstamp_needed_deferred;
1616 #endif
1617 
1618 void net_enable_timestamp(void)
1619 {
1620 #ifdef HAVE_JUMP_LABEL
1621 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1622 
1623 	if (deferred) {
1624 		while (--deferred)
1625 			static_key_slow_dec(&netstamp_needed);
1626 		return;
1627 	}
1628 #endif
1629 	static_key_slow_inc(&netstamp_needed);
1630 }
1631 EXPORT_SYMBOL(net_enable_timestamp);
1632 
1633 void net_disable_timestamp(void)
1634 {
1635 #ifdef HAVE_JUMP_LABEL
1636 	if (in_interrupt()) {
1637 		atomic_inc(&netstamp_needed_deferred);
1638 		return;
1639 	}
1640 #endif
1641 	static_key_slow_dec(&netstamp_needed);
1642 }
1643 EXPORT_SYMBOL(net_disable_timestamp);
1644 
1645 static inline void net_timestamp_set(struct sk_buff *skb)
1646 {
1647 	skb->tstamp.tv64 = 0;
1648 	if (static_key_false(&netstamp_needed))
1649 		__net_timestamp(skb);
1650 }
1651 
1652 #define net_timestamp_check(COND, SKB)			\
1653 	if (static_key_false(&netstamp_needed)) {		\
1654 		if ((COND) && !(SKB)->tstamp.tv64)	\
1655 			__net_timestamp(SKB);		\
1656 	}						\
1657 
1658 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1659 {
1660 	unsigned int len;
1661 
1662 	if (!(dev->flags & IFF_UP))
1663 		return false;
1664 
1665 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1666 	if (skb->len <= len)
1667 		return true;
1668 
1669 	/* if TSO is enabled, we don't care about the length as the packet
1670 	 * could be forwarded without being segmented before
1671 	 */
1672 	if (skb_is_gso(skb))
1673 		return true;
1674 
1675 	return false;
1676 }
1677 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1678 
1679 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1680 {
1681 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1682 		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1683 			atomic_long_inc(&dev->rx_dropped);
1684 			kfree_skb(skb);
1685 			return NET_RX_DROP;
1686 		}
1687 	}
1688 
1689 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1690 		atomic_long_inc(&dev->rx_dropped);
1691 		kfree_skb(skb);
1692 		return NET_RX_DROP;
1693 	}
1694 
1695 	skb_scrub_packet(skb, true);
1696 	skb->protocol = eth_type_trans(skb, dev);
1697 	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1698 
1699 	return 0;
1700 }
1701 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1702 
1703 /**
1704  * dev_forward_skb - loopback an skb to another netif
1705  *
1706  * @dev: destination network device
1707  * @skb: buffer to forward
1708  *
1709  * return values:
1710  *	NET_RX_SUCCESS	(no congestion)
1711  *	NET_RX_DROP     (packet was dropped, but freed)
1712  *
1713  * dev_forward_skb can be used for injecting an skb from the
1714  * start_xmit function of one device into the receive queue
1715  * of another device.
1716  *
1717  * The receiving device may be in another namespace, so
1718  * we have to clear all information in the skb that could
1719  * impact namespace isolation.
1720  */
1721 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1722 {
1723 	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1724 }
1725 EXPORT_SYMBOL_GPL(dev_forward_skb);
1726 
1727 static inline int deliver_skb(struct sk_buff *skb,
1728 			      struct packet_type *pt_prev,
1729 			      struct net_device *orig_dev)
1730 {
1731 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1732 		return -ENOMEM;
1733 	atomic_inc(&skb->users);
1734 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1735 }
1736 
1737 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1738 {
1739 	if (!ptype->af_packet_priv || !skb->sk)
1740 		return false;
1741 
1742 	if (ptype->id_match)
1743 		return ptype->id_match(ptype, skb->sk);
1744 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1745 		return true;
1746 
1747 	return false;
1748 }
1749 
1750 /*
1751  *	Support routine. Sends outgoing frames to any network
1752  *	taps currently in use.
1753  */
1754 
1755 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1756 {
1757 	struct packet_type *ptype;
1758 	struct sk_buff *skb2 = NULL;
1759 	struct packet_type *pt_prev = NULL;
1760 
1761 	rcu_read_lock();
1762 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1763 		/* Never send packets back to the socket
1764 		 * they originated from - MvS (miquels@drinkel.ow.org)
1765 		 */
1766 		if ((ptype->dev == dev || !ptype->dev) &&
1767 		    (!skb_loop_sk(ptype, skb))) {
1768 			if (pt_prev) {
1769 				deliver_skb(skb2, pt_prev, skb->dev);
1770 				pt_prev = ptype;
1771 				continue;
1772 			}
1773 
1774 			skb2 = skb_clone(skb, GFP_ATOMIC);
1775 			if (!skb2)
1776 				break;
1777 
1778 			net_timestamp_set(skb2);
1779 
1780 			/* skb->nh should be correctly
1781 			   set by sender, so that the second statement is
1782 			   just protection against buggy protocols.
1783 			 */
1784 			skb_reset_mac_header(skb2);
1785 
1786 			if (skb_network_header(skb2) < skb2->data ||
1787 			    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1788 				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1789 						     ntohs(skb2->protocol),
1790 						     dev->name);
1791 				skb_reset_network_header(skb2);
1792 			}
1793 
1794 			skb2->transport_header = skb2->network_header;
1795 			skb2->pkt_type = PACKET_OUTGOING;
1796 			pt_prev = ptype;
1797 		}
1798 	}
1799 	if (pt_prev)
1800 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1801 	rcu_read_unlock();
1802 }
1803 
1804 /**
1805  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1806  * @dev: Network device
1807  * @txq: number of queues available
1808  *
1809  * If real_num_tx_queues is changed the tc mappings may no longer be
1810  * valid. To resolve this verify the tc mapping remains valid and if
1811  * not NULL the mapping. With no priorities mapping to this
1812  * offset/count pair it will no longer be used. In the worst case TC0
1813  * is invalid nothing can be done so disable priority mappings. If is
1814  * expected that drivers will fix this mapping if they can before
1815  * calling netif_set_real_num_tx_queues.
1816  */
1817 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1818 {
1819 	int i;
1820 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1821 
1822 	/* If TC0 is invalidated disable TC mapping */
1823 	if (tc->offset + tc->count > txq) {
1824 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1825 		dev->num_tc = 0;
1826 		return;
1827 	}
1828 
1829 	/* Invalidated prio to tc mappings set to TC0 */
1830 	for (i = 1; i < TC_BITMASK + 1; i++) {
1831 		int q = netdev_get_prio_tc_map(dev, i);
1832 
1833 		tc = &dev->tc_to_txq[q];
1834 		if (tc->offset + tc->count > txq) {
1835 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1836 				i, q);
1837 			netdev_set_prio_tc_map(dev, i, 0);
1838 		}
1839 	}
1840 }
1841 
1842 #ifdef CONFIG_XPS
1843 static DEFINE_MUTEX(xps_map_mutex);
1844 #define xmap_dereference(P)		\
1845 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1846 
1847 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1848 					int cpu, u16 index)
1849 {
1850 	struct xps_map *map = NULL;
1851 	int pos;
1852 
1853 	if (dev_maps)
1854 		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1855 
1856 	for (pos = 0; map && pos < map->len; pos++) {
1857 		if (map->queues[pos] == index) {
1858 			if (map->len > 1) {
1859 				map->queues[pos] = map->queues[--map->len];
1860 			} else {
1861 				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1862 				kfree_rcu(map, rcu);
1863 				map = NULL;
1864 			}
1865 			break;
1866 		}
1867 	}
1868 
1869 	return map;
1870 }
1871 
1872 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1873 {
1874 	struct xps_dev_maps *dev_maps;
1875 	int cpu, i;
1876 	bool active = false;
1877 
1878 	mutex_lock(&xps_map_mutex);
1879 	dev_maps = xmap_dereference(dev->xps_maps);
1880 
1881 	if (!dev_maps)
1882 		goto out_no_maps;
1883 
1884 	for_each_possible_cpu(cpu) {
1885 		for (i = index; i < dev->num_tx_queues; i++) {
1886 			if (!remove_xps_queue(dev_maps, cpu, i))
1887 				break;
1888 		}
1889 		if (i == dev->num_tx_queues)
1890 			active = true;
1891 	}
1892 
1893 	if (!active) {
1894 		RCU_INIT_POINTER(dev->xps_maps, NULL);
1895 		kfree_rcu(dev_maps, rcu);
1896 	}
1897 
1898 	for (i = index; i < dev->num_tx_queues; i++)
1899 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1900 					     NUMA_NO_NODE);
1901 
1902 out_no_maps:
1903 	mutex_unlock(&xps_map_mutex);
1904 }
1905 
1906 static struct xps_map *expand_xps_map(struct xps_map *map,
1907 				      int cpu, u16 index)
1908 {
1909 	struct xps_map *new_map;
1910 	int alloc_len = XPS_MIN_MAP_ALLOC;
1911 	int i, pos;
1912 
1913 	for (pos = 0; map && pos < map->len; pos++) {
1914 		if (map->queues[pos] != index)
1915 			continue;
1916 		return map;
1917 	}
1918 
1919 	/* Need to add queue to this CPU's existing map */
1920 	if (map) {
1921 		if (pos < map->alloc_len)
1922 			return map;
1923 
1924 		alloc_len = map->alloc_len * 2;
1925 	}
1926 
1927 	/* Need to allocate new map to store queue on this CPU's map */
1928 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1929 			       cpu_to_node(cpu));
1930 	if (!new_map)
1931 		return NULL;
1932 
1933 	for (i = 0; i < pos; i++)
1934 		new_map->queues[i] = map->queues[i];
1935 	new_map->alloc_len = alloc_len;
1936 	new_map->len = pos;
1937 
1938 	return new_map;
1939 }
1940 
1941 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1942 			u16 index)
1943 {
1944 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1945 	struct xps_map *map, *new_map;
1946 	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1947 	int cpu, numa_node_id = -2;
1948 	bool active = false;
1949 
1950 	mutex_lock(&xps_map_mutex);
1951 
1952 	dev_maps = xmap_dereference(dev->xps_maps);
1953 
1954 	/* allocate memory for queue storage */
1955 	for_each_online_cpu(cpu) {
1956 		if (!cpumask_test_cpu(cpu, mask))
1957 			continue;
1958 
1959 		if (!new_dev_maps)
1960 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1961 		if (!new_dev_maps) {
1962 			mutex_unlock(&xps_map_mutex);
1963 			return -ENOMEM;
1964 		}
1965 
1966 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1967 				 NULL;
1968 
1969 		map = expand_xps_map(map, cpu, index);
1970 		if (!map)
1971 			goto error;
1972 
1973 		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1974 	}
1975 
1976 	if (!new_dev_maps)
1977 		goto out_no_new_maps;
1978 
1979 	for_each_possible_cpu(cpu) {
1980 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1981 			/* add queue to CPU maps */
1982 			int pos = 0;
1983 
1984 			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1985 			while ((pos < map->len) && (map->queues[pos] != index))
1986 				pos++;
1987 
1988 			if (pos == map->len)
1989 				map->queues[map->len++] = index;
1990 #ifdef CONFIG_NUMA
1991 			if (numa_node_id == -2)
1992 				numa_node_id = cpu_to_node(cpu);
1993 			else if (numa_node_id != cpu_to_node(cpu))
1994 				numa_node_id = -1;
1995 #endif
1996 		} else if (dev_maps) {
1997 			/* fill in the new device map from the old device map */
1998 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1999 			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2000 		}
2001 
2002 	}
2003 
2004 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2005 
2006 	/* Cleanup old maps */
2007 	if (dev_maps) {
2008 		for_each_possible_cpu(cpu) {
2009 			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2010 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2011 			if (map && map != new_map)
2012 				kfree_rcu(map, rcu);
2013 		}
2014 
2015 		kfree_rcu(dev_maps, rcu);
2016 	}
2017 
2018 	dev_maps = new_dev_maps;
2019 	active = true;
2020 
2021 out_no_new_maps:
2022 	/* update Tx queue numa node */
2023 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2024 				     (numa_node_id >= 0) ? numa_node_id :
2025 				     NUMA_NO_NODE);
2026 
2027 	if (!dev_maps)
2028 		goto out_no_maps;
2029 
2030 	/* removes queue from unused CPUs */
2031 	for_each_possible_cpu(cpu) {
2032 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2033 			continue;
2034 
2035 		if (remove_xps_queue(dev_maps, cpu, index))
2036 			active = true;
2037 	}
2038 
2039 	/* free map if not active */
2040 	if (!active) {
2041 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2042 		kfree_rcu(dev_maps, rcu);
2043 	}
2044 
2045 out_no_maps:
2046 	mutex_unlock(&xps_map_mutex);
2047 
2048 	return 0;
2049 error:
2050 	/* remove any maps that we added */
2051 	for_each_possible_cpu(cpu) {
2052 		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2053 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2054 				 NULL;
2055 		if (new_map && new_map != map)
2056 			kfree(new_map);
2057 	}
2058 
2059 	mutex_unlock(&xps_map_mutex);
2060 
2061 	kfree(new_dev_maps);
2062 	return -ENOMEM;
2063 }
2064 EXPORT_SYMBOL(netif_set_xps_queue);
2065 
2066 #endif
2067 /*
2068  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2069  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2070  */
2071 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2072 {
2073 	int rc;
2074 
2075 	if (txq < 1 || txq > dev->num_tx_queues)
2076 		return -EINVAL;
2077 
2078 	if (dev->reg_state == NETREG_REGISTERED ||
2079 	    dev->reg_state == NETREG_UNREGISTERING) {
2080 		ASSERT_RTNL();
2081 
2082 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2083 						  txq);
2084 		if (rc)
2085 			return rc;
2086 
2087 		if (dev->num_tc)
2088 			netif_setup_tc(dev, txq);
2089 
2090 		if (txq < dev->real_num_tx_queues) {
2091 			qdisc_reset_all_tx_gt(dev, txq);
2092 #ifdef CONFIG_XPS
2093 			netif_reset_xps_queues_gt(dev, txq);
2094 #endif
2095 		}
2096 	}
2097 
2098 	dev->real_num_tx_queues = txq;
2099 	return 0;
2100 }
2101 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2102 
2103 #ifdef CONFIG_SYSFS
2104 /**
2105  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2106  *	@dev: Network device
2107  *	@rxq: Actual number of RX queues
2108  *
2109  *	This must be called either with the rtnl_lock held or before
2110  *	registration of the net device.  Returns 0 on success, or a
2111  *	negative error code.  If called before registration, it always
2112  *	succeeds.
2113  */
2114 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2115 {
2116 	int rc;
2117 
2118 	if (rxq < 1 || rxq > dev->num_rx_queues)
2119 		return -EINVAL;
2120 
2121 	if (dev->reg_state == NETREG_REGISTERED) {
2122 		ASSERT_RTNL();
2123 
2124 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2125 						  rxq);
2126 		if (rc)
2127 			return rc;
2128 	}
2129 
2130 	dev->real_num_rx_queues = rxq;
2131 	return 0;
2132 }
2133 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2134 #endif
2135 
2136 /**
2137  * netif_get_num_default_rss_queues - default number of RSS queues
2138  *
2139  * This routine should set an upper limit on the number of RSS queues
2140  * used by default by multiqueue devices.
2141  */
2142 int netif_get_num_default_rss_queues(void)
2143 {
2144 	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2145 }
2146 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2147 
2148 static inline void __netif_reschedule(struct Qdisc *q)
2149 {
2150 	struct softnet_data *sd;
2151 	unsigned long flags;
2152 
2153 	local_irq_save(flags);
2154 	sd = this_cpu_ptr(&softnet_data);
2155 	q->next_sched = NULL;
2156 	*sd->output_queue_tailp = q;
2157 	sd->output_queue_tailp = &q->next_sched;
2158 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2159 	local_irq_restore(flags);
2160 }
2161 
2162 void __netif_schedule(struct Qdisc *q)
2163 {
2164 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2165 		__netif_reschedule(q);
2166 }
2167 EXPORT_SYMBOL(__netif_schedule);
2168 
2169 struct dev_kfree_skb_cb {
2170 	enum skb_free_reason reason;
2171 };
2172 
2173 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2174 {
2175 	return (struct dev_kfree_skb_cb *)skb->cb;
2176 }
2177 
2178 void netif_schedule_queue(struct netdev_queue *txq)
2179 {
2180 	rcu_read_lock();
2181 	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2182 		struct Qdisc *q = rcu_dereference(txq->qdisc);
2183 
2184 		__netif_schedule(q);
2185 	}
2186 	rcu_read_unlock();
2187 }
2188 EXPORT_SYMBOL(netif_schedule_queue);
2189 
2190 /**
2191  *	netif_wake_subqueue - allow sending packets on subqueue
2192  *	@dev: network device
2193  *	@queue_index: sub queue index
2194  *
2195  * Resume individual transmit queue of a device with multiple transmit queues.
2196  */
2197 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2198 {
2199 	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2200 
2201 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2202 		struct Qdisc *q;
2203 
2204 		rcu_read_lock();
2205 		q = rcu_dereference(txq->qdisc);
2206 		__netif_schedule(q);
2207 		rcu_read_unlock();
2208 	}
2209 }
2210 EXPORT_SYMBOL(netif_wake_subqueue);
2211 
2212 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2213 {
2214 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2215 		struct Qdisc *q;
2216 
2217 		rcu_read_lock();
2218 		q = rcu_dereference(dev_queue->qdisc);
2219 		__netif_schedule(q);
2220 		rcu_read_unlock();
2221 	}
2222 }
2223 EXPORT_SYMBOL(netif_tx_wake_queue);
2224 
2225 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2226 {
2227 	unsigned long flags;
2228 
2229 	if (likely(atomic_read(&skb->users) == 1)) {
2230 		smp_rmb();
2231 		atomic_set(&skb->users, 0);
2232 	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2233 		return;
2234 	}
2235 	get_kfree_skb_cb(skb)->reason = reason;
2236 	local_irq_save(flags);
2237 	skb->next = __this_cpu_read(softnet_data.completion_queue);
2238 	__this_cpu_write(softnet_data.completion_queue, skb);
2239 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2240 	local_irq_restore(flags);
2241 }
2242 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2243 
2244 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2245 {
2246 	if (in_irq() || irqs_disabled())
2247 		__dev_kfree_skb_irq(skb, reason);
2248 	else
2249 		dev_kfree_skb(skb);
2250 }
2251 EXPORT_SYMBOL(__dev_kfree_skb_any);
2252 
2253 
2254 /**
2255  * netif_device_detach - mark device as removed
2256  * @dev: network device
2257  *
2258  * Mark device as removed from system and therefore no longer available.
2259  */
2260 void netif_device_detach(struct net_device *dev)
2261 {
2262 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2263 	    netif_running(dev)) {
2264 		netif_tx_stop_all_queues(dev);
2265 	}
2266 }
2267 EXPORT_SYMBOL(netif_device_detach);
2268 
2269 /**
2270  * netif_device_attach - mark device as attached
2271  * @dev: network device
2272  *
2273  * Mark device as attached from system and restart if needed.
2274  */
2275 void netif_device_attach(struct net_device *dev)
2276 {
2277 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2278 	    netif_running(dev)) {
2279 		netif_tx_wake_all_queues(dev);
2280 		__netdev_watchdog_up(dev);
2281 	}
2282 }
2283 EXPORT_SYMBOL(netif_device_attach);
2284 
2285 static void skb_warn_bad_offload(const struct sk_buff *skb)
2286 {
2287 	static const netdev_features_t null_features = 0;
2288 	struct net_device *dev = skb->dev;
2289 	const char *driver = "";
2290 
2291 	if (!net_ratelimit())
2292 		return;
2293 
2294 	if (dev && dev->dev.parent)
2295 		driver = dev_driver_string(dev->dev.parent);
2296 
2297 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2298 	     "gso_type=%d ip_summed=%d\n",
2299 	     driver, dev ? &dev->features : &null_features,
2300 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2301 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2302 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2303 }
2304 
2305 /*
2306  * Invalidate hardware checksum when packet is to be mangled, and
2307  * complete checksum manually on outgoing path.
2308  */
2309 int skb_checksum_help(struct sk_buff *skb)
2310 {
2311 	__wsum csum;
2312 	int ret = 0, offset;
2313 
2314 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2315 		goto out_set_summed;
2316 
2317 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2318 		skb_warn_bad_offload(skb);
2319 		return -EINVAL;
2320 	}
2321 
2322 	/* Before computing a checksum, we should make sure no frag could
2323 	 * be modified by an external entity : checksum could be wrong.
2324 	 */
2325 	if (skb_has_shared_frag(skb)) {
2326 		ret = __skb_linearize(skb);
2327 		if (ret)
2328 			goto out;
2329 	}
2330 
2331 	offset = skb_checksum_start_offset(skb);
2332 	BUG_ON(offset >= skb_headlen(skb));
2333 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2334 
2335 	offset += skb->csum_offset;
2336 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2337 
2338 	if (skb_cloned(skb) &&
2339 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2340 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2341 		if (ret)
2342 			goto out;
2343 	}
2344 
2345 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2346 out_set_summed:
2347 	skb->ip_summed = CHECKSUM_NONE;
2348 out:
2349 	return ret;
2350 }
2351 EXPORT_SYMBOL(skb_checksum_help);
2352 
2353 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2354 {
2355 	__be16 type = skb->protocol;
2356 
2357 	/* Tunnel gso handlers can set protocol to ethernet. */
2358 	if (type == htons(ETH_P_TEB)) {
2359 		struct ethhdr *eth;
2360 
2361 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2362 			return 0;
2363 
2364 		eth = (struct ethhdr *)skb_mac_header(skb);
2365 		type = eth->h_proto;
2366 	}
2367 
2368 	return __vlan_get_protocol(skb, type, depth);
2369 }
2370 
2371 /**
2372  *	skb_mac_gso_segment - mac layer segmentation handler.
2373  *	@skb: buffer to segment
2374  *	@features: features for the output path (see dev->features)
2375  */
2376 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2377 				    netdev_features_t features)
2378 {
2379 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2380 	struct packet_offload *ptype;
2381 	int vlan_depth = skb->mac_len;
2382 	__be16 type = skb_network_protocol(skb, &vlan_depth);
2383 
2384 	if (unlikely(!type))
2385 		return ERR_PTR(-EINVAL);
2386 
2387 	__skb_pull(skb, vlan_depth);
2388 
2389 	rcu_read_lock();
2390 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2391 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2392 			segs = ptype->callbacks.gso_segment(skb, features);
2393 			break;
2394 		}
2395 	}
2396 	rcu_read_unlock();
2397 
2398 	__skb_push(skb, skb->data - skb_mac_header(skb));
2399 
2400 	return segs;
2401 }
2402 EXPORT_SYMBOL(skb_mac_gso_segment);
2403 
2404 
2405 /* openvswitch calls this on rx path, so we need a different check.
2406  */
2407 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2408 {
2409 	if (tx_path)
2410 		return skb->ip_summed != CHECKSUM_PARTIAL;
2411 	else
2412 		return skb->ip_summed == CHECKSUM_NONE;
2413 }
2414 
2415 /**
2416  *	__skb_gso_segment - Perform segmentation on skb.
2417  *	@skb: buffer to segment
2418  *	@features: features for the output path (see dev->features)
2419  *	@tx_path: whether it is called in TX path
2420  *
2421  *	This function segments the given skb and returns a list of segments.
2422  *
2423  *	It may return NULL if the skb requires no segmentation.  This is
2424  *	only possible when GSO is used for verifying header integrity.
2425  */
2426 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2427 				  netdev_features_t features, bool tx_path)
2428 {
2429 	if (unlikely(skb_needs_check(skb, tx_path))) {
2430 		int err;
2431 
2432 		skb_warn_bad_offload(skb);
2433 
2434 		err = skb_cow_head(skb, 0);
2435 		if (err < 0)
2436 			return ERR_PTR(err);
2437 	}
2438 
2439 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2440 	SKB_GSO_CB(skb)->encap_level = 0;
2441 
2442 	skb_reset_mac_header(skb);
2443 	skb_reset_mac_len(skb);
2444 
2445 	return skb_mac_gso_segment(skb, features);
2446 }
2447 EXPORT_SYMBOL(__skb_gso_segment);
2448 
2449 /* Take action when hardware reception checksum errors are detected. */
2450 #ifdef CONFIG_BUG
2451 void netdev_rx_csum_fault(struct net_device *dev)
2452 {
2453 	if (net_ratelimit()) {
2454 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2455 		dump_stack();
2456 	}
2457 }
2458 EXPORT_SYMBOL(netdev_rx_csum_fault);
2459 #endif
2460 
2461 /* Actually, we should eliminate this check as soon as we know, that:
2462  * 1. IOMMU is present and allows to map all the memory.
2463  * 2. No high memory really exists on this machine.
2464  */
2465 
2466 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2467 {
2468 #ifdef CONFIG_HIGHMEM
2469 	int i;
2470 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2471 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2472 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2473 			if (PageHighMem(skb_frag_page(frag)))
2474 				return 1;
2475 		}
2476 	}
2477 
2478 	if (PCI_DMA_BUS_IS_PHYS) {
2479 		struct device *pdev = dev->dev.parent;
2480 
2481 		if (!pdev)
2482 			return 0;
2483 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2484 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2485 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2486 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2487 				return 1;
2488 		}
2489 	}
2490 #endif
2491 	return 0;
2492 }
2493 
2494 /* If MPLS offload request, verify we are testing hardware MPLS features
2495  * instead of standard features for the netdev.
2496  */
2497 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2498 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2499 					   netdev_features_t features,
2500 					   __be16 type)
2501 {
2502 	if (eth_p_mpls(type))
2503 		features &= skb->dev->mpls_features;
2504 
2505 	return features;
2506 }
2507 #else
2508 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2509 					   netdev_features_t features,
2510 					   __be16 type)
2511 {
2512 	return features;
2513 }
2514 #endif
2515 
2516 static netdev_features_t harmonize_features(struct sk_buff *skb,
2517 	netdev_features_t features)
2518 {
2519 	int tmp;
2520 	__be16 type;
2521 
2522 	type = skb_network_protocol(skb, &tmp);
2523 	features = net_mpls_features(skb, features, type);
2524 
2525 	if (skb->ip_summed != CHECKSUM_NONE &&
2526 	    !can_checksum_protocol(features, type)) {
2527 		features &= ~NETIF_F_ALL_CSUM;
2528 	} else if (illegal_highdma(skb->dev, skb)) {
2529 		features &= ~NETIF_F_SG;
2530 	}
2531 
2532 	return features;
2533 }
2534 
2535 netdev_features_t netif_skb_features(struct sk_buff *skb)
2536 {
2537 	struct net_device *dev = skb->dev;
2538 	netdev_features_t features = dev->features;
2539 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2540 	__be16 protocol = skb->protocol;
2541 
2542 	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2543 		features &= ~NETIF_F_GSO_MASK;
2544 
2545 	/* If encapsulation offload request, verify we are testing
2546 	 * hardware encapsulation features instead of standard
2547 	 * features for the netdev
2548 	 */
2549 	if (skb->encapsulation)
2550 		features &= dev->hw_enc_features;
2551 
2552 	if (!vlan_tx_tag_present(skb)) {
2553 		if (unlikely(protocol == htons(ETH_P_8021Q) ||
2554 			     protocol == htons(ETH_P_8021AD))) {
2555 			struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2556 			protocol = veh->h_vlan_encapsulated_proto;
2557 		} else {
2558 			goto finalize;
2559 		}
2560 	}
2561 
2562 	features = netdev_intersect_features(features,
2563 					     dev->vlan_features |
2564 					     NETIF_F_HW_VLAN_CTAG_TX |
2565 					     NETIF_F_HW_VLAN_STAG_TX);
2566 
2567 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2568 		features = netdev_intersect_features(features,
2569 						     NETIF_F_SG |
2570 						     NETIF_F_HIGHDMA |
2571 						     NETIF_F_FRAGLIST |
2572 						     NETIF_F_GEN_CSUM |
2573 						     NETIF_F_HW_VLAN_CTAG_TX |
2574 						     NETIF_F_HW_VLAN_STAG_TX);
2575 
2576 finalize:
2577 	if (dev->netdev_ops->ndo_features_check)
2578 		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2579 								features);
2580 
2581 	return harmonize_features(skb, features);
2582 }
2583 EXPORT_SYMBOL(netif_skb_features);
2584 
2585 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2586 		    struct netdev_queue *txq, bool more)
2587 {
2588 	unsigned int len;
2589 	int rc;
2590 
2591 	if (!list_empty(&ptype_all))
2592 		dev_queue_xmit_nit(skb, dev);
2593 
2594 	len = skb->len;
2595 	trace_net_dev_start_xmit(skb, dev);
2596 	rc = netdev_start_xmit(skb, dev, txq, more);
2597 	trace_net_dev_xmit(skb, rc, dev, len);
2598 
2599 	return rc;
2600 }
2601 
2602 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2603 				    struct netdev_queue *txq, int *ret)
2604 {
2605 	struct sk_buff *skb = first;
2606 	int rc = NETDEV_TX_OK;
2607 
2608 	while (skb) {
2609 		struct sk_buff *next = skb->next;
2610 
2611 		skb->next = NULL;
2612 		rc = xmit_one(skb, dev, txq, next != NULL);
2613 		if (unlikely(!dev_xmit_complete(rc))) {
2614 			skb->next = next;
2615 			goto out;
2616 		}
2617 
2618 		skb = next;
2619 		if (netif_xmit_stopped(txq) && skb) {
2620 			rc = NETDEV_TX_BUSY;
2621 			break;
2622 		}
2623 	}
2624 
2625 out:
2626 	*ret = rc;
2627 	return skb;
2628 }
2629 
2630 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2631 					  netdev_features_t features)
2632 {
2633 	if (vlan_tx_tag_present(skb) &&
2634 	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2635 		skb = __vlan_hwaccel_push_inside(skb);
2636 	return skb;
2637 }
2638 
2639 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2640 {
2641 	netdev_features_t features;
2642 
2643 	if (skb->next)
2644 		return skb;
2645 
2646 	features = netif_skb_features(skb);
2647 	skb = validate_xmit_vlan(skb, features);
2648 	if (unlikely(!skb))
2649 		goto out_null;
2650 
2651 	if (netif_needs_gso(dev, skb, features)) {
2652 		struct sk_buff *segs;
2653 
2654 		segs = skb_gso_segment(skb, features);
2655 		if (IS_ERR(segs)) {
2656 			goto out_kfree_skb;
2657 		} else if (segs) {
2658 			consume_skb(skb);
2659 			skb = segs;
2660 		}
2661 	} else {
2662 		if (skb_needs_linearize(skb, features) &&
2663 		    __skb_linearize(skb))
2664 			goto out_kfree_skb;
2665 
2666 		/* If packet is not checksummed and device does not
2667 		 * support checksumming for this protocol, complete
2668 		 * checksumming here.
2669 		 */
2670 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2671 			if (skb->encapsulation)
2672 				skb_set_inner_transport_header(skb,
2673 							       skb_checksum_start_offset(skb));
2674 			else
2675 				skb_set_transport_header(skb,
2676 							 skb_checksum_start_offset(skb));
2677 			if (!(features & NETIF_F_ALL_CSUM) &&
2678 			    skb_checksum_help(skb))
2679 				goto out_kfree_skb;
2680 		}
2681 	}
2682 
2683 	return skb;
2684 
2685 out_kfree_skb:
2686 	kfree_skb(skb);
2687 out_null:
2688 	return NULL;
2689 }
2690 
2691 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2692 {
2693 	struct sk_buff *next, *head = NULL, *tail;
2694 
2695 	for (; skb != NULL; skb = next) {
2696 		next = skb->next;
2697 		skb->next = NULL;
2698 
2699 		/* in case skb wont be segmented, point to itself */
2700 		skb->prev = skb;
2701 
2702 		skb = validate_xmit_skb(skb, dev);
2703 		if (!skb)
2704 			continue;
2705 
2706 		if (!head)
2707 			head = skb;
2708 		else
2709 			tail->next = skb;
2710 		/* If skb was segmented, skb->prev points to
2711 		 * the last segment. If not, it still contains skb.
2712 		 */
2713 		tail = skb->prev;
2714 	}
2715 	return head;
2716 }
2717 
2718 static void qdisc_pkt_len_init(struct sk_buff *skb)
2719 {
2720 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2721 
2722 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2723 
2724 	/* To get more precise estimation of bytes sent on wire,
2725 	 * we add to pkt_len the headers size of all segments
2726 	 */
2727 	if (shinfo->gso_size)  {
2728 		unsigned int hdr_len;
2729 		u16 gso_segs = shinfo->gso_segs;
2730 
2731 		/* mac layer + network layer */
2732 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2733 
2734 		/* + transport layer */
2735 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2736 			hdr_len += tcp_hdrlen(skb);
2737 		else
2738 			hdr_len += sizeof(struct udphdr);
2739 
2740 		if (shinfo->gso_type & SKB_GSO_DODGY)
2741 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2742 						shinfo->gso_size);
2743 
2744 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2745 	}
2746 }
2747 
2748 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2749 				 struct net_device *dev,
2750 				 struct netdev_queue *txq)
2751 {
2752 	spinlock_t *root_lock = qdisc_lock(q);
2753 	bool contended;
2754 	int rc;
2755 
2756 	qdisc_pkt_len_init(skb);
2757 	qdisc_calculate_pkt_len(skb, q);
2758 	/*
2759 	 * Heuristic to force contended enqueues to serialize on a
2760 	 * separate lock before trying to get qdisc main lock.
2761 	 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2762 	 * often and dequeue packets faster.
2763 	 */
2764 	contended = qdisc_is_running(q);
2765 	if (unlikely(contended))
2766 		spin_lock(&q->busylock);
2767 
2768 	spin_lock(root_lock);
2769 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2770 		kfree_skb(skb);
2771 		rc = NET_XMIT_DROP;
2772 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2773 		   qdisc_run_begin(q)) {
2774 		/*
2775 		 * This is a work-conserving queue; there are no old skbs
2776 		 * waiting to be sent out; and the qdisc is not running -
2777 		 * xmit the skb directly.
2778 		 */
2779 
2780 		qdisc_bstats_update(q, skb);
2781 
2782 		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2783 			if (unlikely(contended)) {
2784 				spin_unlock(&q->busylock);
2785 				contended = false;
2786 			}
2787 			__qdisc_run(q);
2788 		} else
2789 			qdisc_run_end(q);
2790 
2791 		rc = NET_XMIT_SUCCESS;
2792 	} else {
2793 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2794 		if (qdisc_run_begin(q)) {
2795 			if (unlikely(contended)) {
2796 				spin_unlock(&q->busylock);
2797 				contended = false;
2798 			}
2799 			__qdisc_run(q);
2800 		}
2801 	}
2802 	spin_unlock(root_lock);
2803 	if (unlikely(contended))
2804 		spin_unlock(&q->busylock);
2805 	return rc;
2806 }
2807 
2808 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2809 static void skb_update_prio(struct sk_buff *skb)
2810 {
2811 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2812 
2813 	if (!skb->priority && skb->sk && map) {
2814 		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2815 
2816 		if (prioidx < map->priomap_len)
2817 			skb->priority = map->priomap[prioidx];
2818 	}
2819 }
2820 #else
2821 #define skb_update_prio(skb)
2822 #endif
2823 
2824 static DEFINE_PER_CPU(int, xmit_recursion);
2825 #define RECURSION_LIMIT 10
2826 
2827 /**
2828  *	dev_loopback_xmit - loop back @skb
2829  *	@skb: buffer to transmit
2830  */
2831 int dev_loopback_xmit(struct sk_buff *skb)
2832 {
2833 	skb_reset_mac_header(skb);
2834 	__skb_pull(skb, skb_network_offset(skb));
2835 	skb->pkt_type = PACKET_LOOPBACK;
2836 	skb->ip_summed = CHECKSUM_UNNECESSARY;
2837 	WARN_ON(!skb_dst(skb));
2838 	skb_dst_force(skb);
2839 	netif_rx_ni(skb);
2840 	return 0;
2841 }
2842 EXPORT_SYMBOL(dev_loopback_xmit);
2843 
2844 /**
2845  *	__dev_queue_xmit - transmit a buffer
2846  *	@skb: buffer to transmit
2847  *	@accel_priv: private data used for L2 forwarding offload
2848  *
2849  *	Queue a buffer for transmission to a network device. The caller must
2850  *	have set the device and priority and built the buffer before calling
2851  *	this function. The function can be called from an interrupt.
2852  *
2853  *	A negative errno code is returned on a failure. A success does not
2854  *	guarantee the frame will be transmitted as it may be dropped due
2855  *	to congestion or traffic shaping.
2856  *
2857  * -----------------------------------------------------------------------------------
2858  *      I notice this method can also return errors from the queue disciplines,
2859  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2860  *      be positive.
2861  *
2862  *      Regardless of the return value, the skb is consumed, so it is currently
2863  *      difficult to retry a send to this method.  (You can bump the ref count
2864  *      before sending to hold a reference for retry if you are careful.)
2865  *
2866  *      When calling this method, interrupts MUST be enabled.  This is because
2867  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2868  *          --BLG
2869  */
2870 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2871 {
2872 	struct net_device *dev = skb->dev;
2873 	struct netdev_queue *txq;
2874 	struct Qdisc *q;
2875 	int rc = -ENOMEM;
2876 
2877 	skb_reset_mac_header(skb);
2878 
2879 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2880 		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2881 
2882 	/* Disable soft irqs for various locks below. Also
2883 	 * stops preemption for RCU.
2884 	 */
2885 	rcu_read_lock_bh();
2886 
2887 	skb_update_prio(skb);
2888 
2889 	/* If device/qdisc don't need skb->dst, release it right now while
2890 	 * its hot in this cpu cache.
2891 	 */
2892 	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2893 		skb_dst_drop(skb);
2894 	else
2895 		skb_dst_force(skb);
2896 
2897 	txq = netdev_pick_tx(dev, skb, accel_priv);
2898 	q = rcu_dereference_bh(txq->qdisc);
2899 
2900 #ifdef CONFIG_NET_CLS_ACT
2901 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2902 #endif
2903 	trace_net_dev_queue(skb);
2904 	if (q->enqueue) {
2905 		rc = __dev_xmit_skb(skb, q, dev, txq);
2906 		goto out;
2907 	}
2908 
2909 	/* The device has no queue. Common case for software devices:
2910 	   loopback, all the sorts of tunnels...
2911 
2912 	   Really, it is unlikely that netif_tx_lock protection is necessary
2913 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2914 	   counters.)
2915 	   However, it is possible, that they rely on protection
2916 	   made by us here.
2917 
2918 	   Check this and shot the lock. It is not prone from deadlocks.
2919 	   Either shot noqueue qdisc, it is even simpler 8)
2920 	 */
2921 	if (dev->flags & IFF_UP) {
2922 		int cpu = smp_processor_id(); /* ok because BHs are off */
2923 
2924 		if (txq->xmit_lock_owner != cpu) {
2925 
2926 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2927 				goto recursion_alert;
2928 
2929 			skb = validate_xmit_skb(skb, dev);
2930 			if (!skb)
2931 				goto drop;
2932 
2933 			HARD_TX_LOCK(dev, txq, cpu);
2934 
2935 			if (!netif_xmit_stopped(txq)) {
2936 				__this_cpu_inc(xmit_recursion);
2937 				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2938 				__this_cpu_dec(xmit_recursion);
2939 				if (dev_xmit_complete(rc)) {
2940 					HARD_TX_UNLOCK(dev, txq);
2941 					goto out;
2942 				}
2943 			}
2944 			HARD_TX_UNLOCK(dev, txq);
2945 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2946 					     dev->name);
2947 		} else {
2948 			/* Recursion is detected! It is possible,
2949 			 * unfortunately
2950 			 */
2951 recursion_alert:
2952 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2953 					     dev->name);
2954 		}
2955 	}
2956 
2957 	rc = -ENETDOWN;
2958 drop:
2959 	rcu_read_unlock_bh();
2960 
2961 	atomic_long_inc(&dev->tx_dropped);
2962 	kfree_skb_list(skb);
2963 	return rc;
2964 out:
2965 	rcu_read_unlock_bh();
2966 	return rc;
2967 }
2968 
2969 int dev_queue_xmit(struct sk_buff *skb)
2970 {
2971 	return __dev_queue_xmit(skb, NULL);
2972 }
2973 EXPORT_SYMBOL(dev_queue_xmit);
2974 
2975 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2976 {
2977 	return __dev_queue_xmit(skb, accel_priv);
2978 }
2979 EXPORT_SYMBOL(dev_queue_xmit_accel);
2980 
2981 
2982 /*=======================================================================
2983 			Receiver routines
2984   =======================================================================*/
2985 
2986 int netdev_max_backlog __read_mostly = 1000;
2987 EXPORT_SYMBOL(netdev_max_backlog);
2988 
2989 int netdev_tstamp_prequeue __read_mostly = 1;
2990 int netdev_budget __read_mostly = 300;
2991 int weight_p __read_mostly = 64;            /* old backlog weight */
2992 
2993 /* Called with irq disabled */
2994 static inline void ____napi_schedule(struct softnet_data *sd,
2995 				     struct napi_struct *napi)
2996 {
2997 	list_add_tail(&napi->poll_list, &sd->poll_list);
2998 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2999 }
3000 
3001 #ifdef CONFIG_RPS
3002 
3003 /* One global table that all flow-based protocols share. */
3004 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3005 EXPORT_SYMBOL(rps_sock_flow_table);
3006 
3007 struct static_key rps_needed __read_mostly;
3008 
3009 static struct rps_dev_flow *
3010 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3011 	    struct rps_dev_flow *rflow, u16 next_cpu)
3012 {
3013 	if (next_cpu != RPS_NO_CPU) {
3014 #ifdef CONFIG_RFS_ACCEL
3015 		struct netdev_rx_queue *rxqueue;
3016 		struct rps_dev_flow_table *flow_table;
3017 		struct rps_dev_flow *old_rflow;
3018 		u32 flow_id;
3019 		u16 rxq_index;
3020 		int rc;
3021 
3022 		/* Should we steer this flow to a different hardware queue? */
3023 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3024 		    !(dev->features & NETIF_F_NTUPLE))
3025 			goto out;
3026 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3027 		if (rxq_index == skb_get_rx_queue(skb))
3028 			goto out;
3029 
3030 		rxqueue = dev->_rx + rxq_index;
3031 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3032 		if (!flow_table)
3033 			goto out;
3034 		flow_id = skb_get_hash(skb) & flow_table->mask;
3035 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3036 							rxq_index, flow_id);
3037 		if (rc < 0)
3038 			goto out;
3039 		old_rflow = rflow;
3040 		rflow = &flow_table->flows[flow_id];
3041 		rflow->filter = rc;
3042 		if (old_rflow->filter == rflow->filter)
3043 			old_rflow->filter = RPS_NO_FILTER;
3044 	out:
3045 #endif
3046 		rflow->last_qtail =
3047 			per_cpu(softnet_data, next_cpu).input_queue_head;
3048 	}
3049 
3050 	rflow->cpu = next_cpu;
3051 	return rflow;
3052 }
3053 
3054 /*
3055  * get_rps_cpu is called from netif_receive_skb and returns the target
3056  * CPU from the RPS map of the receiving queue for a given skb.
3057  * rcu_read_lock must be held on entry.
3058  */
3059 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3060 		       struct rps_dev_flow **rflowp)
3061 {
3062 	struct netdev_rx_queue *rxqueue;
3063 	struct rps_map *map;
3064 	struct rps_dev_flow_table *flow_table;
3065 	struct rps_sock_flow_table *sock_flow_table;
3066 	int cpu = -1;
3067 	u16 tcpu;
3068 	u32 hash;
3069 
3070 	if (skb_rx_queue_recorded(skb)) {
3071 		u16 index = skb_get_rx_queue(skb);
3072 		if (unlikely(index >= dev->real_num_rx_queues)) {
3073 			WARN_ONCE(dev->real_num_rx_queues > 1,
3074 				  "%s received packet on queue %u, but number "
3075 				  "of RX queues is %u\n",
3076 				  dev->name, index, dev->real_num_rx_queues);
3077 			goto done;
3078 		}
3079 		rxqueue = dev->_rx + index;
3080 	} else
3081 		rxqueue = dev->_rx;
3082 
3083 	map = rcu_dereference(rxqueue->rps_map);
3084 	if (map) {
3085 		if (map->len == 1 &&
3086 		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
3087 			tcpu = map->cpus[0];
3088 			if (cpu_online(tcpu))
3089 				cpu = tcpu;
3090 			goto done;
3091 		}
3092 	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3093 		goto done;
3094 	}
3095 
3096 	skb_reset_network_header(skb);
3097 	hash = skb_get_hash(skb);
3098 	if (!hash)
3099 		goto done;
3100 
3101 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3102 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3103 	if (flow_table && sock_flow_table) {
3104 		u16 next_cpu;
3105 		struct rps_dev_flow *rflow;
3106 
3107 		rflow = &flow_table->flows[hash & flow_table->mask];
3108 		tcpu = rflow->cpu;
3109 
3110 		next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3111 
3112 		/*
3113 		 * If the desired CPU (where last recvmsg was done) is
3114 		 * different from current CPU (one in the rx-queue flow
3115 		 * table entry), switch if one of the following holds:
3116 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
3117 		 *   - Current CPU is offline.
3118 		 *   - The current CPU's queue tail has advanced beyond the
3119 		 *     last packet that was enqueued using this table entry.
3120 		 *     This guarantees that all previous packets for the flow
3121 		 *     have been dequeued, thus preserving in order delivery.
3122 		 */
3123 		if (unlikely(tcpu != next_cpu) &&
3124 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3125 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3126 		      rflow->last_qtail)) >= 0)) {
3127 			tcpu = next_cpu;
3128 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3129 		}
3130 
3131 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3132 			*rflowp = rflow;
3133 			cpu = tcpu;
3134 			goto done;
3135 		}
3136 	}
3137 
3138 	if (map) {
3139 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3140 		if (cpu_online(tcpu)) {
3141 			cpu = tcpu;
3142 			goto done;
3143 		}
3144 	}
3145 
3146 done:
3147 	return cpu;
3148 }
3149 
3150 #ifdef CONFIG_RFS_ACCEL
3151 
3152 /**
3153  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3154  * @dev: Device on which the filter was set
3155  * @rxq_index: RX queue index
3156  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3157  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3158  *
3159  * Drivers that implement ndo_rx_flow_steer() should periodically call
3160  * this function for each installed filter and remove the filters for
3161  * which it returns %true.
3162  */
3163 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3164 			 u32 flow_id, u16 filter_id)
3165 {
3166 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3167 	struct rps_dev_flow_table *flow_table;
3168 	struct rps_dev_flow *rflow;
3169 	bool expire = true;
3170 	int cpu;
3171 
3172 	rcu_read_lock();
3173 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3174 	if (flow_table && flow_id <= flow_table->mask) {
3175 		rflow = &flow_table->flows[flow_id];
3176 		cpu = ACCESS_ONCE(rflow->cpu);
3177 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3178 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3179 			   rflow->last_qtail) <
3180 		     (int)(10 * flow_table->mask)))
3181 			expire = false;
3182 	}
3183 	rcu_read_unlock();
3184 	return expire;
3185 }
3186 EXPORT_SYMBOL(rps_may_expire_flow);
3187 
3188 #endif /* CONFIG_RFS_ACCEL */
3189 
3190 /* Called from hardirq (IPI) context */
3191 static void rps_trigger_softirq(void *data)
3192 {
3193 	struct softnet_data *sd = data;
3194 
3195 	____napi_schedule(sd, &sd->backlog);
3196 	sd->received_rps++;
3197 }
3198 
3199 #endif /* CONFIG_RPS */
3200 
3201 /*
3202  * Check if this softnet_data structure is another cpu one
3203  * If yes, queue it to our IPI list and return 1
3204  * If no, return 0
3205  */
3206 static int rps_ipi_queued(struct softnet_data *sd)
3207 {
3208 #ifdef CONFIG_RPS
3209 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3210 
3211 	if (sd != mysd) {
3212 		sd->rps_ipi_next = mysd->rps_ipi_list;
3213 		mysd->rps_ipi_list = sd;
3214 
3215 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3216 		return 1;
3217 	}
3218 #endif /* CONFIG_RPS */
3219 	return 0;
3220 }
3221 
3222 #ifdef CONFIG_NET_FLOW_LIMIT
3223 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3224 #endif
3225 
3226 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3227 {
3228 #ifdef CONFIG_NET_FLOW_LIMIT
3229 	struct sd_flow_limit *fl;
3230 	struct softnet_data *sd;
3231 	unsigned int old_flow, new_flow;
3232 
3233 	if (qlen < (netdev_max_backlog >> 1))
3234 		return false;
3235 
3236 	sd = this_cpu_ptr(&softnet_data);
3237 
3238 	rcu_read_lock();
3239 	fl = rcu_dereference(sd->flow_limit);
3240 	if (fl) {
3241 		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3242 		old_flow = fl->history[fl->history_head];
3243 		fl->history[fl->history_head] = new_flow;
3244 
3245 		fl->history_head++;
3246 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3247 
3248 		if (likely(fl->buckets[old_flow]))
3249 			fl->buckets[old_flow]--;
3250 
3251 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3252 			fl->count++;
3253 			rcu_read_unlock();
3254 			return true;
3255 		}
3256 	}
3257 	rcu_read_unlock();
3258 #endif
3259 	return false;
3260 }
3261 
3262 /*
3263  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3264  * queue (may be a remote CPU queue).
3265  */
3266 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3267 			      unsigned int *qtail)
3268 {
3269 	struct softnet_data *sd;
3270 	unsigned long flags;
3271 	unsigned int qlen;
3272 
3273 	sd = &per_cpu(softnet_data, cpu);
3274 
3275 	local_irq_save(flags);
3276 
3277 	rps_lock(sd);
3278 	qlen = skb_queue_len(&sd->input_pkt_queue);
3279 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3280 		if (qlen) {
3281 enqueue:
3282 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3283 			input_queue_tail_incr_save(sd, qtail);
3284 			rps_unlock(sd);
3285 			local_irq_restore(flags);
3286 			return NET_RX_SUCCESS;
3287 		}
3288 
3289 		/* Schedule NAPI for backlog device
3290 		 * We can use non atomic operation since we own the queue lock
3291 		 */
3292 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3293 			if (!rps_ipi_queued(sd))
3294 				____napi_schedule(sd, &sd->backlog);
3295 		}
3296 		goto enqueue;
3297 	}
3298 
3299 	sd->dropped++;
3300 	rps_unlock(sd);
3301 
3302 	local_irq_restore(flags);
3303 
3304 	atomic_long_inc(&skb->dev->rx_dropped);
3305 	kfree_skb(skb);
3306 	return NET_RX_DROP;
3307 }
3308 
3309 static int netif_rx_internal(struct sk_buff *skb)
3310 {
3311 	int ret;
3312 
3313 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3314 
3315 	trace_netif_rx(skb);
3316 #ifdef CONFIG_RPS
3317 	if (static_key_false(&rps_needed)) {
3318 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3319 		int cpu;
3320 
3321 		preempt_disable();
3322 		rcu_read_lock();
3323 
3324 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3325 		if (cpu < 0)
3326 			cpu = smp_processor_id();
3327 
3328 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3329 
3330 		rcu_read_unlock();
3331 		preempt_enable();
3332 	} else
3333 #endif
3334 	{
3335 		unsigned int qtail;
3336 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3337 		put_cpu();
3338 	}
3339 	return ret;
3340 }
3341 
3342 /**
3343  *	netif_rx	-	post buffer to the network code
3344  *	@skb: buffer to post
3345  *
3346  *	This function receives a packet from a device driver and queues it for
3347  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3348  *	may be dropped during processing for congestion control or by the
3349  *	protocol layers.
3350  *
3351  *	return values:
3352  *	NET_RX_SUCCESS	(no congestion)
3353  *	NET_RX_DROP     (packet was dropped)
3354  *
3355  */
3356 
3357 int netif_rx(struct sk_buff *skb)
3358 {
3359 	trace_netif_rx_entry(skb);
3360 
3361 	return netif_rx_internal(skb);
3362 }
3363 EXPORT_SYMBOL(netif_rx);
3364 
3365 int netif_rx_ni(struct sk_buff *skb)
3366 {
3367 	int err;
3368 
3369 	trace_netif_rx_ni_entry(skb);
3370 
3371 	preempt_disable();
3372 	err = netif_rx_internal(skb);
3373 	if (local_softirq_pending())
3374 		do_softirq();
3375 	preempt_enable();
3376 
3377 	return err;
3378 }
3379 EXPORT_SYMBOL(netif_rx_ni);
3380 
3381 static void net_tx_action(struct softirq_action *h)
3382 {
3383 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3384 
3385 	if (sd->completion_queue) {
3386 		struct sk_buff *clist;
3387 
3388 		local_irq_disable();
3389 		clist = sd->completion_queue;
3390 		sd->completion_queue = NULL;
3391 		local_irq_enable();
3392 
3393 		while (clist) {
3394 			struct sk_buff *skb = clist;
3395 			clist = clist->next;
3396 
3397 			WARN_ON(atomic_read(&skb->users));
3398 			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3399 				trace_consume_skb(skb);
3400 			else
3401 				trace_kfree_skb(skb, net_tx_action);
3402 			__kfree_skb(skb);
3403 		}
3404 	}
3405 
3406 	if (sd->output_queue) {
3407 		struct Qdisc *head;
3408 
3409 		local_irq_disable();
3410 		head = sd->output_queue;
3411 		sd->output_queue = NULL;
3412 		sd->output_queue_tailp = &sd->output_queue;
3413 		local_irq_enable();
3414 
3415 		while (head) {
3416 			struct Qdisc *q = head;
3417 			spinlock_t *root_lock;
3418 
3419 			head = head->next_sched;
3420 
3421 			root_lock = qdisc_lock(q);
3422 			if (spin_trylock(root_lock)) {
3423 				smp_mb__before_atomic();
3424 				clear_bit(__QDISC_STATE_SCHED,
3425 					  &q->state);
3426 				qdisc_run(q);
3427 				spin_unlock(root_lock);
3428 			} else {
3429 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3430 					      &q->state)) {
3431 					__netif_reschedule(q);
3432 				} else {
3433 					smp_mb__before_atomic();
3434 					clear_bit(__QDISC_STATE_SCHED,
3435 						  &q->state);
3436 				}
3437 			}
3438 		}
3439 	}
3440 }
3441 
3442 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3443     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3444 /* This hook is defined here for ATM LANE */
3445 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3446 			     unsigned char *addr) __read_mostly;
3447 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3448 #endif
3449 
3450 #ifdef CONFIG_NET_CLS_ACT
3451 /* TODO: Maybe we should just force sch_ingress to be compiled in
3452  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3453  * a compare and 2 stores extra right now if we dont have it on
3454  * but have CONFIG_NET_CLS_ACT
3455  * NOTE: This doesn't stop any functionality; if you dont have
3456  * the ingress scheduler, you just can't add policies on ingress.
3457  *
3458  */
3459 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3460 {
3461 	struct net_device *dev = skb->dev;
3462 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3463 	int result = TC_ACT_OK;
3464 	struct Qdisc *q;
3465 
3466 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3467 		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3468 				     skb->skb_iif, dev->ifindex);
3469 		return TC_ACT_SHOT;
3470 	}
3471 
3472 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3473 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3474 
3475 	q = rcu_dereference(rxq->qdisc);
3476 	if (q != &noop_qdisc) {
3477 		spin_lock(qdisc_lock(q));
3478 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3479 			result = qdisc_enqueue_root(skb, q);
3480 		spin_unlock(qdisc_lock(q));
3481 	}
3482 
3483 	return result;
3484 }
3485 
3486 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3487 					 struct packet_type **pt_prev,
3488 					 int *ret, struct net_device *orig_dev)
3489 {
3490 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3491 
3492 	if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3493 		goto out;
3494 
3495 	if (*pt_prev) {
3496 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3497 		*pt_prev = NULL;
3498 	}
3499 
3500 	switch (ing_filter(skb, rxq)) {
3501 	case TC_ACT_SHOT:
3502 	case TC_ACT_STOLEN:
3503 		kfree_skb(skb);
3504 		return NULL;
3505 	}
3506 
3507 out:
3508 	skb->tc_verd = 0;
3509 	return skb;
3510 }
3511 #endif
3512 
3513 /**
3514  *	netdev_rx_handler_register - register receive handler
3515  *	@dev: device to register a handler for
3516  *	@rx_handler: receive handler to register
3517  *	@rx_handler_data: data pointer that is used by rx handler
3518  *
3519  *	Register a receive handler for a device. This handler will then be
3520  *	called from __netif_receive_skb. A negative errno code is returned
3521  *	on a failure.
3522  *
3523  *	The caller must hold the rtnl_mutex.
3524  *
3525  *	For a general description of rx_handler, see enum rx_handler_result.
3526  */
3527 int netdev_rx_handler_register(struct net_device *dev,
3528 			       rx_handler_func_t *rx_handler,
3529 			       void *rx_handler_data)
3530 {
3531 	ASSERT_RTNL();
3532 
3533 	if (dev->rx_handler)
3534 		return -EBUSY;
3535 
3536 	/* Note: rx_handler_data must be set before rx_handler */
3537 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3538 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3539 
3540 	return 0;
3541 }
3542 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3543 
3544 /**
3545  *	netdev_rx_handler_unregister - unregister receive handler
3546  *	@dev: device to unregister a handler from
3547  *
3548  *	Unregister a receive handler from a device.
3549  *
3550  *	The caller must hold the rtnl_mutex.
3551  */
3552 void netdev_rx_handler_unregister(struct net_device *dev)
3553 {
3554 
3555 	ASSERT_RTNL();
3556 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3557 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3558 	 * section has a guarantee to see a non NULL rx_handler_data
3559 	 * as well.
3560 	 */
3561 	synchronize_net();
3562 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3563 }
3564 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3565 
3566 /*
3567  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3568  * the special handling of PFMEMALLOC skbs.
3569  */
3570 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3571 {
3572 	switch (skb->protocol) {
3573 	case htons(ETH_P_ARP):
3574 	case htons(ETH_P_IP):
3575 	case htons(ETH_P_IPV6):
3576 	case htons(ETH_P_8021Q):
3577 	case htons(ETH_P_8021AD):
3578 		return true;
3579 	default:
3580 		return false;
3581 	}
3582 }
3583 
3584 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3585 {
3586 	struct packet_type *ptype, *pt_prev;
3587 	rx_handler_func_t *rx_handler;
3588 	struct net_device *orig_dev;
3589 	struct net_device *null_or_dev;
3590 	bool deliver_exact = false;
3591 	int ret = NET_RX_DROP;
3592 	__be16 type;
3593 
3594 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3595 
3596 	trace_netif_receive_skb(skb);
3597 
3598 	orig_dev = skb->dev;
3599 
3600 	skb_reset_network_header(skb);
3601 	if (!skb_transport_header_was_set(skb))
3602 		skb_reset_transport_header(skb);
3603 	skb_reset_mac_len(skb);
3604 
3605 	pt_prev = NULL;
3606 
3607 	rcu_read_lock();
3608 
3609 another_round:
3610 	skb->skb_iif = skb->dev->ifindex;
3611 
3612 	__this_cpu_inc(softnet_data.processed);
3613 
3614 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3615 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3616 		skb = skb_vlan_untag(skb);
3617 		if (unlikely(!skb))
3618 			goto unlock;
3619 	}
3620 
3621 #ifdef CONFIG_NET_CLS_ACT
3622 	if (skb->tc_verd & TC_NCLS) {
3623 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3624 		goto ncls;
3625 	}
3626 #endif
3627 
3628 	if (pfmemalloc)
3629 		goto skip_taps;
3630 
3631 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3632 		if (!ptype->dev || ptype->dev == skb->dev) {
3633 			if (pt_prev)
3634 				ret = deliver_skb(skb, pt_prev, orig_dev);
3635 			pt_prev = ptype;
3636 		}
3637 	}
3638 
3639 skip_taps:
3640 #ifdef CONFIG_NET_CLS_ACT
3641 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3642 	if (!skb)
3643 		goto unlock;
3644 ncls:
3645 #endif
3646 
3647 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3648 		goto drop;
3649 
3650 	if (vlan_tx_tag_present(skb)) {
3651 		if (pt_prev) {
3652 			ret = deliver_skb(skb, pt_prev, orig_dev);
3653 			pt_prev = NULL;
3654 		}
3655 		if (vlan_do_receive(&skb))
3656 			goto another_round;
3657 		else if (unlikely(!skb))
3658 			goto unlock;
3659 	}
3660 
3661 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3662 	if (rx_handler) {
3663 		if (pt_prev) {
3664 			ret = deliver_skb(skb, pt_prev, orig_dev);
3665 			pt_prev = NULL;
3666 		}
3667 		switch (rx_handler(&skb)) {
3668 		case RX_HANDLER_CONSUMED:
3669 			ret = NET_RX_SUCCESS;
3670 			goto unlock;
3671 		case RX_HANDLER_ANOTHER:
3672 			goto another_round;
3673 		case RX_HANDLER_EXACT:
3674 			deliver_exact = true;
3675 		case RX_HANDLER_PASS:
3676 			break;
3677 		default:
3678 			BUG();
3679 		}
3680 	}
3681 
3682 	if (unlikely(vlan_tx_tag_present(skb))) {
3683 		if (vlan_tx_tag_get_id(skb))
3684 			skb->pkt_type = PACKET_OTHERHOST;
3685 		/* Note: we might in the future use prio bits
3686 		 * and set skb->priority like in vlan_do_receive()
3687 		 * For the time being, just ignore Priority Code Point
3688 		 */
3689 		skb->vlan_tci = 0;
3690 	}
3691 
3692 	/* deliver only exact match when indicated */
3693 	null_or_dev = deliver_exact ? skb->dev : NULL;
3694 
3695 	type = skb->protocol;
3696 	list_for_each_entry_rcu(ptype,
3697 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3698 		if (ptype->type == type &&
3699 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3700 		     ptype->dev == orig_dev)) {
3701 			if (pt_prev)
3702 				ret = deliver_skb(skb, pt_prev, orig_dev);
3703 			pt_prev = ptype;
3704 		}
3705 	}
3706 
3707 	if (pt_prev) {
3708 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3709 			goto drop;
3710 		else
3711 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3712 	} else {
3713 drop:
3714 		atomic_long_inc(&skb->dev->rx_dropped);
3715 		kfree_skb(skb);
3716 		/* Jamal, now you will not able to escape explaining
3717 		 * me how you were going to use this. :-)
3718 		 */
3719 		ret = NET_RX_DROP;
3720 	}
3721 
3722 unlock:
3723 	rcu_read_unlock();
3724 	return ret;
3725 }
3726 
3727 static int __netif_receive_skb(struct sk_buff *skb)
3728 {
3729 	int ret;
3730 
3731 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3732 		unsigned long pflags = current->flags;
3733 
3734 		/*
3735 		 * PFMEMALLOC skbs are special, they should
3736 		 * - be delivered to SOCK_MEMALLOC sockets only
3737 		 * - stay away from userspace
3738 		 * - have bounded memory usage
3739 		 *
3740 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
3741 		 * context down to all allocation sites.
3742 		 */
3743 		current->flags |= PF_MEMALLOC;
3744 		ret = __netif_receive_skb_core(skb, true);
3745 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
3746 	} else
3747 		ret = __netif_receive_skb_core(skb, false);
3748 
3749 	return ret;
3750 }
3751 
3752 static int netif_receive_skb_internal(struct sk_buff *skb)
3753 {
3754 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3755 
3756 	if (skb_defer_rx_timestamp(skb))
3757 		return NET_RX_SUCCESS;
3758 
3759 #ifdef CONFIG_RPS
3760 	if (static_key_false(&rps_needed)) {
3761 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3762 		int cpu, ret;
3763 
3764 		rcu_read_lock();
3765 
3766 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3767 
3768 		if (cpu >= 0) {
3769 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3770 			rcu_read_unlock();
3771 			return ret;
3772 		}
3773 		rcu_read_unlock();
3774 	}
3775 #endif
3776 	return __netif_receive_skb(skb);
3777 }
3778 
3779 /**
3780  *	netif_receive_skb - process receive buffer from network
3781  *	@skb: buffer to process
3782  *
3783  *	netif_receive_skb() is the main receive data processing function.
3784  *	It always succeeds. The buffer may be dropped during processing
3785  *	for congestion control or by the protocol layers.
3786  *
3787  *	This function may only be called from softirq context and interrupts
3788  *	should be enabled.
3789  *
3790  *	Return values (usually ignored):
3791  *	NET_RX_SUCCESS: no congestion
3792  *	NET_RX_DROP: packet was dropped
3793  */
3794 int netif_receive_skb(struct sk_buff *skb)
3795 {
3796 	trace_netif_receive_skb_entry(skb);
3797 
3798 	return netif_receive_skb_internal(skb);
3799 }
3800 EXPORT_SYMBOL(netif_receive_skb);
3801 
3802 /* Network device is going away, flush any packets still pending
3803  * Called with irqs disabled.
3804  */
3805 static void flush_backlog(void *arg)
3806 {
3807 	struct net_device *dev = arg;
3808 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3809 	struct sk_buff *skb, *tmp;
3810 
3811 	rps_lock(sd);
3812 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3813 		if (skb->dev == dev) {
3814 			__skb_unlink(skb, &sd->input_pkt_queue);
3815 			kfree_skb(skb);
3816 			input_queue_head_incr(sd);
3817 		}
3818 	}
3819 	rps_unlock(sd);
3820 
3821 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3822 		if (skb->dev == dev) {
3823 			__skb_unlink(skb, &sd->process_queue);
3824 			kfree_skb(skb);
3825 			input_queue_head_incr(sd);
3826 		}
3827 	}
3828 }
3829 
3830 static int napi_gro_complete(struct sk_buff *skb)
3831 {
3832 	struct packet_offload *ptype;
3833 	__be16 type = skb->protocol;
3834 	struct list_head *head = &offload_base;
3835 	int err = -ENOENT;
3836 
3837 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3838 
3839 	if (NAPI_GRO_CB(skb)->count == 1) {
3840 		skb_shinfo(skb)->gso_size = 0;
3841 		goto out;
3842 	}
3843 
3844 	rcu_read_lock();
3845 	list_for_each_entry_rcu(ptype, head, list) {
3846 		if (ptype->type != type || !ptype->callbacks.gro_complete)
3847 			continue;
3848 
3849 		err = ptype->callbacks.gro_complete(skb, 0);
3850 		break;
3851 	}
3852 	rcu_read_unlock();
3853 
3854 	if (err) {
3855 		WARN_ON(&ptype->list == head);
3856 		kfree_skb(skb);
3857 		return NET_RX_SUCCESS;
3858 	}
3859 
3860 out:
3861 	return netif_receive_skb_internal(skb);
3862 }
3863 
3864 /* napi->gro_list contains packets ordered by age.
3865  * youngest packets at the head of it.
3866  * Complete skbs in reverse order to reduce latencies.
3867  */
3868 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3869 {
3870 	struct sk_buff *skb, *prev = NULL;
3871 
3872 	/* scan list and build reverse chain */
3873 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3874 		skb->prev = prev;
3875 		prev = skb;
3876 	}
3877 
3878 	for (skb = prev; skb; skb = prev) {
3879 		skb->next = NULL;
3880 
3881 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3882 			return;
3883 
3884 		prev = skb->prev;
3885 		napi_gro_complete(skb);
3886 		napi->gro_count--;
3887 	}
3888 
3889 	napi->gro_list = NULL;
3890 }
3891 EXPORT_SYMBOL(napi_gro_flush);
3892 
3893 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3894 {
3895 	struct sk_buff *p;
3896 	unsigned int maclen = skb->dev->hard_header_len;
3897 	u32 hash = skb_get_hash_raw(skb);
3898 
3899 	for (p = napi->gro_list; p; p = p->next) {
3900 		unsigned long diffs;
3901 
3902 		NAPI_GRO_CB(p)->flush = 0;
3903 
3904 		if (hash != skb_get_hash_raw(p)) {
3905 			NAPI_GRO_CB(p)->same_flow = 0;
3906 			continue;
3907 		}
3908 
3909 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3910 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3911 		if (maclen == ETH_HLEN)
3912 			diffs |= compare_ether_header(skb_mac_header(p),
3913 						      skb_mac_header(skb));
3914 		else if (!diffs)
3915 			diffs = memcmp(skb_mac_header(p),
3916 				       skb_mac_header(skb),
3917 				       maclen);
3918 		NAPI_GRO_CB(p)->same_flow = !diffs;
3919 	}
3920 }
3921 
3922 static void skb_gro_reset_offset(struct sk_buff *skb)
3923 {
3924 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
3925 	const skb_frag_t *frag0 = &pinfo->frags[0];
3926 
3927 	NAPI_GRO_CB(skb)->data_offset = 0;
3928 	NAPI_GRO_CB(skb)->frag0 = NULL;
3929 	NAPI_GRO_CB(skb)->frag0_len = 0;
3930 
3931 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3932 	    pinfo->nr_frags &&
3933 	    !PageHighMem(skb_frag_page(frag0))) {
3934 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3935 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3936 	}
3937 }
3938 
3939 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3940 {
3941 	struct skb_shared_info *pinfo = skb_shinfo(skb);
3942 
3943 	BUG_ON(skb->end - skb->tail < grow);
3944 
3945 	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3946 
3947 	skb->data_len -= grow;
3948 	skb->tail += grow;
3949 
3950 	pinfo->frags[0].page_offset += grow;
3951 	skb_frag_size_sub(&pinfo->frags[0], grow);
3952 
3953 	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3954 		skb_frag_unref(skb, 0);
3955 		memmove(pinfo->frags, pinfo->frags + 1,
3956 			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
3957 	}
3958 }
3959 
3960 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3961 {
3962 	struct sk_buff **pp = NULL;
3963 	struct packet_offload *ptype;
3964 	__be16 type = skb->protocol;
3965 	struct list_head *head = &offload_base;
3966 	int same_flow;
3967 	enum gro_result ret;
3968 	int grow;
3969 
3970 	if (!(skb->dev->features & NETIF_F_GRO))
3971 		goto normal;
3972 
3973 	if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
3974 		goto normal;
3975 
3976 	gro_list_prepare(napi, skb);
3977 
3978 	rcu_read_lock();
3979 	list_for_each_entry_rcu(ptype, head, list) {
3980 		if (ptype->type != type || !ptype->callbacks.gro_receive)
3981 			continue;
3982 
3983 		skb_set_network_header(skb, skb_gro_offset(skb));
3984 		skb_reset_mac_len(skb);
3985 		NAPI_GRO_CB(skb)->same_flow = 0;
3986 		NAPI_GRO_CB(skb)->flush = 0;
3987 		NAPI_GRO_CB(skb)->free = 0;
3988 		NAPI_GRO_CB(skb)->udp_mark = 0;
3989 
3990 		/* Setup for GRO checksum validation */
3991 		switch (skb->ip_summed) {
3992 		case CHECKSUM_COMPLETE:
3993 			NAPI_GRO_CB(skb)->csum = skb->csum;
3994 			NAPI_GRO_CB(skb)->csum_valid = 1;
3995 			NAPI_GRO_CB(skb)->csum_cnt = 0;
3996 			break;
3997 		case CHECKSUM_UNNECESSARY:
3998 			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
3999 			NAPI_GRO_CB(skb)->csum_valid = 0;
4000 			break;
4001 		default:
4002 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4003 			NAPI_GRO_CB(skb)->csum_valid = 0;
4004 		}
4005 
4006 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4007 		break;
4008 	}
4009 	rcu_read_unlock();
4010 
4011 	if (&ptype->list == head)
4012 		goto normal;
4013 
4014 	same_flow = NAPI_GRO_CB(skb)->same_flow;
4015 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4016 
4017 	if (pp) {
4018 		struct sk_buff *nskb = *pp;
4019 
4020 		*pp = nskb->next;
4021 		nskb->next = NULL;
4022 		napi_gro_complete(nskb);
4023 		napi->gro_count--;
4024 	}
4025 
4026 	if (same_flow)
4027 		goto ok;
4028 
4029 	if (NAPI_GRO_CB(skb)->flush)
4030 		goto normal;
4031 
4032 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4033 		struct sk_buff *nskb = napi->gro_list;
4034 
4035 		/* locate the end of the list to select the 'oldest' flow */
4036 		while (nskb->next) {
4037 			pp = &nskb->next;
4038 			nskb = *pp;
4039 		}
4040 		*pp = NULL;
4041 		nskb->next = NULL;
4042 		napi_gro_complete(nskb);
4043 	} else {
4044 		napi->gro_count++;
4045 	}
4046 	NAPI_GRO_CB(skb)->count = 1;
4047 	NAPI_GRO_CB(skb)->age = jiffies;
4048 	NAPI_GRO_CB(skb)->last = skb;
4049 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4050 	skb->next = napi->gro_list;
4051 	napi->gro_list = skb;
4052 	ret = GRO_HELD;
4053 
4054 pull:
4055 	grow = skb_gro_offset(skb) - skb_headlen(skb);
4056 	if (grow > 0)
4057 		gro_pull_from_frag0(skb, grow);
4058 ok:
4059 	return ret;
4060 
4061 normal:
4062 	ret = GRO_NORMAL;
4063 	goto pull;
4064 }
4065 
4066 struct packet_offload *gro_find_receive_by_type(__be16 type)
4067 {
4068 	struct list_head *offload_head = &offload_base;
4069 	struct packet_offload *ptype;
4070 
4071 	list_for_each_entry_rcu(ptype, offload_head, list) {
4072 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4073 			continue;
4074 		return ptype;
4075 	}
4076 	return NULL;
4077 }
4078 EXPORT_SYMBOL(gro_find_receive_by_type);
4079 
4080 struct packet_offload *gro_find_complete_by_type(__be16 type)
4081 {
4082 	struct list_head *offload_head = &offload_base;
4083 	struct packet_offload *ptype;
4084 
4085 	list_for_each_entry_rcu(ptype, offload_head, list) {
4086 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4087 			continue;
4088 		return ptype;
4089 	}
4090 	return NULL;
4091 }
4092 EXPORT_SYMBOL(gro_find_complete_by_type);
4093 
4094 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4095 {
4096 	switch (ret) {
4097 	case GRO_NORMAL:
4098 		if (netif_receive_skb_internal(skb))
4099 			ret = GRO_DROP;
4100 		break;
4101 
4102 	case GRO_DROP:
4103 		kfree_skb(skb);
4104 		break;
4105 
4106 	case GRO_MERGED_FREE:
4107 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4108 			kmem_cache_free(skbuff_head_cache, skb);
4109 		else
4110 			__kfree_skb(skb);
4111 		break;
4112 
4113 	case GRO_HELD:
4114 	case GRO_MERGED:
4115 		break;
4116 	}
4117 
4118 	return ret;
4119 }
4120 
4121 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4122 {
4123 	trace_napi_gro_receive_entry(skb);
4124 
4125 	skb_gro_reset_offset(skb);
4126 
4127 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4128 }
4129 EXPORT_SYMBOL(napi_gro_receive);
4130 
4131 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4132 {
4133 	if (unlikely(skb->pfmemalloc)) {
4134 		consume_skb(skb);
4135 		return;
4136 	}
4137 	__skb_pull(skb, skb_headlen(skb));
4138 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4139 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4140 	skb->vlan_tci = 0;
4141 	skb->dev = napi->dev;
4142 	skb->skb_iif = 0;
4143 	skb->encapsulation = 0;
4144 	skb_shinfo(skb)->gso_type = 0;
4145 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4146 
4147 	napi->skb = skb;
4148 }
4149 
4150 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4151 {
4152 	struct sk_buff *skb = napi->skb;
4153 
4154 	if (!skb) {
4155 		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4156 		napi->skb = skb;
4157 	}
4158 	return skb;
4159 }
4160 EXPORT_SYMBOL(napi_get_frags);
4161 
4162 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4163 				      struct sk_buff *skb,
4164 				      gro_result_t ret)
4165 {
4166 	switch (ret) {
4167 	case GRO_NORMAL:
4168 	case GRO_HELD:
4169 		__skb_push(skb, ETH_HLEN);
4170 		skb->protocol = eth_type_trans(skb, skb->dev);
4171 		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4172 			ret = GRO_DROP;
4173 		break;
4174 
4175 	case GRO_DROP:
4176 	case GRO_MERGED_FREE:
4177 		napi_reuse_skb(napi, skb);
4178 		break;
4179 
4180 	case GRO_MERGED:
4181 		break;
4182 	}
4183 
4184 	return ret;
4185 }
4186 
4187 /* Upper GRO stack assumes network header starts at gro_offset=0
4188  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4189  * We copy ethernet header into skb->data to have a common layout.
4190  */
4191 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4192 {
4193 	struct sk_buff *skb = napi->skb;
4194 	const struct ethhdr *eth;
4195 	unsigned int hlen = sizeof(*eth);
4196 
4197 	napi->skb = NULL;
4198 
4199 	skb_reset_mac_header(skb);
4200 	skb_gro_reset_offset(skb);
4201 
4202 	eth = skb_gro_header_fast(skb, 0);
4203 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4204 		eth = skb_gro_header_slow(skb, hlen, 0);
4205 		if (unlikely(!eth)) {
4206 			napi_reuse_skb(napi, skb);
4207 			return NULL;
4208 		}
4209 	} else {
4210 		gro_pull_from_frag0(skb, hlen);
4211 		NAPI_GRO_CB(skb)->frag0 += hlen;
4212 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4213 	}
4214 	__skb_pull(skb, hlen);
4215 
4216 	/*
4217 	 * This works because the only protocols we care about don't require
4218 	 * special handling.
4219 	 * We'll fix it up properly in napi_frags_finish()
4220 	 */
4221 	skb->protocol = eth->h_proto;
4222 
4223 	return skb;
4224 }
4225 
4226 gro_result_t napi_gro_frags(struct napi_struct *napi)
4227 {
4228 	struct sk_buff *skb = napi_frags_skb(napi);
4229 
4230 	if (!skb)
4231 		return GRO_DROP;
4232 
4233 	trace_napi_gro_frags_entry(skb);
4234 
4235 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4236 }
4237 EXPORT_SYMBOL(napi_gro_frags);
4238 
4239 /* Compute the checksum from gro_offset and return the folded value
4240  * after adding in any pseudo checksum.
4241  */
4242 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4243 {
4244 	__wsum wsum;
4245 	__sum16 sum;
4246 
4247 	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4248 
4249 	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4250 	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4251 	if (likely(!sum)) {
4252 		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4253 		    !skb->csum_complete_sw)
4254 			netdev_rx_csum_fault(skb->dev);
4255 	}
4256 
4257 	NAPI_GRO_CB(skb)->csum = wsum;
4258 	NAPI_GRO_CB(skb)->csum_valid = 1;
4259 
4260 	return sum;
4261 }
4262 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4263 
4264 /*
4265  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4266  * Note: called with local irq disabled, but exits with local irq enabled.
4267  */
4268 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4269 {
4270 #ifdef CONFIG_RPS
4271 	struct softnet_data *remsd = sd->rps_ipi_list;
4272 
4273 	if (remsd) {
4274 		sd->rps_ipi_list = NULL;
4275 
4276 		local_irq_enable();
4277 
4278 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4279 		while (remsd) {
4280 			struct softnet_data *next = remsd->rps_ipi_next;
4281 
4282 			if (cpu_online(remsd->cpu))
4283 				smp_call_function_single_async(remsd->cpu,
4284 							   &remsd->csd);
4285 			remsd = next;
4286 		}
4287 	} else
4288 #endif
4289 		local_irq_enable();
4290 }
4291 
4292 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4293 {
4294 #ifdef CONFIG_RPS
4295 	return sd->rps_ipi_list != NULL;
4296 #else
4297 	return false;
4298 #endif
4299 }
4300 
4301 static int process_backlog(struct napi_struct *napi, int quota)
4302 {
4303 	int work = 0;
4304 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4305 
4306 	/* Check if we have pending ipi, its better to send them now,
4307 	 * not waiting net_rx_action() end.
4308 	 */
4309 	if (sd_has_rps_ipi_waiting(sd)) {
4310 		local_irq_disable();
4311 		net_rps_action_and_irq_enable(sd);
4312 	}
4313 
4314 	napi->weight = weight_p;
4315 	local_irq_disable();
4316 	while (1) {
4317 		struct sk_buff *skb;
4318 
4319 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4320 			local_irq_enable();
4321 			__netif_receive_skb(skb);
4322 			local_irq_disable();
4323 			input_queue_head_incr(sd);
4324 			if (++work >= quota) {
4325 				local_irq_enable();
4326 				return work;
4327 			}
4328 		}
4329 
4330 		rps_lock(sd);
4331 		if (skb_queue_empty(&sd->input_pkt_queue)) {
4332 			/*
4333 			 * Inline a custom version of __napi_complete().
4334 			 * only current cpu owns and manipulates this napi,
4335 			 * and NAPI_STATE_SCHED is the only possible flag set
4336 			 * on backlog.
4337 			 * We can use a plain write instead of clear_bit(),
4338 			 * and we dont need an smp_mb() memory barrier.
4339 			 */
4340 			napi->state = 0;
4341 			rps_unlock(sd);
4342 
4343 			break;
4344 		}
4345 
4346 		skb_queue_splice_tail_init(&sd->input_pkt_queue,
4347 					   &sd->process_queue);
4348 		rps_unlock(sd);
4349 	}
4350 	local_irq_enable();
4351 
4352 	return work;
4353 }
4354 
4355 /**
4356  * __napi_schedule - schedule for receive
4357  * @n: entry to schedule
4358  *
4359  * The entry's receive function will be scheduled to run.
4360  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4361  */
4362 void __napi_schedule(struct napi_struct *n)
4363 {
4364 	unsigned long flags;
4365 
4366 	local_irq_save(flags);
4367 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4368 	local_irq_restore(flags);
4369 }
4370 EXPORT_SYMBOL(__napi_schedule);
4371 
4372 /**
4373  * __napi_schedule_irqoff - schedule for receive
4374  * @n: entry to schedule
4375  *
4376  * Variant of __napi_schedule() assuming hard irqs are masked
4377  */
4378 void __napi_schedule_irqoff(struct napi_struct *n)
4379 {
4380 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4381 }
4382 EXPORT_SYMBOL(__napi_schedule_irqoff);
4383 
4384 void __napi_complete(struct napi_struct *n)
4385 {
4386 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4387 
4388 	list_del_init(&n->poll_list);
4389 	smp_mb__before_atomic();
4390 	clear_bit(NAPI_STATE_SCHED, &n->state);
4391 }
4392 EXPORT_SYMBOL(__napi_complete);
4393 
4394 void napi_complete_done(struct napi_struct *n, int work_done)
4395 {
4396 	unsigned long flags;
4397 
4398 	/*
4399 	 * don't let napi dequeue from the cpu poll list
4400 	 * just in case its running on a different cpu
4401 	 */
4402 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4403 		return;
4404 
4405 	if (n->gro_list) {
4406 		unsigned long timeout = 0;
4407 
4408 		if (work_done)
4409 			timeout = n->dev->gro_flush_timeout;
4410 
4411 		if (timeout)
4412 			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4413 				      HRTIMER_MODE_REL_PINNED);
4414 		else
4415 			napi_gro_flush(n, false);
4416 	}
4417 	if (likely(list_empty(&n->poll_list))) {
4418 		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4419 	} else {
4420 		/* If n->poll_list is not empty, we need to mask irqs */
4421 		local_irq_save(flags);
4422 		__napi_complete(n);
4423 		local_irq_restore(flags);
4424 	}
4425 }
4426 EXPORT_SYMBOL(napi_complete_done);
4427 
4428 /* must be called under rcu_read_lock(), as we dont take a reference */
4429 struct napi_struct *napi_by_id(unsigned int napi_id)
4430 {
4431 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4432 	struct napi_struct *napi;
4433 
4434 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4435 		if (napi->napi_id == napi_id)
4436 			return napi;
4437 
4438 	return NULL;
4439 }
4440 EXPORT_SYMBOL_GPL(napi_by_id);
4441 
4442 void napi_hash_add(struct napi_struct *napi)
4443 {
4444 	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4445 
4446 		spin_lock(&napi_hash_lock);
4447 
4448 		/* 0 is not a valid id, we also skip an id that is taken
4449 		 * we expect both events to be extremely rare
4450 		 */
4451 		napi->napi_id = 0;
4452 		while (!napi->napi_id) {
4453 			napi->napi_id = ++napi_gen_id;
4454 			if (napi_by_id(napi->napi_id))
4455 				napi->napi_id = 0;
4456 		}
4457 
4458 		hlist_add_head_rcu(&napi->napi_hash_node,
4459 			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4460 
4461 		spin_unlock(&napi_hash_lock);
4462 	}
4463 }
4464 EXPORT_SYMBOL_GPL(napi_hash_add);
4465 
4466 /* Warning : caller is responsible to make sure rcu grace period
4467  * is respected before freeing memory containing @napi
4468  */
4469 void napi_hash_del(struct napi_struct *napi)
4470 {
4471 	spin_lock(&napi_hash_lock);
4472 
4473 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4474 		hlist_del_rcu(&napi->napi_hash_node);
4475 
4476 	spin_unlock(&napi_hash_lock);
4477 }
4478 EXPORT_SYMBOL_GPL(napi_hash_del);
4479 
4480 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4481 {
4482 	struct napi_struct *napi;
4483 
4484 	napi = container_of(timer, struct napi_struct, timer);
4485 	if (napi->gro_list)
4486 		napi_schedule(napi);
4487 
4488 	return HRTIMER_NORESTART;
4489 }
4490 
4491 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4492 		    int (*poll)(struct napi_struct *, int), int weight)
4493 {
4494 	INIT_LIST_HEAD(&napi->poll_list);
4495 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4496 	napi->timer.function = napi_watchdog;
4497 	napi->gro_count = 0;
4498 	napi->gro_list = NULL;
4499 	napi->skb = NULL;
4500 	napi->poll = poll;
4501 	if (weight > NAPI_POLL_WEIGHT)
4502 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4503 			    weight, dev->name);
4504 	napi->weight = weight;
4505 	list_add(&napi->dev_list, &dev->napi_list);
4506 	napi->dev = dev;
4507 #ifdef CONFIG_NETPOLL
4508 	spin_lock_init(&napi->poll_lock);
4509 	napi->poll_owner = -1;
4510 #endif
4511 	set_bit(NAPI_STATE_SCHED, &napi->state);
4512 }
4513 EXPORT_SYMBOL(netif_napi_add);
4514 
4515 void napi_disable(struct napi_struct *n)
4516 {
4517 	might_sleep();
4518 	set_bit(NAPI_STATE_DISABLE, &n->state);
4519 
4520 	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4521 		msleep(1);
4522 
4523 	hrtimer_cancel(&n->timer);
4524 
4525 	clear_bit(NAPI_STATE_DISABLE, &n->state);
4526 }
4527 EXPORT_SYMBOL(napi_disable);
4528 
4529 void netif_napi_del(struct napi_struct *napi)
4530 {
4531 	list_del_init(&napi->dev_list);
4532 	napi_free_frags(napi);
4533 
4534 	kfree_skb_list(napi->gro_list);
4535 	napi->gro_list = NULL;
4536 	napi->gro_count = 0;
4537 }
4538 EXPORT_SYMBOL(netif_napi_del);
4539 
4540 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4541 {
4542 	void *have;
4543 	int work, weight;
4544 
4545 	list_del_init(&n->poll_list);
4546 
4547 	have = netpoll_poll_lock(n);
4548 
4549 	weight = n->weight;
4550 
4551 	/* This NAPI_STATE_SCHED test is for avoiding a race
4552 	 * with netpoll's poll_napi().  Only the entity which
4553 	 * obtains the lock and sees NAPI_STATE_SCHED set will
4554 	 * actually make the ->poll() call.  Therefore we avoid
4555 	 * accidentally calling ->poll() when NAPI is not scheduled.
4556 	 */
4557 	work = 0;
4558 	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4559 		work = n->poll(n, weight);
4560 		trace_napi_poll(n);
4561 	}
4562 
4563 	WARN_ON_ONCE(work > weight);
4564 
4565 	if (likely(work < weight))
4566 		goto out_unlock;
4567 
4568 	/* Drivers must not modify the NAPI state if they
4569 	 * consume the entire weight.  In such cases this code
4570 	 * still "owns" the NAPI instance and therefore can
4571 	 * move the instance around on the list at-will.
4572 	 */
4573 	if (unlikely(napi_disable_pending(n))) {
4574 		napi_complete(n);
4575 		goto out_unlock;
4576 	}
4577 
4578 	if (n->gro_list) {
4579 		/* flush too old packets
4580 		 * If HZ < 1000, flush all packets.
4581 		 */
4582 		napi_gro_flush(n, HZ >= 1000);
4583 	}
4584 
4585 	/* Some drivers may have called napi_schedule
4586 	 * prior to exhausting their budget.
4587 	 */
4588 	if (unlikely(!list_empty(&n->poll_list))) {
4589 		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4590 			     n->dev ? n->dev->name : "backlog");
4591 		goto out_unlock;
4592 	}
4593 
4594 	list_add_tail(&n->poll_list, repoll);
4595 
4596 out_unlock:
4597 	netpoll_poll_unlock(have);
4598 
4599 	return work;
4600 }
4601 
4602 static void net_rx_action(struct softirq_action *h)
4603 {
4604 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4605 	unsigned long time_limit = jiffies + 2;
4606 	int budget = netdev_budget;
4607 	LIST_HEAD(list);
4608 	LIST_HEAD(repoll);
4609 
4610 	local_irq_disable();
4611 	list_splice_init(&sd->poll_list, &list);
4612 	local_irq_enable();
4613 
4614 	for (;;) {
4615 		struct napi_struct *n;
4616 
4617 		if (list_empty(&list)) {
4618 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4619 				return;
4620 			break;
4621 		}
4622 
4623 		n = list_first_entry(&list, struct napi_struct, poll_list);
4624 		budget -= napi_poll(n, &repoll);
4625 
4626 		/* If softirq window is exhausted then punt.
4627 		 * Allow this to run for 2 jiffies since which will allow
4628 		 * an average latency of 1.5/HZ.
4629 		 */
4630 		if (unlikely(budget <= 0 ||
4631 			     time_after_eq(jiffies, time_limit))) {
4632 			sd->time_squeeze++;
4633 			break;
4634 		}
4635 	}
4636 
4637 	local_irq_disable();
4638 
4639 	list_splice_tail_init(&sd->poll_list, &list);
4640 	list_splice_tail(&repoll, &list);
4641 	list_splice(&list, &sd->poll_list);
4642 	if (!list_empty(&sd->poll_list))
4643 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4644 
4645 	net_rps_action_and_irq_enable(sd);
4646 }
4647 
4648 struct netdev_adjacent {
4649 	struct net_device *dev;
4650 
4651 	/* upper master flag, there can only be one master device per list */
4652 	bool master;
4653 
4654 	/* counter for the number of times this device was added to us */
4655 	u16 ref_nr;
4656 
4657 	/* private field for the users */
4658 	void *private;
4659 
4660 	struct list_head list;
4661 	struct rcu_head rcu;
4662 };
4663 
4664 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4665 						 struct net_device *adj_dev,
4666 						 struct list_head *adj_list)
4667 {
4668 	struct netdev_adjacent *adj;
4669 
4670 	list_for_each_entry(adj, adj_list, list) {
4671 		if (adj->dev == adj_dev)
4672 			return adj;
4673 	}
4674 	return NULL;
4675 }
4676 
4677 /**
4678  * netdev_has_upper_dev - Check if device is linked to an upper device
4679  * @dev: device
4680  * @upper_dev: upper device to check
4681  *
4682  * Find out if a device is linked to specified upper device and return true
4683  * in case it is. Note that this checks only immediate upper device,
4684  * not through a complete stack of devices. The caller must hold the RTNL lock.
4685  */
4686 bool netdev_has_upper_dev(struct net_device *dev,
4687 			  struct net_device *upper_dev)
4688 {
4689 	ASSERT_RTNL();
4690 
4691 	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4692 }
4693 EXPORT_SYMBOL(netdev_has_upper_dev);
4694 
4695 /**
4696  * netdev_has_any_upper_dev - Check if device is linked to some device
4697  * @dev: device
4698  *
4699  * Find out if a device is linked to an upper device and return true in case
4700  * it is. The caller must hold the RTNL lock.
4701  */
4702 static bool netdev_has_any_upper_dev(struct net_device *dev)
4703 {
4704 	ASSERT_RTNL();
4705 
4706 	return !list_empty(&dev->all_adj_list.upper);
4707 }
4708 
4709 /**
4710  * netdev_master_upper_dev_get - Get master upper device
4711  * @dev: device
4712  *
4713  * Find a master upper device and return pointer to it or NULL in case
4714  * it's not there. The caller must hold the RTNL lock.
4715  */
4716 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4717 {
4718 	struct netdev_adjacent *upper;
4719 
4720 	ASSERT_RTNL();
4721 
4722 	if (list_empty(&dev->adj_list.upper))
4723 		return NULL;
4724 
4725 	upper = list_first_entry(&dev->adj_list.upper,
4726 				 struct netdev_adjacent, list);
4727 	if (likely(upper->master))
4728 		return upper->dev;
4729 	return NULL;
4730 }
4731 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4732 
4733 void *netdev_adjacent_get_private(struct list_head *adj_list)
4734 {
4735 	struct netdev_adjacent *adj;
4736 
4737 	adj = list_entry(adj_list, struct netdev_adjacent, list);
4738 
4739 	return adj->private;
4740 }
4741 EXPORT_SYMBOL(netdev_adjacent_get_private);
4742 
4743 /**
4744  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4745  * @dev: device
4746  * @iter: list_head ** of the current position
4747  *
4748  * Gets the next device from the dev's upper list, starting from iter
4749  * position. The caller must hold RCU read lock.
4750  */
4751 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4752 						 struct list_head **iter)
4753 {
4754 	struct netdev_adjacent *upper;
4755 
4756 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4757 
4758 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4759 
4760 	if (&upper->list == &dev->adj_list.upper)
4761 		return NULL;
4762 
4763 	*iter = &upper->list;
4764 
4765 	return upper->dev;
4766 }
4767 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4768 
4769 /**
4770  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4771  * @dev: device
4772  * @iter: list_head ** of the current position
4773  *
4774  * Gets the next device from the dev's upper list, starting from iter
4775  * position. The caller must hold RCU read lock.
4776  */
4777 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4778 						     struct list_head **iter)
4779 {
4780 	struct netdev_adjacent *upper;
4781 
4782 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4783 
4784 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4785 
4786 	if (&upper->list == &dev->all_adj_list.upper)
4787 		return NULL;
4788 
4789 	*iter = &upper->list;
4790 
4791 	return upper->dev;
4792 }
4793 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4794 
4795 /**
4796  * netdev_lower_get_next_private - Get the next ->private from the
4797  *				   lower neighbour list
4798  * @dev: device
4799  * @iter: list_head ** of the current position
4800  *
4801  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4802  * list, starting from iter position. The caller must hold either hold the
4803  * RTNL lock or its own locking that guarantees that the neighbour lower
4804  * list will remain unchainged.
4805  */
4806 void *netdev_lower_get_next_private(struct net_device *dev,
4807 				    struct list_head **iter)
4808 {
4809 	struct netdev_adjacent *lower;
4810 
4811 	lower = list_entry(*iter, struct netdev_adjacent, list);
4812 
4813 	if (&lower->list == &dev->adj_list.lower)
4814 		return NULL;
4815 
4816 	*iter = lower->list.next;
4817 
4818 	return lower->private;
4819 }
4820 EXPORT_SYMBOL(netdev_lower_get_next_private);
4821 
4822 /**
4823  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4824  *				       lower neighbour list, RCU
4825  *				       variant
4826  * @dev: device
4827  * @iter: list_head ** of the current position
4828  *
4829  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4830  * list, starting from iter position. The caller must hold RCU read lock.
4831  */
4832 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4833 					struct list_head **iter)
4834 {
4835 	struct netdev_adjacent *lower;
4836 
4837 	WARN_ON_ONCE(!rcu_read_lock_held());
4838 
4839 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4840 
4841 	if (&lower->list == &dev->adj_list.lower)
4842 		return NULL;
4843 
4844 	*iter = &lower->list;
4845 
4846 	return lower->private;
4847 }
4848 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4849 
4850 /**
4851  * netdev_lower_get_next - Get the next device from the lower neighbour
4852  *                         list
4853  * @dev: device
4854  * @iter: list_head ** of the current position
4855  *
4856  * Gets the next netdev_adjacent from the dev's lower neighbour
4857  * list, starting from iter position. The caller must hold RTNL lock or
4858  * its own locking that guarantees that the neighbour lower
4859  * list will remain unchainged.
4860  */
4861 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4862 {
4863 	struct netdev_adjacent *lower;
4864 
4865 	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4866 
4867 	if (&lower->list == &dev->adj_list.lower)
4868 		return NULL;
4869 
4870 	*iter = &lower->list;
4871 
4872 	return lower->dev;
4873 }
4874 EXPORT_SYMBOL(netdev_lower_get_next);
4875 
4876 /**
4877  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4878  *				       lower neighbour list, RCU
4879  *				       variant
4880  * @dev: device
4881  *
4882  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4883  * list. The caller must hold RCU read lock.
4884  */
4885 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4886 {
4887 	struct netdev_adjacent *lower;
4888 
4889 	lower = list_first_or_null_rcu(&dev->adj_list.lower,
4890 			struct netdev_adjacent, list);
4891 	if (lower)
4892 		return lower->private;
4893 	return NULL;
4894 }
4895 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4896 
4897 /**
4898  * netdev_master_upper_dev_get_rcu - Get master upper device
4899  * @dev: device
4900  *
4901  * Find a master upper device and return pointer to it or NULL in case
4902  * it's not there. The caller must hold the RCU read lock.
4903  */
4904 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4905 {
4906 	struct netdev_adjacent *upper;
4907 
4908 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
4909 				       struct netdev_adjacent, list);
4910 	if (upper && likely(upper->master))
4911 		return upper->dev;
4912 	return NULL;
4913 }
4914 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4915 
4916 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4917 			      struct net_device *adj_dev,
4918 			      struct list_head *dev_list)
4919 {
4920 	char linkname[IFNAMSIZ+7];
4921 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4922 		"upper_%s" : "lower_%s", adj_dev->name);
4923 	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4924 				 linkname);
4925 }
4926 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4927 			       char *name,
4928 			       struct list_head *dev_list)
4929 {
4930 	char linkname[IFNAMSIZ+7];
4931 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4932 		"upper_%s" : "lower_%s", name);
4933 	sysfs_remove_link(&(dev->dev.kobj), linkname);
4934 }
4935 
4936 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4937 						 struct net_device *adj_dev,
4938 						 struct list_head *dev_list)
4939 {
4940 	return (dev_list == &dev->adj_list.upper ||
4941 		dev_list == &dev->adj_list.lower) &&
4942 		net_eq(dev_net(dev), dev_net(adj_dev));
4943 }
4944 
4945 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4946 					struct net_device *adj_dev,
4947 					struct list_head *dev_list,
4948 					void *private, bool master)
4949 {
4950 	struct netdev_adjacent *adj;
4951 	int ret;
4952 
4953 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4954 
4955 	if (adj) {
4956 		adj->ref_nr++;
4957 		return 0;
4958 	}
4959 
4960 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4961 	if (!adj)
4962 		return -ENOMEM;
4963 
4964 	adj->dev = adj_dev;
4965 	adj->master = master;
4966 	adj->ref_nr = 1;
4967 	adj->private = private;
4968 	dev_hold(adj_dev);
4969 
4970 	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4971 		 adj_dev->name, dev->name, adj_dev->name);
4972 
4973 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
4974 		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4975 		if (ret)
4976 			goto free_adj;
4977 	}
4978 
4979 	/* Ensure that master link is always the first item in list. */
4980 	if (master) {
4981 		ret = sysfs_create_link(&(dev->dev.kobj),
4982 					&(adj_dev->dev.kobj), "master");
4983 		if (ret)
4984 			goto remove_symlinks;
4985 
4986 		list_add_rcu(&adj->list, dev_list);
4987 	} else {
4988 		list_add_tail_rcu(&adj->list, dev_list);
4989 	}
4990 
4991 	return 0;
4992 
4993 remove_symlinks:
4994 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
4995 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4996 free_adj:
4997 	kfree(adj);
4998 	dev_put(adj_dev);
4999 
5000 	return ret;
5001 }
5002 
5003 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5004 					 struct net_device *adj_dev,
5005 					 struct list_head *dev_list)
5006 {
5007 	struct netdev_adjacent *adj;
5008 
5009 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
5010 
5011 	if (!adj) {
5012 		pr_err("tried to remove device %s from %s\n",
5013 		       dev->name, adj_dev->name);
5014 		BUG();
5015 	}
5016 
5017 	if (adj->ref_nr > 1) {
5018 		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5019 			 adj->ref_nr-1);
5020 		adj->ref_nr--;
5021 		return;
5022 	}
5023 
5024 	if (adj->master)
5025 		sysfs_remove_link(&(dev->dev.kobj), "master");
5026 
5027 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5028 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5029 
5030 	list_del_rcu(&adj->list);
5031 	pr_debug("dev_put for %s, because link removed from %s to %s\n",
5032 		 adj_dev->name, dev->name, adj_dev->name);
5033 	dev_put(adj_dev);
5034 	kfree_rcu(adj, rcu);
5035 }
5036 
5037 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5038 					    struct net_device *upper_dev,
5039 					    struct list_head *up_list,
5040 					    struct list_head *down_list,
5041 					    void *private, bool master)
5042 {
5043 	int ret;
5044 
5045 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5046 					   master);
5047 	if (ret)
5048 		return ret;
5049 
5050 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5051 					   false);
5052 	if (ret) {
5053 		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5054 		return ret;
5055 	}
5056 
5057 	return 0;
5058 }
5059 
5060 static int __netdev_adjacent_dev_link(struct net_device *dev,
5061 				      struct net_device *upper_dev)
5062 {
5063 	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5064 						&dev->all_adj_list.upper,
5065 						&upper_dev->all_adj_list.lower,
5066 						NULL, false);
5067 }
5068 
5069 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5070 					       struct net_device *upper_dev,
5071 					       struct list_head *up_list,
5072 					       struct list_head *down_list)
5073 {
5074 	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5075 	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5076 }
5077 
5078 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5079 					 struct net_device *upper_dev)
5080 {
5081 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5082 					   &dev->all_adj_list.upper,
5083 					   &upper_dev->all_adj_list.lower);
5084 }
5085 
5086 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5087 						struct net_device *upper_dev,
5088 						void *private, bool master)
5089 {
5090 	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5091 
5092 	if (ret)
5093 		return ret;
5094 
5095 	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5096 					       &dev->adj_list.upper,
5097 					       &upper_dev->adj_list.lower,
5098 					       private, master);
5099 	if (ret) {
5100 		__netdev_adjacent_dev_unlink(dev, upper_dev);
5101 		return ret;
5102 	}
5103 
5104 	return 0;
5105 }
5106 
5107 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5108 						   struct net_device *upper_dev)
5109 {
5110 	__netdev_adjacent_dev_unlink(dev, upper_dev);
5111 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5112 					   &dev->adj_list.upper,
5113 					   &upper_dev->adj_list.lower);
5114 }
5115 
5116 static int __netdev_upper_dev_link(struct net_device *dev,
5117 				   struct net_device *upper_dev, bool master,
5118 				   void *private)
5119 {
5120 	struct netdev_adjacent *i, *j, *to_i, *to_j;
5121 	int ret = 0;
5122 
5123 	ASSERT_RTNL();
5124 
5125 	if (dev == upper_dev)
5126 		return -EBUSY;
5127 
5128 	/* To prevent loops, check if dev is not upper device to upper_dev. */
5129 	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5130 		return -EBUSY;
5131 
5132 	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5133 		return -EEXIST;
5134 
5135 	if (master && netdev_master_upper_dev_get(dev))
5136 		return -EBUSY;
5137 
5138 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5139 						   master);
5140 	if (ret)
5141 		return ret;
5142 
5143 	/* Now that we linked these devs, make all the upper_dev's
5144 	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5145 	 * versa, and don't forget the devices itself. All of these
5146 	 * links are non-neighbours.
5147 	 */
5148 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5149 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5150 			pr_debug("Interlinking %s with %s, non-neighbour\n",
5151 				 i->dev->name, j->dev->name);
5152 			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5153 			if (ret)
5154 				goto rollback_mesh;
5155 		}
5156 	}
5157 
5158 	/* add dev to every upper_dev's upper device */
5159 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5160 		pr_debug("linking %s's upper device %s with %s\n",
5161 			 upper_dev->name, i->dev->name, dev->name);
5162 		ret = __netdev_adjacent_dev_link(dev, i->dev);
5163 		if (ret)
5164 			goto rollback_upper_mesh;
5165 	}
5166 
5167 	/* add upper_dev to every dev's lower device */
5168 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5169 		pr_debug("linking %s's lower device %s with %s\n", dev->name,
5170 			 i->dev->name, upper_dev->name);
5171 		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5172 		if (ret)
5173 			goto rollback_lower_mesh;
5174 	}
5175 
5176 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5177 	return 0;
5178 
5179 rollback_lower_mesh:
5180 	to_i = i;
5181 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5182 		if (i == to_i)
5183 			break;
5184 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5185 	}
5186 
5187 	i = NULL;
5188 
5189 rollback_upper_mesh:
5190 	to_i = i;
5191 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5192 		if (i == to_i)
5193 			break;
5194 		__netdev_adjacent_dev_unlink(dev, i->dev);
5195 	}
5196 
5197 	i = j = NULL;
5198 
5199 rollback_mesh:
5200 	to_i = i;
5201 	to_j = j;
5202 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5203 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5204 			if (i == to_i && j == to_j)
5205 				break;
5206 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5207 		}
5208 		if (i == to_i)
5209 			break;
5210 	}
5211 
5212 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5213 
5214 	return ret;
5215 }
5216 
5217 /**
5218  * netdev_upper_dev_link - Add a link to the upper device
5219  * @dev: device
5220  * @upper_dev: new upper device
5221  *
5222  * Adds a link to device which is upper to this one. The caller must hold
5223  * the RTNL lock. On a failure a negative errno code is returned.
5224  * On success the reference counts are adjusted and the function
5225  * returns zero.
5226  */
5227 int netdev_upper_dev_link(struct net_device *dev,
5228 			  struct net_device *upper_dev)
5229 {
5230 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5231 }
5232 EXPORT_SYMBOL(netdev_upper_dev_link);
5233 
5234 /**
5235  * netdev_master_upper_dev_link - Add a master link to the upper device
5236  * @dev: device
5237  * @upper_dev: new upper device
5238  *
5239  * Adds a link to device which is upper to this one. In this case, only
5240  * one master upper device can be linked, although other non-master devices
5241  * might be linked as well. The caller must hold the RTNL lock.
5242  * On a failure a negative errno code is returned. On success the reference
5243  * counts are adjusted and the function returns zero.
5244  */
5245 int netdev_master_upper_dev_link(struct net_device *dev,
5246 				 struct net_device *upper_dev)
5247 {
5248 	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5249 }
5250 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5251 
5252 int netdev_master_upper_dev_link_private(struct net_device *dev,
5253 					 struct net_device *upper_dev,
5254 					 void *private)
5255 {
5256 	return __netdev_upper_dev_link(dev, upper_dev, true, private);
5257 }
5258 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5259 
5260 /**
5261  * netdev_upper_dev_unlink - Removes a link to upper device
5262  * @dev: device
5263  * @upper_dev: new upper device
5264  *
5265  * Removes a link to device which is upper to this one. The caller must hold
5266  * the RTNL lock.
5267  */
5268 void netdev_upper_dev_unlink(struct net_device *dev,
5269 			     struct net_device *upper_dev)
5270 {
5271 	struct netdev_adjacent *i, *j;
5272 	ASSERT_RTNL();
5273 
5274 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5275 
5276 	/* Here is the tricky part. We must remove all dev's lower
5277 	 * devices from all upper_dev's upper devices and vice
5278 	 * versa, to maintain the graph relationship.
5279 	 */
5280 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5281 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5282 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5283 
5284 	/* remove also the devices itself from lower/upper device
5285 	 * list
5286 	 */
5287 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5288 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5289 
5290 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5291 		__netdev_adjacent_dev_unlink(dev, i->dev);
5292 
5293 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5294 }
5295 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5296 
5297 static void netdev_adjacent_add_links(struct net_device *dev)
5298 {
5299 	struct netdev_adjacent *iter;
5300 
5301 	struct net *net = dev_net(dev);
5302 
5303 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5304 		if (!net_eq(net,dev_net(iter->dev)))
5305 			continue;
5306 		netdev_adjacent_sysfs_add(iter->dev, dev,
5307 					  &iter->dev->adj_list.lower);
5308 		netdev_adjacent_sysfs_add(dev, iter->dev,
5309 					  &dev->adj_list.upper);
5310 	}
5311 
5312 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5313 		if (!net_eq(net,dev_net(iter->dev)))
5314 			continue;
5315 		netdev_adjacent_sysfs_add(iter->dev, dev,
5316 					  &iter->dev->adj_list.upper);
5317 		netdev_adjacent_sysfs_add(dev, iter->dev,
5318 					  &dev->adj_list.lower);
5319 	}
5320 }
5321 
5322 static void netdev_adjacent_del_links(struct net_device *dev)
5323 {
5324 	struct netdev_adjacent *iter;
5325 
5326 	struct net *net = dev_net(dev);
5327 
5328 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5329 		if (!net_eq(net,dev_net(iter->dev)))
5330 			continue;
5331 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5332 					  &iter->dev->adj_list.lower);
5333 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5334 					  &dev->adj_list.upper);
5335 	}
5336 
5337 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5338 		if (!net_eq(net,dev_net(iter->dev)))
5339 			continue;
5340 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5341 					  &iter->dev->adj_list.upper);
5342 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5343 					  &dev->adj_list.lower);
5344 	}
5345 }
5346 
5347 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5348 {
5349 	struct netdev_adjacent *iter;
5350 
5351 	struct net *net = dev_net(dev);
5352 
5353 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5354 		if (!net_eq(net,dev_net(iter->dev)))
5355 			continue;
5356 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5357 					  &iter->dev->adj_list.lower);
5358 		netdev_adjacent_sysfs_add(iter->dev, dev,
5359 					  &iter->dev->adj_list.lower);
5360 	}
5361 
5362 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5363 		if (!net_eq(net,dev_net(iter->dev)))
5364 			continue;
5365 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5366 					  &iter->dev->adj_list.upper);
5367 		netdev_adjacent_sysfs_add(iter->dev, dev,
5368 					  &iter->dev->adj_list.upper);
5369 	}
5370 }
5371 
5372 void *netdev_lower_dev_get_private(struct net_device *dev,
5373 				   struct net_device *lower_dev)
5374 {
5375 	struct netdev_adjacent *lower;
5376 
5377 	if (!lower_dev)
5378 		return NULL;
5379 	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5380 	if (!lower)
5381 		return NULL;
5382 
5383 	return lower->private;
5384 }
5385 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5386 
5387 
5388 int dev_get_nest_level(struct net_device *dev,
5389 		       bool (*type_check)(struct net_device *dev))
5390 {
5391 	struct net_device *lower = NULL;
5392 	struct list_head *iter;
5393 	int max_nest = -1;
5394 	int nest;
5395 
5396 	ASSERT_RTNL();
5397 
5398 	netdev_for_each_lower_dev(dev, lower, iter) {
5399 		nest = dev_get_nest_level(lower, type_check);
5400 		if (max_nest < nest)
5401 			max_nest = nest;
5402 	}
5403 
5404 	if (type_check(dev))
5405 		max_nest++;
5406 
5407 	return max_nest;
5408 }
5409 EXPORT_SYMBOL(dev_get_nest_level);
5410 
5411 static void dev_change_rx_flags(struct net_device *dev, int flags)
5412 {
5413 	const struct net_device_ops *ops = dev->netdev_ops;
5414 
5415 	if (ops->ndo_change_rx_flags)
5416 		ops->ndo_change_rx_flags(dev, flags);
5417 }
5418 
5419 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5420 {
5421 	unsigned int old_flags = dev->flags;
5422 	kuid_t uid;
5423 	kgid_t gid;
5424 
5425 	ASSERT_RTNL();
5426 
5427 	dev->flags |= IFF_PROMISC;
5428 	dev->promiscuity += inc;
5429 	if (dev->promiscuity == 0) {
5430 		/*
5431 		 * Avoid overflow.
5432 		 * If inc causes overflow, untouch promisc and return error.
5433 		 */
5434 		if (inc < 0)
5435 			dev->flags &= ~IFF_PROMISC;
5436 		else {
5437 			dev->promiscuity -= inc;
5438 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5439 				dev->name);
5440 			return -EOVERFLOW;
5441 		}
5442 	}
5443 	if (dev->flags != old_flags) {
5444 		pr_info("device %s %s promiscuous mode\n",
5445 			dev->name,
5446 			dev->flags & IFF_PROMISC ? "entered" : "left");
5447 		if (audit_enabled) {
5448 			current_uid_gid(&uid, &gid);
5449 			audit_log(current->audit_context, GFP_ATOMIC,
5450 				AUDIT_ANOM_PROMISCUOUS,
5451 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5452 				dev->name, (dev->flags & IFF_PROMISC),
5453 				(old_flags & IFF_PROMISC),
5454 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
5455 				from_kuid(&init_user_ns, uid),
5456 				from_kgid(&init_user_ns, gid),
5457 				audit_get_sessionid(current));
5458 		}
5459 
5460 		dev_change_rx_flags(dev, IFF_PROMISC);
5461 	}
5462 	if (notify)
5463 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
5464 	return 0;
5465 }
5466 
5467 /**
5468  *	dev_set_promiscuity	- update promiscuity count on a device
5469  *	@dev: device
5470  *	@inc: modifier
5471  *
5472  *	Add or remove promiscuity from a device. While the count in the device
5473  *	remains above zero the interface remains promiscuous. Once it hits zero
5474  *	the device reverts back to normal filtering operation. A negative inc
5475  *	value is used to drop promiscuity on the device.
5476  *	Return 0 if successful or a negative errno code on error.
5477  */
5478 int dev_set_promiscuity(struct net_device *dev, int inc)
5479 {
5480 	unsigned int old_flags = dev->flags;
5481 	int err;
5482 
5483 	err = __dev_set_promiscuity(dev, inc, true);
5484 	if (err < 0)
5485 		return err;
5486 	if (dev->flags != old_flags)
5487 		dev_set_rx_mode(dev);
5488 	return err;
5489 }
5490 EXPORT_SYMBOL(dev_set_promiscuity);
5491 
5492 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5493 {
5494 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5495 
5496 	ASSERT_RTNL();
5497 
5498 	dev->flags |= IFF_ALLMULTI;
5499 	dev->allmulti += inc;
5500 	if (dev->allmulti == 0) {
5501 		/*
5502 		 * Avoid overflow.
5503 		 * If inc causes overflow, untouch allmulti and return error.
5504 		 */
5505 		if (inc < 0)
5506 			dev->flags &= ~IFF_ALLMULTI;
5507 		else {
5508 			dev->allmulti -= inc;
5509 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5510 				dev->name);
5511 			return -EOVERFLOW;
5512 		}
5513 	}
5514 	if (dev->flags ^ old_flags) {
5515 		dev_change_rx_flags(dev, IFF_ALLMULTI);
5516 		dev_set_rx_mode(dev);
5517 		if (notify)
5518 			__dev_notify_flags(dev, old_flags,
5519 					   dev->gflags ^ old_gflags);
5520 	}
5521 	return 0;
5522 }
5523 
5524 /**
5525  *	dev_set_allmulti	- update allmulti count on a device
5526  *	@dev: device
5527  *	@inc: modifier
5528  *
5529  *	Add or remove reception of all multicast frames to a device. While the
5530  *	count in the device remains above zero the interface remains listening
5531  *	to all interfaces. Once it hits zero the device reverts back to normal
5532  *	filtering operation. A negative @inc value is used to drop the counter
5533  *	when releasing a resource needing all multicasts.
5534  *	Return 0 if successful or a negative errno code on error.
5535  */
5536 
5537 int dev_set_allmulti(struct net_device *dev, int inc)
5538 {
5539 	return __dev_set_allmulti(dev, inc, true);
5540 }
5541 EXPORT_SYMBOL(dev_set_allmulti);
5542 
5543 /*
5544  *	Upload unicast and multicast address lists to device and
5545  *	configure RX filtering. When the device doesn't support unicast
5546  *	filtering it is put in promiscuous mode while unicast addresses
5547  *	are present.
5548  */
5549 void __dev_set_rx_mode(struct net_device *dev)
5550 {
5551 	const struct net_device_ops *ops = dev->netdev_ops;
5552 
5553 	/* dev_open will call this function so the list will stay sane. */
5554 	if (!(dev->flags&IFF_UP))
5555 		return;
5556 
5557 	if (!netif_device_present(dev))
5558 		return;
5559 
5560 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5561 		/* Unicast addresses changes may only happen under the rtnl,
5562 		 * therefore calling __dev_set_promiscuity here is safe.
5563 		 */
5564 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5565 			__dev_set_promiscuity(dev, 1, false);
5566 			dev->uc_promisc = true;
5567 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5568 			__dev_set_promiscuity(dev, -1, false);
5569 			dev->uc_promisc = false;
5570 		}
5571 	}
5572 
5573 	if (ops->ndo_set_rx_mode)
5574 		ops->ndo_set_rx_mode(dev);
5575 }
5576 
5577 void dev_set_rx_mode(struct net_device *dev)
5578 {
5579 	netif_addr_lock_bh(dev);
5580 	__dev_set_rx_mode(dev);
5581 	netif_addr_unlock_bh(dev);
5582 }
5583 
5584 /**
5585  *	dev_get_flags - get flags reported to userspace
5586  *	@dev: device
5587  *
5588  *	Get the combination of flag bits exported through APIs to userspace.
5589  */
5590 unsigned int dev_get_flags(const struct net_device *dev)
5591 {
5592 	unsigned int flags;
5593 
5594 	flags = (dev->flags & ~(IFF_PROMISC |
5595 				IFF_ALLMULTI |
5596 				IFF_RUNNING |
5597 				IFF_LOWER_UP |
5598 				IFF_DORMANT)) |
5599 		(dev->gflags & (IFF_PROMISC |
5600 				IFF_ALLMULTI));
5601 
5602 	if (netif_running(dev)) {
5603 		if (netif_oper_up(dev))
5604 			flags |= IFF_RUNNING;
5605 		if (netif_carrier_ok(dev))
5606 			flags |= IFF_LOWER_UP;
5607 		if (netif_dormant(dev))
5608 			flags |= IFF_DORMANT;
5609 	}
5610 
5611 	return flags;
5612 }
5613 EXPORT_SYMBOL(dev_get_flags);
5614 
5615 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5616 {
5617 	unsigned int old_flags = dev->flags;
5618 	int ret;
5619 
5620 	ASSERT_RTNL();
5621 
5622 	/*
5623 	 *	Set the flags on our device.
5624 	 */
5625 
5626 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5627 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5628 			       IFF_AUTOMEDIA)) |
5629 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5630 				    IFF_ALLMULTI));
5631 
5632 	/*
5633 	 *	Load in the correct multicast list now the flags have changed.
5634 	 */
5635 
5636 	if ((old_flags ^ flags) & IFF_MULTICAST)
5637 		dev_change_rx_flags(dev, IFF_MULTICAST);
5638 
5639 	dev_set_rx_mode(dev);
5640 
5641 	/*
5642 	 *	Have we downed the interface. We handle IFF_UP ourselves
5643 	 *	according to user attempts to set it, rather than blindly
5644 	 *	setting it.
5645 	 */
5646 
5647 	ret = 0;
5648 	if ((old_flags ^ flags) & IFF_UP)
5649 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5650 
5651 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
5652 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
5653 		unsigned int old_flags = dev->flags;
5654 
5655 		dev->gflags ^= IFF_PROMISC;
5656 
5657 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
5658 			if (dev->flags != old_flags)
5659 				dev_set_rx_mode(dev);
5660 	}
5661 
5662 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5663 	   is important. Some (broken) drivers set IFF_PROMISC, when
5664 	   IFF_ALLMULTI is requested not asking us and not reporting.
5665 	 */
5666 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5667 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5668 
5669 		dev->gflags ^= IFF_ALLMULTI;
5670 		__dev_set_allmulti(dev, inc, false);
5671 	}
5672 
5673 	return ret;
5674 }
5675 
5676 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5677 			unsigned int gchanges)
5678 {
5679 	unsigned int changes = dev->flags ^ old_flags;
5680 
5681 	if (gchanges)
5682 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5683 
5684 	if (changes & IFF_UP) {
5685 		if (dev->flags & IFF_UP)
5686 			call_netdevice_notifiers(NETDEV_UP, dev);
5687 		else
5688 			call_netdevice_notifiers(NETDEV_DOWN, dev);
5689 	}
5690 
5691 	if (dev->flags & IFF_UP &&
5692 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5693 		struct netdev_notifier_change_info change_info;
5694 
5695 		change_info.flags_changed = changes;
5696 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5697 					      &change_info.info);
5698 	}
5699 }
5700 
5701 /**
5702  *	dev_change_flags - change device settings
5703  *	@dev: device
5704  *	@flags: device state flags
5705  *
5706  *	Change settings on device based state flags. The flags are
5707  *	in the userspace exported format.
5708  */
5709 int dev_change_flags(struct net_device *dev, unsigned int flags)
5710 {
5711 	int ret;
5712 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5713 
5714 	ret = __dev_change_flags(dev, flags);
5715 	if (ret < 0)
5716 		return ret;
5717 
5718 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5719 	__dev_notify_flags(dev, old_flags, changes);
5720 	return ret;
5721 }
5722 EXPORT_SYMBOL(dev_change_flags);
5723 
5724 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5725 {
5726 	const struct net_device_ops *ops = dev->netdev_ops;
5727 
5728 	if (ops->ndo_change_mtu)
5729 		return ops->ndo_change_mtu(dev, new_mtu);
5730 
5731 	dev->mtu = new_mtu;
5732 	return 0;
5733 }
5734 
5735 /**
5736  *	dev_set_mtu - Change maximum transfer unit
5737  *	@dev: device
5738  *	@new_mtu: new transfer unit
5739  *
5740  *	Change the maximum transfer size of the network device.
5741  */
5742 int dev_set_mtu(struct net_device *dev, int new_mtu)
5743 {
5744 	int err, orig_mtu;
5745 
5746 	if (new_mtu == dev->mtu)
5747 		return 0;
5748 
5749 	/*	MTU must be positive.	 */
5750 	if (new_mtu < 0)
5751 		return -EINVAL;
5752 
5753 	if (!netif_device_present(dev))
5754 		return -ENODEV;
5755 
5756 	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5757 	err = notifier_to_errno(err);
5758 	if (err)
5759 		return err;
5760 
5761 	orig_mtu = dev->mtu;
5762 	err = __dev_set_mtu(dev, new_mtu);
5763 
5764 	if (!err) {
5765 		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5766 		err = notifier_to_errno(err);
5767 		if (err) {
5768 			/* setting mtu back and notifying everyone again,
5769 			 * so that they have a chance to revert changes.
5770 			 */
5771 			__dev_set_mtu(dev, orig_mtu);
5772 			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5773 		}
5774 	}
5775 	return err;
5776 }
5777 EXPORT_SYMBOL(dev_set_mtu);
5778 
5779 /**
5780  *	dev_set_group - Change group this device belongs to
5781  *	@dev: device
5782  *	@new_group: group this device should belong to
5783  */
5784 void dev_set_group(struct net_device *dev, int new_group)
5785 {
5786 	dev->group = new_group;
5787 }
5788 EXPORT_SYMBOL(dev_set_group);
5789 
5790 /**
5791  *	dev_set_mac_address - Change Media Access Control Address
5792  *	@dev: device
5793  *	@sa: new address
5794  *
5795  *	Change the hardware (MAC) address of the device
5796  */
5797 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5798 {
5799 	const struct net_device_ops *ops = dev->netdev_ops;
5800 	int err;
5801 
5802 	if (!ops->ndo_set_mac_address)
5803 		return -EOPNOTSUPP;
5804 	if (sa->sa_family != dev->type)
5805 		return -EINVAL;
5806 	if (!netif_device_present(dev))
5807 		return -ENODEV;
5808 	err = ops->ndo_set_mac_address(dev, sa);
5809 	if (err)
5810 		return err;
5811 	dev->addr_assign_type = NET_ADDR_SET;
5812 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5813 	add_device_randomness(dev->dev_addr, dev->addr_len);
5814 	return 0;
5815 }
5816 EXPORT_SYMBOL(dev_set_mac_address);
5817 
5818 /**
5819  *	dev_change_carrier - Change device carrier
5820  *	@dev: device
5821  *	@new_carrier: new value
5822  *
5823  *	Change device carrier
5824  */
5825 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5826 {
5827 	const struct net_device_ops *ops = dev->netdev_ops;
5828 
5829 	if (!ops->ndo_change_carrier)
5830 		return -EOPNOTSUPP;
5831 	if (!netif_device_present(dev))
5832 		return -ENODEV;
5833 	return ops->ndo_change_carrier(dev, new_carrier);
5834 }
5835 EXPORT_SYMBOL(dev_change_carrier);
5836 
5837 /**
5838  *	dev_get_phys_port_id - Get device physical port ID
5839  *	@dev: device
5840  *	@ppid: port ID
5841  *
5842  *	Get device physical port ID
5843  */
5844 int dev_get_phys_port_id(struct net_device *dev,
5845 			 struct netdev_phys_item_id *ppid)
5846 {
5847 	const struct net_device_ops *ops = dev->netdev_ops;
5848 
5849 	if (!ops->ndo_get_phys_port_id)
5850 		return -EOPNOTSUPP;
5851 	return ops->ndo_get_phys_port_id(dev, ppid);
5852 }
5853 EXPORT_SYMBOL(dev_get_phys_port_id);
5854 
5855 /**
5856  *	dev_new_index	-	allocate an ifindex
5857  *	@net: the applicable net namespace
5858  *
5859  *	Returns a suitable unique value for a new device interface
5860  *	number.  The caller must hold the rtnl semaphore or the
5861  *	dev_base_lock to be sure it remains unique.
5862  */
5863 static int dev_new_index(struct net *net)
5864 {
5865 	int ifindex = net->ifindex;
5866 	for (;;) {
5867 		if (++ifindex <= 0)
5868 			ifindex = 1;
5869 		if (!__dev_get_by_index(net, ifindex))
5870 			return net->ifindex = ifindex;
5871 	}
5872 }
5873 
5874 /* Delayed registration/unregisteration */
5875 static LIST_HEAD(net_todo_list);
5876 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5877 
5878 static void net_set_todo(struct net_device *dev)
5879 {
5880 	list_add_tail(&dev->todo_list, &net_todo_list);
5881 	dev_net(dev)->dev_unreg_count++;
5882 }
5883 
5884 static void rollback_registered_many(struct list_head *head)
5885 {
5886 	struct net_device *dev, *tmp;
5887 	LIST_HEAD(close_head);
5888 
5889 	BUG_ON(dev_boot_phase);
5890 	ASSERT_RTNL();
5891 
5892 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5893 		/* Some devices call without registering
5894 		 * for initialization unwind. Remove those
5895 		 * devices and proceed with the remaining.
5896 		 */
5897 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5898 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5899 				 dev->name, dev);
5900 
5901 			WARN_ON(1);
5902 			list_del(&dev->unreg_list);
5903 			continue;
5904 		}
5905 		dev->dismantle = true;
5906 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5907 	}
5908 
5909 	/* If device is running, close it first. */
5910 	list_for_each_entry(dev, head, unreg_list)
5911 		list_add_tail(&dev->close_list, &close_head);
5912 	dev_close_many(&close_head);
5913 
5914 	list_for_each_entry(dev, head, unreg_list) {
5915 		/* And unlink it from device chain. */
5916 		unlist_netdevice(dev);
5917 
5918 		dev->reg_state = NETREG_UNREGISTERING;
5919 	}
5920 
5921 	synchronize_net();
5922 
5923 	list_for_each_entry(dev, head, unreg_list) {
5924 		struct sk_buff *skb = NULL;
5925 
5926 		/* Shutdown queueing discipline. */
5927 		dev_shutdown(dev);
5928 
5929 
5930 		/* Notify protocols, that we are about to destroy
5931 		   this device. They should clean all the things.
5932 		*/
5933 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5934 
5935 		if (!dev->rtnl_link_ops ||
5936 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5937 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
5938 						     GFP_KERNEL);
5939 
5940 		/*
5941 		 *	Flush the unicast and multicast chains
5942 		 */
5943 		dev_uc_flush(dev);
5944 		dev_mc_flush(dev);
5945 
5946 		if (dev->netdev_ops->ndo_uninit)
5947 			dev->netdev_ops->ndo_uninit(dev);
5948 
5949 		if (skb)
5950 			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
5951 
5952 		/* Notifier chain MUST detach us all upper devices. */
5953 		WARN_ON(netdev_has_any_upper_dev(dev));
5954 
5955 		/* Remove entries from kobject tree */
5956 		netdev_unregister_kobject(dev);
5957 #ifdef CONFIG_XPS
5958 		/* Remove XPS queueing entries */
5959 		netif_reset_xps_queues_gt(dev, 0);
5960 #endif
5961 	}
5962 
5963 	synchronize_net();
5964 
5965 	list_for_each_entry(dev, head, unreg_list)
5966 		dev_put(dev);
5967 }
5968 
5969 static void rollback_registered(struct net_device *dev)
5970 {
5971 	LIST_HEAD(single);
5972 
5973 	list_add(&dev->unreg_list, &single);
5974 	rollback_registered_many(&single);
5975 	list_del(&single);
5976 }
5977 
5978 static netdev_features_t netdev_fix_features(struct net_device *dev,
5979 	netdev_features_t features)
5980 {
5981 	/* Fix illegal checksum combinations */
5982 	if ((features & NETIF_F_HW_CSUM) &&
5983 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5984 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5985 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5986 	}
5987 
5988 	/* TSO requires that SG is present as well. */
5989 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5990 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5991 		features &= ~NETIF_F_ALL_TSO;
5992 	}
5993 
5994 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5995 					!(features & NETIF_F_IP_CSUM)) {
5996 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5997 		features &= ~NETIF_F_TSO;
5998 		features &= ~NETIF_F_TSO_ECN;
5999 	}
6000 
6001 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6002 					 !(features & NETIF_F_IPV6_CSUM)) {
6003 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6004 		features &= ~NETIF_F_TSO6;
6005 	}
6006 
6007 	/* TSO ECN requires that TSO is present as well. */
6008 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6009 		features &= ~NETIF_F_TSO_ECN;
6010 
6011 	/* Software GSO depends on SG. */
6012 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6013 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6014 		features &= ~NETIF_F_GSO;
6015 	}
6016 
6017 	/* UFO needs SG and checksumming */
6018 	if (features & NETIF_F_UFO) {
6019 		/* maybe split UFO into V4 and V6? */
6020 		if (!((features & NETIF_F_GEN_CSUM) ||
6021 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6022 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6023 			netdev_dbg(dev,
6024 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6025 			features &= ~NETIF_F_UFO;
6026 		}
6027 
6028 		if (!(features & NETIF_F_SG)) {
6029 			netdev_dbg(dev,
6030 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6031 			features &= ~NETIF_F_UFO;
6032 		}
6033 	}
6034 
6035 #ifdef CONFIG_NET_RX_BUSY_POLL
6036 	if (dev->netdev_ops->ndo_busy_poll)
6037 		features |= NETIF_F_BUSY_POLL;
6038 	else
6039 #endif
6040 		features &= ~NETIF_F_BUSY_POLL;
6041 
6042 	return features;
6043 }
6044 
6045 int __netdev_update_features(struct net_device *dev)
6046 {
6047 	netdev_features_t features;
6048 	int err = 0;
6049 
6050 	ASSERT_RTNL();
6051 
6052 	features = netdev_get_wanted_features(dev);
6053 
6054 	if (dev->netdev_ops->ndo_fix_features)
6055 		features = dev->netdev_ops->ndo_fix_features(dev, features);
6056 
6057 	/* driver might be less strict about feature dependencies */
6058 	features = netdev_fix_features(dev, features);
6059 
6060 	if (dev->features == features)
6061 		return 0;
6062 
6063 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6064 		&dev->features, &features);
6065 
6066 	if (dev->netdev_ops->ndo_set_features)
6067 		err = dev->netdev_ops->ndo_set_features(dev, features);
6068 
6069 	if (unlikely(err < 0)) {
6070 		netdev_err(dev,
6071 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
6072 			err, &features, &dev->features);
6073 		return -1;
6074 	}
6075 
6076 	if (!err)
6077 		dev->features = features;
6078 
6079 	return 1;
6080 }
6081 
6082 /**
6083  *	netdev_update_features - recalculate device features
6084  *	@dev: the device to check
6085  *
6086  *	Recalculate dev->features set and send notifications if it
6087  *	has changed. Should be called after driver or hardware dependent
6088  *	conditions might have changed that influence the features.
6089  */
6090 void netdev_update_features(struct net_device *dev)
6091 {
6092 	if (__netdev_update_features(dev))
6093 		netdev_features_change(dev);
6094 }
6095 EXPORT_SYMBOL(netdev_update_features);
6096 
6097 /**
6098  *	netdev_change_features - recalculate device features
6099  *	@dev: the device to check
6100  *
6101  *	Recalculate dev->features set and send notifications even
6102  *	if they have not changed. Should be called instead of
6103  *	netdev_update_features() if also dev->vlan_features might
6104  *	have changed to allow the changes to be propagated to stacked
6105  *	VLAN devices.
6106  */
6107 void netdev_change_features(struct net_device *dev)
6108 {
6109 	__netdev_update_features(dev);
6110 	netdev_features_change(dev);
6111 }
6112 EXPORT_SYMBOL(netdev_change_features);
6113 
6114 /**
6115  *	netif_stacked_transfer_operstate -	transfer operstate
6116  *	@rootdev: the root or lower level device to transfer state from
6117  *	@dev: the device to transfer operstate to
6118  *
6119  *	Transfer operational state from root to device. This is normally
6120  *	called when a stacking relationship exists between the root
6121  *	device and the device(a leaf device).
6122  */
6123 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6124 					struct net_device *dev)
6125 {
6126 	if (rootdev->operstate == IF_OPER_DORMANT)
6127 		netif_dormant_on(dev);
6128 	else
6129 		netif_dormant_off(dev);
6130 
6131 	if (netif_carrier_ok(rootdev)) {
6132 		if (!netif_carrier_ok(dev))
6133 			netif_carrier_on(dev);
6134 	} else {
6135 		if (netif_carrier_ok(dev))
6136 			netif_carrier_off(dev);
6137 	}
6138 }
6139 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6140 
6141 #ifdef CONFIG_SYSFS
6142 static int netif_alloc_rx_queues(struct net_device *dev)
6143 {
6144 	unsigned int i, count = dev->num_rx_queues;
6145 	struct netdev_rx_queue *rx;
6146 
6147 	BUG_ON(count < 1);
6148 
6149 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6150 	if (!rx)
6151 		return -ENOMEM;
6152 
6153 	dev->_rx = rx;
6154 
6155 	for (i = 0; i < count; i++)
6156 		rx[i].dev = dev;
6157 	return 0;
6158 }
6159 #endif
6160 
6161 static void netdev_init_one_queue(struct net_device *dev,
6162 				  struct netdev_queue *queue, void *_unused)
6163 {
6164 	/* Initialize queue lock */
6165 	spin_lock_init(&queue->_xmit_lock);
6166 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6167 	queue->xmit_lock_owner = -1;
6168 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6169 	queue->dev = dev;
6170 #ifdef CONFIG_BQL
6171 	dql_init(&queue->dql, HZ);
6172 #endif
6173 }
6174 
6175 static void netif_free_tx_queues(struct net_device *dev)
6176 {
6177 	kvfree(dev->_tx);
6178 }
6179 
6180 static int netif_alloc_netdev_queues(struct net_device *dev)
6181 {
6182 	unsigned int count = dev->num_tx_queues;
6183 	struct netdev_queue *tx;
6184 	size_t sz = count * sizeof(*tx);
6185 
6186 	BUG_ON(count < 1 || count > 0xffff);
6187 
6188 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6189 	if (!tx) {
6190 		tx = vzalloc(sz);
6191 		if (!tx)
6192 			return -ENOMEM;
6193 	}
6194 	dev->_tx = tx;
6195 
6196 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6197 	spin_lock_init(&dev->tx_global_lock);
6198 
6199 	return 0;
6200 }
6201 
6202 /**
6203  *	register_netdevice	- register a network device
6204  *	@dev: device to register
6205  *
6206  *	Take a completed network device structure and add it to the kernel
6207  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6208  *	chain. 0 is returned on success. A negative errno code is returned
6209  *	on a failure to set up the device, or if the name is a duplicate.
6210  *
6211  *	Callers must hold the rtnl semaphore. You may want
6212  *	register_netdev() instead of this.
6213  *
6214  *	BUGS:
6215  *	The locking appears insufficient to guarantee two parallel registers
6216  *	will not get the same name.
6217  */
6218 
6219 int register_netdevice(struct net_device *dev)
6220 {
6221 	int ret;
6222 	struct net *net = dev_net(dev);
6223 
6224 	BUG_ON(dev_boot_phase);
6225 	ASSERT_RTNL();
6226 
6227 	might_sleep();
6228 
6229 	/* When net_device's are persistent, this will be fatal. */
6230 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6231 	BUG_ON(!net);
6232 
6233 	spin_lock_init(&dev->addr_list_lock);
6234 	netdev_set_addr_lockdep_class(dev);
6235 
6236 	dev->iflink = -1;
6237 
6238 	ret = dev_get_valid_name(net, dev, dev->name);
6239 	if (ret < 0)
6240 		goto out;
6241 
6242 	/* Init, if this function is available */
6243 	if (dev->netdev_ops->ndo_init) {
6244 		ret = dev->netdev_ops->ndo_init(dev);
6245 		if (ret) {
6246 			if (ret > 0)
6247 				ret = -EIO;
6248 			goto out;
6249 		}
6250 	}
6251 
6252 	if (((dev->hw_features | dev->features) &
6253 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
6254 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6255 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6256 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6257 		ret = -EINVAL;
6258 		goto err_uninit;
6259 	}
6260 
6261 	ret = -EBUSY;
6262 	if (!dev->ifindex)
6263 		dev->ifindex = dev_new_index(net);
6264 	else if (__dev_get_by_index(net, dev->ifindex))
6265 		goto err_uninit;
6266 
6267 	if (dev->iflink == -1)
6268 		dev->iflink = dev->ifindex;
6269 
6270 	/* Transfer changeable features to wanted_features and enable
6271 	 * software offloads (GSO and GRO).
6272 	 */
6273 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
6274 	dev->features |= NETIF_F_SOFT_FEATURES;
6275 	dev->wanted_features = dev->features & dev->hw_features;
6276 
6277 	if (!(dev->flags & IFF_LOOPBACK)) {
6278 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
6279 	}
6280 
6281 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6282 	 */
6283 	dev->vlan_features |= NETIF_F_HIGHDMA;
6284 
6285 	/* Make NETIF_F_SG inheritable to tunnel devices.
6286 	 */
6287 	dev->hw_enc_features |= NETIF_F_SG;
6288 
6289 	/* Make NETIF_F_SG inheritable to MPLS.
6290 	 */
6291 	dev->mpls_features |= NETIF_F_SG;
6292 
6293 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6294 	ret = notifier_to_errno(ret);
6295 	if (ret)
6296 		goto err_uninit;
6297 
6298 	ret = netdev_register_kobject(dev);
6299 	if (ret)
6300 		goto err_uninit;
6301 	dev->reg_state = NETREG_REGISTERED;
6302 
6303 	__netdev_update_features(dev);
6304 
6305 	/*
6306 	 *	Default initial state at registry is that the
6307 	 *	device is present.
6308 	 */
6309 
6310 	set_bit(__LINK_STATE_PRESENT, &dev->state);
6311 
6312 	linkwatch_init_dev(dev);
6313 
6314 	dev_init_scheduler(dev);
6315 	dev_hold(dev);
6316 	list_netdevice(dev);
6317 	add_device_randomness(dev->dev_addr, dev->addr_len);
6318 
6319 	/* If the device has permanent device address, driver should
6320 	 * set dev_addr and also addr_assign_type should be set to
6321 	 * NET_ADDR_PERM (default value).
6322 	 */
6323 	if (dev->addr_assign_type == NET_ADDR_PERM)
6324 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6325 
6326 	/* Notify protocols, that a new device appeared. */
6327 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6328 	ret = notifier_to_errno(ret);
6329 	if (ret) {
6330 		rollback_registered(dev);
6331 		dev->reg_state = NETREG_UNREGISTERED;
6332 	}
6333 	/*
6334 	 *	Prevent userspace races by waiting until the network
6335 	 *	device is fully setup before sending notifications.
6336 	 */
6337 	if (!dev->rtnl_link_ops ||
6338 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6339 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6340 
6341 out:
6342 	return ret;
6343 
6344 err_uninit:
6345 	if (dev->netdev_ops->ndo_uninit)
6346 		dev->netdev_ops->ndo_uninit(dev);
6347 	goto out;
6348 }
6349 EXPORT_SYMBOL(register_netdevice);
6350 
6351 /**
6352  *	init_dummy_netdev	- init a dummy network device for NAPI
6353  *	@dev: device to init
6354  *
6355  *	This takes a network device structure and initialize the minimum
6356  *	amount of fields so it can be used to schedule NAPI polls without
6357  *	registering a full blown interface. This is to be used by drivers
6358  *	that need to tie several hardware interfaces to a single NAPI
6359  *	poll scheduler due to HW limitations.
6360  */
6361 int init_dummy_netdev(struct net_device *dev)
6362 {
6363 	/* Clear everything. Note we don't initialize spinlocks
6364 	 * are they aren't supposed to be taken by any of the
6365 	 * NAPI code and this dummy netdev is supposed to be
6366 	 * only ever used for NAPI polls
6367 	 */
6368 	memset(dev, 0, sizeof(struct net_device));
6369 
6370 	/* make sure we BUG if trying to hit standard
6371 	 * register/unregister code path
6372 	 */
6373 	dev->reg_state = NETREG_DUMMY;
6374 
6375 	/* NAPI wants this */
6376 	INIT_LIST_HEAD(&dev->napi_list);
6377 
6378 	/* a dummy interface is started by default */
6379 	set_bit(__LINK_STATE_PRESENT, &dev->state);
6380 	set_bit(__LINK_STATE_START, &dev->state);
6381 
6382 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
6383 	 * because users of this 'device' dont need to change
6384 	 * its refcount.
6385 	 */
6386 
6387 	return 0;
6388 }
6389 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6390 
6391 
6392 /**
6393  *	register_netdev	- register a network device
6394  *	@dev: device to register
6395  *
6396  *	Take a completed network device structure and add it to the kernel
6397  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6398  *	chain. 0 is returned on success. A negative errno code is returned
6399  *	on a failure to set up the device, or if the name is a duplicate.
6400  *
6401  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
6402  *	and expands the device name if you passed a format string to
6403  *	alloc_netdev.
6404  */
6405 int register_netdev(struct net_device *dev)
6406 {
6407 	int err;
6408 
6409 	rtnl_lock();
6410 	err = register_netdevice(dev);
6411 	rtnl_unlock();
6412 	return err;
6413 }
6414 EXPORT_SYMBOL(register_netdev);
6415 
6416 int netdev_refcnt_read(const struct net_device *dev)
6417 {
6418 	int i, refcnt = 0;
6419 
6420 	for_each_possible_cpu(i)
6421 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6422 	return refcnt;
6423 }
6424 EXPORT_SYMBOL(netdev_refcnt_read);
6425 
6426 /**
6427  * netdev_wait_allrefs - wait until all references are gone.
6428  * @dev: target net_device
6429  *
6430  * This is called when unregistering network devices.
6431  *
6432  * Any protocol or device that holds a reference should register
6433  * for netdevice notification, and cleanup and put back the
6434  * reference if they receive an UNREGISTER event.
6435  * We can get stuck here if buggy protocols don't correctly
6436  * call dev_put.
6437  */
6438 static void netdev_wait_allrefs(struct net_device *dev)
6439 {
6440 	unsigned long rebroadcast_time, warning_time;
6441 	int refcnt;
6442 
6443 	linkwatch_forget_dev(dev);
6444 
6445 	rebroadcast_time = warning_time = jiffies;
6446 	refcnt = netdev_refcnt_read(dev);
6447 
6448 	while (refcnt != 0) {
6449 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6450 			rtnl_lock();
6451 
6452 			/* Rebroadcast unregister notification */
6453 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6454 
6455 			__rtnl_unlock();
6456 			rcu_barrier();
6457 			rtnl_lock();
6458 
6459 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6460 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6461 				     &dev->state)) {
6462 				/* We must not have linkwatch events
6463 				 * pending on unregister. If this
6464 				 * happens, we simply run the queue
6465 				 * unscheduled, resulting in a noop
6466 				 * for this device.
6467 				 */
6468 				linkwatch_run_queue();
6469 			}
6470 
6471 			__rtnl_unlock();
6472 
6473 			rebroadcast_time = jiffies;
6474 		}
6475 
6476 		msleep(250);
6477 
6478 		refcnt = netdev_refcnt_read(dev);
6479 
6480 		if (time_after(jiffies, warning_time + 10 * HZ)) {
6481 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6482 				 dev->name, refcnt);
6483 			warning_time = jiffies;
6484 		}
6485 	}
6486 }
6487 
6488 /* The sequence is:
6489  *
6490  *	rtnl_lock();
6491  *	...
6492  *	register_netdevice(x1);
6493  *	register_netdevice(x2);
6494  *	...
6495  *	unregister_netdevice(y1);
6496  *	unregister_netdevice(y2);
6497  *      ...
6498  *	rtnl_unlock();
6499  *	free_netdev(y1);
6500  *	free_netdev(y2);
6501  *
6502  * We are invoked by rtnl_unlock().
6503  * This allows us to deal with problems:
6504  * 1) We can delete sysfs objects which invoke hotplug
6505  *    without deadlocking with linkwatch via keventd.
6506  * 2) Since we run with the RTNL semaphore not held, we can sleep
6507  *    safely in order to wait for the netdev refcnt to drop to zero.
6508  *
6509  * We must not return until all unregister events added during
6510  * the interval the lock was held have been completed.
6511  */
6512 void netdev_run_todo(void)
6513 {
6514 	struct list_head list;
6515 
6516 	/* Snapshot list, allow later requests */
6517 	list_replace_init(&net_todo_list, &list);
6518 
6519 	__rtnl_unlock();
6520 
6521 
6522 	/* Wait for rcu callbacks to finish before next phase */
6523 	if (!list_empty(&list))
6524 		rcu_barrier();
6525 
6526 	while (!list_empty(&list)) {
6527 		struct net_device *dev
6528 			= list_first_entry(&list, struct net_device, todo_list);
6529 		list_del(&dev->todo_list);
6530 
6531 		rtnl_lock();
6532 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6533 		__rtnl_unlock();
6534 
6535 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6536 			pr_err("network todo '%s' but state %d\n",
6537 			       dev->name, dev->reg_state);
6538 			dump_stack();
6539 			continue;
6540 		}
6541 
6542 		dev->reg_state = NETREG_UNREGISTERED;
6543 
6544 		on_each_cpu(flush_backlog, dev, 1);
6545 
6546 		netdev_wait_allrefs(dev);
6547 
6548 		/* paranoia */
6549 		BUG_ON(netdev_refcnt_read(dev));
6550 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
6551 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6552 		WARN_ON(dev->dn_ptr);
6553 
6554 		if (dev->destructor)
6555 			dev->destructor(dev);
6556 
6557 		/* Report a network device has been unregistered */
6558 		rtnl_lock();
6559 		dev_net(dev)->dev_unreg_count--;
6560 		__rtnl_unlock();
6561 		wake_up(&netdev_unregistering_wq);
6562 
6563 		/* Free network device */
6564 		kobject_put(&dev->dev.kobj);
6565 	}
6566 }
6567 
6568 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6569  * fields in the same order, with only the type differing.
6570  */
6571 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6572 			     const struct net_device_stats *netdev_stats)
6573 {
6574 #if BITS_PER_LONG == 64
6575 	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6576 	memcpy(stats64, netdev_stats, sizeof(*stats64));
6577 #else
6578 	size_t i, n = sizeof(*stats64) / sizeof(u64);
6579 	const unsigned long *src = (const unsigned long *)netdev_stats;
6580 	u64 *dst = (u64 *)stats64;
6581 
6582 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6583 		     sizeof(*stats64) / sizeof(u64));
6584 	for (i = 0; i < n; i++)
6585 		dst[i] = src[i];
6586 #endif
6587 }
6588 EXPORT_SYMBOL(netdev_stats_to_stats64);
6589 
6590 /**
6591  *	dev_get_stats	- get network device statistics
6592  *	@dev: device to get statistics from
6593  *	@storage: place to store stats
6594  *
6595  *	Get network statistics from device. Return @storage.
6596  *	The device driver may provide its own method by setting
6597  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6598  *	otherwise the internal statistics structure is used.
6599  */
6600 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6601 					struct rtnl_link_stats64 *storage)
6602 {
6603 	const struct net_device_ops *ops = dev->netdev_ops;
6604 
6605 	if (ops->ndo_get_stats64) {
6606 		memset(storage, 0, sizeof(*storage));
6607 		ops->ndo_get_stats64(dev, storage);
6608 	} else if (ops->ndo_get_stats) {
6609 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6610 	} else {
6611 		netdev_stats_to_stats64(storage, &dev->stats);
6612 	}
6613 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6614 	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6615 	return storage;
6616 }
6617 EXPORT_SYMBOL(dev_get_stats);
6618 
6619 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6620 {
6621 	struct netdev_queue *queue = dev_ingress_queue(dev);
6622 
6623 #ifdef CONFIG_NET_CLS_ACT
6624 	if (queue)
6625 		return queue;
6626 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6627 	if (!queue)
6628 		return NULL;
6629 	netdev_init_one_queue(dev, queue, NULL);
6630 	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6631 	queue->qdisc_sleeping = &noop_qdisc;
6632 	rcu_assign_pointer(dev->ingress_queue, queue);
6633 #endif
6634 	return queue;
6635 }
6636 
6637 static const struct ethtool_ops default_ethtool_ops;
6638 
6639 void netdev_set_default_ethtool_ops(struct net_device *dev,
6640 				    const struct ethtool_ops *ops)
6641 {
6642 	if (dev->ethtool_ops == &default_ethtool_ops)
6643 		dev->ethtool_ops = ops;
6644 }
6645 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6646 
6647 void netdev_freemem(struct net_device *dev)
6648 {
6649 	char *addr = (char *)dev - dev->padded;
6650 
6651 	kvfree(addr);
6652 }
6653 
6654 /**
6655  *	alloc_netdev_mqs - allocate network device
6656  *	@sizeof_priv:		size of private data to allocate space for
6657  *	@name:			device name format string
6658  *	@name_assign_type: 	origin of device name
6659  *	@setup:			callback to initialize device
6660  *	@txqs:			the number of TX subqueues to allocate
6661  *	@rxqs:			the number of RX subqueues to allocate
6662  *
6663  *	Allocates a struct net_device with private data area for driver use
6664  *	and performs basic initialization.  Also allocates subqueue structs
6665  *	for each queue on the device.
6666  */
6667 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6668 		unsigned char name_assign_type,
6669 		void (*setup)(struct net_device *),
6670 		unsigned int txqs, unsigned int rxqs)
6671 {
6672 	struct net_device *dev;
6673 	size_t alloc_size;
6674 	struct net_device *p;
6675 
6676 	BUG_ON(strlen(name) >= sizeof(dev->name));
6677 
6678 	if (txqs < 1) {
6679 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6680 		return NULL;
6681 	}
6682 
6683 #ifdef CONFIG_SYSFS
6684 	if (rxqs < 1) {
6685 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6686 		return NULL;
6687 	}
6688 #endif
6689 
6690 	alloc_size = sizeof(struct net_device);
6691 	if (sizeof_priv) {
6692 		/* ensure 32-byte alignment of private area */
6693 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6694 		alloc_size += sizeof_priv;
6695 	}
6696 	/* ensure 32-byte alignment of whole construct */
6697 	alloc_size += NETDEV_ALIGN - 1;
6698 
6699 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6700 	if (!p)
6701 		p = vzalloc(alloc_size);
6702 	if (!p)
6703 		return NULL;
6704 
6705 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6706 	dev->padded = (char *)dev - (char *)p;
6707 
6708 	dev->pcpu_refcnt = alloc_percpu(int);
6709 	if (!dev->pcpu_refcnt)
6710 		goto free_dev;
6711 
6712 	if (dev_addr_init(dev))
6713 		goto free_pcpu;
6714 
6715 	dev_mc_init(dev);
6716 	dev_uc_init(dev);
6717 
6718 	dev_net_set(dev, &init_net);
6719 
6720 	dev->gso_max_size = GSO_MAX_SIZE;
6721 	dev->gso_max_segs = GSO_MAX_SEGS;
6722 	dev->gso_min_segs = 0;
6723 
6724 	INIT_LIST_HEAD(&dev->napi_list);
6725 	INIT_LIST_HEAD(&dev->unreg_list);
6726 	INIT_LIST_HEAD(&dev->close_list);
6727 	INIT_LIST_HEAD(&dev->link_watch_list);
6728 	INIT_LIST_HEAD(&dev->adj_list.upper);
6729 	INIT_LIST_HEAD(&dev->adj_list.lower);
6730 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
6731 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
6732 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6733 	setup(dev);
6734 
6735 	dev->num_tx_queues = txqs;
6736 	dev->real_num_tx_queues = txqs;
6737 	if (netif_alloc_netdev_queues(dev))
6738 		goto free_all;
6739 
6740 #ifdef CONFIG_SYSFS
6741 	dev->num_rx_queues = rxqs;
6742 	dev->real_num_rx_queues = rxqs;
6743 	if (netif_alloc_rx_queues(dev))
6744 		goto free_all;
6745 #endif
6746 
6747 	strcpy(dev->name, name);
6748 	dev->name_assign_type = name_assign_type;
6749 	dev->group = INIT_NETDEV_GROUP;
6750 	if (!dev->ethtool_ops)
6751 		dev->ethtool_ops = &default_ethtool_ops;
6752 	return dev;
6753 
6754 free_all:
6755 	free_netdev(dev);
6756 	return NULL;
6757 
6758 free_pcpu:
6759 	free_percpu(dev->pcpu_refcnt);
6760 free_dev:
6761 	netdev_freemem(dev);
6762 	return NULL;
6763 }
6764 EXPORT_SYMBOL(alloc_netdev_mqs);
6765 
6766 /**
6767  *	free_netdev - free network device
6768  *	@dev: device
6769  *
6770  *	This function does the last stage of destroying an allocated device
6771  * 	interface. The reference to the device object is released.
6772  *	If this is the last reference then it will be freed.
6773  */
6774 void free_netdev(struct net_device *dev)
6775 {
6776 	struct napi_struct *p, *n;
6777 
6778 	release_net(dev_net(dev));
6779 
6780 	netif_free_tx_queues(dev);
6781 #ifdef CONFIG_SYSFS
6782 	kfree(dev->_rx);
6783 #endif
6784 
6785 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6786 
6787 	/* Flush device addresses */
6788 	dev_addr_flush(dev);
6789 
6790 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6791 		netif_napi_del(p);
6792 
6793 	free_percpu(dev->pcpu_refcnt);
6794 	dev->pcpu_refcnt = NULL;
6795 
6796 	/*  Compatibility with error handling in drivers */
6797 	if (dev->reg_state == NETREG_UNINITIALIZED) {
6798 		netdev_freemem(dev);
6799 		return;
6800 	}
6801 
6802 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6803 	dev->reg_state = NETREG_RELEASED;
6804 
6805 	/* will free via device release */
6806 	put_device(&dev->dev);
6807 }
6808 EXPORT_SYMBOL(free_netdev);
6809 
6810 /**
6811  *	synchronize_net -  Synchronize with packet receive processing
6812  *
6813  *	Wait for packets currently being received to be done.
6814  *	Does not block later packets from starting.
6815  */
6816 void synchronize_net(void)
6817 {
6818 	might_sleep();
6819 	if (rtnl_is_locked())
6820 		synchronize_rcu_expedited();
6821 	else
6822 		synchronize_rcu();
6823 }
6824 EXPORT_SYMBOL(synchronize_net);
6825 
6826 /**
6827  *	unregister_netdevice_queue - remove device from the kernel
6828  *	@dev: device
6829  *	@head: list
6830  *
6831  *	This function shuts down a device interface and removes it
6832  *	from the kernel tables.
6833  *	If head not NULL, device is queued to be unregistered later.
6834  *
6835  *	Callers must hold the rtnl semaphore.  You may want
6836  *	unregister_netdev() instead of this.
6837  */
6838 
6839 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6840 {
6841 	ASSERT_RTNL();
6842 
6843 	if (head) {
6844 		list_move_tail(&dev->unreg_list, head);
6845 	} else {
6846 		rollback_registered(dev);
6847 		/* Finish processing unregister after unlock */
6848 		net_set_todo(dev);
6849 	}
6850 }
6851 EXPORT_SYMBOL(unregister_netdevice_queue);
6852 
6853 /**
6854  *	unregister_netdevice_many - unregister many devices
6855  *	@head: list of devices
6856  *
6857  *  Note: As most callers use a stack allocated list_head,
6858  *  we force a list_del() to make sure stack wont be corrupted later.
6859  */
6860 void unregister_netdevice_many(struct list_head *head)
6861 {
6862 	struct net_device *dev;
6863 
6864 	if (!list_empty(head)) {
6865 		rollback_registered_many(head);
6866 		list_for_each_entry(dev, head, unreg_list)
6867 			net_set_todo(dev);
6868 		list_del(head);
6869 	}
6870 }
6871 EXPORT_SYMBOL(unregister_netdevice_many);
6872 
6873 /**
6874  *	unregister_netdev - remove device from the kernel
6875  *	@dev: device
6876  *
6877  *	This function shuts down a device interface and removes it
6878  *	from the kernel tables.
6879  *
6880  *	This is just a wrapper for unregister_netdevice that takes
6881  *	the rtnl semaphore.  In general you want to use this and not
6882  *	unregister_netdevice.
6883  */
6884 void unregister_netdev(struct net_device *dev)
6885 {
6886 	rtnl_lock();
6887 	unregister_netdevice(dev);
6888 	rtnl_unlock();
6889 }
6890 EXPORT_SYMBOL(unregister_netdev);
6891 
6892 /**
6893  *	dev_change_net_namespace - move device to different nethost namespace
6894  *	@dev: device
6895  *	@net: network namespace
6896  *	@pat: If not NULL name pattern to try if the current device name
6897  *	      is already taken in the destination network namespace.
6898  *
6899  *	This function shuts down a device interface and moves it
6900  *	to a new network namespace. On success 0 is returned, on
6901  *	a failure a netagive errno code is returned.
6902  *
6903  *	Callers must hold the rtnl semaphore.
6904  */
6905 
6906 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6907 {
6908 	int err;
6909 
6910 	ASSERT_RTNL();
6911 
6912 	/* Don't allow namespace local devices to be moved. */
6913 	err = -EINVAL;
6914 	if (dev->features & NETIF_F_NETNS_LOCAL)
6915 		goto out;
6916 
6917 	/* Ensure the device has been registrered */
6918 	if (dev->reg_state != NETREG_REGISTERED)
6919 		goto out;
6920 
6921 	/* Get out if there is nothing todo */
6922 	err = 0;
6923 	if (net_eq(dev_net(dev), net))
6924 		goto out;
6925 
6926 	/* Pick the destination device name, and ensure
6927 	 * we can use it in the destination network namespace.
6928 	 */
6929 	err = -EEXIST;
6930 	if (__dev_get_by_name(net, dev->name)) {
6931 		/* We get here if we can't use the current device name */
6932 		if (!pat)
6933 			goto out;
6934 		if (dev_get_valid_name(net, dev, pat) < 0)
6935 			goto out;
6936 	}
6937 
6938 	/*
6939 	 * And now a mini version of register_netdevice unregister_netdevice.
6940 	 */
6941 
6942 	/* If device is running close it first. */
6943 	dev_close(dev);
6944 
6945 	/* And unlink it from device chain */
6946 	err = -ENODEV;
6947 	unlist_netdevice(dev);
6948 
6949 	synchronize_net();
6950 
6951 	/* Shutdown queueing discipline. */
6952 	dev_shutdown(dev);
6953 
6954 	/* Notify protocols, that we are about to destroy
6955 	   this device. They should clean all the things.
6956 
6957 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6958 	   This is wanted because this way 8021q and macvlan know
6959 	   the device is just moving and can keep their slaves up.
6960 	*/
6961 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6962 	rcu_barrier();
6963 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6964 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6965 
6966 	/*
6967 	 *	Flush the unicast and multicast chains
6968 	 */
6969 	dev_uc_flush(dev);
6970 	dev_mc_flush(dev);
6971 
6972 	/* Send a netdev-removed uevent to the old namespace */
6973 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6974 	netdev_adjacent_del_links(dev);
6975 
6976 	/* Actually switch the network namespace */
6977 	dev_net_set(dev, net);
6978 
6979 	/* If there is an ifindex conflict assign a new one */
6980 	if (__dev_get_by_index(net, dev->ifindex)) {
6981 		int iflink = (dev->iflink == dev->ifindex);
6982 		dev->ifindex = dev_new_index(net);
6983 		if (iflink)
6984 			dev->iflink = dev->ifindex;
6985 	}
6986 
6987 	/* Send a netdev-add uevent to the new namespace */
6988 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6989 	netdev_adjacent_add_links(dev);
6990 
6991 	/* Fixup kobjects */
6992 	err = device_rename(&dev->dev, dev->name);
6993 	WARN_ON(err);
6994 
6995 	/* Add the device back in the hashes */
6996 	list_netdevice(dev);
6997 
6998 	/* Notify protocols, that a new device appeared. */
6999 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7000 
7001 	/*
7002 	 *	Prevent userspace races by waiting until the network
7003 	 *	device is fully setup before sending notifications.
7004 	 */
7005 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7006 
7007 	synchronize_net();
7008 	err = 0;
7009 out:
7010 	return err;
7011 }
7012 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7013 
7014 static int dev_cpu_callback(struct notifier_block *nfb,
7015 			    unsigned long action,
7016 			    void *ocpu)
7017 {
7018 	struct sk_buff **list_skb;
7019 	struct sk_buff *skb;
7020 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
7021 	struct softnet_data *sd, *oldsd;
7022 
7023 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7024 		return NOTIFY_OK;
7025 
7026 	local_irq_disable();
7027 	cpu = smp_processor_id();
7028 	sd = &per_cpu(softnet_data, cpu);
7029 	oldsd = &per_cpu(softnet_data, oldcpu);
7030 
7031 	/* Find end of our completion_queue. */
7032 	list_skb = &sd->completion_queue;
7033 	while (*list_skb)
7034 		list_skb = &(*list_skb)->next;
7035 	/* Append completion queue from offline CPU. */
7036 	*list_skb = oldsd->completion_queue;
7037 	oldsd->completion_queue = NULL;
7038 
7039 	/* Append output queue from offline CPU. */
7040 	if (oldsd->output_queue) {
7041 		*sd->output_queue_tailp = oldsd->output_queue;
7042 		sd->output_queue_tailp = oldsd->output_queue_tailp;
7043 		oldsd->output_queue = NULL;
7044 		oldsd->output_queue_tailp = &oldsd->output_queue;
7045 	}
7046 	/* Append NAPI poll list from offline CPU, with one exception :
7047 	 * process_backlog() must be called by cpu owning percpu backlog.
7048 	 * We properly handle process_queue & input_pkt_queue later.
7049 	 */
7050 	while (!list_empty(&oldsd->poll_list)) {
7051 		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7052 							    struct napi_struct,
7053 							    poll_list);
7054 
7055 		list_del_init(&napi->poll_list);
7056 		if (napi->poll == process_backlog)
7057 			napi->state = 0;
7058 		else
7059 			____napi_schedule(sd, napi);
7060 	}
7061 
7062 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
7063 	local_irq_enable();
7064 
7065 	/* Process offline CPU's input_pkt_queue */
7066 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7067 		netif_rx_internal(skb);
7068 		input_queue_head_incr(oldsd);
7069 	}
7070 	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7071 		netif_rx_internal(skb);
7072 		input_queue_head_incr(oldsd);
7073 	}
7074 
7075 	return NOTIFY_OK;
7076 }
7077 
7078 
7079 /**
7080  *	netdev_increment_features - increment feature set by one
7081  *	@all: current feature set
7082  *	@one: new feature set
7083  *	@mask: mask feature set
7084  *
7085  *	Computes a new feature set after adding a device with feature set
7086  *	@one to the master device with current feature set @all.  Will not
7087  *	enable anything that is off in @mask. Returns the new feature set.
7088  */
7089 netdev_features_t netdev_increment_features(netdev_features_t all,
7090 	netdev_features_t one, netdev_features_t mask)
7091 {
7092 	if (mask & NETIF_F_GEN_CSUM)
7093 		mask |= NETIF_F_ALL_CSUM;
7094 	mask |= NETIF_F_VLAN_CHALLENGED;
7095 
7096 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7097 	all &= one | ~NETIF_F_ALL_FOR_ALL;
7098 
7099 	/* If one device supports hw checksumming, set for all. */
7100 	if (all & NETIF_F_GEN_CSUM)
7101 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7102 
7103 	return all;
7104 }
7105 EXPORT_SYMBOL(netdev_increment_features);
7106 
7107 static struct hlist_head * __net_init netdev_create_hash(void)
7108 {
7109 	int i;
7110 	struct hlist_head *hash;
7111 
7112 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7113 	if (hash != NULL)
7114 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
7115 			INIT_HLIST_HEAD(&hash[i]);
7116 
7117 	return hash;
7118 }
7119 
7120 /* Initialize per network namespace state */
7121 static int __net_init netdev_init(struct net *net)
7122 {
7123 	if (net != &init_net)
7124 		INIT_LIST_HEAD(&net->dev_base_head);
7125 
7126 	net->dev_name_head = netdev_create_hash();
7127 	if (net->dev_name_head == NULL)
7128 		goto err_name;
7129 
7130 	net->dev_index_head = netdev_create_hash();
7131 	if (net->dev_index_head == NULL)
7132 		goto err_idx;
7133 
7134 	return 0;
7135 
7136 err_idx:
7137 	kfree(net->dev_name_head);
7138 err_name:
7139 	return -ENOMEM;
7140 }
7141 
7142 /**
7143  *	netdev_drivername - network driver for the device
7144  *	@dev: network device
7145  *
7146  *	Determine network driver for device.
7147  */
7148 const char *netdev_drivername(const struct net_device *dev)
7149 {
7150 	const struct device_driver *driver;
7151 	const struct device *parent;
7152 	const char *empty = "";
7153 
7154 	parent = dev->dev.parent;
7155 	if (!parent)
7156 		return empty;
7157 
7158 	driver = parent->driver;
7159 	if (driver && driver->name)
7160 		return driver->name;
7161 	return empty;
7162 }
7163 
7164 static void __netdev_printk(const char *level, const struct net_device *dev,
7165 			    struct va_format *vaf)
7166 {
7167 	if (dev && dev->dev.parent) {
7168 		dev_printk_emit(level[1] - '0',
7169 				dev->dev.parent,
7170 				"%s %s %s%s: %pV",
7171 				dev_driver_string(dev->dev.parent),
7172 				dev_name(dev->dev.parent),
7173 				netdev_name(dev), netdev_reg_state(dev),
7174 				vaf);
7175 	} else if (dev) {
7176 		printk("%s%s%s: %pV",
7177 		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
7178 	} else {
7179 		printk("%s(NULL net_device): %pV", level, vaf);
7180 	}
7181 }
7182 
7183 void netdev_printk(const char *level, const struct net_device *dev,
7184 		   const char *format, ...)
7185 {
7186 	struct va_format vaf;
7187 	va_list args;
7188 
7189 	va_start(args, format);
7190 
7191 	vaf.fmt = format;
7192 	vaf.va = &args;
7193 
7194 	__netdev_printk(level, dev, &vaf);
7195 
7196 	va_end(args);
7197 }
7198 EXPORT_SYMBOL(netdev_printk);
7199 
7200 #define define_netdev_printk_level(func, level)			\
7201 void func(const struct net_device *dev, const char *fmt, ...)	\
7202 {								\
7203 	struct va_format vaf;					\
7204 	va_list args;						\
7205 								\
7206 	va_start(args, fmt);					\
7207 								\
7208 	vaf.fmt = fmt;						\
7209 	vaf.va = &args;						\
7210 								\
7211 	__netdev_printk(level, dev, &vaf);			\
7212 								\
7213 	va_end(args);						\
7214 }								\
7215 EXPORT_SYMBOL(func);
7216 
7217 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7218 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7219 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7220 define_netdev_printk_level(netdev_err, KERN_ERR);
7221 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7222 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7223 define_netdev_printk_level(netdev_info, KERN_INFO);
7224 
7225 static void __net_exit netdev_exit(struct net *net)
7226 {
7227 	kfree(net->dev_name_head);
7228 	kfree(net->dev_index_head);
7229 }
7230 
7231 static struct pernet_operations __net_initdata netdev_net_ops = {
7232 	.init = netdev_init,
7233 	.exit = netdev_exit,
7234 };
7235 
7236 static void __net_exit default_device_exit(struct net *net)
7237 {
7238 	struct net_device *dev, *aux;
7239 	/*
7240 	 * Push all migratable network devices back to the
7241 	 * initial network namespace
7242 	 */
7243 	rtnl_lock();
7244 	for_each_netdev_safe(net, dev, aux) {
7245 		int err;
7246 		char fb_name[IFNAMSIZ];
7247 
7248 		/* Ignore unmoveable devices (i.e. loopback) */
7249 		if (dev->features & NETIF_F_NETNS_LOCAL)
7250 			continue;
7251 
7252 		/* Leave virtual devices for the generic cleanup */
7253 		if (dev->rtnl_link_ops)
7254 			continue;
7255 
7256 		/* Push remaining network devices to init_net */
7257 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7258 		err = dev_change_net_namespace(dev, &init_net, fb_name);
7259 		if (err) {
7260 			pr_emerg("%s: failed to move %s to init_net: %d\n",
7261 				 __func__, dev->name, err);
7262 			BUG();
7263 		}
7264 	}
7265 	rtnl_unlock();
7266 }
7267 
7268 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7269 {
7270 	/* Return with the rtnl_lock held when there are no network
7271 	 * devices unregistering in any network namespace in net_list.
7272 	 */
7273 	struct net *net;
7274 	bool unregistering;
7275 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
7276 
7277 	add_wait_queue(&netdev_unregistering_wq, &wait);
7278 	for (;;) {
7279 		unregistering = false;
7280 		rtnl_lock();
7281 		list_for_each_entry(net, net_list, exit_list) {
7282 			if (net->dev_unreg_count > 0) {
7283 				unregistering = true;
7284 				break;
7285 			}
7286 		}
7287 		if (!unregistering)
7288 			break;
7289 		__rtnl_unlock();
7290 
7291 		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7292 	}
7293 	remove_wait_queue(&netdev_unregistering_wq, &wait);
7294 }
7295 
7296 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7297 {
7298 	/* At exit all network devices most be removed from a network
7299 	 * namespace.  Do this in the reverse order of registration.
7300 	 * Do this across as many network namespaces as possible to
7301 	 * improve batching efficiency.
7302 	 */
7303 	struct net_device *dev;
7304 	struct net *net;
7305 	LIST_HEAD(dev_kill_list);
7306 
7307 	/* To prevent network device cleanup code from dereferencing
7308 	 * loopback devices or network devices that have been freed
7309 	 * wait here for all pending unregistrations to complete,
7310 	 * before unregistring the loopback device and allowing the
7311 	 * network namespace be freed.
7312 	 *
7313 	 * The netdev todo list containing all network devices
7314 	 * unregistrations that happen in default_device_exit_batch
7315 	 * will run in the rtnl_unlock() at the end of
7316 	 * default_device_exit_batch.
7317 	 */
7318 	rtnl_lock_unregistering(net_list);
7319 	list_for_each_entry(net, net_list, exit_list) {
7320 		for_each_netdev_reverse(net, dev) {
7321 			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7322 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7323 			else
7324 				unregister_netdevice_queue(dev, &dev_kill_list);
7325 		}
7326 	}
7327 	unregister_netdevice_many(&dev_kill_list);
7328 	rtnl_unlock();
7329 }
7330 
7331 static struct pernet_operations __net_initdata default_device_ops = {
7332 	.exit = default_device_exit,
7333 	.exit_batch = default_device_exit_batch,
7334 };
7335 
7336 /*
7337  *	Initialize the DEV module. At boot time this walks the device list and
7338  *	unhooks any devices that fail to initialise (normally hardware not
7339  *	present) and leaves us with a valid list of present and active devices.
7340  *
7341  */
7342 
7343 /*
7344  *       This is called single threaded during boot, so no need
7345  *       to take the rtnl semaphore.
7346  */
7347 static int __init net_dev_init(void)
7348 {
7349 	int i, rc = -ENOMEM;
7350 
7351 	BUG_ON(!dev_boot_phase);
7352 
7353 	if (dev_proc_init())
7354 		goto out;
7355 
7356 	if (netdev_kobject_init())
7357 		goto out;
7358 
7359 	INIT_LIST_HEAD(&ptype_all);
7360 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
7361 		INIT_LIST_HEAD(&ptype_base[i]);
7362 
7363 	INIT_LIST_HEAD(&offload_base);
7364 
7365 	if (register_pernet_subsys(&netdev_net_ops))
7366 		goto out;
7367 
7368 	/*
7369 	 *	Initialise the packet receive queues.
7370 	 */
7371 
7372 	for_each_possible_cpu(i) {
7373 		struct softnet_data *sd = &per_cpu(softnet_data, i);
7374 
7375 		skb_queue_head_init(&sd->input_pkt_queue);
7376 		skb_queue_head_init(&sd->process_queue);
7377 		INIT_LIST_HEAD(&sd->poll_list);
7378 		sd->output_queue_tailp = &sd->output_queue;
7379 #ifdef CONFIG_RPS
7380 		sd->csd.func = rps_trigger_softirq;
7381 		sd->csd.info = sd;
7382 		sd->cpu = i;
7383 #endif
7384 
7385 		sd->backlog.poll = process_backlog;
7386 		sd->backlog.weight = weight_p;
7387 	}
7388 
7389 	dev_boot_phase = 0;
7390 
7391 	/* The loopback device is special if any other network devices
7392 	 * is present in a network namespace the loopback device must
7393 	 * be present. Since we now dynamically allocate and free the
7394 	 * loopback device ensure this invariant is maintained by
7395 	 * keeping the loopback device as the first device on the
7396 	 * list of network devices.  Ensuring the loopback devices
7397 	 * is the first device that appears and the last network device
7398 	 * that disappears.
7399 	 */
7400 	if (register_pernet_device(&loopback_net_ops))
7401 		goto out;
7402 
7403 	if (register_pernet_device(&default_device_ops))
7404 		goto out;
7405 
7406 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7407 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7408 
7409 	hotcpu_notifier(dev_cpu_callback, 0);
7410 	dst_init();
7411 	rc = 0;
7412 out:
7413 	return rc;
7414 }
7415 
7416 subsys_initcall(net_dev_init);
7417