xref: /linux/net/core/dev.c (revision 824b1a97400338e47ba7f878a42cd89d80cd5b81)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <net/busy_poll.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/stat.h>
102 #include <net/dst.h>
103 #include <net/dst_metadata.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/module.h>
110 #include <linux/netpoll.h>
111 #include <linux/rcupdate.h>
112 #include <linux/delay.h>
113 #include <net/iw_handler.h>
114 #include <asm/current.h>
115 #include <linux/audit.h>
116 #include <linux/dmaengine.h>
117 #include <linux/err.h>
118 #include <linux/ctype.h>
119 #include <linux/if_arp.h>
120 #include <linux/if_vlan.h>
121 #include <linux/ip.h>
122 #include <net/ip.h>
123 #include <net/mpls.h>
124 #include <linux/ipv6.h>
125 #include <linux/in.h>
126 #include <linux/jhash.h>
127 #include <linux/random.h>
128 #include <trace/events/napi.h>
129 #include <trace/events/net.h>
130 #include <trace/events/skb.h>
131 #include <linux/pci.h>
132 #include <linux/inetdevice.h>
133 #include <linux/cpu_rmap.h>
134 #include <linux/static_key.h>
135 #include <linux/hashtable.h>
136 #include <linux/vmalloc.h>
137 #include <linux/if_macvlan.h>
138 #include <linux/errqueue.h>
139 #include <linux/hrtimer.h>
140 #include <linux/netfilter_ingress.h>
141 #include <linux/sctp.h>
142 
143 #include "net-sysfs.h"
144 
145 /* Instead of increasing this, you should create a hash table. */
146 #define MAX_GRO_SKBS 8
147 
148 /* This should be increased if a protocol with a bigger head is added. */
149 #define GRO_MAX_HEAD (MAX_HEADER + 128)
150 
151 static DEFINE_SPINLOCK(ptype_lock);
152 static DEFINE_SPINLOCK(offload_lock);
153 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
154 struct list_head ptype_all __read_mostly;	/* Taps */
155 static struct list_head offload_base __read_mostly;
156 
157 static int netif_rx_internal(struct sk_buff *skb);
158 static int call_netdevice_notifiers_info(unsigned long val,
159 					 struct net_device *dev,
160 					 struct netdev_notifier_info *info);
161 
162 /*
163  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
164  * semaphore.
165  *
166  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
167  *
168  * Writers must hold the rtnl semaphore while they loop through the
169  * dev_base_head list, and hold dev_base_lock for writing when they do the
170  * actual updates.  This allows pure readers to access the list even
171  * while a writer is preparing to update it.
172  *
173  * To put it another way, dev_base_lock is held for writing only to
174  * protect against pure readers; the rtnl semaphore provides the
175  * protection against other writers.
176  *
177  * See, for example usages, register_netdevice() and
178  * unregister_netdevice(), which must be called with the rtnl
179  * semaphore held.
180  */
181 DEFINE_RWLOCK(dev_base_lock);
182 EXPORT_SYMBOL(dev_base_lock);
183 
184 /* protects napi_hash addition/deletion and napi_gen_id */
185 static DEFINE_SPINLOCK(napi_hash_lock);
186 
187 static unsigned int napi_gen_id = NR_CPUS;
188 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
189 
190 static seqcount_t devnet_rename_seq;
191 
192 static inline void dev_base_seq_inc(struct net *net)
193 {
194 	while (++net->dev_base_seq == 0);
195 }
196 
197 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
198 {
199 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
200 
201 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
202 }
203 
204 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
205 {
206 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
207 }
208 
209 static inline void rps_lock(struct softnet_data *sd)
210 {
211 #ifdef CONFIG_RPS
212 	spin_lock(&sd->input_pkt_queue.lock);
213 #endif
214 }
215 
216 static inline void rps_unlock(struct softnet_data *sd)
217 {
218 #ifdef CONFIG_RPS
219 	spin_unlock(&sd->input_pkt_queue.lock);
220 #endif
221 }
222 
223 /* Device list insertion */
224 static void list_netdevice(struct net_device *dev)
225 {
226 	struct net *net = dev_net(dev);
227 
228 	ASSERT_RTNL();
229 
230 	write_lock_bh(&dev_base_lock);
231 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
232 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
233 	hlist_add_head_rcu(&dev->index_hlist,
234 			   dev_index_hash(net, dev->ifindex));
235 	write_unlock_bh(&dev_base_lock);
236 
237 	dev_base_seq_inc(net);
238 }
239 
240 /* Device list removal
241  * caller must respect a RCU grace period before freeing/reusing dev
242  */
243 static void unlist_netdevice(struct net_device *dev)
244 {
245 	ASSERT_RTNL();
246 
247 	/* Unlink dev from the device chain */
248 	write_lock_bh(&dev_base_lock);
249 	list_del_rcu(&dev->dev_list);
250 	hlist_del_rcu(&dev->name_hlist);
251 	hlist_del_rcu(&dev->index_hlist);
252 	write_unlock_bh(&dev_base_lock);
253 
254 	dev_base_seq_inc(dev_net(dev));
255 }
256 
257 /*
258  *	Our notifier list
259  */
260 
261 static RAW_NOTIFIER_HEAD(netdev_chain);
262 
263 /*
264  *	Device drivers call our routines to queue packets here. We empty the
265  *	queue in the local softnet handler.
266  */
267 
268 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
269 EXPORT_PER_CPU_SYMBOL(softnet_data);
270 
271 #ifdef CONFIG_LOCKDEP
272 /*
273  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
274  * according to dev->type
275  */
276 static const unsigned short netdev_lock_type[] =
277 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
278 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
279 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
280 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
281 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
282 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
283 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
284 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
285 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
286 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
287 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
288 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
289 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
290 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
291 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
292 
293 static const char *const netdev_lock_name[] =
294 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
295 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
296 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
297 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
298 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
299 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
300 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
301 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
302 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
303 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
304 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
305 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
306 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
307 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
308 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
309 
310 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
311 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
312 
313 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
314 {
315 	int i;
316 
317 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
318 		if (netdev_lock_type[i] == dev_type)
319 			return i;
320 	/* the last key is used by default */
321 	return ARRAY_SIZE(netdev_lock_type) - 1;
322 }
323 
324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
325 						 unsigned short dev_type)
326 {
327 	int i;
328 
329 	i = netdev_lock_pos(dev_type);
330 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
331 				   netdev_lock_name[i]);
332 }
333 
334 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
335 {
336 	int i;
337 
338 	i = netdev_lock_pos(dev->type);
339 	lockdep_set_class_and_name(&dev->addr_list_lock,
340 				   &netdev_addr_lock_key[i],
341 				   netdev_lock_name[i]);
342 }
343 #else
344 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
345 						 unsigned short dev_type)
346 {
347 }
348 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349 {
350 }
351 #endif
352 
353 /*******************************************************************************
354 
355 		Protocol management and registration routines
356 
357 *******************************************************************************/
358 
359 /*
360  *	Add a protocol ID to the list. Now that the input handler is
361  *	smarter we can dispense with all the messy stuff that used to be
362  *	here.
363  *
364  *	BEWARE!!! Protocol handlers, mangling input packets,
365  *	MUST BE last in hash buckets and checking protocol handlers
366  *	MUST start from promiscuous ptype_all chain in net_bh.
367  *	It is true now, do not change it.
368  *	Explanation follows: if protocol handler, mangling packet, will
369  *	be the first on list, it is not able to sense, that packet
370  *	is cloned and should be copied-on-write, so that it will
371  *	change it and subsequent readers will get broken packet.
372  *							--ANK (980803)
373  */
374 
375 static inline struct list_head *ptype_head(const struct packet_type *pt)
376 {
377 	if (pt->type == htons(ETH_P_ALL))
378 		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
379 	else
380 		return pt->dev ? &pt->dev->ptype_specific :
381 				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
382 }
383 
384 /**
385  *	dev_add_pack - add packet handler
386  *	@pt: packet type declaration
387  *
388  *	Add a protocol handler to the networking stack. The passed &packet_type
389  *	is linked into kernel lists and may not be freed until it has been
390  *	removed from the kernel lists.
391  *
392  *	This call does not sleep therefore it can not
393  *	guarantee all CPU's that are in middle of receiving packets
394  *	will see the new packet type (until the next received packet).
395  */
396 
397 void dev_add_pack(struct packet_type *pt)
398 {
399 	struct list_head *head = ptype_head(pt);
400 
401 	spin_lock(&ptype_lock);
402 	list_add_rcu(&pt->list, head);
403 	spin_unlock(&ptype_lock);
404 }
405 EXPORT_SYMBOL(dev_add_pack);
406 
407 /**
408  *	__dev_remove_pack	 - remove packet handler
409  *	@pt: packet type declaration
410  *
411  *	Remove a protocol handler that was previously added to the kernel
412  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
413  *	from the kernel lists and can be freed or reused once this function
414  *	returns.
415  *
416  *      The packet type might still be in use by receivers
417  *	and must not be freed until after all the CPU's have gone
418  *	through a quiescent state.
419  */
420 void __dev_remove_pack(struct packet_type *pt)
421 {
422 	struct list_head *head = ptype_head(pt);
423 	struct packet_type *pt1;
424 
425 	spin_lock(&ptype_lock);
426 
427 	list_for_each_entry(pt1, head, list) {
428 		if (pt == pt1) {
429 			list_del_rcu(&pt->list);
430 			goto out;
431 		}
432 	}
433 
434 	pr_warn("dev_remove_pack: %p not found\n", pt);
435 out:
436 	spin_unlock(&ptype_lock);
437 }
438 EXPORT_SYMBOL(__dev_remove_pack);
439 
440 /**
441  *	dev_remove_pack	 - remove packet handler
442  *	@pt: packet type declaration
443  *
444  *	Remove a protocol handler that was previously added to the kernel
445  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
446  *	from the kernel lists and can be freed or reused once this function
447  *	returns.
448  *
449  *	This call sleeps to guarantee that no CPU is looking at the packet
450  *	type after return.
451  */
452 void dev_remove_pack(struct packet_type *pt)
453 {
454 	__dev_remove_pack(pt);
455 
456 	synchronize_net();
457 }
458 EXPORT_SYMBOL(dev_remove_pack);
459 
460 
461 /**
462  *	dev_add_offload - register offload handlers
463  *	@po: protocol offload declaration
464  *
465  *	Add protocol offload handlers to the networking stack. The passed
466  *	&proto_offload is linked into kernel lists and may not be freed until
467  *	it has been removed from the kernel lists.
468  *
469  *	This call does not sleep therefore it can not
470  *	guarantee all CPU's that are in middle of receiving packets
471  *	will see the new offload handlers (until the next received packet).
472  */
473 void dev_add_offload(struct packet_offload *po)
474 {
475 	struct packet_offload *elem;
476 
477 	spin_lock(&offload_lock);
478 	list_for_each_entry(elem, &offload_base, list) {
479 		if (po->priority < elem->priority)
480 			break;
481 	}
482 	list_add_rcu(&po->list, elem->list.prev);
483 	spin_unlock(&offload_lock);
484 }
485 EXPORT_SYMBOL(dev_add_offload);
486 
487 /**
488  *	__dev_remove_offload	 - remove offload handler
489  *	@po: packet offload declaration
490  *
491  *	Remove a protocol offload handler that was previously added to the
492  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
493  *	is removed from the kernel lists and can be freed or reused once this
494  *	function returns.
495  *
496  *      The packet type might still be in use by receivers
497  *	and must not be freed until after all the CPU's have gone
498  *	through a quiescent state.
499  */
500 static void __dev_remove_offload(struct packet_offload *po)
501 {
502 	struct list_head *head = &offload_base;
503 	struct packet_offload *po1;
504 
505 	spin_lock(&offload_lock);
506 
507 	list_for_each_entry(po1, head, list) {
508 		if (po == po1) {
509 			list_del_rcu(&po->list);
510 			goto out;
511 		}
512 	}
513 
514 	pr_warn("dev_remove_offload: %p not found\n", po);
515 out:
516 	spin_unlock(&offload_lock);
517 }
518 
519 /**
520  *	dev_remove_offload	 - remove packet offload handler
521  *	@po: packet offload declaration
522  *
523  *	Remove a packet offload handler that was previously added to the kernel
524  *	offload handlers by dev_add_offload(). The passed &offload_type is
525  *	removed from the kernel lists and can be freed or reused once this
526  *	function returns.
527  *
528  *	This call sleeps to guarantee that no CPU is looking at the packet
529  *	type after return.
530  */
531 void dev_remove_offload(struct packet_offload *po)
532 {
533 	__dev_remove_offload(po);
534 
535 	synchronize_net();
536 }
537 EXPORT_SYMBOL(dev_remove_offload);
538 
539 /******************************************************************************
540 
541 		      Device Boot-time Settings Routines
542 
543 *******************************************************************************/
544 
545 /* Boot time configuration table */
546 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
547 
548 /**
549  *	netdev_boot_setup_add	- add new setup entry
550  *	@name: name of the device
551  *	@map: configured settings for the device
552  *
553  *	Adds new setup entry to the dev_boot_setup list.  The function
554  *	returns 0 on error and 1 on success.  This is a generic routine to
555  *	all netdevices.
556  */
557 static int netdev_boot_setup_add(char *name, struct ifmap *map)
558 {
559 	struct netdev_boot_setup *s;
560 	int i;
561 
562 	s = dev_boot_setup;
563 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
564 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
565 			memset(s[i].name, 0, sizeof(s[i].name));
566 			strlcpy(s[i].name, name, IFNAMSIZ);
567 			memcpy(&s[i].map, map, sizeof(s[i].map));
568 			break;
569 		}
570 	}
571 
572 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
573 }
574 
575 /**
576  *	netdev_boot_setup_check	- check boot time settings
577  *	@dev: the netdevice
578  *
579  * 	Check boot time settings for the device.
580  *	The found settings are set for the device to be used
581  *	later in the device probing.
582  *	Returns 0 if no settings found, 1 if they are.
583  */
584 int netdev_boot_setup_check(struct net_device *dev)
585 {
586 	struct netdev_boot_setup *s = dev_boot_setup;
587 	int i;
588 
589 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
590 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
591 		    !strcmp(dev->name, s[i].name)) {
592 			dev->irq 	= s[i].map.irq;
593 			dev->base_addr 	= s[i].map.base_addr;
594 			dev->mem_start 	= s[i].map.mem_start;
595 			dev->mem_end 	= s[i].map.mem_end;
596 			return 1;
597 		}
598 	}
599 	return 0;
600 }
601 EXPORT_SYMBOL(netdev_boot_setup_check);
602 
603 
604 /**
605  *	netdev_boot_base	- get address from boot time settings
606  *	@prefix: prefix for network device
607  *	@unit: id for network device
608  *
609  * 	Check boot time settings for the base address of device.
610  *	The found settings are set for the device to be used
611  *	later in the device probing.
612  *	Returns 0 if no settings found.
613  */
614 unsigned long netdev_boot_base(const char *prefix, int unit)
615 {
616 	const struct netdev_boot_setup *s = dev_boot_setup;
617 	char name[IFNAMSIZ];
618 	int i;
619 
620 	sprintf(name, "%s%d", prefix, unit);
621 
622 	/*
623 	 * If device already registered then return base of 1
624 	 * to indicate not to probe for this interface
625 	 */
626 	if (__dev_get_by_name(&init_net, name))
627 		return 1;
628 
629 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
630 		if (!strcmp(name, s[i].name))
631 			return s[i].map.base_addr;
632 	return 0;
633 }
634 
635 /*
636  * Saves at boot time configured settings for any netdevice.
637  */
638 int __init netdev_boot_setup(char *str)
639 {
640 	int ints[5];
641 	struct ifmap map;
642 
643 	str = get_options(str, ARRAY_SIZE(ints), ints);
644 	if (!str || !*str)
645 		return 0;
646 
647 	/* Save settings */
648 	memset(&map, 0, sizeof(map));
649 	if (ints[0] > 0)
650 		map.irq = ints[1];
651 	if (ints[0] > 1)
652 		map.base_addr = ints[2];
653 	if (ints[0] > 2)
654 		map.mem_start = ints[3];
655 	if (ints[0] > 3)
656 		map.mem_end = ints[4];
657 
658 	/* Add new entry to the list */
659 	return netdev_boot_setup_add(str, &map);
660 }
661 
662 __setup("netdev=", netdev_boot_setup);
663 
664 /*******************************************************************************
665 
666 			    Device Interface Subroutines
667 
668 *******************************************************************************/
669 
670 /**
671  *	dev_get_iflink	- get 'iflink' value of a interface
672  *	@dev: targeted interface
673  *
674  *	Indicates the ifindex the interface is linked to.
675  *	Physical interfaces have the same 'ifindex' and 'iflink' values.
676  */
677 
678 int dev_get_iflink(const struct net_device *dev)
679 {
680 	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
681 		return dev->netdev_ops->ndo_get_iflink(dev);
682 
683 	return dev->ifindex;
684 }
685 EXPORT_SYMBOL(dev_get_iflink);
686 
687 /**
688  *	dev_fill_metadata_dst - Retrieve tunnel egress information.
689  *	@dev: targeted interface
690  *	@skb: The packet.
691  *
692  *	For better visibility of tunnel traffic OVS needs to retrieve
693  *	egress tunnel information for a packet. Following API allows
694  *	user to get this info.
695  */
696 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
697 {
698 	struct ip_tunnel_info *info;
699 
700 	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
701 		return -EINVAL;
702 
703 	info = skb_tunnel_info_unclone(skb);
704 	if (!info)
705 		return -ENOMEM;
706 	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
707 		return -EINVAL;
708 
709 	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
710 }
711 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
712 
713 /**
714  *	__dev_get_by_name	- find a device by its name
715  *	@net: the applicable net namespace
716  *	@name: name to find
717  *
718  *	Find an interface by name. Must be called under RTNL semaphore
719  *	or @dev_base_lock. If the name is found a pointer to the device
720  *	is returned. If the name is not found then %NULL is returned. The
721  *	reference counters are not incremented so the caller must be
722  *	careful with locks.
723  */
724 
725 struct net_device *__dev_get_by_name(struct net *net, const char *name)
726 {
727 	struct net_device *dev;
728 	struct hlist_head *head = dev_name_hash(net, name);
729 
730 	hlist_for_each_entry(dev, head, name_hlist)
731 		if (!strncmp(dev->name, name, IFNAMSIZ))
732 			return dev;
733 
734 	return NULL;
735 }
736 EXPORT_SYMBOL(__dev_get_by_name);
737 
738 /**
739  *	dev_get_by_name_rcu	- find a device by its name
740  *	@net: the applicable net namespace
741  *	@name: name to find
742  *
743  *	Find an interface by name.
744  *	If the name is found a pointer to the device is returned.
745  * 	If the name is not found then %NULL is returned.
746  *	The reference counters are not incremented so the caller must be
747  *	careful with locks. The caller must hold RCU lock.
748  */
749 
750 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
751 {
752 	struct net_device *dev;
753 	struct hlist_head *head = dev_name_hash(net, name);
754 
755 	hlist_for_each_entry_rcu(dev, head, name_hlist)
756 		if (!strncmp(dev->name, name, IFNAMSIZ))
757 			return dev;
758 
759 	return NULL;
760 }
761 EXPORT_SYMBOL(dev_get_by_name_rcu);
762 
763 /**
764  *	dev_get_by_name		- find a device by its name
765  *	@net: the applicable net namespace
766  *	@name: name to find
767  *
768  *	Find an interface by name. This can be called from any
769  *	context and does its own locking. The returned handle has
770  *	the usage count incremented and the caller must use dev_put() to
771  *	release it when it is no longer needed. %NULL is returned if no
772  *	matching device is found.
773  */
774 
775 struct net_device *dev_get_by_name(struct net *net, const char *name)
776 {
777 	struct net_device *dev;
778 
779 	rcu_read_lock();
780 	dev = dev_get_by_name_rcu(net, name);
781 	if (dev)
782 		dev_hold(dev);
783 	rcu_read_unlock();
784 	return dev;
785 }
786 EXPORT_SYMBOL(dev_get_by_name);
787 
788 /**
789  *	__dev_get_by_index - find a device by its ifindex
790  *	@net: the applicable net namespace
791  *	@ifindex: index of device
792  *
793  *	Search for an interface by index. Returns %NULL if the device
794  *	is not found or a pointer to the device. The device has not
795  *	had its reference counter increased so the caller must be careful
796  *	about locking. The caller must hold either the RTNL semaphore
797  *	or @dev_base_lock.
798  */
799 
800 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
801 {
802 	struct net_device *dev;
803 	struct hlist_head *head = dev_index_hash(net, ifindex);
804 
805 	hlist_for_each_entry(dev, head, index_hlist)
806 		if (dev->ifindex == ifindex)
807 			return dev;
808 
809 	return NULL;
810 }
811 EXPORT_SYMBOL(__dev_get_by_index);
812 
813 /**
814  *	dev_get_by_index_rcu - find a device by its ifindex
815  *	@net: the applicable net namespace
816  *	@ifindex: index of device
817  *
818  *	Search for an interface by index. Returns %NULL if the device
819  *	is not found or a pointer to the device. The device has not
820  *	had its reference counter increased so the caller must be careful
821  *	about locking. The caller must hold RCU lock.
822  */
823 
824 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
825 {
826 	struct net_device *dev;
827 	struct hlist_head *head = dev_index_hash(net, ifindex);
828 
829 	hlist_for_each_entry_rcu(dev, head, index_hlist)
830 		if (dev->ifindex == ifindex)
831 			return dev;
832 
833 	return NULL;
834 }
835 EXPORT_SYMBOL(dev_get_by_index_rcu);
836 
837 
838 /**
839  *	dev_get_by_index - find a device by its ifindex
840  *	@net: the applicable net namespace
841  *	@ifindex: index of device
842  *
843  *	Search for an interface by index. Returns NULL if the device
844  *	is not found or a pointer to the device. The device returned has
845  *	had a reference added and the pointer is safe until the user calls
846  *	dev_put to indicate they have finished with it.
847  */
848 
849 struct net_device *dev_get_by_index(struct net *net, int ifindex)
850 {
851 	struct net_device *dev;
852 
853 	rcu_read_lock();
854 	dev = dev_get_by_index_rcu(net, ifindex);
855 	if (dev)
856 		dev_hold(dev);
857 	rcu_read_unlock();
858 	return dev;
859 }
860 EXPORT_SYMBOL(dev_get_by_index);
861 
862 /**
863  *	netdev_get_name - get a netdevice name, knowing its ifindex.
864  *	@net: network namespace
865  *	@name: a pointer to the buffer where the name will be stored.
866  *	@ifindex: the ifindex of the interface to get the name from.
867  *
868  *	The use of raw_seqcount_begin() and cond_resched() before
869  *	retrying is required as we want to give the writers a chance
870  *	to complete when CONFIG_PREEMPT is not set.
871  */
872 int netdev_get_name(struct net *net, char *name, int ifindex)
873 {
874 	struct net_device *dev;
875 	unsigned int seq;
876 
877 retry:
878 	seq = raw_seqcount_begin(&devnet_rename_seq);
879 	rcu_read_lock();
880 	dev = dev_get_by_index_rcu(net, ifindex);
881 	if (!dev) {
882 		rcu_read_unlock();
883 		return -ENODEV;
884 	}
885 
886 	strcpy(name, dev->name);
887 	rcu_read_unlock();
888 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
889 		cond_resched();
890 		goto retry;
891 	}
892 
893 	return 0;
894 }
895 
896 /**
897  *	dev_getbyhwaddr_rcu - find a device by its hardware address
898  *	@net: the applicable net namespace
899  *	@type: media type of device
900  *	@ha: hardware address
901  *
902  *	Search for an interface by MAC address. Returns NULL if the device
903  *	is not found or a pointer to the device.
904  *	The caller must hold RCU or RTNL.
905  *	The returned device has not had its ref count increased
906  *	and the caller must therefore be careful about locking
907  *
908  */
909 
910 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
911 				       const char *ha)
912 {
913 	struct net_device *dev;
914 
915 	for_each_netdev_rcu(net, dev)
916 		if (dev->type == type &&
917 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
918 			return dev;
919 
920 	return NULL;
921 }
922 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
923 
924 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
925 {
926 	struct net_device *dev;
927 
928 	ASSERT_RTNL();
929 	for_each_netdev(net, dev)
930 		if (dev->type == type)
931 			return dev;
932 
933 	return NULL;
934 }
935 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
936 
937 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
938 {
939 	struct net_device *dev, *ret = NULL;
940 
941 	rcu_read_lock();
942 	for_each_netdev_rcu(net, dev)
943 		if (dev->type == type) {
944 			dev_hold(dev);
945 			ret = dev;
946 			break;
947 		}
948 	rcu_read_unlock();
949 	return ret;
950 }
951 EXPORT_SYMBOL(dev_getfirstbyhwtype);
952 
953 /**
954  *	__dev_get_by_flags - find any device with given flags
955  *	@net: the applicable net namespace
956  *	@if_flags: IFF_* values
957  *	@mask: bitmask of bits in if_flags to check
958  *
959  *	Search for any interface with the given flags. Returns NULL if a device
960  *	is not found or a pointer to the device. Must be called inside
961  *	rtnl_lock(), and result refcount is unchanged.
962  */
963 
964 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
965 				      unsigned short mask)
966 {
967 	struct net_device *dev, *ret;
968 
969 	ASSERT_RTNL();
970 
971 	ret = NULL;
972 	for_each_netdev(net, dev) {
973 		if (((dev->flags ^ if_flags) & mask) == 0) {
974 			ret = dev;
975 			break;
976 		}
977 	}
978 	return ret;
979 }
980 EXPORT_SYMBOL(__dev_get_by_flags);
981 
982 /**
983  *	dev_valid_name - check if name is okay for network device
984  *	@name: name string
985  *
986  *	Network device names need to be valid file names to
987  *	to allow sysfs to work.  We also disallow any kind of
988  *	whitespace.
989  */
990 bool dev_valid_name(const char *name)
991 {
992 	if (*name == '\0')
993 		return false;
994 	if (strlen(name) >= IFNAMSIZ)
995 		return false;
996 	if (!strcmp(name, ".") || !strcmp(name, ".."))
997 		return false;
998 
999 	while (*name) {
1000 		if (*name == '/' || *name == ':' || isspace(*name))
1001 			return false;
1002 		name++;
1003 	}
1004 	return true;
1005 }
1006 EXPORT_SYMBOL(dev_valid_name);
1007 
1008 /**
1009  *	__dev_alloc_name - allocate a name for a device
1010  *	@net: network namespace to allocate the device name in
1011  *	@name: name format string
1012  *	@buf:  scratch buffer and result name string
1013  *
1014  *	Passed a format string - eg "lt%d" it will try and find a suitable
1015  *	id. It scans list of devices to build up a free map, then chooses
1016  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1017  *	while allocating the name and adding the device in order to avoid
1018  *	duplicates.
1019  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1020  *	Returns the number of the unit assigned or a negative errno code.
1021  */
1022 
1023 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1024 {
1025 	int i = 0;
1026 	const char *p;
1027 	const int max_netdevices = 8*PAGE_SIZE;
1028 	unsigned long *inuse;
1029 	struct net_device *d;
1030 
1031 	p = strnchr(name, IFNAMSIZ-1, '%');
1032 	if (p) {
1033 		/*
1034 		 * Verify the string as this thing may have come from
1035 		 * the user.  There must be either one "%d" and no other "%"
1036 		 * characters.
1037 		 */
1038 		if (p[1] != 'd' || strchr(p + 2, '%'))
1039 			return -EINVAL;
1040 
1041 		/* Use one page as a bit array of possible slots */
1042 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1043 		if (!inuse)
1044 			return -ENOMEM;
1045 
1046 		for_each_netdev(net, d) {
1047 			if (!sscanf(d->name, name, &i))
1048 				continue;
1049 			if (i < 0 || i >= max_netdevices)
1050 				continue;
1051 
1052 			/*  avoid cases where sscanf is not exact inverse of printf */
1053 			snprintf(buf, IFNAMSIZ, name, i);
1054 			if (!strncmp(buf, d->name, IFNAMSIZ))
1055 				set_bit(i, inuse);
1056 		}
1057 
1058 		i = find_first_zero_bit(inuse, max_netdevices);
1059 		free_page((unsigned long) inuse);
1060 	}
1061 
1062 	if (buf != name)
1063 		snprintf(buf, IFNAMSIZ, name, i);
1064 	if (!__dev_get_by_name(net, buf))
1065 		return i;
1066 
1067 	/* It is possible to run out of possible slots
1068 	 * when the name is long and there isn't enough space left
1069 	 * for the digits, or if all bits are used.
1070 	 */
1071 	return -ENFILE;
1072 }
1073 
1074 /**
1075  *	dev_alloc_name - allocate a name for a device
1076  *	@dev: device
1077  *	@name: name format string
1078  *
1079  *	Passed a format string - eg "lt%d" it will try and find a suitable
1080  *	id. It scans list of devices to build up a free map, then chooses
1081  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1082  *	while allocating the name and adding the device in order to avoid
1083  *	duplicates.
1084  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1085  *	Returns the number of the unit assigned or a negative errno code.
1086  */
1087 
1088 int dev_alloc_name(struct net_device *dev, const char *name)
1089 {
1090 	char buf[IFNAMSIZ];
1091 	struct net *net;
1092 	int ret;
1093 
1094 	BUG_ON(!dev_net(dev));
1095 	net = dev_net(dev);
1096 	ret = __dev_alloc_name(net, name, buf);
1097 	if (ret >= 0)
1098 		strlcpy(dev->name, buf, IFNAMSIZ);
1099 	return ret;
1100 }
1101 EXPORT_SYMBOL(dev_alloc_name);
1102 
1103 static int dev_alloc_name_ns(struct net *net,
1104 			     struct net_device *dev,
1105 			     const char *name)
1106 {
1107 	char buf[IFNAMSIZ];
1108 	int ret;
1109 
1110 	ret = __dev_alloc_name(net, name, buf);
1111 	if (ret >= 0)
1112 		strlcpy(dev->name, buf, IFNAMSIZ);
1113 	return ret;
1114 }
1115 
1116 static int dev_get_valid_name(struct net *net,
1117 			      struct net_device *dev,
1118 			      const char *name)
1119 {
1120 	BUG_ON(!net);
1121 
1122 	if (!dev_valid_name(name))
1123 		return -EINVAL;
1124 
1125 	if (strchr(name, '%'))
1126 		return dev_alloc_name_ns(net, dev, name);
1127 	else if (__dev_get_by_name(net, name))
1128 		return -EEXIST;
1129 	else if (dev->name != name)
1130 		strlcpy(dev->name, name, IFNAMSIZ);
1131 
1132 	return 0;
1133 }
1134 
1135 /**
1136  *	dev_change_name - change name of a device
1137  *	@dev: device
1138  *	@newname: name (or format string) must be at least IFNAMSIZ
1139  *
1140  *	Change name of a device, can pass format strings "eth%d".
1141  *	for wildcarding.
1142  */
1143 int dev_change_name(struct net_device *dev, const char *newname)
1144 {
1145 	unsigned char old_assign_type;
1146 	char oldname[IFNAMSIZ];
1147 	int err = 0;
1148 	int ret;
1149 	struct net *net;
1150 
1151 	ASSERT_RTNL();
1152 	BUG_ON(!dev_net(dev));
1153 
1154 	net = dev_net(dev);
1155 	if (dev->flags & IFF_UP)
1156 		return -EBUSY;
1157 
1158 	write_seqcount_begin(&devnet_rename_seq);
1159 
1160 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1161 		write_seqcount_end(&devnet_rename_seq);
1162 		return 0;
1163 	}
1164 
1165 	memcpy(oldname, dev->name, IFNAMSIZ);
1166 
1167 	err = dev_get_valid_name(net, dev, newname);
1168 	if (err < 0) {
1169 		write_seqcount_end(&devnet_rename_seq);
1170 		return err;
1171 	}
1172 
1173 	if (oldname[0] && !strchr(oldname, '%'))
1174 		netdev_info(dev, "renamed from %s\n", oldname);
1175 
1176 	old_assign_type = dev->name_assign_type;
1177 	dev->name_assign_type = NET_NAME_RENAMED;
1178 
1179 rollback:
1180 	ret = device_rename(&dev->dev, dev->name);
1181 	if (ret) {
1182 		memcpy(dev->name, oldname, IFNAMSIZ);
1183 		dev->name_assign_type = old_assign_type;
1184 		write_seqcount_end(&devnet_rename_seq);
1185 		return ret;
1186 	}
1187 
1188 	write_seqcount_end(&devnet_rename_seq);
1189 
1190 	netdev_adjacent_rename_links(dev, oldname);
1191 
1192 	write_lock_bh(&dev_base_lock);
1193 	hlist_del_rcu(&dev->name_hlist);
1194 	write_unlock_bh(&dev_base_lock);
1195 
1196 	synchronize_rcu();
1197 
1198 	write_lock_bh(&dev_base_lock);
1199 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1200 	write_unlock_bh(&dev_base_lock);
1201 
1202 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1203 	ret = notifier_to_errno(ret);
1204 
1205 	if (ret) {
1206 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1207 		if (err >= 0) {
1208 			err = ret;
1209 			write_seqcount_begin(&devnet_rename_seq);
1210 			memcpy(dev->name, oldname, IFNAMSIZ);
1211 			memcpy(oldname, newname, IFNAMSIZ);
1212 			dev->name_assign_type = old_assign_type;
1213 			old_assign_type = NET_NAME_RENAMED;
1214 			goto rollback;
1215 		} else {
1216 			pr_err("%s: name change rollback failed: %d\n",
1217 			       dev->name, ret);
1218 		}
1219 	}
1220 
1221 	return err;
1222 }
1223 
1224 /**
1225  *	dev_set_alias - change ifalias of a device
1226  *	@dev: device
1227  *	@alias: name up to IFALIASZ
1228  *	@len: limit of bytes to copy from info
1229  *
1230  *	Set ifalias for a device,
1231  */
1232 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1233 {
1234 	char *new_ifalias;
1235 
1236 	ASSERT_RTNL();
1237 
1238 	if (len >= IFALIASZ)
1239 		return -EINVAL;
1240 
1241 	if (!len) {
1242 		kfree(dev->ifalias);
1243 		dev->ifalias = NULL;
1244 		return 0;
1245 	}
1246 
1247 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1248 	if (!new_ifalias)
1249 		return -ENOMEM;
1250 	dev->ifalias = new_ifalias;
1251 
1252 	strlcpy(dev->ifalias, alias, len+1);
1253 	return len;
1254 }
1255 
1256 
1257 /**
1258  *	netdev_features_change - device changes features
1259  *	@dev: device to cause notification
1260  *
1261  *	Called to indicate a device has changed features.
1262  */
1263 void netdev_features_change(struct net_device *dev)
1264 {
1265 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1266 }
1267 EXPORT_SYMBOL(netdev_features_change);
1268 
1269 /**
1270  *	netdev_state_change - device changes state
1271  *	@dev: device to cause notification
1272  *
1273  *	Called to indicate a device has changed state. This function calls
1274  *	the notifier chains for netdev_chain and sends a NEWLINK message
1275  *	to the routing socket.
1276  */
1277 void netdev_state_change(struct net_device *dev)
1278 {
1279 	if (dev->flags & IFF_UP) {
1280 		struct netdev_notifier_change_info change_info;
1281 
1282 		change_info.flags_changed = 0;
1283 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1284 					      &change_info.info);
1285 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1286 	}
1287 }
1288 EXPORT_SYMBOL(netdev_state_change);
1289 
1290 /**
1291  * 	netdev_notify_peers - notify network peers about existence of @dev
1292  * 	@dev: network device
1293  *
1294  * Generate traffic such that interested network peers are aware of
1295  * @dev, such as by generating a gratuitous ARP. This may be used when
1296  * a device wants to inform the rest of the network about some sort of
1297  * reconfiguration such as a failover event or virtual machine
1298  * migration.
1299  */
1300 void netdev_notify_peers(struct net_device *dev)
1301 {
1302 	rtnl_lock();
1303 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1304 	rtnl_unlock();
1305 }
1306 EXPORT_SYMBOL(netdev_notify_peers);
1307 
1308 static int __dev_open(struct net_device *dev)
1309 {
1310 	const struct net_device_ops *ops = dev->netdev_ops;
1311 	int ret;
1312 
1313 	ASSERT_RTNL();
1314 
1315 	if (!netif_device_present(dev))
1316 		return -ENODEV;
1317 
1318 	/* Block netpoll from trying to do any rx path servicing.
1319 	 * If we don't do this there is a chance ndo_poll_controller
1320 	 * or ndo_poll may be running while we open the device
1321 	 */
1322 	netpoll_poll_disable(dev);
1323 
1324 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1325 	ret = notifier_to_errno(ret);
1326 	if (ret)
1327 		return ret;
1328 
1329 	set_bit(__LINK_STATE_START, &dev->state);
1330 
1331 	if (ops->ndo_validate_addr)
1332 		ret = ops->ndo_validate_addr(dev);
1333 
1334 	if (!ret && ops->ndo_open)
1335 		ret = ops->ndo_open(dev);
1336 
1337 	netpoll_poll_enable(dev);
1338 
1339 	if (ret)
1340 		clear_bit(__LINK_STATE_START, &dev->state);
1341 	else {
1342 		dev->flags |= IFF_UP;
1343 		dev_set_rx_mode(dev);
1344 		dev_activate(dev);
1345 		add_device_randomness(dev->dev_addr, dev->addr_len);
1346 	}
1347 
1348 	return ret;
1349 }
1350 
1351 /**
1352  *	dev_open	- prepare an interface for use.
1353  *	@dev:	device to open
1354  *
1355  *	Takes a device from down to up state. The device's private open
1356  *	function is invoked and then the multicast lists are loaded. Finally
1357  *	the device is moved into the up state and a %NETDEV_UP message is
1358  *	sent to the netdev notifier chain.
1359  *
1360  *	Calling this function on an active interface is a nop. On a failure
1361  *	a negative errno code is returned.
1362  */
1363 int dev_open(struct net_device *dev)
1364 {
1365 	int ret;
1366 
1367 	if (dev->flags & IFF_UP)
1368 		return 0;
1369 
1370 	ret = __dev_open(dev);
1371 	if (ret < 0)
1372 		return ret;
1373 
1374 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1375 	call_netdevice_notifiers(NETDEV_UP, dev);
1376 
1377 	return ret;
1378 }
1379 EXPORT_SYMBOL(dev_open);
1380 
1381 static int __dev_close_many(struct list_head *head)
1382 {
1383 	struct net_device *dev;
1384 
1385 	ASSERT_RTNL();
1386 	might_sleep();
1387 
1388 	list_for_each_entry(dev, head, close_list) {
1389 		/* Temporarily disable netpoll until the interface is down */
1390 		netpoll_poll_disable(dev);
1391 
1392 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1393 
1394 		clear_bit(__LINK_STATE_START, &dev->state);
1395 
1396 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1397 		 * can be even on different cpu. So just clear netif_running().
1398 		 *
1399 		 * dev->stop() will invoke napi_disable() on all of it's
1400 		 * napi_struct instances on this device.
1401 		 */
1402 		smp_mb__after_atomic(); /* Commit netif_running(). */
1403 	}
1404 
1405 	dev_deactivate_many(head);
1406 
1407 	list_for_each_entry(dev, head, close_list) {
1408 		const struct net_device_ops *ops = dev->netdev_ops;
1409 
1410 		/*
1411 		 *	Call the device specific close. This cannot fail.
1412 		 *	Only if device is UP
1413 		 *
1414 		 *	We allow it to be called even after a DETACH hot-plug
1415 		 *	event.
1416 		 */
1417 		if (ops->ndo_stop)
1418 			ops->ndo_stop(dev);
1419 
1420 		dev->flags &= ~IFF_UP;
1421 		netpoll_poll_enable(dev);
1422 	}
1423 
1424 	return 0;
1425 }
1426 
1427 static int __dev_close(struct net_device *dev)
1428 {
1429 	int retval;
1430 	LIST_HEAD(single);
1431 
1432 	list_add(&dev->close_list, &single);
1433 	retval = __dev_close_many(&single);
1434 	list_del(&single);
1435 
1436 	return retval;
1437 }
1438 
1439 int dev_close_many(struct list_head *head, bool unlink)
1440 {
1441 	struct net_device *dev, *tmp;
1442 
1443 	/* Remove the devices that don't need to be closed */
1444 	list_for_each_entry_safe(dev, tmp, head, close_list)
1445 		if (!(dev->flags & IFF_UP))
1446 			list_del_init(&dev->close_list);
1447 
1448 	__dev_close_many(head);
1449 
1450 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1451 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1452 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1453 		if (unlink)
1454 			list_del_init(&dev->close_list);
1455 	}
1456 
1457 	return 0;
1458 }
1459 EXPORT_SYMBOL(dev_close_many);
1460 
1461 /**
1462  *	dev_close - shutdown an interface.
1463  *	@dev: device to shutdown
1464  *
1465  *	This function moves an active device into down state. A
1466  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1467  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1468  *	chain.
1469  */
1470 int dev_close(struct net_device *dev)
1471 {
1472 	if (dev->flags & IFF_UP) {
1473 		LIST_HEAD(single);
1474 
1475 		list_add(&dev->close_list, &single);
1476 		dev_close_many(&single, true);
1477 		list_del(&single);
1478 	}
1479 	return 0;
1480 }
1481 EXPORT_SYMBOL(dev_close);
1482 
1483 
1484 /**
1485  *	dev_disable_lro - disable Large Receive Offload on a device
1486  *	@dev: device
1487  *
1488  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1489  *	called under RTNL.  This is needed if received packets may be
1490  *	forwarded to another interface.
1491  */
1492 void dev_disable_lro(struct net_device *dev)
1493 {
1494 	struct net_device *lower_dev;
1495 	struct list_head *iter;
1496 
1497 	dev->wanted_features &= ~NETIF_F_LRO;
1498 	netdev_update_features(dev);
1499 
1500 	if (unlikely(dev->features & NETIF_F_LRO))
1501 		netdev_WARN(dev, "failed to disable LRO!\n");
1502 
1503 	netdev_for_each_lower_dev(dev, lower_dev, iter)
1504 		dev_disable_lro(lower_dev);
1505 }
1506 EXPORT_SYMBOL(dev_disable_lro);
1507 
1508 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1509 				   struct net_device *dev)
1510 {
1511 	struct netdev_notifier_info info;
1512 
1513 	netdev_notifier_info_init(&info, dev);
1514 	return nb->notifier_call(nb, val, &info);
1515 }
1516 
1517 static int dev_boot_phase = 1;
1518 
1519 /**
1520  *	register_netdevice_notifier - register a network notifier block
1521  *	@nb: notifier
1522  *
1523  *	Register a notifier to be called when network device events occur.
1524  *	The notifier passed is linked into the kernel structures and must
1525  *	not be reused until it has been unregistered. A negative errno code
1526  *	is returned on a failure.
1527  *
1528  * 	When registered all registration and up events are replayed
1529  *	to the new notifier to allow device to have a race free
1530  *	view of the network device list.
1531  */
1532 
1533 int register_netdevice_notifier(struct notifier_block *nb)
1534 {
1535 	struct net_device *dev;
1536 	struct net_device *last;
1537 	struct net *net;
1538 	int err;
1539 
1540 	rtnl_lock();
1541 	err = raw_notifier_chain_register(&netdev_chain, nb);
1542 	if (err)
1543 		goto unlock;
1544 	if (dev_boot_phase)
1545 		goto unlock;
1546 	for_each_net(net) {
1547 		for_each_netdev(net, dev) {
1548 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1549 			err = notifier_to_errno(err);
1550 			if (err)
1551 				goto rollback;
1552 
1553 			if (!(dev->flags & IFF_UP))
1554 				continue;
1555 
1556 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1557 		}
1558 	}
1559 
1560 unlock:
1561 	rtnl_unlock();
1562 	return err;
1563 
1564 rollback:
1565 	last = dev;
1566 	for_each_net(net) {
1567 		for_each_netdev(net, dev) {
1568 			if (dev == last)
1569 				goto outroll;
1570 
1571 			if (dev->flags & IFF_UP) {
1572 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1573 							dev);
1574 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1575 			}
1576 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1577 		}
1578 	}
1579 
1580 outroll:
1581 	raw_notifier_chain_unregister(&netdev_chain, nb);
1582 	goto unlock;
1583 }
1584 EXPORT_SYMBOL(register_netdevice_notifier);
1585 
1586 /**
1587  *	unregister_netdevice_notifier - unregister a network notifier block
1588  *	@nb: notifier
1589  *
1590  *	Unregister a notifier previously registered by
1591  *	register_netdevice_notifier(). The notifier is unlinked into the
1592  *	kernel structures and may then be reused. A negative errno code
1593  *	is returned on a failure.
1594  *
1595  * 	After unregistering unregister and down device events are synthesized
1596  *	for all devices on the device list to the removed notifier to remove
1597  *	the need for special case cleanup code.
1598  */
1599 
1600 int unregister_netdevice_notifier(struct notifier_block *nb)
1601 {
1602 	struct net_device *dev;
1603 	struct net *net;
1604 	int err;
1605 
1606 	rtnl_lock();
1607 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1608 	if (err)
1609 		goto unlock;
1610 
1611 	for_each_net(net) {
1612 		for_each_netdev(net, dev) {
1613 			if (dev->flags & IFF_UP) {
1614 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1615 							dev);
1616 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1617 			}
1618 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1619 		}
1620 	}
1621 unlock:
1622 	rtnl_unlock();
1623 	return err;
1624 }
1625 EXPORT_SYMBOL(unregister_netdevice_notifier);
1626 
1627 /**
1628  *	call_netdevice_notifiers_info - call all network notifier blocks
1629  *	@val: value passed unmodified to notifier function
1630  *	@dev: net_device pointer passed unmodified to notifier function
1631  *	@info: notifier information data
1632  *
1633  *	Call all network notifier blocks.  Parameters and return value
1634  *	are as for raw_notifier_call_chain().
1635  */
1636 
1637 static int call_netdevice_notifiers_info(unsigned long val,
1638 					 struct net_device *dev,
1639 					 struct netdev_notifier_info *info)
1640 {
1641 	ASSERT_RTNL();
1642 	netdev_notifier_info_init(info, dev);
1643 	return raw_notifier_call_chain(&netdev_chain, val, info);
1644 }
1645 
1646 /**
1647  *	call_netdevice_notifiers - call all network notifier blocks
1648  *      @val: value passed unmodified to notifier function
1649  *      @dev: net_device pointer passed unmodified to notifier function
1650  *
1651  *	Call all network notifier blocks.  Parameters and return value
1652  *	are as for raw_notifier_call_chain().
1653  */
1654 
1655 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1656 {
1657 	struct netdev_notifier_info info;
1658 
1659 	return call_netdevice_notifiers_info(val, dev, &info);
1660 }
1661 EXPORT_SYMBOL(call_netdevice_notifiers);
1662 
1663 #ifdef CONFIG_NET_INGRESS
1664 static struct static_key ingress_needed __read_mostly;
1665 
1666 void net_inc_ingress_queue(void)
1667 {
1668 	static_key_slow_inc(&ingress_needed);
1669 }
1670 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1671 
1672 void net_dec_ingress_queue(void)
1673 {
1674 	static_key_slow_dec(&ingress_needed);
1675 }
1676 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1677 #endif
1678 
1679 static struct static_key netstamp_needed __read_mostly;
1680 #ifdef HAVE_JUMP_LABEL
1681 /* We are not allowed to call static_key_slow_dec() from irq context
1682  * If net_disable_timestamp() is called from irq context, defer the
1683  * static_key_slow_dec() calls.
1684  */
1685 static atomic_t netstamp_needed_deferred;
1686 #endif
1687 
1688 void net_enable_timestamp(void)
1689 {
1690 #ifdef HAVE_JUMP_LABEL
1691 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1692 
1693 	if (deferred) {
1694 		while (--deferred)
1695 			static_key_slow_dec(&netstamp_needed);
1696 		return;
1697 	}
1698 #endif
1699 	static_key_slow_inc(&netstamp_needed);
1700 }
1701 EXPORT_SYMBOL(net_enable_timestamp);
1702 
1703 void net_disable_timestamp(void)
1704 {
1705 #ifdef HAVE_JUMP_LABEL
1706 	if (in_interrupt()) {
1707 		atomic_inc(&netstamp_needed_deferred);
1708 		return;
1709 	}
1710 #endif
1711 	static_key_slow_dec(&netstamp_needed);
1712 }
1713 EXPORT_SYMBOL(net_disable_timestamp);
1714 
1715 static inline void net_timestamp_set(struct sk_buff *skb)
1716 {
1717 	skb->tstamp.tv64 = 0;
1718 	if (static_key_false(&netstamp_needed))
1719 		__net_timestamp(skb);
1720 }
1721 
1722 #define net_timestamp_check(COND, SKB)			\
1723 	if (static_key_false(&netstamp_needed)) {		\
1724 		if ((COND) && !(SKB)->tstamp.tv64)	\
1725 			__net_timestamp(SKB);		\
1726 	}						\
1727 
1728 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1729 {
1730 	unsigned int len;
1731 
1732 	if (!(dev->flags & IFF_UP))
1733 		return false;
1734 
1735 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1736 	if (skb->len <= len)
1737 		return true;
1738 
1739 	/* if TSO is enabled, we don't care about the length as the packet
1740 	 * could be forwarded without being segmented before
1741 	 */
1742 	if (skb_is_gso(skb))
1743 		return true;
1744 
1745 	return false;
1746 }
1747 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1748 
1749 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1750 {
1751 	if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1752 	    unlikely(!is_skb_forwardable(dev, skb))) {
1753 		atomic_long_inc(&dev->rx_dropped);
1754 		kfree_skb(skb);
1755 		return NET_RX_DROP;
1756 	}
1757 
1758 	skb_scrub_packet(skb, true);
1759 	skb->priority = 0;
1760 	skb->protocol = eth_type_trans(skb, dev);
1761 	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1762 
1763 	return 0;
1764 }
1765 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1766 
1767 /**
1768  * dev_forward_skb - loopback an skb to another netif
1769  *
1770  * @dev: destination network device
1771  * @skb: buffer to forward
1772  *
1773  * return values:
1774  *	NET_RX_SUCCESS	(no congestion)
1775  *	NET_RX_DROP     (packet was dropped, but freed)
1776  *
1777  * dev_forward_skb can be used for injecting an skb from the
1778  * start_xmit function of one device into the receive queue
1779  * of another device.
1780  *
1781  * The receiving device may be in another namespace, so
1782  * we have to clear all information in the skb that could
1783  * impact namespace isolation.
1784  */
1785 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1786 {
1787 	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1788 }
1789 EXPORT_SYMBOL_GPL(dev_forward_skb);
1790 
1791 static inline int deliver_skb(struct sk_buff *skb,
1792 			      struct packet_type *pt_prev,
1793 			      struct net_device *orig_dev)
1794 {
1795 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1796 		return -ENOMEM;
1797 	atomic_inc(&skb->users);
1798 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1799 }
1800 
1801 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1802 					  struct packet_type **pt,
1803 					  struct net_device *orig_dev,
1804 					  __be16 type,
1805 					  struct list_head *ptype_list)
1806 {
1807 	struct packet_type *ptype, *pt_prev = *pt;
1808 
1809 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1810 		if (ptype->type != type)
1811 			continue;
1812 		if (pt_prev)
1813 			deliver_skb(skb, pt_prev, orig_dev);
1814 		pt_prev = ptype;
1815 	}
1816 	*pt = pt_prev;
1817 }
1818 
1819 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1820 {
1821 	if (!ptype->af_packet_priv || !skb->sk)
1822 		return false;
1823 
1824 	if (ptype->id_match)
1825 		return ptype->id_match(ptype, skb->sk);
1826 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1827 		return true;
1828 
1829 	return false;
1830 }
1831 
1832 /*
1833  *	Support routine. Sends outgoing frames to any network
1834  *	taps currently in use.
1835  */
1836 
1837 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1838 {
1839 	struct packet_type *ptype;
1840 	struct sk_buff *skb2 = NULL;
1841 	struct packet_type *pt_prev = NULL;
1842 	struct list_head *ptype_list = &ptype_all;
1843 
1844 	rcu_read_lock();
1845 again:
1846 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1847 		/* Never send packets back to the socket
1848 		 * they originated from - MvS (miquels@drinkel.ow.org)
1849 		 */
1850 		if (skb_loop_sk(ptype, skb))
1851 			continue;
1852 
1853 		if (pt_prev) {
1854 			deliver_skb(skb2, pt_prev, skb->dev);
1855 			pt_prev = ptype;
1856 			continue;
1857 		}
1858 
1859 		/* need to clone skb, done only once */
1860 		skb2 = skb_clone(skb, GFP_ATOMIC);
1861 		if (!skb2)
1862 			goto out_unlock;
1863 
1864 		net_timestamp_set(skb2);
1865 
1866 		/* skb->nh should be correctly
1867 		 * set by sender, so that the second statement is
1868 		 * just protection against buggy protocols.
1869 		 */
1870 		skb_reset_mac_header(skb2);
1871 
1872 		if (skb_network_header(skb2) < skb2->data ||
1873 		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1874 			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1875 					     ntohs(skb2->protocol),
1876 					     dev->name);
1877 			skb_reset_network_header(skb2);
1878 		}
1879 
1880 		skb2->transport_header = skb2->network_header;
1881 		skb2->pkt_type = PACKET_OUTGOING;
1882 		pt_prev = ptype;
1883 	}
1884 
1885 	if (ptype_list == &ptype_all) {
1886 		ptype_list = &dev->ptype_all;
1887 		goto again;
1888 	}
1889 out_unlock:
1890 	if (pt_prev)
1891 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1892 	rcu_read_unlock();
1893 }
1894 
1895 /**
1896  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1897  * @dev: Network device
1898  * @txq: number of queues available
1899  *
1900  * If real_num_tx_queues is changed the tc mappings may no longer be
1901  * valid. To resolve this verify the tc mapping remains valid and if
1902  * not NULL the mapping. With no priorities mapping to this
1903  * offset/count pair it will no longer be used. In the worst case TC0
1904  * is invalid nothing can be done so disable priority mappings. If is
1905  * expected that drivers will fix this mapping if they can before
1906  * calling netif_set_real_num_tx_queues.
1907  */
1908 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1909 {
1910 	int i;
1911 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1912 
1913 	/* If TC0 is invalidated disable TC mapping */
1914 	if (tc->offset + tc->count > txq) {
1915 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1916 		dev->num_tc = 0;
1917 		return;
1918 	}
1919 
1920 	/* Invalidated prio to tc mappings set to TC0 */
1921 	for (i = 1; i < TC_BITMASK + 1; i++) {
1922 		int q = netdev_get_prio_tc_map(dev, i);
1923 
1924 		tc = &dev->tc_to_txq[q];
1925 		if (tc->offset + tc->count > txq) {
1926 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1927 				i, q);
1928 			netdev_set_prio_tc_map(dev, i, 0);
1929 		}
1930 	}
1931 }
1932 
1933 #ifdef CONFIG_XPS
1934 static DEFINE_MUTEX(xps_map_mutex);
1935 #define xmap_dereference(P)		\
1936 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1937 
1938 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1939 					int cpu, u16 index)
1940 {
1941 	struct xps_map *map = NULL;
1942 	int pos;
1943 
1944 	if (dev_maps)
1945 		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1946 
1947 	for (pos = 0; map && pos < map->len; pos++) {
1948 		if (map->queues[pos] == index) {
1949 			if (map->len > 1) {
1950 				map->queues[pos] = map->queues[--map->len];
1951 			} else {
1952 				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1953 				kfree_rcu(map, rcu);
1954 				map = NULL;
1955 			}
1956 			break;
1957 		}
1958 	}
1959 
1960 	return map;
1961 }
1962 
1963 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1964 {
1965 	struct xps_dev_maps *dev_maps;
1966 	int cpu, i;
1967 	bool active = false;
1968 
1969 	mutex_lock(&xps_map_mutex);
1970 	dev_maps = xmap_dereference(dev->xps_maps);
1971 
1972 	if (!dev_maps)
1973 		goto out_no_maps;
1974 
1975 	for_each_possible_cpu(cpu) {
1976 		for (i = index; i < dev->num_tx_queues; i++) {
1977 			if (!remove_xps_queue(dev_maps, cpu, i))
1978 				break;
1979 		}
1980 		if (i == dev->num_tx_queues)
1981 			active = true;
1982 	}
1983 
1984 	if (!active) {
1985 		RCU_INIT_POINTER(dev->xps_maps, NULL);
1986 		kfree_rcu(dev_maps, rcu);
1987 	}
1988 
1989 	for (i = index; i < dev->num_tx_queues; i++)
1990 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1991 					     NUMA_NO_NODE);
1992 
1993 out_no_maps:
1994 	mutex_unlock(&xps_map_mutex);
1995 }
1996 
1997 static struct xps_map *expand_xps_map(struct xps_map *map,
1998 				      int cpu, u16 index)
1999 {
2000 	struct xps_map *new_map;
2001 	int alloc_len = XPS_MIN_MAP_ALLOC;
2002 	int i, pos;
2003 
2004 	for (pos = 0; map && pos < map->len; pos++) {
2005 		if (map->queues[pos] != index)
2006 			continue;
2007 		return map;
2008 	}
2009 
2010 	/* Need to add queue to this CPU's existing map */
2011 	if (map) {
2012 		if (pos < map->alloc_len)
2013 			return map;
2014 
2015 		alloc_len = map->alloc_len * 2;
2016 	}
2017 
2018 	/* Need to allocate new map to store queue on this CPU's map */
2019 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2020 			       cpu_to_node(cpu));
2021 	if (!new_map)
2022 		return NULL;
2023 
2024 	for (i = 0; i < pos; i++)
2025 		new_map->queues[i] = map->queues[i];
2026 	new_map->alloc_len = alloc_len;
2027 	new_map->len = pos;
2028 
2029 	return new_map;
2030 }
2031 
2032 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2033 			u16 index)
2034 {
2035 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2036 	struct xps_map *map, *new_map;
2037 	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2038 	int cpu, numa_node_id = -2;
2039 	bool active = false;
2040 
2041 	mutex_lock(&xps_map_mutex);
2042 
2043 	dev_maps = xmap_dereference(dev->xps_maps);
2044 
2045 	/* allocate memory for queue storage */
2046 	for_each_online_cpu(cpu) {
2047 		if (!cpumask_test_cpu(cpu, mask))
2048 			continue;
2049 
2050 		if (!new_dev_maps)
2051 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2052 		if (!new_dev_maps) {
2053 			mutex_unlock(&xps_map_mutex);
2054 			return -ENOMEM;
2055 		}
2056 
2057 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2058 				 NULL;
2059 
2060 		map = expand_xps_map(map, cpu, index);
2061 		if (!map)
2062 			goto error;
2063 
2064 		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2065 	}
2066 
2067 	if (!new_dev_maps)
2068 		goto out_no_new_maps;
2069 
2070 	for_each_possible_cpu(cpu) {
2071 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2072 			/* add queue to CPU maps */
2073 			int pos = 0;
2074 
2075 			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2076 			while ((pos < map->len) && (map->queues[pos] != index))
2077 				pos++;
2078 
2079 			if (pos == map->len)
2080 				map->queues[map->len++] = index;
2081 #ifdef CONFIG_NUMA
2082 			if (numa_node_id == -2)
2083 				numa_node_id = cpu_to_node(cpu);
2084 			else if (numa_node_id != cpu_to_node(cpu))
2085 				numa_node_id = -1;
2086 #endif
2087 		} else if (dev_maps) {
2088 			/* fill in the new device map from the old device map */
2089 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2090 			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2091 		}
2092 
2093 	}
2094 
2095 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2096 
2097 	/* Cleanup old maps */
2098 	if (dev_maps) {
2099 		for_each_possible_cpu(cpu) {
2100 			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2101 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2102 			if (map && map != new_map)
2103 				kfree_rcu(map, rcu);
2104 		}
2105 
2106 		kfree_rcu(dev_maps, rcu);
2107 	}
2108 
2109 	dev_maps = new_dev_maps;
2110 	active = true;
2111 
2112 out_no_new_maps:
2113 	/* update Tx queue numa node */
2114 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2115 				     (numa_node_id >= 0) ? numa_node_id :
2116 				     NUMA_NO_NODE);
2117 
2118 	if (!dev_maps)
2119 		goto out_no_maps;
2120 
2121 	/* removes queue from unused CPUs */
2122 	for_each_possible_cpu(cpu) {
2123 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2124 			continue;
2125 
2126 		if (remove_xps_queue(dev_maps, cpu, index))
2127 			active = true;
2128 	}
2129 
2130 	/* free map if not active */
2131 	if (!active) {
2132 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2133 		kfree_rcu(dev_maps, rcu);
2134 	}
2135 
2136 out_no_maps:
2137 	mutex_unlock(&xps_map_mutex);
2138 
2139 	return 0;
2140 error:
2141 	/* remove any maps that we added */
2142 	for_each_possible_cpu(cpu) {
2143 		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2144 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2145 				 NULL;
2146 		if (new_map && new_map != map)
2147 			kfree(new_map);
2148 	}
2149 
2150 	mutex_unlock(&xps_map_mutex);
2151 
2152 	kfree(new_dev_maps);
2153 	return -ENOMEM;
2154 }
2155 EXPORT_SYMBOL(netif_set_xps_queue);
2156 
2157 #endif
2158 /*
2159  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2160  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2161  */
2162 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2163 {
2164 	int rc;
2165 
2166 	if (txq < 1 || txq > dev->num_tx_queues)
2167 		return -EINVAL;
2168 
2169 	if (dev->reg_state == NETREG_REGISTERED ||
2170 	    dev->reg_state == NETREG_UNREGISTERING) {
2171 		ASSERT_RTNL();
2172 
2173 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2174 						  txq);
2175 		if (rc)
2176 			return rc;
2177 
2178 		if (dev->num_tc)
2179 			netif_setup_tc(dev, txq);
2180 
2181 		if (txq < dev->real_num_tx_queues) {
2182 			qdisc_reset_all_tx_gt(dev, txq);
2183 #ifdef CONFIG_XPS
2184 			netif_reset_xps_queues_gt(dev, txq);
2185 #endif
2186 		}
2187 	}
2188 
2189 	dev->real_num_tx_queues = txq;
2190 	return 0;
2191 }
2192 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2193 
2194 #ifdef CONFIG_SYSFS
2195 /**
2196  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2197  *	@dev: Network device
2198  *	@rxq: Actual number of RX queues
2199  *
2200  *	This must be called either with the rtnl_lock held or before
2201  *	registration of the net device.  Returns 0 on success, or a
2202  *	negative error code.  If called before registration, it always
2203  *	succeeds.
2204  */
2205 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2206 {
2207 	int rc;
2208 
2209 	if (rxq < 1 || rxq > dev->num_rx_queues)
2210 		return -EINVAL;
2211 
2212 	if (dev->reg_state == NETREG_REGISTERED) {
2213 		ASSERT_RTNL();
2214 
2215 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2216 						  rxq);
2217 		if (rc)
2218 			return rc;
2219 	}
2220 
2221 	dev->real_num_rx_queues = rxq;
2222 	return 0;
2223 }
2224 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2225 #endif
2226 
2227 /**
2228  * netif_get_num_default_rss_queues - default number of RSS queues
2229  *
2230  * This routine should set an upper limit on the number of RSS queues
2231  * used by default by multiqueue devices.
2232  */
2233 int netif_get_num_default_rss_queues(void)
2234 {
2235 	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2236 }
2237 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2238 
2239 static inline void __netif_reschedule(struct Qdisc *q)
2240 {
2241 	struct softnet_data *sd;
2242 	unsigned long flags;
2243 
2244 	local_irq_save(flags);
2245 	sd = this_cpu_ptr(&softnet_data);
2246 	q->next_sched = NULL;
2247 	*sd->output_queue_tailp = q;
2248 	sd->output_queue_tailp = &q->next_sched;
2249 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2250 	local_irq_restore(flags);
2251 }
2252 
2253 void __netif_schedule(struct Qdisc *q)
2254 {
2255 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2256 		__netif_reschedule(q);
2257 }
2258 EXPORT_SYMBOL(__netif_schedule);
2259 
2260 struct dev_kfree_skb_cb {
2261 	enum skb_free_reason reason;
2262 };
2263 
2264 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2265 {
2266 	return (struct dev_kfree_skb_cb *)skb->cb;
2267 }
2268 
2269 void netif_schedule_queue(struct netdev_queue *txq)
2270 {
2271 	rcu_read_lock();
2272 	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2273 		struct Qdisc *q = rcu_dereference(txq->qdisc);
2274 
2275 		__netif_schedule(q);
2276 	}
2277 	rcu_read_unlock();
2278 }
2279 EXPORT_SYMBOL(netif_schedule_queue);
2280 
2281 /**
2282  *	netif_wake_subqueue - allow sending packets on subqueue
2283  *	@dev: network device
2284  *	@queue_index: sub queue index
2285  *
2286  * Resume individual transmit queue of a device with multiple transmit queues.
2287  */
2288 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2289 {
2290 	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2291 
2292 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2293 		struct Qdisc *q;
2294 
2295 		rcu_read_lock();
2296 		q = rcu_dereference(txq->qdisc);
2297 		__netif_schedule(q);
2298 		rcu_read_unlock();
2299 	}
2300 }
2301 EXPORT_SYMBOL(netif_wake_subqueue);
2302 
2303 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2304 {
2305 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2306 		struct Qdisc *q;
2307 
2308 		rcu_read_lock();
2309 		q = rcu_dereference(dev_queue->qdisc);
2310 		__netif_schedule(q);
2311 		rcu_read_unlock();
2312 	}
2313 }
2314 EXPORT_SYMBOL(netif_tx_wake_queue);
2315 
2316 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2317 {
2318 	unsigned long flags;
2319 
2320 	if (likely(atomic_read(&skb->users) == 1)) {
2321 		smp_rmb();
2322 		atomic_set(&skb->users, 0);
2323 	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2324 		return;
2325 	}
2326 	get_kfree_skb_cb(skb)->reason = reason;
2327 	local_irq_save(flags);
2328 	skb->next = __this_cpu_read(softnet_data.completion_queue);
2329 	__this_cpu_write(softnet_data.completion_queue, skb);
2330 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2331 	local_irq_restore(flags);
2332 }
2333 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2334 
2335 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2336 {
2337 	if (in_irq() || irqs_disabled())
2338 		__dev_kfree_skb_irq(skb, reason);
2339 	else
2340 		dev_kfree_skb(skb);
2341 }
2342 EXPORT_SYMBOL(__dev_kfree_skb_any);
2343 
2344 
2345 /**
2346  * netif_device_detach - mark device as removed
2347  * @dev: network device
2348  *
2349  * Mark device as removed from system and therefore no longer available.
2350  */
2351 void netif_device_detach(struct net_device *dev)
2352 {
2353 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2354 	    netif_running(dev)) {
2355 		netif_tx_stop_all_queues(dev);
2356 	}
2357 }
2358 EXPORT_SYMBOL(netif_device_detach);
2359 
2360 /**
2361  * netif_device_attach - mark device as attached
2362  * @dev: network device
2363  *
2364  * Mark device as attached from system and restart if needed.
2365  */
2366 void netif_device_attach(struct net_device *dev)
2367 {
2368 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2369 	    netif_running(dev)) {
2370 		netif_tx_wake_all_queues(dev);
2371 		__netdev_watchdog_up(dev);
2372 	}
2373 }
2374 EXPORT_SYMBOL(netif_device_attach);
2375 
2376 /*
2377  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2378  * to be used as a distribution range.
2379  */
2380 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2381 		  unsigned int num_tx_queues)
2382 {
2383 	u32 hash;
2384 	u16 qoffset = 0;
2385 	u16 qcount = num_tx_queues;
2386 
2387 	if (skb_rx_queue_recorded(skb)) {
2388 		hash = skb_get_rx_queue(skb);
2389 		while (unlikely(hash >= num_tx_queues))
2390 			hash -= num_tx_queues;
2391 		return hash;
2392 	}
2393 
2394 	if (dev->num_tc) {
2395 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2396 		qoffset = dev->tc_to_txq[tc].offset;
2397 		qcount = dev->tc_to_txq[tc].count;
2398 	}
2399 
2400 	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2401 }
2402 EXPORT_SYMBOL(__skb_tx_hash);
2403 
2404 static void skb_warn_bad_offload(const struct sk_buff *skb)
2405 {
2406 	static const netdev_features_t null_features = 0;
2407 	struct net_device *dev = skb->dev;
2408 	const char *name = "";
2409 
2410 	if (!net_ratelimit())
2411 		return;
2412 
2413 	if (dev) {
2414 		if (dev->dev.parent)
2415 			name = dev_driver_string(dev->dev.parent);
2416 		else
2417 			name = netdev_name(dev);
2418 	}
2419 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2420 	     "gso_type=%d ip_summed=%d\n",
2421 	     name, dev ? &dev->features : &null_features,
2422 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2423 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2424 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2425 }
2426 
2427 /*
2428  * Invalidate hardware checksum when packet is to be mangled, and
2429  * complete checksum manually on outgoing path.
2430  */
2431 int skb_checksum_help(struct sk_buff *skb)
2432 {
2433 	__wsum csum;
2434 	int ret = 0, offset;
2435 
2436 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2437 		goto out_set_summed;
2438 
2439 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2440 		skb_warn_bad_offload(skb);
2441 		return -EINVAL;
2442 	}
2443 
2444 	/* Before computing a checksum, we should make sure no frag could
2445 	 * be modified by an external entity : checksum could be wrong.
2446 	 */
2447 	if (skb_has_shared_frag(skb)) {
2448 		ret = __skb_linearize(skb);
2449 		if (ret)
2450 			goto out;
2451 	}
2452 
2453 	offset = skb_checksum_start_offset(skb);
2454 	BUG_ON(offset >= skb_headlen(skb));
2455 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2456 
2457 	offset += skb->csum_offset;
2458 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2459 
2460 	if (skb_cloned(skb) &&
2461 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2462 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2463 		if (ret)
2464 			goto out;
2465 	}
2466 
2467 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2468 out_set_summed:
2469 	skb->ip_summed = CHECKSUM_NONE;
2470 out:
2471 	return ret;
2472 }
2473 EXPORT_SYMBOL(skb_checksum_help);
2474 
2475 /* skb_csum_offload_check - Driver helper function to determine if a device
2476  * with limited checksum offload capabilities is able to offload the checksum
2477  * for a given packet.
2478  *
2479  * Arguments:
2480  *   skb - sk_buff for the packet in question
2481  *   spec - contains the description of what device can offload
2482  *   csum_encapped - returns true if the checksum being offloaded is
2483  *	      encpasulated. That is it is checksum for the transport header
2484  *	      in the inner headers.
2485  *   checksum_help - when set indicates that helper function should
2486  *	      call skb_checksum_help if offload checks fail
2487  *
2488  * Returns:
2489  *   true: Packet has passed the checksum checks and should be offloadable to
2490  *	   the device (a driver may still need to check for additional
2491  *	   restrictions of its device)
2492  *   false: Checksum is not offloadable. If checksum_help was set then
2493  *	   skb_checksum_help was called to resolve checksum for non-GSO
2494  *	   packets and when IP protocol is not SCTP
2495  */
2496 bool __skb_csum_offload_chk(struct sk_buff *skb,
2497 			    const struct skb_csum_offl_spec *spec,
2498 			    bool *csum_encapped,
2499 			    bool csum_help)
2500 {
2501 	struct iphdr *iph;
2502 	struct ipv6hdr *ipv6;
2503 	void *nhdr;
2504 	int protocol;
2505 	u8 ip_proto;
2506 
2507 	if (skb->protocol == htons(ETH_P_8021Q) ||
2508 	    skb->protocol == htons(ETH_P_8021AD)) {
2509 		if (!spec->vlan_okay)
2510 			goto need_help;
2511 	}
2512 
2513 	/* We check whether the checksum refers to a transport layer checksum in
2514 	 * the outermost header or an encapsulated transport layer checksum that
2515 	 * corresponds to the inner headers of the skb. If the checksum is for
2516 	 * something else in the packet we need help.
2517 	 */
2518 	if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2519 		/* Non-encapsulated checksum */
2520 		protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2521 		nhdr = skb_network_header(skb);
2522 		*csum_encapped = false;
2523 		if (spec->no_not_encapped)
2524 			goto need_help;
2525 	} else if (skb->encapsulation && spec->encap_okay &&
2526 		   skb_checksum_start_offset(skb) ==
2527 		   skb_inner_transport_offset(skb)) {
2528 		/* Encapsulated checksum */
2529 		*csum_encapped = true;
2530 		switch (skb->inner_protocol_type) {
2531 		case ENCAP_TYPE_ETHER:
2532 			protocol = eproto_to_ipproto(skb->inner_protocol);
2533 			break;
2534 		case ENCAP_TYPE_IPPROTO:
2535 			protocol = skb->inner_protocol;
2536 			break;
2537 		}
2538 		nhdr = skb_inner_network_header(skb);
2539 	} else {
2540 		goto need_help;
2541 	}
2542 
2543 	switch (protocol) {
2544 	case IPPROTO_IP:
2545 		if (!spec->ipv4_okay)
2546 			goto need_help;
2547 		iph = nhdr;
2548 		ip_proto = iph->protocol;
2549 		if (iph->ihl != 5 && !spec->ip_options_okay)
2550 			goto need_help;
2551 		break;
2552 	case IPPROTO_IPV6:
2553 		if (!spec->ipv6_okay)
2554 			goto need_help;
2555 		if (spec->no_encapped_ipv6 && *csum_encapped)
2556 			goto need_help;
2557 		ipv6 = nhdr;
2558 		nhdr += sizeof(*ipv6);
2559 		ip_proto = ipv6->nexthdr;
2560 		break;
2561 	default:
2562 		goto need_help;
2563 	}
2564 
2565 ip_proto_again:
2566 	switch (ip_proto) {
2567 	case IPPROTO_TCP:
2568 		if (!spec->tcp_okay ||
2569 		    skb->csum_offset != offsetof(struct tcphdr, check))
2570 			goto need_help;
2571 		break;
2572 	case IPPROTO_UDP:
2573 		if (!spec->udp_okay ||
2574 		    skb->csum_offset != offsetof(struct udphdr, check))
2575 			goto need_help;
2576 		break;
2577 	case IPPROTO_SCTP:
2578 		if (!spec->sctp_okay ||
2579 		    skb->csum_offset != offsetof(struct sctphdr, checksum))
2580 			goto cant_help;
2581 		break;
2582 	case NEXTHDR_HOP:
2583 	case NEXTHDR_ROUTING:
2584 	case NEXTHDR_DEST: {
2585 		u8 *opthdr = nhdr;
2586 
2587 		if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2588 			goto need_help;
2589 
2590 		ip_proto = opthdr[0];
2591 		nhdr += (opthdr[1] + 1) << 3;
2592 
2593 		goto ip_proto_again;
2594 	}
2595 	default:
2596 		goto need_help;
2597 	}
2598 
2599 	/* Passed the tests for offloading checksum */
2600 	return true;
2601 
2602 need_help:
2603 	if (csum_help && !skb_shinfo(skb)->gso_size)
2604 		skb_checksum_help(skb);
2605 cant_help:
2606 	return false;
2607 }
2608 EXPORT_SYMBOL(__skb_csum_offload_chk);
2609 
2610 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2611 {
2612 	__be16 type = skb->protocol;
2613 
2614 	/* Tunnel gso handlers can set protocol to ethernet. */
2615 	if (type == htons(ETH_P_TEB)) {
2616 		struct ethhdr *eth;
2617 
2618 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2619 			return 0;
2620 
2621 		eth = (struct ethhdr *)skb_mac_header(skb);
2622 		type = eth->h_proto;
2623 	}
2624 
2625 	return __vlan_get_protocol(skb, type, depth);
2626 }
2627 
2628 /**
2629  *	skb_mac_gso_segment - mac layer segmentation handler.
2630  *	@skb: buffer to segment
2631  *	@features: features for the output path (see dev->features)
2632  */
2633 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2634 				    netdev_features_t features)
2635 {
2636 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2637 	struct packet_offload *ptype;
2638 	int vlan_depth = skb->mac_len;
2639 	__be16 type = skb_network_protocol(skb, &vlan_depth);
2640 
2641 	if (unlikely(!type))
2642 		return ERR_PTR(-EINVAL);
2643 
2644 	__skb_pull(skb, vlan_depth);
2645 
2646 	rcu_read_lock();
2647 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2648 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2649 			segs = ptype->callbacks.gso_segment(skb, features);
2650 			break;
2651 		}
2652 	}
2653 	rcu_read_unlock();
2654 
2655 	__skb_push(skb, skb->data - skb_mac_header(skb));
2656 
2657 	return segs;
2658 }
2659 EXPORT_SYMBOL(skb_mac_gso_segment);
2660 
2661 
2662 /* openvswitch calls this on rx path, so we need a different check.
2663  */
2664 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2665 {
2666 	if (tx_path)
2667 		return skb->ip_summed != CHECKSUM_PARTIAL;
2668 	else
2669 		return skb->ip_summed == CHECKSUM_NONE;
2670 }
2671 
2672 /**
2673  *	__skb_gso_segment - Perform segmentation on skb.
2674  *	@skb: buffer to segment
2675  *	@features: features for the output path (see dev->features)
2676  *	@tx_path: whether it is called in TX path
2677  *
2678  *	This function segments the given skb and returns a list of segments.
2679  *
2680  *	It may return NULL if the skb requires no segmentation.  This is
2681  *	only possible when GSO is used for verifying header integrity.
2682  */
2683 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2684 				  netdev_features_t features, bool tx_path)
2685 {
2686 	if (unlikely(skb_needs_check(skb, tx_path))) {
2687 		int err;
2688 
2689 		skb_warn_bad_offload(skb);
2690 
2691 		err = skb_cow_head(skb, 0);
2692 		if (err < 0)
2693 			return ERR_PTR(err);
2694 	}
2695 
2696 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2697 	SKB_GSO_CB(skb)->encap_level = 0;
2698 
2699 	skb_reset_mac_header(skb);
2700 	skb_reset_mac_len(skb);
2701 
2702 	return skb_mac_gso_segment(skb, features);
2703 }
2704 EXPORT_SYMBOL(__skb_gso_segment);
2705 
2706 /* Take action when hardware reception checksum errors are detected. */
2707 #ifdef CONFIG_BUG
2708 void netdev_rx_csum_fault(struct net_device *dev)
2709 {
2710 	if (net_ratelimit()) {
2711 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2712 		dump_stack();
2713 	}
2714 }
2715 EXPORT_SYMBOL(netdev_rx_csum_fault);
2716 #endif
2717 
2718 /* Actually, we should eliminate this check as soon as we know, that:
2719  * 1. IOMMU is present and allows to map all the memory.
2720  * 2. No high memory really exists on this machine.
2721  */
2722 
2723 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2724 {
2725 #ifdef CONFIG_HIGHMEM
2726 	int i;
2727 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2728 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2729 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2730 			if (PageHighMem(skb_frag_page(frag)))
2731 				return 1;
2732 		}
2733 	}
2734 
2735 	if (PCI_DMA_BUS_IS_PHYS) {
2736 		struct device *pdev = dev->dev.parent;
2737 
2738 		if (!pdev)
2739 			return 0;
2740 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2741 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2742 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2743 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2744 				return 1;
2745 		}
2746 	}
2747 #endif
2748 	return 0;
2749 }
2750 
2751 /* If MPLS offload request, verify we are testing hardware MPLS features
2752  * instead of standard features for the netdev.
2753  */
2754 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2755 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2756 					   netdev_features_t features,
2757 					   __be16 type)
2758 {
2759 	if (eth_p_mpls(type))
2760 		features &= skb->dev->mpls_features;
2761 
2762 	return features;
2763 }
2764 #else
2765 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2766 					   netdev_features_t features,
2767 					   __be16 type)
2768 {
2769 	return features;
2770 }
2771 #endif
2772 
2773 static netdev_features_t harmonize_features(struct sk_buff *skb,
2774 	netdev_features_t features)
2775 {
2776 	int tmp;
2777 	__be16 type;
2778 
2779 	type = skb_network_protocol(skb, &tmp);
2780 	features = net_mpls_features(skb, features, type);
2781 
2782 	if (skb->ip_summed != CHECKSUM_NONE &&
2783 	    !can_checksum_protocol(features, type)) {
2784 		features &= ~NETIF_F_CSUM_MASK;
2785 	} else if (illegal_highdma(skb->dev, skb)) {
2786 		features &= ~NETIF_F_SG;
2787 	}
2788 
2789 	return features;
2790 }
2791 
2792 netdev_features_t passthru_features_check(struct sk_buff *skb,
2793 					  struct net_device *dev,
2794 					  netdev_features_t features)
2795 {
2796 	return features;
2797 }
2798 EXPORT_SYMBOL(passthru_features_check);
2799 
2800 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2801 					     struct net_device *dev,
2802 					     netdev_features_t features)
2803 {
2804 	return vlan_features_check(skb, features);
2805 }
2806 
2807 netdev_features_t netif_skb_features(struct sk_buff *skb)
2808 {
2809 	struct net_device *dev = skb->dev;
2810 	netdev_features_t features = dev->features;
2811 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2812 
2813 	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2814 		features &= ~NETIF_F_GSO_MASK;
2815 
2816 	/* If encapsulation offload request, verify we are testing
2817 	 * hardware encapsulation features instead of standard
2818 	 * features for the netdev
2819 	 */
2820 	if (skb->encapsulation)
2821 		features &= dev->hw_enc_features;
2822 
2823 	if (skb_vlan_tagged(skb))
2824 		features = netdev_intersect_features(features,
2825 						     dev->vlan_features |
2826 						     NETIF_F_HW_VLAN_CTAG_TX |
2827 						     NETIF_F_HW_VLAN_STAG_TX);
2828 
2829 	if (dev->netdev_ops->ndo_features_check)
2830 		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2831 								features);
2832 	else
2833 		features &= dflt_features_check(skb, dev, features);
2834 
2835 	return harmonize_features(skb, features);
2836 }
2837 EXPORT_SYMBOL(netif_skb_features);
2838 
2839 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2840 		    struct netdev_queue *txq, bool more)
2841 {
2842 	unsigned int len;
2843 	int rc;
2844 
2845 	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2846 		dev_queue_xmit_nit(skb, dev);
2847 
2848 	len = skb->len;
2849 	trace_net_dev_start_xmit(skb, dev);
2850 	rc = netdev_start_xmit(skb, dev, txq, more);
2851 	trace_net_dev_xmit(skb, rc, dev, len);
2852 
2853 	return rc;
2854 }
2855 
2856 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2857 				    struct netdev_queue *txq, int *ret)
2858 {
2859 	struct sk_buff *skb = first;
2860 	int rc = NETDEV_TX_OK;
2861 
2862 	while (skb) {
2863 		struct sk_buff *next = skb->next;
2864 
2865 		skb->next = NULL;
2866 		rc = xmit_one(skb, dev, txq, next != NULL);
2867 		if (unlikely(!dev_xmit_complete(rc))) {
2868 			skb->next = next;
2869 			goto out;
2870 		}
2871 
2872 		skb = next;
2873 		if (netif_xmit_stopped(txq) && skb) {
2874 			rc = NETDEV_TX_BUSY;
2875 			break;
2876 		}
2877 	}
2878 
2879 out:
2880 	*ret = rc;
2881 	return skb;
2882 }
2883 
2884 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2885 					  netdev_features_t features)
2886 {
2887 	if (skb_vlan_tag_present(skb) &&
2888 	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2889 		skb = __vlan_hwaccel_push_inside(skb);
2890 	return skb;
2891 }
2892 
2893 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2894 {
2895 	netdev_features_t features;
2896 
2897 	if (skb->next)
2898 		return skb;
2899 
2900 	features = netif_skb_features(skb);
2901 	skb = validate_xmit_vlan(skb, features);
2902 	if (unlikely(!skb))
2903 		goto out_null;
2904 
2905 	if (netif_needs_gso(skb, features)) {
2906 		struct sk_buff *segs;
2907 
2908 		segs = skb_gso_segment(skb, features);
2909 		if (IS_ERR(segs)) {
2910 			goto out_kfree_skb;
2911 		} else if (segs) {
2912 			consume_skb(skb);
2913 			skb = segs;
2914 		}
2915 	} else {
2916 		if (skb_needs_linearize(skb, features) &&
2917 		    __skb_linearize(skb))
2918 			goto out_kfree_skb;
2919 
2920 		/* If packet is not checksummed and device does not
2921 		 * support checksumming for this protocol, complete
2922 		 * checksumming here.
2923 		 */
2924 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2925 			if (skb->encapsulation)
2926 				skb_set_inner_transport_header(skb,
2927 							       skb_checksum_start_offset(skb));
2928 			else
2929 				skb_set_transport_header(skb,
2930 							 skb_checksum_start_offset(skb));
2931 			if (!(features & NETIF_F_CSUM_MASK) &&
2932 			    skb_checksum_help(skb))
2933 				goto out_kfree_skb;
2934 		}
2935 	}
2936 
2937 	return skb;
2938 
2939 out_kfree_skb:
2940 	kfree_skb(skb);
2941 out_null:
2942 	return NULL;
2943 }
2944 
2945 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2946 {
2947 	struct sk_buff *next, *head = NULL, *tail;
2948 
2949 	for (; skb != NULL; skb = next) {
2950 		next = skb->next;
2951 		skb->next = NULL;
2952 
2953 		/* in case skb wont be segmented, point to itself */
2954 		skb->prev = skb;
2955 
2956 		skb = validate_xmit_skb(skb, dev);
2957 		if (!skb)
2958 			continue;
2959 
2960 		if (!head)
2961 			head = skb;
2962 		else
2963 			tail->next = skb;
2964 		/* If skb was segmented, skb->prev points to
2965 		 * the last segment. If not, it still contains skb.
2966 		 */
2967 		tail = skb->prev;
2968 	}
2969 	return head;
2970 }
2971 
2972 static void qdisc_pkt_len_init(struct sk_buff *skb)
2973 {
2974 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2975 
2976 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2977 
2978 	/* To get more precise estimation of bytes sent on wire,
2979 	 * we add to pkt_len the headers size of all segments
2980 	 */
2981 	if (shinfo->gso_size)  {
2982 		unsigned int hdr_len;
2983 		u16 gso_segs = shinfo->gso_segs;
2984 
2985 		/* mac layer + network layer */
2986 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2987 
2988 		/* + transport layer */
2989 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2990 			hdr_len += tcp_hdrlen(skb);
2991 		else
2992 			hdr_len += sizeof(struct udphdr);
2993 
2994 		if (shinfo->gso_type & SKB_GSO_DODGY)
2995 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2996 						shinfo->gso_size);
2997 
2998 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2999 	}
3000 }
3001 
3002 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3003 				 struct net_device *dev,
3004 				 struct netdev_queue *txq)
3005 {
3006 	spinlock_t *root_lock = qdisc_lock(q);
3007 	bool contended;
3008 	int rc;
3009 
3010 	qdisc_pkt_len_init(skb);
3011 	qdisc_calculate_pkt_len(skb, q);
3012 	/*
3013 	 * Heuristic to force contended enqueues to serialize on a
3014 	 * separate lock before trying to get qdisc main lock.
3015 	 * This permits __QDISC___STATE_RUNNING owner to get the lock more
3016 	 * often and dequeue packets faster.
3017 	 */
3018 	contended = qdisc_is_running(q);
3019 	if (unlikely(contended))
3020 		spin_lock(&q->busylock);
3021 
3022 	spin_lock(root_lock);
3023 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3024 		kfree_skb(skb);
3025 		rc = NET_XMIT_DROP;
3026 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3027 		   qdisc_run_begin(q)) {
3028 		/*
3029 		 * This is a work-conserving queue; there are no old skbs
3030 		 * waiting to be sent out; and the qdisc is not running -
3031 		 * xmit the skb directly.
3032 		 */
3033 
3034 		qdisc_bstats_update(q, skb);
3035 
3036 		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3037 			if (unlikely(contended)) {
3038 				spin_unlock(&q->busylock);
3039 				contended = false;
3040 			}
3041 			__qdisc_run(q);
3042 		} else
3043 			qdisc_run_end(q);
3044 
3045 		rc = NET_XMIT_SUCCESS;
3046 	} else {
3047 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
3048 		if (qdisc_run_begin(q)) {
3049 			if (unlikely(contended)) {
3050 				spin_unlock(&q->busylock);
3051 				contended = false;
3052 			}
3053 			__qdisc_run(q);
3054 		}
3055 	}
3056 	spin_unlock(root_lock);
3057 	if (unlikely(contended))
3058 		spin_unlock(&q->busylock);
3059 	return rc;
3060 }
3061 
3062 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3063 static void skb_update_prio(struct sk_buff *skb)
3064 {
3065 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3066 
3067 	if (!skb->priority && skb->sk && map) {
3068 		unsigned int prioidx =
3069 			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3070 
3071 		if (prioidx < map->priomap_len)
3072 			skb->priority = map->priomap[prioidx];
3073 	}
3074 }
3075 #else
3076 #define skb_update_prio(skb)
3077 #endif
3078 
3079 DEFINE_PER_CPU(int, xmit_recursion);
3080 EXPORT_SYMBOL(xmit_recursion);
3081 
3082 #define RECURSION_LIMIT 10
3083 
3084 /**
3085  *	dev_loopback_xmit - loop back @skb
3086  *	@net: network namespace this loopback is happening in
3087  *	@sk:  sk needed to be a netfilter okfn
3088  *	@skb: buffer to transmit
3089  */
3090 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3091 {
3092 	skb_reset_mac_header(skb);
3093 	__skb_pull(skb, skb_network_offset(skb));
3094 	skb->pkt_type = PACKET_LOOPBACK;
3095 	skb->ip_summed = CHECKSUM_UNNECESSARY;
3096 	WARN_ON(!skb_dst(skb));
3097 	skb_dst_force(skb);
3098 	netif_rx_ni(skb);
3099 	return 0;
3100 }
3101 EXPORT_SYMBOL(dev_loopback_xmit);
3102 
3103 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3104 {
3105 #ifdef CONFIG_XPS
3106 	struct xps_dev_maps *dev_maps;
3107 	struct xps_map *map;
3108 	int queue_index = -1;
3109 
3110 	rcu_read_lock();
3111 	dev_maps = rcu_dereference(dev->xps_maps);
3112 	if (dev_maps) {
3113 		map = rcu_dereference(
3114 		    dev_maps->cpu_map[skb->sender_cpu - 1]);
3115 		if (map) {
3116 			if (map->len == 1)
3117 				queue_index = map->queues[0];
3118 			else
3119 				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3120 									   map->len)];
3121 			if (unlikely(queue_index >= dev->real_num_tx_queues))
3122 				queue_index = -1;
3123 		}
3124 	}
3125 	rcu_read_unlock();
3126 
3127 	return queue_index;
3128 #else
3129 	return -1;
3130 #endif
3131 }
3132 
3133 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3134 {
3135 	struct sock *sk = skb->sk;
3136 	int queue_index = sk_tx_queue_get(sk);
3137 
3138 	if (queue_index < 0 || skb->ooo_okay ||
3139 	    queue_index >= dev->real_num_tx_queues) {
3140 		int new_index = get_xps_queue(dev, skb);
3141 		if (new_index < 0)
3142 			new_index = skb_tx_hash(dev, skb);
3143 
3144 		if (queue_index != new_index && sk &&
3145 		    sk_fullsock(sk) &&
3146 		    rcu_access_pointer(sk->sk_dst_cache))
3147 			sk_tx_queue_set(sk, new_index);
3148 
3149 		queue_index = new_index;
3150 	}
3151 
3152 	return queue_index;
3153 }
3154 
3155 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3156 				    struct sk_buff *skb,
3157 				    void *accel_priv)
3158 {
3159 	int queue_index = 0;
3160 
3161 #ifdef CONFIG_XPS
3162 	u32 sender_cpu = skb->sender_cpu - 1;
3163 
3164 	if (sender_cpu >= (u32)NR_CPUS)
3165 		skb->sender_cpu = raw_smp_processor_id() + 1;
3166 #endif
3167 
3168 	if (dev->real_num_tx_queues != 1) {
3169 		const struct net_device_ops *ops = dev->netdev_ops;
3170 		if (ops->ndo_select_queue)
3171 			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3172 							    __netdev_pick_tx);
3173 		else
3174 			queue_index = __netdev_pick_tx(dev, skb);
3175 
3176 		if (!accel_priv)
3177 			queue_index = netdev_cap_txqueue(dev, queue_index);
3178 	}
3179 
3180 	skb_set_queue_mapping(skb, queue_index);
3181 	return netdev_get_tx_queue(dev, queue_index);
3182 }
3183 
3184 /**
3185  *	__dev_queue_xmit - transmit a buffer
3186  *	@skb: buffer to transmit
3187  *	@accel_priv: private data used for L2 forwarding offload
3188  *
3189  *	Queue a buffer for transmission to a network device. The caller must
3190  *	have set the device and priority and built the buffer before calling
3191  *	this function. The function can be called from an interrupt.
3192  *
3193  *	A negative errno code is returned on a failure. A success does not
3194  *	guarantee the frame will be transmitted as it may be dropped due
3195  *	to congestion or traffic shaping.
3196  *
3197  * -----------------------------------------------------------------------------------
3198  *      I notice this method can also return errors from the queue disciplines,
3199  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3200  *      be positive.
3201  *
3202  *      Regardless of the return value, the skb is consumed, so it is currently
3203  *      difficult to retry a send to this method.  (You can bump the ref count
3204  *      before sending to hold a reference for retry if you are careful.)
3205  *
3206  *      When calling this method, interrupts MUST be enabled.  This is because
3207  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3208  *          --BLG
3209  */
3210 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3211 {
3212 	struct net_device *dev = skb->dev;
3213 	struct netdev_queue *txq;
3214 	struct Qdisc *q;
3215 	int rc = -ENOMEM;
3216 
3217 	skb_reset_mac_header(skb);
3218 
3219 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3220 		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3221 
3222 	/* Disable soft irqs for various locks below. Also
3223 	 * stops preemption for RCU.
3224 	 */
3225 	rcu_read_lock_bh();
3226 
3227 	skb_update_prio(skb);
3228 
3229 	/* If device/qdisc don't need skb->dst, release it right now while
3230 	 * its hot in this cpu cache.
3231 	 */
3232 	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3233 		skb_dst_drop(skb);
3234 	else
3235 		skb_dst_force(skb);
3236 
3237 #ifdef CONFIG_NET_SWITCHDEV
3238 	/* Don't forward if offload device already forwarded */
3239 	if (skb->offload_fwd_mark &&
3240 	    skb->offload_fwd_mark == dev->offload_fwd_mark) {
3241 		consume_skb(skb);
3242 		rc = NET_XMIT_SUCCESS;
3243 		goto out;
3244 	}
3245 #endif
3246 
3247 	txq = netdev_pick_tx(dev, skb, accel_priv);
3248 	q = rcu_dereference_bh(txq->qdisc);
3249 
3250 #ifdef CONFIG_NET_CLS_ACT
3251 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3252 #endif
3253 	trace_net_dev_queue(skb);
3254 	if (q->enqueue) {
3255 		rc = __dev_xmit_skb(skb, q, dev, txq);
3256 		goto out;
3257 	}
3258 
3259 	/* The device has no queue. Common case for software devices:
3260 	   loopback, all the sorts of tunnels...
3261 
3262 	   Really, it is unlikely that netif_tx_lock protection is necessary
3263 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3264 	   counters.)
3265 	   However, it is possible, that they rely on protection
3266 	   made by us here.
3267 
3268 	   Check this and shot the lock. It is not prone from deadlocks.
3269 	   Either shot noqueue qdisc, it is even simpler 8)
3270 	 */
3271 	if (dev->flags & IFF_UP) {
3272 		int cpu = smp_processor_id(); /* ok because BHs are off */
3273 
3274 		if (txq->xmit_lock_owner != cpu) {
3275 
3276 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3277 				goto recursion_alert;
3278 
3279 			skb = validate_xmit_skb(skb, dev);
3280 			if (!skb)
3281 				goto drop;
3282 
3283 			HARD_TX_LOCK(dev, txq, cpu);
3284 
3285 			if (!netif_xmit_stopped(txq)) {
3286 				__this_cpu_inc(xmit_recursion);
3287 				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3288 				__this_cpu_dec(xmit_recursion);
3289 				if (dev_xmit_complete(rc)) {
3290 					HARD_TX_UNLOCK(dev, txq);
3291 					goto out;
3292 				}
3293 			}
3294 			HARD_TX_UNLOCK(dev, txq);
3295 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3296 					     dev->name);
3297 		} else {
3298 			/* Recursion is detected! It is possible,
3299 			 * unfortunately
3300 			 */
3301 recursion_alert:
3302 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3303 					     dev->name);
3304 		}
3305 	}
3306 
3307 	rc = -ENETDOWN;
3308 drop:
3309 	rcu_read_unlock_bh();
3310 
3311 	atomic_long_inc(&dev->tx_dropped);
3312 	kfree_skb_list(skb);
3313 	return rc;
3314 out:
3315 	rcu_read_unlock_bh();
3316 	return rc;
3317 }
3318 
3319 int dev_queue_xmit(struct sk_buff *skb)
3320 {
3321 	return __dev_queue_xmit(skb, NULL);
3322 }
3323 EXPORT_SYMBOL(dev_queue_xmit);
3324 
3325 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3326 {
3327 	return __dev_queue_xmit(skb, accel_priv);
3328 }
3329 EXPORT_SYMBOL(dev_queue_xmit_accel);
3330 
3331 
3332 /*=======================================================================
3333 			Receiver routines
3334   =======================================================================*/
3335 
3336 int netdev_max_backlog __read_mostly = 1000;
3337 EXPORT_SYMBOL(netdev_max_backlog);
3338 
3339 int netdev_tstamp_prequeue __read_mostly = 1;
3340 int netdev_budget __read_mostly = 300;
3341 int weight_p __read_mostly = 64;            /* old backlog weight */
3342 
3343 /* Called with irq disabled */
3344 static inline void ____napi_schedule(struct softnet_data *sd,
3345 				     struct napi_struct *napi)
3346 {
3347 	list_add_tail(&napi->poll_list, &sd->poll_list);
3348 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3349 }
3350 
3351 #ifdef CONFIG_RPS
3352 
3353 /* One global table that all flow-based protocols share. */
3354 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3355 EXPORT_SYMBOL(rps_sock_flow_table);
3356 u32 rps_cpu_mask __read_mostly;
3357 EXPORT_SYMBOL(rps_cpu_mask);
3358 
3359 struct static_key rps_needed __read_mostly;
3360 
3361 static struct rps_dev_flow *
3362 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3363 	    struct rps_dev_flow *rflow, u16 next_cpu)
3364 {
3365 	if (next_cpu < nr_cpu_ids) {
3366 #ifdef CONFIG_RFS_ACCEL
3367 		struct netdev_rx_queue *rxqueue;
3368 		struct rps_dev_flow_table *flow_table;
3369 		struct rps_dev_flow *old_rflow;
3370 		u32 flow_id;
3371 		u16 rxq_index;
3372 		int rc;
3373 
3374 		/* Should we steer this flow to a different hardware queue? */
3375 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3376 		    !(dev->features & NETIF_F_NTUPLE))
3377 			goto out;
3378 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3379 		if (rxq_index == skb_get_rx_queue(skb))
3380 			goto out;
3381 
3382 		rxqueue = dev->_rx + rxq_index;
3383 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3384 		if (!flow_table)
3385 			goto out;
3386 		flow_id = skb_get_hash(skb) & flow_table->mask;
3387 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3388 							rxq_index, flow_id);
3389 		if (rc < 0)
3390 			goto out;
3391 		old_rflow = rflow;
3392 		rflow = &flow_table->flows[flow_id];
3393 		rflow->filter = rc;
3394 		if (old_rflow->filter == rflow->filter)
3395 			old_rflow->filter = RPS_NO_FILTER;
3396 	out:
3397 #endif
3398 		rflow->last_qtail =
3399 			per_cpu(softnet_data, next_cpu).input_queue_head;
3400 	}
3401 
3402 	rflow->cpu = next_cpu;
3403 	return rflow;
3404 }
3405 
3406 /*
3407  * get_rps_cpu is called from netif_receive_skb and returns the target
3408  * CPU from the RPS map of the receiving queue for a given skb.
3409  * rcu_read_lock must be held on entry.
3410  */
3411 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3412 		       struct rps_dev_flow **rflowp)
3413 {
3414 	const struct rps_sock_flow_table *sock_flow_table;
3415 	struct netdev_rx_queue *rxqueue = dev->_rx;
3416 	struct rps_dev_flow_table *flow_table;
3417 	struct rps_map *map;
3418 	int cpu = -1;
3419 	u32 tcpu;
3420 	u32 hash;
3421 
3422 	if (skb_rx_queue_recorded(skb)) {
3423 		u16 index = skb_get_rx_queue(skb);
3424 
3425 		if (unlikely(index >= dev->real_num_rx_queues)) {
3426 			WARN_ONCE(dev->real_num_rx_queues > 1,
3427 				  "%s received packet on queue %u, but number "
3428 				  "of RX queues is %u\n",
3429 				  dev->name, index, dev->real_num_rx_queues);
3430 			goto done;
3431 		}
3432 		rxqueue += index;
3433 	}
3434 
3435 	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3436 
3437 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3438 	map = rcu_dereference(rxqueue->rps_map);
3439 	if (!flow_table && !map)
3440 		goto done;
3441 
3442 	skb_reset_network_header(skb);
3443 	hash = skb_get_hash(skb);
3444 	if (!hash)
3445 		goto done;
3446 
3447 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3448 	if (flow_table && sock_flow_table) {
3449 		struct rps_dev_flow *rflow;
3450 		u32 next_cpu;
3451 		u32 ident;
3452 
3453 		/* First check into global flow table if there is a match */
3454 		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3455 		if ((ident ^ hash) & ~rps_cpu_mask)
3456 			goto try_rps;
3457 
3458 		next_cpu = ident & rps_cpu_mask;
3459 
3460 		/* OK, now we know there is a match,
3461 		 * we can look at the local (per receive queue) flow table
3462 		 */
3463 		rflow = &flow_table->flows[hash & flow_table->mask];
3464 		tcpu = rflow->cpu;
3465 
3466 		/*
3467 		 * If the desired CPU (where last recvmsg was done) is
3468 		 * different from current CPU (one in the rx-queue flow
3469 		 * table entry), switch if one of the following holds:
3470 		 *   - Current CPU is unset (>= nr_cpu_ids).
3471 		 *   - Current CPU is offline.
3472 		 *   - The current CPU's queue tail has advanced beyond the
3473 		 *     last packet that was enqueued using this table entry.
3474 		 *     This guarantees that all previous packets for the flow
3475 		 *     have been dequeued, thus preserving in order delivery.
3476 		 */
3477 		if (unlikely(tcpu != next_cpu) &&
3478 		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3479 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3480 		      rflow->last_qtail)) >= 0)) {
3481 			tcpu = next_cpu;
3482 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3483 		}
3484 
3485 		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3486 			*rflowp = rflow;
3487 			cpu = tcpu;
3488 			goto done;
3489 		}
3490 	}
3491 
3492 try_rps:
3493 
3494 	if (map) {
3495 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3496 		if (cpu_online(tcpu)) {
3497 			cpu = tcpu;
3498 			goto done;
3499 		}
3500 	}
3501 
3502 done:
3503 	return cpu;
3504 }
3505 
3506 #ifdef CONFIG_RFS_ACCEL
3507 
3508 /**
3509  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3510  * @dev: Device on which the filter was set
3511  * @rxq_index: RX queue index
3512  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3513  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3514  *
3515  * Drivers that implement ndo_rx_flow_steer() should periodically call
3516  * this function for each installed filter and remove the filters for
3517  * which it returns %true.
3518  */
3519 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3520 			 u32 flow_id, u16 filter_id)
3521 {
3522 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3523 	struct rps_dev_flow_table *flow_table;
3524 	struct rps_dev_flow *rflow;
3525 	bool expire = true;
3526 	unsigned int cpu;
3527 
3528 	rcu_read_lock();
3529 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3530 	if (flow_table && flow_id <= flow_table->mask) {
3531 		rflow = &flow_table->flows[flow_id];
3532 		cpu = ACCESS_ONCE(rflow->cpu);
3533 		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3534 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3535 			   rflow->last_qtail) <
3536 		     (int)(10 * flow_table->mask)))
3537 			expire = false;
3538 	}
3539 	rcu_read_unlock();
3540 	return expire;
3541 }
3542 EXPORT_SYMBOL(rps_may_expire_flow);
3543 
3544 #endif /* CONFIG_RFS_ACCEL */
3545 
3546 /* Called from hardirq (IPI) context */
3547 static void rps_trigger_softirq(void *data)
3548 {
3549 	struct softnet_data *sd = data;
3550 
3551 	____napi_schedule(sd, &sd->backlog);
3552 	sd->received_rps++;
3553 }
3554 
3555 #endif /* CONFIG_RPS */
3556 
3557 /*
3558  * Check if this softnet_data structure is another cpu one
3559  * If yes, queue it to our IPI list and return 1
3560  * If no, return 0
3561  */
3562 static int rps_ipi_queued(struct softnet_data *sd)
3563 {
3564 #ifdef CONFIG_RPS
3565 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3566 
3567 	if (sd != mysd) {
3568 		sd->rps_ipi_next = mysd->rps_ipi_list;
3569 		mysd->rps_ipi_list = sd;
3570 
3571 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3572 		return 1;
3573 	}
3574 #endif /* CONFIG_RPS */
3575 	return 0;
3576 }
3577 
3578 #ifdef CONFIG_NET_FLOW_LIMIT
3579 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3580 #endif
3581 
3582 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3583 {
3584 #ifdef CONFIG_NET_FLOW_LIMIT
3585 	struct sd_flow_limit *fl;
3586 	struct softnet_data *sd;
3587 	unsigned int old_flow, new_flow;
3588 
3589 	if (qlen < (netdev_max_backlog >> 1))
3590 		return false;
3591 
3592 	sd = this_cpu_ptr(&softnet_data);
3593 
3594 	rcu_read_lock();
3595 	fl = rcu_dereference(sd->flow_limit);
3596 	if (fl) {
3597 		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3598 		old_flow = fl->history[fl->history_head];
3599 		fl->history[fl->history_head] = new_flow;
3600 
3601 		fl->history_head++;
3602 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3603 
3604 		if (likely(fl->buckets[old_flow]))
3605 			fl->buckets[old_flow]--;
3606 
3607 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3608 			fl->count++;
3609 			rcu_read_unlock();
3610 			return true;
3611 		}
3612 	}
3613 	rcu_read_unlock();
3614 #endif
3615 	return false;
3616 }
3617 
3618 /*
3619  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3620  * queue (may be a remote CPU queue).
3621  */
3622 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3623 			      unsigned int *qtail)
3624 {
3625 	struct softnet_data *sd;
3626 	unsigned long flags;
3627 	unsigned int qlen;
3628 
3629 	sd = &per_cpu(softnet_data, cpu);
3630 
3631 	local_irq_save(flags);
3632 
3633 	rps_lock(sd);
3634 	if (!netif_running(skb->dev))
3635 		goto drop;
3636 	qlen = skb_queue_len(&sd->input_pkt_queue);
3637 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3638 		if (qlen) {
3639 enqueue:
3640 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3641 			input_queue_tail_incr_save(sd, qtail);
3642 			rps_unlock(sd);
3643 			local_irq_restore(flags);
3644 			return NET_RX_SUCCESS;
3645 		}
3646 
3647 		/* Schedule NAPI for backlog device
3648 		 * We can use non atomic operation since we own the queue lock
3649 		 */
3650 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3651 			if (!rps_ipi_queued(sd))
3652 				____napi_schedule(sd, &sd->backlog);
3653 		}
3654 		goto enqueue;
3655 	}
3656 
3657 drop:
3658 	sd->dropped++;
3659 	rps_unlock(sd);
3660 
3661 	local_irq_restore(flags);
3662 
3663 	atomic_long_inc(&skb->dev->rx_dropped);
3664 	kfree_skb(skb);
3665 	return NET_RX_DROP;
3666 }
3667 
3668 static int netif_rx_internal(struct sk_buff *skb)
3669 {
3670 	int ret;
3671 
3672 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3673 
3674 	trace_netif_rx(skb);
3675 #ifdef CONFIG_RPS
3676 	if (static_key_false(&rps_needed)) {
3677 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3678 		int cpu;
3679 
3680 		preempt_disable();
3681 		rcu_read_lock();
3682 
3683 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3684 		if (cpu < 0)
3685 			cpu = smp_processor_id();
3686 
3687 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3688 
3689 		rcu_read_unlock();
3690 		preempt_enable();
3691 	} else
3692 #endif
3693 	{
3694 		unsigned int qtail;
3695 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3696 		put_cpu();
3697 	}
3698 	return ret;
3699 }
3700 
3701 /**
3702  *	netif_rx	-	post buffer to the network code
3703  *	@skb: buffer to post
3704  *
3705  *	This function receives a packet from a device driver and queues it for
3706  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3707  *	may be dropped during processing for congestion control or by the
3708  *	protocol layers.
3709  *
3710  *	return values:
3711  *	NET_RX_SUCCESS	(no congestion)
3712  *	NET_RX_DROP     (packet was dropped)
3713  *
3714  */
3715 
3716 int netif_rx(struct sk_buff *skb)
3717 {
3718 	trace_netif_rx_entry(skb);
3719 
3720 	return netif_rx_internal(skb);
3721 }
3722 EXPORT_SYMBOL(netif_rx);
3723 
3724 int netif_rx_ni(struct sk_buff *skb)
3725 {
3726 	int err;
3727 
3728 	trace_netif_rx_ni_entry(skb);
3729 
3730 	preempt_disable();
3731 	err = netif_rx_internal(skb);
3732 	if (local_softirq_pending())
3733 		do_softirq();
3734 	preempt_enable();
3735 
3736 	return err;
3737 }
3738 EXPORT_SYMBOL(netif_rx_ni);
3739 
3740 static void net_tx_action(struct softirq_action *h)
3741 {
3742 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3743 
3744 	if (sd->completion_queue) {
3745 		struct sk_buff *clist;
3746 
3747 		local_irq_disable();
3748 		clist = sd->completion_queue;
3749 		sd->completion_queue = NULL;
3750 		local_irq_enable();
3751 
3752 		while (clist) {
3753 			struct sk_buff *skb = clist;
3754 			clist = clist->next;
3755 
3756 			WARN_ON(atomic_read(&skb->users));
3757 			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3758 				trace_consume_skb(skb);
3759 			else
3760 				trace_kfree_skb(skb, net_tx_action);
3761 			__kfree_skb(skb);
3762 		}
3763 	}
3764 
3765 	if (sd->output_queue) {
3766 		struct Qdisc *head;
3767 
3768 		local_irq_disable();
3769 		head = sd->output_queue;
3770 		sd->output_queue = NULL;
3771 		sd->output_queue_tailp = &sd->output_queue;
3772 		local_irq_enable();
3773 
3774 		while (head) {
3775 			struct Qdisc *q = head;
3776 			spinlock_t *root_lock;
3777 
3778 			head = head->next_sched;
3779 
3780 			root_lock = qdisc_lock(q);
3781 			if (spin_trylock(root_lock)) {
3782 				smp_mb__before_atomic();
3783 				clear_bit(__QDISC_STATE_SCHED,
3784 					  &q->state);
3785 				qdisc_run(q);
3786 				spin_unlock(root_lock);
3787 			} else {
3788 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3789 					      &q->state)) {
3790 					__netif_reschedule(q);
3791 				} else {
3792 					smp_mb__before_atomic();
3793 					clear_bit(__QDISC_STATE_SCHED,
3794 						  &q->state);
3795 				}
3796 			}
3797 		}
3798 	}
3799 }
3800 
3801 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3802     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3803 /* This hook is defined here for ATM LANE */
3804 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3805 			     unsigned char *addr) __read_mostly;
3806 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3807 #endif
3808 
3809 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3810 					 struct packet_type **pt_prev,
3811 					 int *ret, struct net_device *orig_dev)
3812 {
3813 #ifdef CONFIG_NET_CLS_ACT
3814 	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3815 	struct tcf_result cl_res;
3816 
3817 	/* If there's at least one ingress present somewhere (so
3818 	 * we get here via enabled static key), remaining devices
3819 	 * that are not configured with an ingress qdisc will bail
3820 	 * out here.
3821 	 */
3822 	if (!cl)
3823 		return skb;
3824 	if (*pt_prev) {
3825 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3826 		*pt_prev = NULL;
3827 	}
3828 
3829 	qdisc_skb_cb(skb)->pkt_len = skb->len;
3830 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3831 	qdisc_bstats_cpu_update(cl->q, skb);
3832 
3833 	switch (tc_classify(skb, cl, &cl_res, false)) {
3834 	case TC_ACT_OK:
3835 	case TC_ACT_RECLASSIFY:
3836 		skb->tc_index = TC_H_MIN(cl_res.classid);
3837 		break;
3838 	case TC_ACT_SHOT:
3839 		qdisc_qstats_cpu_drop(cl->q);
3840 	case TC_ACT_STOLEN:
3841 	case TC_ACT_QUEUED:
3842 		kfree_skb(skb);
3843 		return NULL;
3844 	case TC_ACT_REDIRECT:
3845 		/* skb_mac_header check was done by cls/act_bpf, so
3846 		 * we can safely push the L2 header back before
3847 		 * redirecting to another netdev
3848 		 */
3849 		__skb_push(skb, skb->mac_len);
3850 		skb_do_redirect(skb);
3851 		return NULL;
3852 	default:
3853 		break;
3854 	}
3855 #endif /* CONFIG_NET_CLS_ACT */
3856 	return skb;
3857 }
3858 
3859 /**
3860  *	netdev_rx_handler_register - register receive handler
3861  *	@dev: device to register a handler for
3862  *	@rx_handler: receive handler to register
3863  *	@rx_handler_data: data pointer that is used by rx handler
3864  *
3865  *	Register a receive handler for a device. This handler will then be
3866  *	called from __netif_receive_skb. A negative errno code is returned
3867  *	on a failure.
3868  *
3869  *	The caller must hold the rtnl_mutex.
3870  *
3871  *	For a general description of rx_handler, see enum rx_handler_result.
3872  */
3873 int netdev_rx_handler_register(struct net_device *dev,
3874 			       rx_handler_func_t *rx_handler,
3875 			       void *rx_handler_data)
3876 {
3877 	ASSERT_RTNL();
3878 
3879 	if (dev->rx_handler)
3880 		return -EBUSY;
3881 
3882 	/* Note: rx_handler_data must be set before rx_handler */
3883 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3884 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3885 
3886 	return 0;
3887 }
3888 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3889 
3890 /**
3891  *	netdev_rx_handler_unregister - unregister receive handler
3892  *	@dev: device to unregister a handler from
3893  *
3894  *	Unregister a receive handler from a device.
3895  *
3896  *	The caller must hold the rtnl_mutex.
3897  */
3898 void netdev_rx_handler_unregister(struct net_device *dev)
3899 {
3900 
3901 	ASSERT_RTNL();
3902 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3903 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3904 	 * section has a guarantee to see a non NULL rx_handler_data
3905 	 * as well.
3906 	 */
3907 	synchronize_net();
3908 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3909 }
3910 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3911 
3912 /*
3913  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3914  * the special handling of PFMEMALLOC skbs.
3915  */
3916 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3917 {
3918 	switch (skb->protocol) {
3919 	case htons(ETH_P_ARP):
3920 	case htons(ETH_P_IP):
3921 	case htons(ETH_P_IPV6):
3922 	case htons(ETH_P_8021Q):
3923 	case htons(ETH_P_8021AD):
3924 		return true;
3925 	default:
3926 		return false;
3927 	}
3928 }
3929 
3930 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3931 			     int *ret, struct net_device *orig_dev)
3932 {
3933 #ifdef CONFIG_NETFILTER_INGRESS
3934 	if (nf_hook_ingress_active(skb)) {
3935 		if (*pt_prev) {
3936 			*ret = deliver_skb(skb, *pt_prev, orig_dev);
3937 			*pt_prev = NULL;
3938 		}
3939 
3940 		return nf_hook_ingress(skb);
3941 	}
3942 #endif /* CONFIG_NETFILTER_INGRESS */
3943 	return 0;
3944 }
3945 
3946 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3947 {
3948 	struct packet_type *ptype, *pt_prev;
3949 	rx_handler_func_t *rx_handler;
3950 	struct net_device *orig_dev;
3951 	bool deliver_exact = false;
3952 	int ret = NET_RX_DROP;
3953 	__be16 type;
3954 
3955 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3956 
3957 	trace_netif_receive_skb(skb);
3958 
3959 	orig_dev = skb->dev;
3960 
3961 	skb_reset_network_header(skb);
3962 	if (!skb_transport_header_was_set(skb))
3963 		skb_reset_transport_header(skb);
3964 	skb_reset_mac_len(skb);
3965 
3966 	pt_prev = NULL;
3967 
3968 another_round:
3969 	skb->skb_iif = skb->dev->ifindex;
3970 
3971 	__this_cpu_inc(softnet_data.processed);
3972 
3973 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3974 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3975 		skb = skb_vlan_untag(skb);
3976 		if (unlikely(!skb))
3977 			goto out;
3978 	}
3979 
3980 #ifdef CONFIG_NET_CLS_ACT
3981 	if (skb->tc_verd & TC_NCLS) {
3982 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3983 		goto ncls;
3984 	}
3985 #endif
3986 
3987 	if (pfmemalloc)
3988 		goto skip_taps;
3989 
3990 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3991 		if (pt_prev)
3992 			ret = deliver_skb(skb, pt_prev, orig_dev);
3993 		pt_prev = ptype;
3994 	}
3995 
3996 	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3997 		if (pt_prev)
3998 			ret = deliver_skb(skb, pt_prev, orig_dev);
3999 		pt_prev = ptype;
4000 	}
4001 
4002 skip_taps:
4003 #ifdef CONFIG_NET_INGRESS
4004 	if (static_key_false(&ingress_needed)) {
4005 		skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
4006 		if (!skb)
4007 			goto out;
4008 
4009 		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4010 			goto out;
4011 	}
4012 #endif
4013 #ifdef CONFIG_NET_CLS_ACT
4014 	skb->tc_verd = 0;
4015 ncls:
4016 #endif
4017 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4018 		goto drop;
4019 
4020 	if (skb_vlan_tag_present(skb)) {
4021 		if (pt_prev) {
4022 			ret = deliver_skb(skb, pt_prev, orig_dev);
4023 			pt_prev = NULL;
4024 		}
4025 		if (vlan_do_receive(&skb))
4026 			goto another_round;
4027 		else if (unlikely(!skb))
4028 			goto out;
4029 	}
4030 
4031 	rx_handler = rcu_dereference(skb->dev->rx_handler);
4032 	if (rx_handler) {
4033 		if (pt_prev) {
4034 			ret = deliver_skb(skb, pt_prev, orig_dev);
4035 			pt_prev = NULL;
4036 		}
4037 		switch (rx_handler(&skb)) {
4038 		case RX_HANDLER_CONSUMED:
4039 			ret = NET_RX_SUCCESS;
4040 			goto out;
4041 		case RX_HANDLER_ANOTHER:
4042 			goto another_round;
4043 		case RX_HANDLER_EXACT:
4044 			deliver_exact = true;
4045 		case RX_HANDLER_PASS:
4046 			break;
4047 		default:
4048 			BUG();
4049 		}
4050 	}
4051 
4052 	if (unlikely(skb_vlan_tag_present(skb))) {
4053 		if (skb_vlan_tag_get_id(skb))
4054 			skb->pkt_type = PACKET_OTHERHOST;
4055 		/* Note: we might in the future use prio bits
4056 		 * and set skb->priority like in vlan_do_receive()
4057 		 * For the time being, just ignore Priority Code Point
4058 		 */
4059 		skb->vlan_tci = 0;
4060 	}
4061 
4062 	type = skb->protocol;
4063 
4064 	/* deliver only exact match when indicated */
4065 	if (likely(!deliver_exact)) {
4066 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4067 				       &ptype_base[ntohs(type) &
4068 						   PTYPE_HASH_MASK]);
4069 	}
4070 
4071 	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4072 			       &orig_dev->ptype_specific);
4073 
4074 	if (unlikely(skb->dev != orig_dev)) {
4075 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4076 				       &skb->dev->ptype_specific);
4077 	}
4078 
4079 	if (pt_prev) {
4080 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4081 			goto drop;
4082 		else
4083 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4084 	} else {
4085 drop:
4086 		atomic_long_inc(&skb->dev->rx_dropped);
4087 		kfree_skb(skb);
4088 		/* Jamal, now you will not able to escape explaining
4089 		 * me how you were going to use this. :-)
4090 		 */
4091 		ret = NET_RX_DROP;
4092 	}
4093 
4094 out:
4095 	return ret;
4096 }
4097 
4098 static int __netif_receive_skb(struct sk_buff *skb)
4099 {
4100 	int ret;
4101 
4102 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4103 		unsigned long pflags = current->flags;
4104 
4105 		/*
4106 		 * PFMEMALLOC skbs are special, they should
4107 		 * - be delivered to SOCK_MEMALLOC sockets only
4108 		 * - stay away from userspace
4109 		 * - have bounded memory usage
4110 		 *
4111 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4112 		 * context down to all allocation sites.
4113 		 */
4114 		current->flags |= PF_MEMALLOC;
4115 		ret = __netif_receive_skb_core(skb, true);
4116 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4117 	} else
4118 		ret = __netif_receive_skb_core(skb, false);
4119 
4120 	return ret;
4121 }
4122 
4123 static int netif_receive_skb_internal(struct sk_buff *skb)
4124 {
4125 	int ret;
4126 
4127 	net_timestamp_check(netdev_tstamp_prequeue, skb);
4128 
4129 	if (skb_defer_rx_timestamp(skb))
4130 		return NET_RX_SUCCESS;
4131 
4132 	rcu_read_lock();
4133 
4134 #ifdef CONFIG_RPS
4135 	if (static_key_false(&rps_needed)) {
4136 		struct rps_dev_flow voidflow, *rflow = &voidflow;
4137 		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4138 
4139 		if (cpu >= 0) {
4140 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4141 			rcu_read_unlock();
4142 			return ret;
4143 		}
4144 	}
4145 #endif
4146 	ret = __netif_receive_skb(skb);
4147 	rcu_read_unlock();
4148 	return ret;
4149 }
4150 
4151 /**
4152  *	netif_receive_skb - process receive buffer from network
4153  *	@skb: buffer to process
4154  *
4155  *	netif_receive_skb() is the main receive data processing function.
4156  *	It always succeeds. The buffer may be dropped during processing
4157  *	for congestion control or by the protocol layers.
4158  *
4159  *	This function may only be called from softirq context and interrupts
4160  *	should be enabled.
4161  *
4162  *	Return values (usually ignored):
4163  *	NET_RX_SUCCESS: no congestion
4164  *	NET_RX_DROP: packet was dropped
4165  */
4166 int netif_receive_skb(struct sk_buff *skb)
4167 {
4168 	trace_netif_receive_skb_entry(skb);
4169 
4170 	return netif_receive_skb_internal(skb);
4171 }
4172 EXPORT_SYMBOL(netif_receive_skb);
4173 
4174 /* Network device is going away, flush any packets still pending
4175  * Called with irqs disabled.
4176  */
4177 static void flush_backlog(void *arg)
4178 {
4179 	struct net_device *dev = arg;
4180 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4181 	struct sk_buff *skb, *tmp;
4182 
4183 	rps_lock(sd);
4184 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4185 		if (skb->dev == dev) {
4186 			__skb_unlink(skb, &sd->input_pkt_queue);
4187 			kfree_skb(skb);
4188 			input_queue_head_incr(sd);
4189 		}
4190 	}
4191 	rps_unlock(sd);
4192 
4193 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4194 		if (skb->dev == dev) {
4195 			__skb_unlink(skb, &sd->process_queue);
4196 			kfree_skb(skb);
4197 			input_queue_head_incr(sd);
4198 		}
4199 	}
4200 }
4201 
4202 static int napi_gro_complete(struct sk_buff *skb)
4203 {
4204 	struct packet_offload *ptype;
4205 	__be16 type = skb->protocol;
4206 	struct list_head *head = &offload_base;
4207 	int err = -ENOENT;
4208 
4209 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4210 
4211 	if (NAPI_GRO_CB(skb)->count == 1) {
4212 		skb_shinfo(skb)->gso_size = 0;
4213 		goto out;
4214 	}
4215 
4216 	rcu_read_lock();
4217 	list_for_each_entry_rcu(ptype, head, list) {
4218 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4219 			continue;
4220 
4221 		err = ptype->callbacks.gro_complete(skb, 0);
4222 		break;
4223 	}
4224 	rcu_read_unlock();
4225 
4226 	if (err) {
4227 		WARN_ON(&ptype->list == head);
4228 		kfree_skb(skb);
4229 		return NET_RX_SUCCESS;
4230 	}
4231 
4232 out:
4233 	return netif_receive_skb_internal(skb);
4234 }
4235 
4236 /* napi->gro_list contains packets ordered by age.
4237  * youngest packets at the head of it.
4238  * Complete skbs in reverse order to reduce latencies.
4239  */
4240 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4241 {
4242 	struct sk_buff *skb, *prev = NULL;
4243 
4244 	/* scan list and build reverse chain */
4245 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4246 		skb->prev = prev;
4247 		prev = skb;
4248 	}
4249 
4250 	for (skb = prev; skb; skb = prev) {
4251 		skb->next = NULL;
4252 
4253 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4254 			return;
4255 
4256 		prev = skb->prev;
4257 		napi_gro_complete(skb);
4258 		napi->gro_count--;
4259 	}
4260 
4261 	napi->gro_list = NULL;
4262 }
4263 EXPORT_SYMBOL(napi_gro_flush);
4264 
4265 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4266 {
4267 	struct sk_buff *p;
4268 	unsigned int maclen = skb->dev->hard_header_len;
4269 	u32 hash = skb_get_hash_raw(skb);
4270 
4271 	for (p = napi->gro_list; p; p = p->next) {
4272 		unsigned long diffs;
4273 
4274 		NAPI_GRO_CB(p)->flush = 0;
4275 
4276 		if (hash != skb_get_hash_raw(p)) {
4277 			NAPI_GRO_CB(p)->same_flow = 0;
4278 			continue;
4279 		}
4280 
4281 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4282 		diffs |= p->vlan_tci ^ skb->vlan_tci;
4283 		if (maclen == ETH_HLEN)
4284 			diffs |= compare_ether_header(skb_mac_header(p),
4285 						      skb_mac_header(skb));
4286 		else if (!diffs)
4287 			diffs = memcmp(skb_mac_header(p),
4288 				       skb_mac_header(skb),
4289 				       maclen);
4290 		NAPI_GRO_CB(p)->same_flow = !diffs;
4291 	}
4292 }
4293 
4294 static void skb_gro_reset_offset(struct sk_buff *skb)
4295 {
4296 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4297 	const skb_frag_t *frag0 = &pinfo->frags[0];
4298 
4299 	NAPI_GRO_CB(skb)->data_offset = 0;
4300 	NAPI_GRO_CB(skb)->frag0 = NULL;
4301 	NAPI_GRO_CB(skb)->frag0_len = 0;
4302 
4303 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4304 	    pinfo->nr_frags &&
4305 	    !PageHighMem(skb_frag_page(frag0))) {
4306 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4307 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4308 	}
4309 }
4310 
4311 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4312 {
4313 	struct skb_shared_info *pinfo = skb_shinfo(skb);
4314 
4315 	BUG_ON(skb->end - skb->tail < grow);
4316 
4317 	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4318 
4319 	skb->data_len -= grow;
4320 	skb->tail += grow;
4321 
4322 	pinfo->frags[0].page_offset += grow;
4323 	skb_frag_size_sub(&pinfo->frags[0], grow);
4324 
4325 	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4326 		skb_frag_unref(skb, 0);
4327 		memmove(pinfo->frags, pinfo->frags + 1,
4328 			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4329 	}
4330 }
4331 
4332 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4333 {
4334 	struct sk_buff **pp = NULL;
4335 	struct packet_offload *ptype;
4336 	__be16 type = skb->protocol;
4337 	struct list_head *head = &offload_base;
4338 	int same_flow;
4339 	enum gro_result ret;
4340 	int grow;
4341 
4342 	if (!(skb->dev->features & NETIF_F_GRO))
4343 		goto normal;
4344 
4345 	if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4346 		goto normal;
4347 
4348 	gro_list_prepare(napi, skb);
4349 
4350 	rcu_read_lock();
4351 	list_for_each_entry_rcu(ptype, head, list) {
4352 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4353 			continue;
4354 
4355 		skb_set_network_header(skb, skb_gro_offset(skb));
4356 		skb_reset_mac_len(skb);
4357 		NAPI_GRO_CB(skb)->same_flow = 0;
4358 		NAPI_GRO_CB(skb)->flush = 0;
4359 		NAPI_GRO_CB(skb)->free = 0;
4360 		NAPI_GRO_CB(skb)->udp_mark = 0;
4361 		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4362 
4363 		/* Setup for GRO checksum validation */
4364 		switch (skb->ip_summed) {
4365 		case CHECKSUM_COMPLETE:
4366 			NAPI_GRO_CB(skb)->csum = skb->csum;
4367 			NAPI_GRO_CB(skb)->csum_valid = 1;
4368 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4369 			break;
4370 		case CHECKSUM_UNNECESSARY:
4371 			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4372 			NAPI_GRO_CB(skb)->csum_valid = 0;
4373 			break;
4374 		default:
4375 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4376 			NAPI_GRO_CB(skb)->csum_valid = 0;
4377 		}
4378 
4379 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4380 		break;
4381 	}
4382 	rcu_read_unlock();
4383 
4384 	if (&ptype->list == head)
4385 		goto normal;
4386 
4387 	same_flow = NAPI_GRO_CB(skb)->same_flow;
4388 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4389 
4390 	if (pp) {
4391 		struct sk_buff *nskb = *pp;
4392 
4393 		*pp = nskb->next;
4394 		nskb->next = NULL;
4395 		napi_gro_complete(nskb);
4396 		napi->gro_count--;
4397 	}
4398 
4399 	if (same_flow)
4400 		goto ok;
4401 
4402 	if (NAPI_GRO_CB(skb)->flush)
4403 		goto normal;
4404 
4405 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4406 		struct sk_buff *nskb = napi->gro_list;
4407 
4408 		/* locate the end of the list to select the 'oldest' flow */
4409 		while (nskb->next) {
4410 			pp = &nskb->next;
4411 			nskb = *pp;
4412 		}
4413 		*pp = NULL;
4414 		nskb->next = NULL;
4415 		napi_gro_complete(nskb);
4416 	} else {
4417 		napi->gro_count++;
4418 	}
4419 	NAPI_GRO_CB(skb)->count = 1;
4420 	NAPI_GRO_CB(skb)->age = jiffies;
4421 	NAPI_GRO_CB(skb)->last = skb;
4422 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4423 	skb->next = napi->gro_list;
4424 	napi->gro_list = skb;
4425 	ret = GRO_HELD;
4426 
4427 pull:
4428 	grow = skb_gro_offset(skb) - skb_headlen(skb);
4429 	if (grow > 0)
4430 		gro_pull_from_frag0(skb, grow);
4431 ok:
4432 	return ret;
4433 
4434 normal:
4435 	ret = GRO_NORMAL;
4436 	goto pull;
4437 }
4438 
4439 struct packet_offload *gro_find_receive_by_type(__be16 type)
4440 {
4441 	struct list_head *offload_head = &offload_base;
4442 	struct packet_offload *ptype;
4443 
4444 	list_for_each_entry_rcu(ptype, offload_head, list) {
4445 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4446 			continue;
4447 		return ptype;
4448 	}
4449 	return NULL;
4450 }
4451 EXPORT_SYMBOL(gro_find_receive_by_type);
4452 
4453 struct packet_offload *gro_find_complete_by_type(__be16 type)
4454 {
4455 	struct list_head *offload_head = &offload_base;
4456 	struct packet_offload *ptype;
4457 
4458 	list_for_each_entry_rcu(ptype, offload_head, list) {
4459 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4460 			continue;
4461 		return ptype;
4462 	}
4463 	return NULL;
4464 }
4465 EXPORT_SYMBOL(gro_find_complete_by_type);
4466 
4467 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4468 {
4469 	switch (ret) {
4470 	case GRO_NORMAL:
4471 		if (netif_receive_skb_internal(skb))
4472 			ret = GRO_DROP;
4473 		break;
4474 
4475 	case GRO_DROP:
4476 		kfree_skb(skb);
4477 		break;
4478 
4479 	case GRO_MERGED_FREE:
4480 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4481 			kmem_cache_free(skbuff_head_cache, skb);
4482 		else
4483 			__kfree_skb(skb);
4484 		break;
4485 
4486 	case GRO_HELD:
4487 	case GRO_MERGED:
4488 		break;
4489 	}
4490 
4491 	return ret;
4492 }
4493 
4494 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4495 {
4496 	skb_mark_napi_id(skb, napi);
4497 	trace_napi_gro_receive_entry(skb);
4498 
4499 	skb_gro_reset_offset(skb);
4500 
4501 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4502 }
4503 EXPORT_SYMBOL(napi_gro_receive);
4504 
4505 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4506 {
4507 	if (unlikely(skb->pfmemalloc)) {
4508 		consume_skb(skb);
4509 		return;
4510 	}
4511 	__skb_pull(skb, skb_headlen(skb));
4512 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4513 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4514 	skb->vlan_tci = 0;
4515 	skb->dev = napi->dev;
4516 	skb->skb_iif = 0;
4517 	skb->encapsulation = 0;
4518 	skb_shinfo(skb)->gso_type = 0;
4519 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4520 
4521 	napi->skb = skb;
4522 }
4523 
4524 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4525 {
4526 	struct sk_buff *skb = napi->skb;
4527 
4528 	if (!skb) {
4529 		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4530 		if (skb) {
4531 			napi->skb = skb;
4532 			skb_mark_napi_id(skb, napi);
4533 		}
4534 	}
4535 	return skb;
4536 }
4537 EXPORT_SYMBOL(napi_get_frags);
4538 
4539 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4540 				      struct sk_buff *skb,
4541 				      gro_result_t ret)
4542 {
4543 	switch (ret) {
4544 	case GRO_NORMAL:
4545 	case GRO_HELD:
4546 		__skb_push(skb, ETH_HLEN);
4547 		skb->protocol = eth_type_trans(skb, skb->dev);
4548 		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4549 			ret = GRO_DROP;
4550 		break;
4551 
4552 	case GRO_DROP:
4553 	case GRO_MERGED_FREE:
4554 		napi_reuse_skb(napi, skb);
4555 		break;
4556 
4557 	case GRO_MERGED:
4558 		break;
4559 	}
4560 
4561 	return ret;
4562 }
4563 
4564 /* Upper GRO stack assumes network header starts at gro_offset=0
4565  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4566  * We copy ethernet header into skb->data to have a common layout.
4567  */
4568 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4569 {
4570 	struct sk_buff *skb = napi->skb;
4571 	const struct ethhdr *eth;
4572 	unsigned int hlen = sizeof(*eth);
4573 
4574 	napi->skb = NULL;
4575 
4576 	skb_reset_mac_header(skb);
4577 	skb_gro_reset_offset(skb);
4578 
4579 	eth = skb_gro_header_fast(skb, 0);
4580 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4581 		eth = skb_gro_header_slow(skb, hlen, 0);
4582 		if (unlikely(!eth)) {
4583 			napi_reuse_skb(napi, skb);
4584 			return NULL;
4585 		}
4586 	} else {
4587 		gro_pull_from_frag0(skb, hlen);
4588 		NAPI_GRO_CB(skb)->frag0 += hlen;
4589 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4590 	}
4591 	__skb_pull(skb, hlen);
4592 
4593 	/*
4594 	 * This works because the only protocols we care about don't require
4595 	 * special handling.
4596 	 * We'll fix it up properly in napi_frags_finish()
4597 	 */
4598 	skb->protocol = eth->h_proto;
4599 
4600 	return skb;
4601 }
4602 
4603 gro_result_t napi_gro_frags(struct napi_struct *napi)
4604 {
4605 	struct sk_buff *skb = napi_frags_skb(napi);
4606 
4607 	if (!skb)
4608 		return GRO_DROP;
4609 
4610 	trace_napi_gro_frags_entry(skb);
4611 
4612 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4613 }
4614 EXPORT_SYMBOL(napi_gro_frags);
4615 
4616 /* Compute the checksum from gro_offset and return the folded value
4617  * after adding in any pseudo checksum.
4618  */
4619 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4620 {
4621 	__wsum wsum;
4622 	__sum16 sum;
4623 
4624 	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4625 
4626 	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4627 	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4628 	if (likely(!sum)) {
4629 		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4630 		    !skb->csum_complete_sw)
4631 			netdev_rx_csum_fault(skb->dev);
4632 	}
4633 
4634 	NAPI_GRO_CB(skb)->csum = wsum;
4635 	NAPI_GRO_CB(skb)->csum_valid = 1;
4636 
4637 	return sum;
4638 }
4639 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4640 
4641 /*
4642  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4643  * Note: called with local irq disabled, but exits with local irq enabled.
4644  */
4645 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4646 {
4647 #ifdef CONFIG_RPS
4648 	struct softnet_data *remsd = sd->rps_ipi_list;
4649 
4650 	if (remsd) {
4651 		sd->rps_ipi_list = NULL;
4652 
4653 		local_irq_enable();
4654 
4655 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4656 		while (remsd) {
4657 			struct softnet_data *next = remsd->rps_ipi_next;
4658 
4659 			if (cpu_online(remsd->cpu))
4660 				smp_call_function_single_async(remsd->cpu,
4661 							   &remsd->csd);
4662 			remsd = next;
4663 		}
4664 	} else
4665 #endif
4666 		local_irq_enable();
4667 }
4668 
4669 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4670 {
4671 #ifdef CONFIG_RPS
4672 	return sd->rps_ipi_list != NULL;
4673 #else
4674 	return false;
4675 #endif
4676 }
4677 
4678 static int process_backlog(struct napi_struct *napi, int quota)
4679 {
4680 	int work = 0;
4681 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4682 
4683 	/* Check if we have pending ipi, its better to send them now,
4684 	 * not waiting net_rx_action() end.
4685 	 */
4686 	if (sd_has_rps_ipi_waiting(sd)) {
4687 		local_irq_disable();
4688 		net_rps_action_and_irq_enable(sd);
4689 	}
4690 
4691 	napi->weight = weight_p;
4692 	local_irq_disable();
4693 	while (1) {
4694 		struct sk_buff *skb;
4695 
4696 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4697 			rcu_read_lock();
4698 			local_irq_enable();
4699 			__netif_receive_skb(skb);
4700 			rcu_read_unlock();
4701 			local_irq_disable();
4702 			input_queue_head_incr(sd);
4703 			if (++work >= quota) {
4704 				local_irq_enable();
4705 				return work;
4706 			}
4707 		}
4708 
4709 		rps_lock(sd);
4710 		if (skb_queue_empty(&sd->input_pkt_queue)) {
4711 			/*
4712 			 * Inline a custom version of __napi_complete().
4713 			 * only current cpu owns and manipulates this napi,
4714 			 * and NAPI_STATE_SCHED is the only possible flag set
4715 			 * on backlog.
4716 			 * We can use a plain write instead of clear_bit(),
4717 			 * and we dont need an smp_mb() memory barrier.
4718 			 */
4719 			napi->state = 0;
4720 			rps_unlock(sd);
4721 
4722 			break;
4723 		}
4724 
4725 		skb_queue_splice_tail_init(&sd->input_pkt_queue,
4726 					   &sd->process_queue);
4727 		rps_unlock(sd);
4728 	}
4729 	local_irq_enable();
4730 
4731 	return work;
4732 }
4733 
4734 /**
4735  * __napi_schedule - schedule for receive
4736  * @n: entry to schedule
4737  *
4738  * The entry's receive function will be scheduled to run.
4739  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4740  */
4741 void __napi_schedule(struct napi_struct *n)
4742 {
4743 	unsigned long flags;
4744 
4745 	local_irq_save(flags);
4746 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4747 	local_irq_restore(flags);
4748 }
4749 EXPORT_SYMBOL(__napi_schedule);
4750 
4751 /**
4752  * __napi_schedule_irqoff - schedule for receive
4753  * @n: entry to schedule
4754  *
4755  * Variant of __napi_schedule() assuming hard irqs are masked
4756  */
4757 void __napi_schedule_irqoff(struct napi_struct *n)
4758 {
4759 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4760 }
4761 EXPORT_SYMBOL(__napi_schedule_irqoff);
4762 
4763 void __napi_complete(struct napi_struct *n)
4764 {
4765 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4766 
4767 	list_del_init(&n->poll_list);
4768 	smp_mb__before_atomic();
4769 	clear_bit(NAPI_STATE_SCHED, &n->state);
4770 }
4771 EXPORT_SYMBOL(__napi_complete);
4772 
4773 void napi_complete_done(struct napi_struct *n, int work_done)
4774 {
4775 	unsigned long flags;
4776 
4777 	/*
4778 	 * don't let napi dequeue from the cpu poll list
4779 	 * just in case its running on a different cpu
4780 	 */
4781 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4782 		return;
4783 
4784 	if (n->gro_list) {
4785 		unsigned long timeout = 0;
4786 
4787 		if (work_done)
4788 			timeout = n->dev->gro_flush_timeout;
4789 
4790 		if (timeout)
4791 			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4792 				      HRTIMER_MODE_REL_PINNED);
4793 		else
4794 			napi_gro_flush(n, false);
4795 	}
4796 	if (likely(list_empty(&n->poll_list))) {
4797 		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4798 	} else {
4799 		/* If n->poll_list is not empty, we need to mask irqs */
4800 		local_irq_save(flags);
4801 		__napi_complete(n);
4802 		local_irq_restore(flags);
4803 	}
4804 }
4805 EXPORT_SYMBOL(napi_complete_done);
4806 
4807 /* must be called under rcu_read_lock(), as we dont take a reference */
4808 static struct napi_struct *napi_by_id(unsigned int napi_id)
4809 {
4810 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4811 	struct napi_struct *napi;
4812 
4813 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4814 		if (napi->napi_id == napi_id)
4815 			return napi;
4816 
4817 	return NULL;
4818 }
4819 
4820 #if defined(CONFIG_NET_RX_BUSY_POLL)
4821 #define BUSY_POLL_BUDGET 8
4822 bool sk_busy_loop(struct sock *sk, int nonblock)
4823 {
4824 	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4825 	int (*busy_poll)(struct napi_struct *dev);
4826 	struct napi_struct *napi;
4827 	int rc = false;
4828 
4829 	rcu_read_lock();
4830 
4831 	napi = napi_by_id(sk->sk_napi_id);
4832 	if (!napi)
4833 		goto out;
4834 
4835 	/* Note: ndo_busy_poll method is optional in linux-4.5 */
4836 	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4837 
4838 	do {
4839 		rc = 0;
4840 		local_bh_disable();
4841 		if (busy_poll) {
4842 			rc = busy_poll(napi);
4843 		} else if (napi_schedule_prep(napi)) {
4844 			void *have = netpoll_poll_lock(napi);
4845 
4846 			if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4847 				rc = napi->poll(napi, BUSY_POLL_BUDGET);
4848 				trace_napi_poll(napi);
4849 				if (rc == BUSY_POLL_BUDGET) {
4850 					napi_complete_done(napi, rc);
4851 					napi_schedule(napi);
4852 				}
4853 			}
4854 			netpoll_poll_unlock(have);
4855 		}
4856 		if (rc > 0)
4857 			NET_ADD_STATS_BH(sock_net(sk),
4858 					 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
4859 		local_bh_enable();
4860 
4861 		if (rc == LL_FLUSH_FAILED)
4862 			break; /* permanent failure */
4863 
4864 		cpu_relax();
4865 	} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
4866 		 !need_resched() && !busy_loop_timeout(end_time));
4867 
4868 	rc = !skb_queue_empty(&sk->sk_receive_queue);
4869 out:
4870 	rcu_read_unlock();
4871 	return rc;
4872 }
4873 EXPORT_SYMBOL(sk_busy_loop);
4874 
4875 #endif /* CONFIG_NET_RX_BUSY_POLL */
4876 
4877 void napi_hash_add(struct napi_struct *napi)
4878 {
4879 	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
4880 	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
4881 		return;
4882 
4883 	spin_lock(&napi_hash_lock);
4884 
4885 	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
4886 	do {
4887 		if (unlikely(++napi_gen_id < NR_CPUS + 1))
4888 			napi_gen_id = NR_CPUS + 1;
4889 	} while (napi_by_id(napi_gen_id));
4890 	napi->napi_id = napi_gen_id;
4891 
4892 	hlist_add_head_rcu(&napi->napi_hash_node,
4893 			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4894 
4895 	spin_unlock(&napi_hash_lock);
4896 }
4897 EXPORT_SYMBOL_GPL(napi_hash_add);
4898 
4899 /* Warning : caller is responsible to make sure rcu grace period
4900  * is respected before freeing memory containing @napi
4901  */
4902 bool napi_hash_del(struct napi_struct *napi)
4903 {
4904 	bool rcu_sync_needed = false;
4905 
4906 	spin_lock(&napi_hash_lock);
4907 
4908 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
4909 		rcu_sync_needed = true;
4910 		hlist_del_rcu(&napi->napi_hash_node);
4911 	}
4912 	spin_unlock(&napi_hash_lock);
4913 	return rcu_sync_needed;
4914 }
4915 EXPORT_SYMBOL_GPL(napi_hash_del);
4916 
4917 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4918 {
4919 	struct napi_struct *napi;
4920 
4921 	napi = container_of(timer, struct napi_struct, timer);
4922 	if (napi->gro_list)
4923 		napi_schedule(napi);
4924 
4925 	return HRTIMER_NORESTART;
4926 }
4927 
4928 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4929 		    int (*poll)(struct napi_struct *, int), int weight)
4930 {
4931 	INIT_LIST_HEAD(&napi->poll_list);
4932 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4933 	napi->timer.function = napi_watchdog;
4934 	napi->gro_count = 0;
4935 	napi->gro_list = NULL;
4936 	napi->skb = NULL;
4937 	napi->poll = poll;
4938 	if (weight > NAPI_POLL_WEIGHT)
4939 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4940 			    weight, dev->name);
4941 	napi->weight = weight;
4942 	list_add(&napi->dev_list, &dev->napi_list);
4943 	napi->dev = dev;
4944 #ifdef CONFIG_NETPOLL
4945 	spin_lock_init(&napi->poll_lock);
4946 	napi->poll_owner = -1;
4947 #endif
4948 	set_bit(NAPI_STATE_SCHED, &napi->state);
4949 	napi_hash_add(napi);
4950 }
4951 EXPORT_SYMBOL(netif_napi_add);
4952 
4953 void napi_disable(struct napi_struct *n)
4954 {
4955 	might_sleep();
4956 	set_bit(NAPI_STATE_DISABLE, &n->state);
4957 
4958 	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4959 		msleep(1);
4960 	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
4961 		msleep(1);
4962 
4963 	hrtimer_cancel(&n->timer);
4964 
4965 	clear_bit(NAPI_STATE_DISABLE, &n->state);
4966 }
4967 EXPORT_SYMBOL(napi_disable);
4968 
4969 /* Must be called in process context */
4970 void netif_napi_del(struct napi_struct *napi)
4971 {
4972 	might_sleep();
4973 	if (napi_hash_del(napi))
4974 		synchronize_net();
4975 	list_del_init(&napi->dev_list);
4976 	napi_free_frags(napi);
4977 
4978 	kfree_skb_list(napi->gro_list);
4979 	napi->gro_list = NULL;
4980 	napi->gro_count = 0;
4981 }
4982 EXPORT_SYMBOL(netif_napi_del);
4983 
4984 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4985 {
4986 	void *have;
4987 	int work, weight;
4988 
4989 	list_del_init(&n->poll_list);
4990 
4991 	have = netpoll_poll_lock(n);
4992 
4993 	weight = n->weight;
4994 
4995 	/* This NAPI_STATE_SCHED test is for avoiding a race
4996 	 * with netpoll's poll_napi().  Only the entity which
4997 	 * obtains the lock and sees NAPI_STATE_SCHED set will
4998 	 * actually make the ->poll() call.  Therefore we avoid
4999 	 * accidentally calling ->poll() when NAPI is not scheduled.
5000 	 */
5001 	work = 0;
5002 	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5003 		work = n->poll(n, weight);
5004 		trace_napi_poll(n);
5005 	}
5006 
5007 	WARN_ON_ONCE(work > weight);
5008 
5009 	if (likely(work < weight))
5010 		goto out_unlock;
5011 
5012 	/* Drivers must not modify the NAPI state if they
5013 	 * consume the entire weight.  In such cases this code
5014 	 * still "owns" the NAPI instance and therefore can
5015 	 * move the instance around on the list at-will.
5016 	 */
5017 	if (unlikely(napi_disable_pending(n))) {
5018 		napi_complete(n);
5019 		goto out_unlock;
5020 	}
5021 
5022 	if (n->gro_list) {
5023 		/* flush too old packets
5024 		 * If HZ < 1000, flush all packets.
5025 		 */
5026 		napi_gro_flush(n, HZ >= 1000);
5027 	}
5028 
5029 	/* Some drivers may have called napi_schedule
5030 	 * prior to exhausting their budget.
5031 	 */
5032 	if (unlikely(!list_empty(&n->poll_list))) {
5033 		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5034 			     n->dev ? n->dev->name : "backlog");
5035 		goto out_unlock;
5036 	}
5037 
5038 	list_add_tail(&n->poll_list, repoll);
5039 
5040 out_unlock:
5041 	netpoll_poll_unlock(have);
5042 
5043 	return work;
5044 }
5045 
5046 static void net_rx_action(struct softirq_action *h)
5047 {
5048 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5049 	unsigned long time_limit = jiffies + 2;
5050 	int budget = netdev_budget;
5051 	LIST_HEAD(list);
5052 	LIST_HEAD(repoll);
5053 
5054 	local_irq_disable();
5055 	list_splice_init(&sd->poll_list, &list);
5056 	local_irq_enable();
5057 
5058 	for (;;) {
5059 		struct napi_struct *n;
5060 
5061 		if (list_empty(&list)) {
5062 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5063 				return;
5064 			break;
5065 		}
5066 
5067 		n = list_first_entry(&list, struct napi_struct, poll_list);
5068 		budget -= napi_poll(n, &repoll);
5069 
5070 		/* If softirq window is exhausted then punt.
5071 		 * Allow this to run for 2 jiffies since which will allow
5072 		 * an average latency of 1.5/HZ.
5073 		 */
5074 		if (unlikely(budget <= 0 ||
5075 			     time_after_eq(jiffies, time_limit))) {
5076 			sd->time_squeeze++;
5077 			break;
5078 		}
5079 	}
5080 
5081 	local_irq_disable();
5082 
5083 	list_splice_tail_init(&sd->poll_list, &list);
5084 	list_splice_tail(&repoll, &list);
5085 	list_splice(&list, &sd->poll_list);
5086 	if (!list_empty(&sd->poll_list))
5087 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5088 
5089 	net_rps_action_and_irq_enable(sd);
5090 }
5091 
5092 struct netdev_adjacent {
5093 	struct net_device *dev;
5094 
5095 	/* upper master flag, there can only be one master device per list */
5096 	bool master;
5097 
5098 	/* counter for the number of times this device was added to us */
5099 	u16 ref_nr;
5100 
5101 	/* private field for the users */
5102 	void *private;
5103 
5104 	struct list_head list;
5105 	struct rcu_head rcu;
5106 };
5107 
5108 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5109 						 struct list_head *adj_list)
5110 {
5111 	struct netdev_adjacent *adj;
5112 
5113 	list_for_each_entry(adj, adj_list, list) {
5114 		if (adj->dev == adj_dev)
5115 			return adj;
5116 	}
5117 	return NULL;
5118 }
5119 
5120 /**
5121  * netdev_has_upper_dev - Check if device is linked to an upper device
5122  * @dev: device
5123  * @upper_dev: upper device to check
5124  *
5125  * Find out if a device is linked to specified upper device and return true
5126  * in case it is. Note that this checks only immediate upper device,
5127  * not through a complete stack of devices. The caller must hold the RTNL lock.
5128  */
5129 bool netdev_has_upper_dev(struct net_device *dev,
5130 			  struct net_device *upper_dev)
5131 {
5132 	ASSERT_RTNL();
5133 
5134 	return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5135 }
5136 EXPORT_SYMBOL(netdev_has_upper_dev);
5137 
5138 /**
5139  * netdev_has_any_upper_dev - Check if device is linked to some device
5140  * @dev: device
5141  *
5142  * Find out if a device is linked to an upper device and return true in case
5143  * it is. The caller must hold the RTNL lock.
5144  */
5145 static bool netdev_has_any_upper_dev(struct net_device *dev)
5146 {
5147 	ASSERT_RTNL();
5148 
5149 	return !list_empty(&dev->all_adj_list.upper);
5150 }
5151 
5152 /**
5153  * netdev_master_upper_dev_get - Get master upper device
5154  * @dev: device
5155  *
5156  * Find a master upper device and return pointer to it or NULL in case
5157  * it's not there. The caller must hold the RTNL lock.
5158  */
5159 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5160 {
5161 	struct netdev_adjacent *upper;
5162 
5163 	ASSERT_RTNL();
5164 
5165 	if (list_empty(&dev->adj_list.upper))
5166 		return NULL;
5167 
5168 	upper = list_first_entry(&dev->adj_list.upper,
5169 				 struct netdev_adjacent, list);
5170 	if (likely(upper->master))
5171 		return upper->dev;
5172 	return NULL;
5173 }
5174 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5175 
5176 void *netdev_adjacent_get_private(struct list_head *adj_list)
5177 {
5178 	struct netdev_adjacent *adj;
5179 
5180 	adj = list_entry(adj_list, struct netdev_adjacent, list);
5181 
5182 	return adj->private;
5183 }
5184 EXPORT_SYMBOL(netdev_adjacent_get_private);
5185 
5186 /**
5187  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5188  * @dev: device
5189  * @iter: list_head ** of the current position
5190  *
5191  * Gets the next device from the dev's upper list, starting from iter
5192  * position. The caller must hold RCU read lock.
5193  */
5194 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5195 						 struct list_head **iter)
5196 {
5197 	struct netdev_adjacent *upper;
5198 
5199 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5200 
5201 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5202 
5203 	if (&upper->list == &dev->adj_list.upper)
5204 		return NULL;
5205 
5206 	*iter = &upper->list;
5207 
5208 	return upper->dev;
5209 }
5210 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5211 
5212 /**
5213  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5214  * @dev: device
5215  * @iter: list_head ** of the current position
5216  *
5217  * Gets the next device from the dev's upper list, starting from iter
5218  * position. The caller must hold RCU read lock.
5219  */
5220 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5221 						     struct list_head **iter)
5222 {
5223 	struct netdev_adjacent *upper;
5224 
5225 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5226 
5227 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5228 
5229 	if (&upper->list == &dev->all_adj_list.upper)
5230 		return NULL;
5231 
5232 	*iter = &upper->list;
5233 
5234 	return upper->dev;
5235 }
5236 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5237 
5238 /**
5239  * netdev_lower_get_next_private - Get the next ->private from the
5240  *				   lower neighbour list
5241  * @dev: device
5242  * @iter: list_head ** of the current position
5243  *
5244  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5245  * list, starting from iter position. The caller must hold either hold the
5246  * RTNL lock or its own locking that guarantees that the neighbour lower
5247  * list will remain unchanged.
5248  */
5249 void *netdev_lower_get_next_private(struct net_device *dev,
5250 				    struct list_head **iter)
5251 {
5252 	struct netdev_adjacent *lower;
5253 
5254 	lower = list_entry(*iter, struct netdev_adjacent, list);
5255 
5256 	if (&lower->list == &dev->adj_list.lower)
5257 		return NULL;
5258 
5259 	*iter = lower->list.next;
5260 
5261 	return lower->private;
5262 }
5263 EXPORT_SYMBOL(netdev_lower_get_next_private);
5264 
5265 /**
5266  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5267  *				       lower neighbour list, RCU
5268  *				       variant
5269  * @dev: device
5270  * @iter: list_head ** of the current position
5271  *
5272  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5273  * list, starting from iter position. The caller must hold RCU read lock.
5274  */
5275 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5276 					struct list_head **iter)
5277 {
5278 	struct netdev_adjacent *lower;
5279 
5280 	WARN_ON_ONCE(!rcu_read_lock_held());
5281 
5282 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5283 
5284 	if (&lower->list == &dev->adj_list.lower)
5285 		return NULL;
5286 
5287 	*iter = &lower->list;
5288 
5289 	return lower->private;
5290 }
5291 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5292 
5293 /**
5294  * netdev_lower_get_next - Get the next device from the lower neighbour
5295  *                         list
5296  * @dev: device
5297  * @iter: list_head ** of the current position
5298  *
5299  * Gets the next netdev_adjacent from the dev's lower neighbour
5300  * list, starting from iter position. The caller must hold RTNL lock or
5301  * its own locking that guarantees that the neighbour lower
5302  * list will remain unchanged.
5303  */
5304 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5305 {
5306 	struct netdev_adjacent *lower;
5307 
5308 	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5309 
5310 	if (&lower->list == &dev->adj_list.lower)
5311 		return NULL;
5312 
5313 	*iter = &lower->list;
5314 
5315 	return lower->dev;
5316 }
5317 EXPORT_SYMBOL(netdev_lower_get_next);
5318 
5319 /**
5320  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5321  *				       lower neighbour list, RCU
5322  *				       variant
5323  * @dev: device
5324  *
5325  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5326  * list. The caller must hold RCU read lock.
5327  */
5328 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5329 {
5330 	struct netdev_adjacent *lower;
5331 
5332 	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5333 			struct netdev_adjacent, list);
5334 	if (lower)
5335 		return lower->private;
5336 	return NULL;
5337 }
5338 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5339 
5340 /**
5341  * netdev_master_upper_dev_get_rcu - Get master upper device
5342  * @dev: device
5343  *
5344  * Find a master upper device and return pointer to it or NULL in case
5345  * it's not there. The caller must hold the RCU read lock.
5346  */
5347 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5348 {
5349 	struct netdev_adjacent *upper;
5350 
5351 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5352 				       struct netdev_adjacent, list);
5353 	if (upper && likely(upper->master))
5354 		return upper->dev;
5355 	return NULL;
5356 }
5357 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5358 
5359 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5360 			      struct net_device *adj_dev,
5361 			      struct list_head *dev_list)
5362 {
5363 	char linkname[IFNAMSIZ+7];
5364 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5365 		"upper_%s" : "lower_%s", adj_dev->name);
5366 	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5367 				 linkname);
5368 }
5369 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5370 			       char *name,
5371 			       struct list_head *dev_list)
5372 {
5373 	char linkname[IFNAMSIZ+7];
5374 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5375 		"upper_%s" : "lower_%s", name);
5376 	sysfs_remove_link(&(dev->dev.kobj), linkname);
5377 }
5378 
5379 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5380 						 struct net_device *adj_dev,
5381 						 struct list_head *dev_list)
5382 {
5383 	return (dev_list == &dev->adj_list.upper ||
5384 		dev_list == &dev->adj_list.lower) &&
5385 		net_eq(dev_net(dev), dev_net(adj_dev));
5386 }
5387 
5388 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5389 					struct net_device *adj_dev,
5390 					struct list_head *dev_list,
5391 					void *private, bool master)
5392 {
5393 	struct netdev_adjacent *adj;
5394 	int ret;
5395 
5396 	adj = __netdev_find_adj(adj_dev, dev_list);
5397 
5398 	if (adj) {
5399 		adj->ref_nr++;
5400 		return 0;
5401 	}
5402 
5403 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5404 	if (!adj)
5405 		return -ENOMEM;
5406 
5407 	adj->dev = adj_dev;
5408 	adj->master = master;
5409 	adj->ref_nr = 1;
5410 	adj->private = private;
5411 	dev_hold(adj_dev);
5412 
5413 	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5414 		 adj_dev->name, dev->name, adj_dev->name);
5415 
5416 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5417 		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5418 		if (ret)
5419 			goto free_adj;
5420 	}
5421 
5422 	/* Ensure that master link is always the first item in list. */
5423 	if (master) {
5424 		ret = sysfs_create_link(&(dev->dev.kobj),
5425 					&(adj_dev->dev.kobj), "master");
5426 		if (ret)
5427 			goto remove_symlinks;
5428 
5429 		list_add_rcu(&adj->list, dev_list);
5430 	} else {
5431 		list_add_tail_rcu(&adj->list, dev_list);
5432 	}
5433 
5434 	return 0;
5435 
5436 remove_symlinks:
5437 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5438 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5439 free_adj:
5440 	kfree(adj);
5441 	dev_put(adj_dev);
5442 
5443 	return ret;
5444 }
5445 
5446 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5447 					 struct net_device *adj_dev,
5448 					 struct list_head *dev_list)
5449 {
5450 	struct netdev_adjacent *adj;
5451 
5452 	adj = __netdev_find_adj(adj_dev, dev_list);
5453 
5454 	if (!adj) {
5455 		pr_err("tried to remove device %s from %s\n",
5456 		       dev->name, adj_dev->name);
5457 		BUG();
5458 	}
5459 
5460 	if (adj->ref_nr > 1) {
5461 		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5462 			 adj->ref_nr-1);
5463 		adj->ref_nr--;
5464 		return;
5465 	}
5466 
5467 	if (adj->master)
5468 		sysfs_remove_link(&(dev->dev.kobj), "master");
5469 
5470 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5471 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5472 
5473 	list_del_rcu(&adj->list);
5474 	pr_debug("dev_put for %s, because link removed from %s to %s\n",
5475 		 adj_dev->name, dev->name, adj_dev->name);
5476 	dev_put(adj_dev);
5477 	kfree_rcu(adj, rcu);
5478 }
5479 
5480 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5481 					    struct net_device *upper_dev,
5482 					    struct list_head *up_list,
5483 					    struct list_head *down_list,
5484 					    void *private, bool master)
5485 {
5486 	int ret;
5487 
5488 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5489 					   master);
5490 	if (ret)
5491 		return ret;
5492 
5493 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5494 					   false);
5495 	if (ret) {
5496 		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5497 		return ret;
5498 	}
5499 
5500 	return 0;
5501 }
5502 
5503 static int __netdev_adjacent_dev_link(struct net_device *dev,
5504 				      struct net_device *upper_dev)
5505 {
5506 	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5507 						&dev->all_adj_list.upper,
5508 						&upper_dev->all_adj_list.lower,
5509 						NULL, false);
5510 }
5511 
5512 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5513 					       struct net_device *upper_dev,
5514 					       struct list_head *up_list,
5515 					       struct list_head *down_list)
5516 {
5517 	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5518 	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5519 }
5520 
5521 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5522 					 struct net_device *upper_dev)
5523 {
5524 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5525 					   &dev->all_adj_list.upper,
5526 					   &upper_dev->all_adj_list.lower);
5527 }
5528 
5529 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5530 						struct net_device *upper_dev,
5531 						void *private, bool master)
5532 {
5533 	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5534 
5535 	if (ret)
5536 		return ret;
5537 
5538 	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5539 					       &dev->adj_list.upper,
5540 					       &upper_dev->adj_list.lower,
5541 					       private, master);
5542 	if (ret) {
5543 		__netdev_adjacent_dev_unlink(dev, upper_dev);
5544 		return ret;
5545 	}
5546 
5547 	return 0;
5548 }
5549 
5550 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5551 						   struct net_device *upper_dev)
5552 {
5553 	__netdev_adjacent_dev_unlink(dev, upper_dev);
5554 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5555 					   &dev->adj_list.upper,
5556 					   &upper_dev->adj_list.lower);
5557 }
5558 
5559 static int __netdev_upper_dev_link(struct net_device *dev,
5560 				   struct net_device *upper_dev, bool master,
5561 				   void *upper_priv, void *upper_info)
5562 {
5563 	struct netdev_notifier_changeupper_info changeupper_info;
5564 	struct netdev_adjacent *i, *j, *to_i, *to_j;
5565 	int ret = 0;
5566 
5567 	ASSERT_RTNL();
5568 
5569 	if (dev == upper_dev)
5570 		return -EBUSY;
5571 
5572 	/* To prevent loops, check if dev is not upper device to upper_dev. */
5573 	if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5574 		return -EBUSY;
5575 
5576 	if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5577 		return -EEXIST;
5578 
5579 	if (master && netdev_master_upper_dev_get(dev))
5580 		return -EBUSY;
5581 
5582 	changeupper_info.upper_dev = upper_dev;
5583 	changeupper_info.master = master;
5584 	changeupper_info.linking = true;
5585 	changeupper_info.upper_info = upper_info;
5586 
5587 	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5588 					    &changeupper_info.info);
5589 	ret = notifier_to_errno(ret);
5590 	if (ret)
5591 		return ret;
5592 
5593 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5594 						   master);
5595 	if (ret)
5596 		return ret;
5597 
5598 	/* Now that we linked these devs, make all the upper_dev's
5599 	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5600 	 * versa, and don't forget the devices itself. All of these
5601 	 * links are non-neighbours.
5602 	 */
5603 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5604 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5605 			pr_debug("Interlinking %s with %s, non-neighbour\n",
5606 				 i->dev->name, j->dev->name);
5607 			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5608 			if (ret)
5609 				goto rollback_mesh;
5610 		}
5611 	}
5612 
5613 	/* add dev to every upper_dev's upper device */
5614 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5615 		pr_debug("linking %s's upper device %s with %s\n",
5616 			 upper_dev->name, i->dev->name, dev->name);
5617 		ret = __netdev_adjacent_dev_link(dev, i->dev);
5618 		if (ret)
5619 			goto rollback_upper_mesh;
5620 	}
5621 
5622 	/* add upper_dev to every dev's lower device */
5623 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5624 		pr_debug("linking %s's lower device %s with %s\n", dev->name,
5625 			 i->dev->name, upper_dev->name);
5626 		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5627 		if (ret)
5628 			goto rollback_lower_mesh;
5629 	}
5630 
5631 	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5632 					    &changeupper_info.info);
5633 	ret = notifier_to_errno(ret);
5634 	if (ret)
5635 		goto rollback_lower_mesh;
5636 
5637 	return 0;
5638 
5639 rollback_lower_mesh:
5640 	to_i = i;
5641 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5642 		if (i == to_i)
5643 			break;
5644 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5645 	}
5646 
5647 	i = NULL;
5648 
5649 rollback_upper_mesh:
5650 	to_i = i;
5651 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5652 		if (i == to_i)
5653 			break;
5654 		__netdev_adjacent_dev_unlink(dev, i->dev);
5655 	}
5656 
5657 	i = j = NULL;
5658 
5659 rollback_mesh:
5660 	to_i = i;
5661 	to_j = j;
5662 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5663 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5664 			if (i == to_i && j == to_j)
5665 				break;
5666 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5667 		}
5668 		if (i == to_i)
5669 			break;
5670 	}
5671 
5672 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5673 
5674 	return ret;
5675 }
5676 
5677 /**
5678  * netdev_upper_dev_link - Add a link to the upper device
5679  * @dev: device
5680  * @upper_dev: new upper device
5681  *
5682  * Adds a link to device which is upper to this one. The caller must hold
5683  * the RTNL lock. On a failure a negative errno code is returned.
5684  * On success the reference counts are adjusted and the function
5685  * returns zero.
5686  */
5687 int netdev_upper_dev_link(struct net_device *dev,
5688 			  struct net_device *upper_dev)
5689 {
5690 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5691 }
5692 EXPORT_SYMBOL(netdev_upper_dev_link);
5693 
5694 /**
5695  * netdev_master_upper_dev_link - Add a master link to the upper device
5696  * @dev: device
5697  * @upper_dev: new upper device
5698  * @upper_priv: upper device private
5699  * @upper_info: upper info to be passed down via notifier
5700  *
5701  * Adds a link to device which is upper to this one. In this case, only
5702  * one master upper device can be linked, although other non-master devices
5703  * might be linked as well. The caller must hold the RTNL lock.
5704  * On a failure a negative errno code is returned. On success the reference
5705  * counts are adjusted and the function returns zero.
5706  */
5707 int netdev_master_upper_dev_link(struct net_device *dev,
5708 				 struct net_device *upper_dev,
5709 				 void *upper_priv, void *upper_info)
5710 {
5711 	return __netdev_upper_dev_link(dev, upper_dev, true,
5712 				       upper_priv, upper_info);
5713 }
5714 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5715 
5716 /**
5717  * netdev_upper_dev_unlink - Removes a link to upper device
5718  * @dev: device
5719  * @upper_dev: new upper device
5720  *
5721  * Removes a link to device which is upper to this one. The caller must hold
5722  * the RTNL lock.
5723  */
5724 void netdev_upper_dev_unlink(struct net_device *dev,
5725 			     struct net_device *upper_dev)
5726 {
5727 	struct netdev_notifier_changeupper_info changeupper_info;
5728 	struct netdev_adjacent *i, *j;
5729 	ASSERT_RTNL();
5730 
5731 	changeupper_info.upper_dev = upper_dev;
5732 	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5733 	changeupper_info.linking = false;
5734 
5735 	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5736 				      &changeupper_info.info);
5737 
5738 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5739 
5740 	/* Here is the tricky part. We must remove all dev's lower
5741 	 * devices from all upper_dev's upper devices and vice
5742 	 * versa, to maintain the graph relationship.
5743 	 */
5744 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5745 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5746 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5747 
5748 	/* remove also the devices itself from lower/upper device
5749 	 * list
5750 	 */
5751 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5752 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5753 
5754 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5755 		__netdev_adjacent_dev_unlink(dev, i->dev);
5756 
5757 	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5758 				      &changeupper_info.info);
5759 }
5760 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5761 
5762 /**
5763  * netdev_bonding_info_change - Dispatch event about slave change
5764  * @dev: device
5765  * @bonding_info: info to dispatch
5766  *
5767  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5768  * The caller must hold the RTNL lock.
5769  */
5770 void netdev_bonding_info_change(struct net_device *dev,
5771 				struct netdev_bonding_info *bonding_info)
5772 {
5773 	struct netdev_notifier_bonding_info	info;
5774 
5775 	memcpy(&info.bonding_info, bonding_info,
5776 	       sizeof(struct netdev_bonding_info));
5777 	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5778 				      &info.info);
5779 }
5780 EXPORT_SYMBOL(netdev_bonding_info_change);
5781 
5782 static void netdev_adjacent_add_links(struct net_device *dev)
5783 {
5784 	struct netdev_adjacent *iter;
5785 
5786 	struct net *net = dev_net(dev);
5787 
5788 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5789 		if (!net_eq(net,dev_net(iter->dev)))
5790 			continue;
5791 		netdev_adjacent_sysfs_add(iter->dev, dev,
5792 					  &iter->dev->adj_list.lower);
5793 		netdev_adjacent_sysfs_add(dev, iter->dev,
5794 					  &dev->adj_list.upper);
5795 	}
5796 
5797 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5798 		if (!net_eq(net,dev_net(iter->dev)))
5799 			continue;
5800 		netdev_adjacent_sysfs_add(iter->dev, dev,
5801 					  &iter->dev->adj_list.upper);
5802 		netdev_adjacent_sysfs_add(dev, iter->dev,
5803 					  &dev->adj_list.lower);
5804 	}
5805 }
5806 
5807 static void netdev_adjacent_del_links(struct net_device *dev)
5808 {
5809 	struct netdev_adjacent *iter;
5810 
5811 	struct net *net = dev_net(dev);
5812 
5813 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5814 		if (!net_eq(net,dev_net(iter->dev)))
5815 			continue;
5816 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5817 					  &iter->dev->adj_list.lower);
5818 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5819 					  &dev->adj_list.upper);
5820 	}
5821 
5822 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5823 		if (!net_eq(net,dev_net(iter->dev)))
5824 			continue;
5825 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5826 					  &iter->dev->adj_list.upper);
5827 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5828 					  &dev->adj_list.lower);
5829 	}
5830 }
5831 
5832 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5833 {
5834 	struct netdev_adjacent *iter;
5835 
5836 	struct net *net = dev_net(dev);
5837 
5838 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5839 		if (!net_eq(net,dev_net(iter->dev)))
5840 			continue;
5841 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5842 					  &iter->dev->adj_list.lower);
5843 		netdev_adjacent_sysfs_add(iter->dev, dev,
5844 					  &iter->dev->adj_list.lower);
5845 	}
5846 
5847 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5848 		if (!net_eq(net,dev_net(iter->dev)))
5849 			continue;
5850 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5851 					  &iter->dev->adj_list.upper);
5852 		netdev_adjacent_sysfs_add(iter->dev, dev,
5853 					  &iter->dev->adj_list.upper);
5854 	}
5855 }
5856 
5857 void *netdev_lower_dev_get_private(struct net_device *dev,
5858 				   struct net_device *lower_dev)
5859 {
5860 	struct netdev_adjacent *lower;
5861 
5862 	if (!lower_dev)
5863 		return NULL;
5864 	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5865 	if (!lower)
5866 		return NULL;
5867 
5868 	return lower->private;
5869 }
5870 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5871 
5872 
5873 int dev_get_nest_level(struct net_device *dev,
5874 		       bool (*type_check)(const struct net_device *dev))
5875 {
5876 	struct net_device *lower = NULL;
5877 	struct list_head *iter;
5878 	int max_nest = -1;
5879 	int nest;
5880 
5881 	ASSERT_RTNL();
5882 
5883 	netdev_for_each_lower_dev(dev, lower, iter) {
5884 		nest = dev_get_nest_level(lower, type_check);
5885 		if (max_nest < nest)
5886 			max_nest = nest;
5887 	}
5888 
5889 	if (type_check(dev))
5890 		max_nest++;
5891 
5892 	return max_nest;
5893 }
5894 EXPORT_SYMBOL(dev_get_nest_level);
5895 
5896 /**
5897  * netdev_lower_change - Dispatch event about lower device state change
5898  * @lower_dev: device
5899  * @lower_state_info: state to dispatch
5900  *
5901  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
5902  * The caller must hold the RTNL lock.
5903  */
5904 void netdev_lower_state_changed(struct net_device *lower_dev,
5905 				void *lower_state_info)
5906 {
5907 	struct netdev_notifier_changelowerstate_info changelowerstate_info;
5908 
5909 	ASSERT_RTNL();
5910 	changelowerstate_info.lower_state_info = lower_state_info;
5911 	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
5912 				      &changelowerstate_info.info);
5913 }
5914 EXPORT_SYMBOL(netdev_lower_state_changed);
5915 
5916 static void dev_change_rx_flags(struct net_device *dev, int flags)
5917 {
5918 	const struct net_device_ops *ops = dev->netdev_ops;
5919 
5920 	if (ops->ndo_change_rx_flags)
5921 		ops->ndo_change_rx_flags(dev, flags);
5922 }
5923 
5924 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5925 {
5926 	unsigned int old_flags = dev->flags;
5927 	kuid_t uid;
5928 	kgid_t gid;
5929 
5930 	ASSERT_RTNL();
5931 
5932 	dev->flags |= IFF_PROMISC;
5933 	dev->promiscuity += inc;
5934 	if (dev->promiscuity == 0) {
5935 		/*
5936 		 * Avoid overflow.
5937 		 * If inc causes overflow, untouch promisc and return error.
5938 		 */
5939 		if (inc < 0)
5940 			dev->flags &= ~IFF_PROMISC;
5941 		else {
5942 			dev->promiscuity -= inc;
5943 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5944 				dev->name);
5945 			return -EOVERFLOW;
5946 		}
5947 	}
5948 	if (dev->flags != old_flags) {
5949 		pr_info("device %s %s promiscuous mode\n",
5950 			dev->name,
5951 			dev->flags & IFF_PROMISC ? "entered" : "left");
5952 		if (audit_enabled) {
5953 			current_uid_gid(&uid, &gid);
5954 			audit_log(current->audit_context, GFP_ATOMIC,
5955 				AUDIT_ANOM_PROMISCUOUS,
5956 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5957 				dev->name, (dev->flags & IFF_PROMISC),
5958 				(old_flags & IFF_PROMISC),
5959 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
5960 				from_kuid(&init_user_ns, uid),
5961 				from_kgid(&init_user_ns, gid),
5962 				audit_get_sessionid(current));
5963 		}
5964 
5965 		dev_change_rx_flags(dev, IFF_PROMISC);
5966 	}
5967 	if (notify)
5968 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
5969 	return 0;
5970 }
5971 
5972 /**
5973  *	dev_set_promiscuity	- update promiscuity count on a device
5974  *	@dev: device
5975  *	@inc: modifier
5976  *
5977  *	Add or remove promiscuity from a device. While the count in the device
5978  *	remains above zero the interface remains promiscuous. Once it hits zero
5979  *	the device reverts back to normal filtering operation. A negative inc
5980  *	value is used to drop promiscuity on the device.
5981  *	Return 0 if successful or a negative errno code on error.
5982  */
5983 int dev_set_promiscuity(struct net_device *dev, int inc)
5984 {
5985 	unsigned int old_flags = dev->flags;
5986 	int err;
5987 
5988 	err = __dev_set_promiscuity(dev, inc, true);
5989 	if (err < 0)
5990 		return err;
5991 	if (dev->flags != old_flags)
5992 		dev_set_rx_mode(dev);
5993 	return err;
5994 }
5995 EXPORT_SYMBOL(dev_set_promiscuity);
5996 
5997 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5998 {
5999 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6000 
6001 	ASSERT_RTNL();
6002 
6003 	dev->flags |= IFF_ALLMULTI;
6004 	dev->allmulti += inc;
6005 	if (dev->allmulti == 0) {
6006 		/*
6007 		 * Avoid overflow.
6008 		 * If inc causes overflow, untouch allmulti and return error.
6009 		 */
6010 		if (inc < 0)
6011 			dev->flags &= ~IFF_ALLMULTI;
6012 		else {
6013 			dev->allmulti -= inc;
6014 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6015 				dev->name);
6016 			return -EOVERFLOW;
6017 		}
6018 	}
6019 	if (dev->flags ^ old_flags) {
6020 		dev_change_rx_flags(dev, IFF_ALLMULTI);
6021 		dev_set_rx_mode(dev);
6022 		if (notify)
6023 			__dev_notify_flags(dev, old_flags,
6024 					   dev->gflags ^ old_gflags);
6025 	}
6026 	return 0;
6027 }
6028 
6029 /**
6030  *	dev_set_allmulti	- update allmulti count on a device
6031  *	@dev: device
6032  *	@inc: modifier
6033  *
6034  *	Add or remove reception of all multicast frames to a device. While the
6035  *	count in the device remains above zero the interface remains listening
6036  *	to all interfaces. Once it hits zero the device reverts back to normal
6037  *	filtering operation. A negative @inc value is used to drop the counter
6038  *	when releasing a resource needing all multicasts.
6039  *	Return 0 if successful or a negative errno code on error.
6040  */
6041 
6042 int dev_set_allmulti(struct net_device *dev, int inc)
6043 {
6044 	return __dev_set_allmulti(dev, inc, true);
6045 }
6046 EXPORT_SYMBOL(dev_set_allmulti);
6047 
6048 /*
6049  *	Upload unicast and multicast address lists to device and
6050  *	configure RX filtering. When the device doesn't support unicast
6051  *	filtering it is put in promiscuous mode while unicast addresses
6052  *	are present.
6053  */
6054 void __dev_set_rx_mode(struct net_device *dev)
6055 {
6056 	const struct net_device_ops *ops = dev->netdev_ops;
6057 
6058 	/* dev_open will call this function so the list will stay sane. */
6059 	if (!(dev->flags&IFF_UP))
6060 		return;
6061 
6062 	if (!netif_device_present(dev))
6063 		return;
6064 
6065 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6066 		/* Unicast addresses changes may only happen under the rtnl,
6067 		 * therefore calling __dev_set_promiscuity here is safe.
6068 		 */
6069 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6070 			__dev_set_promiscuity(dev, 1, false);
6071 			dev->uc_promisc = true;
6072 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6073 			__dev_set_promiscuity(dev, -1, false);
6074 			dev->uc_promisc = false;
6075 		}
6076 	}
6077 
6078 	if (ops->ndo_set_rx_mode)
6079 		ops->ndo_set_rx_mode(dev);
6080 }
6081 
6082 void dev_set_rx_mode(struct net_device *dev)
6083 {
6084 	netif_addr_lock_bh(dev);
6085 	__dev_set_rx_mode(dev);
6086 	netif_addr_unlock_bh(dev);
6087 }
6088 
6089 /**
6090  *	dev_get_flags - get flags reported to userspace
6091  *	@dev: device
6092  *
6093  *	Get the combination of flag bits exported through APIs to userspace.
6094  */
6095 unsigned int dev_get_flags(const struct net_device *dev)
6096 {
6097 	unsigned int flags;
6098 
6099 	flags = (dev->flags & ~(IFF_PROMISC |
6100 				IFF_ALLMULTI |
6101 				IFF_RUNNING |
6102 				IFF_LOWER_UP |
6103 				IFF_DORMANT)) |
6104 		(dev->gflags & (IFF_PROMISC |
6105 				IFF_ALLMULTI));
6106 
6107 	if (netif_running(dev)) {
6108 		if (netif_oper_up(dev))
6109 			flags |= IFF_RUNNING;
6110 		if (netif_carrier_ok(dev))
6111 			flags |= IFF_LOWER_UP;
6112 		if (netif_dormant(dev))
6113 			flags |= IFF_DORMANT;
6114 	}
6115 
6116 	return flags;
6117 }
6118 EXPORT_SYMBOL(dev_get_flags);
6119 
6120 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6121 {
6122 	unsigned int old_flags = dev->flags;
6123 	int ret;
6124 
6125 	ASSERT_RTNL();
6126 
6127 	/*
6128 	 *	Set the flags on our device.
6129 	 */
6130 
6131 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6132 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6133 			       IFF_AUTOMEDIA)) |
6134 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6135 				    IFF_ALLMULTI));
6136 
6137 	/*
6138 	 *	Load in the correct multicast list now the flags have changed.
6139 	 */
6140 
6141 	if ((old_flags ^ flags) & IFF_MULTICAST)
6142 		dev_change_rx_flags(dev, IFF_MULTICAST);
6143 
6144 	dev_set_rx_mode(dev);
6145 
6146 	/*
6147 	 *	Have we downed the interface. We handle IFF_UP ourselves
6148 	 *	according to user attempts to set it, rather than blindly
6149 	 *	setting it.
6150 	 */
6151 
6152 	ret = 0;
6153 	if ((old_flags ^ flags) & IFF_UP)
6154 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6155 
6156 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6157 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6158 		unsigned int old_flags = dev->flags;
6159 
6160 		dev->gflags ^= IFF_PROMISC;
6161 
6162 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6163 			if (dev->flags != old_flags)
6164 				dev_set_rx_mode(dev);
6165 	}
6166 
6167 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6168 	   is important. Some (broken) drivers set IFF_PROMISC, when
6169 	   IFF_ALLMULTI is requested not asking us and not reporting.
6170 	 */
6171 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6172 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6173 
6174 		dev->gflags ^= IFF_ALLMULTI;
6175 		__dev_set_allmulti(dev, inc, false);
6176 	}
6177 
6178 	return ret;
6179 }
6180 
6181 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6182 			unsigned int gchanges)
6183 {
6184 	unsigned int changes = dev->flags ^ old_flags;
6185 
6186 	if (gchanges)
6187 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6188 
6189 	if (changes & IFF_UP) {
6190 		if (dev->flags & IFF_UP)
6191 			call_netdevice_notifiers(NETDEV_UP, dev);
6192 		else
6193 			call_netdevice_notifiers(NETDEV_DOWN, dev);
6194 	}
6195 
6196 	if (dev->flags & IFF_UP &&
6197 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6198 		struct netdev_notifier_change_info change_info;
6199 
6200 		change_info.flags_changed = changes;
6201 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6202 					      &change_info.info);
6203 	}
6204 }
6205 
6206 /**
6207  *	dev_change_flags - change device settings
6208  *	@dev: device
6209  *	@flags: device state flags
6210  *
6211  *	Change settings on device based state flags. The flags are
6212  *	in the userspace exported format.
6213  */
6214 int dev_change_flags(struct net_device *dev, unsigned int flags)
6215 {
6216 	int ret;
6217 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6218 
6219 	ret = __dev_change_flags(dev, flags);
6220 	if (ret < 0)
6221 		return ret;
6222 
6223 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6224 	__dev_notify_flags(dev, old_flags, changes);
6225 	return ret;
6226 }
6227 EXPORT_SYMBOL(dev_change_flags);
6228 
6229 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6230 {
6231 	const struct net_device_ops *ops = dev->netdev_ops;
6232 
6233 	if (ops->ndo_change_mtu)
6234 		return ops->ndo_change_mtu(dev, new_mtu);
6235 
6236 	dev->mtu = new_mtu;
6237 	return 0;
6238 }
6239 
6240 /**
6241  *	dev_set_mtu - Change maximum transfer unit
6242  *	@dev: device
6243  *	@new_mtu: new transfer unit
6244  *
6245  *	Change the maximum transfer size of the network device.
6246  */
6247 int dev_set_mtu(struct net_device *dev, int new_mtu)
6248 {
6249 	int err, orig_mtu;
6250 
6251 	if (new_mtu == dev->mtu)
6252 		return 0;
6253 
6254 	/*	MTU must be positive.	 */
6255 	if (new_mtu < 0)
6256 		return -EINVAL;
6257 
6258 	if (!netif_device_present(dev))
6259 		return -ENODEV;
6260 
6261 	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6262 	err = notifier_to_errno(err);
6263 	if (err)
6264 		return err;
6265 
6266 	orig_mtu = dev->mtu;
6267 	err = __dev_set_mtu(dev, new_mtu);
6268 
6269 	if (!err) {
6270 		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6271 		err = notifier_to_errno(err);
6272 		if (err) {
6273 			/* setting mtu back and notifying everyone again,
6274 			 * so that they have a chance to revert changes.
6275 			 */
6276 			__dev_set_mtu(dev, orig_mtu);
6277 			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6278 		}
6279 	}
6280 	return err;
6281 }
6282 EXPORT_SYMBOL(dev_set_mtu);
6283 
6284 /**
6285  *	dev_set_group - Change group this device belongs to
6286  *	@dev: device
6287  *	@new_group: group this device should belong to
6288  */
6289 void dev_set_group(struct net_device *dev, int new_group)
6290 {
6291 	dev->group = new_group;
6292 }
6293 EXPORT_SYMBOL(dev_set_group);
6294 
6295 /**
6296  *	dev_set_mac_address - Change Media Access Control Address
6297  *	@dev: device
6298  *	@sa: new address
6299  *
6300  *	Change the hardware (MAC) address of the device
6301  */
6302 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6303 {
6304 	const struct net_device_ops *ops = dev->netdev_ops;
6305 	int err;
6306 
6307 	if (!ops->ndo_set_mac_address)
6308 		return -EOPNOTSUPP;
6309 	if (sa->sa_family != dev->type)
6310 		return -EINVAL;
6311 	if (!netif_device_present(dev))
6312 		return -ENODEV;
6313 	err = ops->ndo_set_mac_address(dev, sa);
6314 	if (err)
6315 		return err;
6316 	dev->addr_assign_type = NET_ADDR_SET;
6317 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6318 	add_device_randomness(dev->dev_addr, dev->addr_len);
6319 	return 0;
6320 }
6321 EXPORT_SYMBOL(dev_set_mac_address);
6322 
6323 /**
6324  *	dev_change_carrier - Change device carrier
6325  *	@dev: device
6326  *	@new_carrier: new value
6327  *
6328  *	Change device carrier
6329  */
6330 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6331 {
6332 	const struct net_device_ops *ops = dev->netdev_ops;
6333 
6334 	if (!ops->ndo_change_carrier)
6335 		return -EOPNOTSUPP;
6336 	if (!netif_device_present(dev))
6337 		return -ENODEV;
6338 	return ops->ndo_change_carrier(dev, new_carrier);
6339 }
6340 EXPORT_SYMBOL(dev_change_carrier);
6341 
6342 /**
6343  *	dev_get_phys_port_id - Get device physical port ID
6344  *	@dev: device
6345  *	@ppid: port ID
6346  *
6347  *	Get device physical port ID
6348  */
6349 int dev_get_phys_port_id(struct net_device *dev,
6350 			 struct netdev_phys_item_id *ppid)
6351 {
6352 	const struct net_device_ops *ops = dev->netdev_ops;
6353 
6354 	if (!ops->ndo_get_phys_port_id)
6355 		return -EOPNOTSUPP;
6356 	return ops->ndo_get_phys_port_id(dev, ppid);
6357 }
6358 EXPORT_SYMBOL(dev_get_phys_port_id);
6359 
6360 /**
6361  *	dev_get_phys_port_name - Get device physical port name
6362  *	@dev: device
6363  *	@name: port name
6364  *
6365  *	Get device physical port name
6366  */
6367 int dev_get_phys_port_name(struct net_device *dev,
6368 			   char *name, size_t len)
6369 {
6370 	const struct net_device_ops *ops = dev->netdev_ops;
6371 
6372 	if (!ops->ndo_get_phys_port_name)
6373 		return -EOPNOTSUPP;
6374 	return ops->ndo_get_phys_port_name(dev, name, len);
6375 }
6376 EXPORT_SYMBOL(dev_get_phys_port_name);
6377 
6378 /**
6379  *	dev_change_proto_down - update protocol port state information
6380  *	@dev: device
6381  *	@proto_down: new value
6382  *
6383  *	This info can be used by switch drivers to set the phys state of the
6384  *	port.
6385  */
6386 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6387 {
6388 	const struct net_device_ops *ops = dev->netdev_ops;
6389 
6390 	if (!ops->ndo_change_proto_down)
6391 		return -EOPNOTSUPP;
6392 	if (!netif_device_present(dev))
6393 		return -ENODEV;
6394 	return ops->ndo_change_proto_down(dev, proto_down);
6395 }
6396 EXPORT_SYMBOL(dev_change_proto_down);
6397 
6398 /**
6399  *	dev_new_index	-	allocate an ifindex
6400  *	@net: the applicable net namespace
6401  *
6402  *	Returns a suitable unique value for a new device interface
6403  *	number.  The caller must hold the rtnl semaphore or the
6404  *	dev_base_lock to be sure it remains unique.
6405  */
6406 static int dev_new_index(struct net *net)
6407 {
6408 	int ifindex = net->ifindex;
6409 	for (;;) {
6410 		if (++ifindex <= 0)
6411 			ifindex = 1;
6412 		if (!__dev_get_by_index(net, ifindex))
6413 			return net->ifindex = ifindex;
6414 	}
6415 }
6416 
6417 /* Delayed registration/unregisteration */
6418 static LIST_HEAD(net_todo_list);
6419 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6420 
6421 static void net_set_todo(struct net_device *dev)
6422 {
6423 	list_add_tail(&dev->todo_list, &net_todo_list);
6424 	dev_net(dev)->dev_unreg_count++;
6425 }
6426 
6427 static void rollback_registered_many(struct list_head *head)
6428 {
6429 	struct net_device *dev, *tmp;
6430 	LIST_HEAD(close_head);
6431 
6432 	BUG_ON(dev_boot_phase);
6433 	ASSERT_RTNL();
6434 
6435 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6436 		/* Some devices call without registering
6437 		 * for initialization unwind. Remove those
6438 		 * devices and proceed with the remaining.
6439 		 */
6440 		if (dev->reg_state == NETREG_UNINITIALIZED) {
6441 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6442 				 dev->name, dev);
6443 
6444 			WARN_ON(1);
6445 			list_del(&dev->unreg_list);
6446 			continue;
6447 		}
6448 		dev->dismantle = true;
6449 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6450 	}
6451 
6452 	/* If device is running, close it first. */
6453 	list_for_each_entry(dev, head, unreg_list)
6454 		list_add_tail(&dev->close_list, &close_head);
6455 	dev_close_many(&close_head, true);
6456 
6457 	list_for_each_entry(dev, head, unreg_list) {
6458 		/* And unlink it from device chain. */
6459 		unlist_netdevice(dev);
6460 
6461 		dev->reg_state = NETREG_UNREGISTERING;
6462 		on_each_cpu(flush_backlog, dev, 1);
6463 	}
6464 
6465 	synchronize_net();
6466 
6467 	list_for_each_entry(dev, head, unreg_list) {
6468 		struct sk_buff *skb = NULL;
6469 
6470 		/* Shutdown queueing discipline. */
6471 		dev_shutdown(dev);
6472 
6473 
6474 		/* Notify protocols, that we are about to destroy
6475 		   this device. They should clean all the things.
6476 		*/
6477 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6478 
6479 		if (!dev->rtnl_link_ops ||
6480 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6481 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6482 						     GFP_KERNEL);
6483 
6484 		/*
6485 		 *	Flush the unicast and multicast chains
6486 		 */
6487 		dev_uc_flush(dev);
6488 		dev_mc_flush(dev);
6489 
6490 		if (dev->netdev_ops->ndo_uninit)
6491 			dev->netdev_ops->ndo_uninit(dev);
6492 
6493 		if (skb)
6494 			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6495 
6496 		/* Notifier chain MUST detach us all upper devices. */
6497 		WARN_ON(netdev_has_any_upper_dev(dev));
6498 
6499 		/* Remove entries from kobject tree */
6500 		netdev_unregister_kobject(dev);
6501 #ifdef CONFIG_XPS
6502 		/* Remove XPS queueing entries */
6503 		netif_reset_xps_queues_gt(dev, 0);
6504 #endif
6505 	}
6506 
6507 	synchronize_net();
6508 
6509 	list_for_each_entry(dev, head, unreg_list)
6510 		dev_put(dev);
6511 }
6512 
6513 static void rollback_registered(struct net_device *dev)
6514 {
6515 	LIST_HEAD(single);
6516 
6517 	list_add(&dev->unreg_list, &single);
6518 	rollback_registered_many(&single);
6519 	list_del(&single);
6520 }
6521 
6522 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6523 	struct net_device *upper, netdev_features_t features)
6524 {
6525 	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6526 	netdev_features_t feature;
6527 	int feature_bit;
6528 
6529 	for_each_netdev_feature(&upper_disables, feature_bit) {
6530 		feature = __NETIF_F_BIT(feature_bit);
6531 		if (!(upper->wanted_features & feature)
6532 		    && (features & feature)) {
6533 			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6534 				   &feature, upper->name);
6535 			features &= ~feature;
6536 		}
6537 	}
6538 
6539 	return features;
6540 }
6541 
6542 static void netdev_sync_lower_features(struct net_device *upper,
6543 	struct net_device *lower, netdev_features_t features)
6544 {
6545 	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6546 	netdev_features_t feature;
6547 	int feature_bit;
6548 
6549 	for_each_netdev_feature(&upper_disables, feature_bit) {
6550 		feature = __NETIF_F_BIT(feature_bit);
6551 		if (!(features & feature) && (lower->features & feature)) {
6552 			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6553 				   &feature, lower->name);
6554 			lower->wanted_features &= ~feature;
6555 			netdev_update_features(lower);
6556 
6557 			if (unlikely(lower->features & feature))
6558 				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6559 					    &feature, lower->name);
6560 		}
6561 	}
6562 }
6563 
6564 static netdev_features_t netdev_fix_features(struct net_device *dev,
6565 	netdev_features_t features)
6566 {
6567 	/* Fix illegal checksum combinations */
6568 	if ((features & NETIF_F_HW_CSUM) &&
6569 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6570 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6571 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6572 	}
6573 
6574 	/* TSO requires that SG is present as well. */
6575 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6576 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6577 		features &= ~NETIF_F_ALL_TSO;
6578 	}
6579 
6580 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6581 					!(features & NETIF_F_IP_CSUM)) {
6582 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6583 		features &= ~NETIF_F_TSO;
6584 		features &= ~NETIF_F_TSO_ECN;
6585 	}
6586 
6587 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6588 					 !(features & NETIF_F_IPV6_CSUM)) {
6589 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6590 		features &= ~NETIF_F_TSO6;
6591 	}
6592 
6593 	/* TSO ECN requires that TSO is present as well. */
6594 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6595 		features &= ~NETIF_F_TSO_ECN;
6596 
6597 	/* Software GSO depends on SG. */
6598 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6599 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6600 		features &= ~NETIF_F_GSO;
6601 	}
6602 
6603 	/* UFO needs SG and checksumming */
6604 	if (features & NETIF_F_UFO) {
6605 		/* maybe split UFO into V4 and V6? */
6606 		if (!(features & NETIF_F_HW_CSUM) &&
6607 		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6608 		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6609 			netdev_dbg(dev,
6610 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6611 			features &= ~NETIF_F_UFO;
6612 		}
6613 
6614 		if (!(features & NETIF_F_SG)) {
6615 			netdev_dbg(dev,
6616 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6617 			features &= ~NETIF_F_UFO;
6618 		}
6619 	}
6620 
6621 #ifdef CONFIG_NET_RX_BUSY_POLL
6622 	if (dev->netdev_ops->ndo_busy_poll)
6623 		features |= NETIF_F_BUSY_POLL;
6624 	else
6625 #endif
6626 		features &= ~NETIF_F_BUSY_POLL;
6627 
6628 	return features;
6629 }
6630 
6631 int __netdev_update_features(struct net_device *dev)
6632 {
6633 	struct net_device *upper, *lower;
6634 	netdev_features_t features;
6635 	struct list_head *iter;
6636 	int err = -1;
6637 
6638 	ASSERT_RTNL();
6639 
6640 	features = netdev_get_wanted_features(dev);
6641 
6642 	if (dev->netdev_ops->ndo_fix_features)
6643 		features = dev->netdev_ops->ndo_fix_features(dev, features);
6644 
6645 	/* driver might be less strict about feature dependencies */
6646 	features = netdev_fix_features(dev, features);
6647 
6648 	/* some features can't be enabled if they're off an an upper device */
6649 	netdev_for_each_upper_dev_rcu(dev, upper, iter)
6650 		features = netdev_sync_upper_features(dev, upper, features);
6651 
6652 	if (dev->features == features)
6653 		goto sync_lower;
6654 
6655 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6656 		&dev->features, &features);
6657 
6658 	if (dev->netdev_ops->ndo_set_features)
6659 		err = dev->netdev_ops->ndo_set_features(dev, features);
6660 	else
6661 		err = 0;
6662 
6663 	if (unlikely(err < 0)) {
6664 		netdev_err(dev,
6665 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
6666 			err, &features, &dev->features);
6667 		/* return non-0 since some features might have changed and
6668 		 * it's better to fire a spurious notification than miss it
6669 		 */
6670 		return -1;
6671 	}
6672 
6673 sync_lower:
6674 	/* some features must be disabled on lower devices when disabled
6675 	 * on an upper device (think: bonding master or bridge)
6676 	 */
6677 	netdev_for_each_lower_dev(dev, lower, iter)
6678 		netdev_sync_lower_features(dev, lower, features);
6679 
6680 	if (!err)
6681 		dev->features = features;
6682 
6683 	return err < 0 ? 0 : 1;
6684 }
6685 
6686 /**
6687  *	netdev_update_features - recalculate device features
6688  *	@dev: the device to check
6689  *
6690  *	Recalculate dev->features set and send notifications if it
6691  *	has changed. Should be called after driver or hardware dependent
6692  *	conditions might have changed that influence the features.
6693  */
6694 void netdev_update_features(struct net_device *dev)
6695 {
6696 	if (__netdev_update_features(dev))
6697 		netdev_features_change(dev);
6698 }
6699 EXPORT_SYMBOL(netdev_update_features);
6700 
6701 /**
6702  *	netdev_change_features - recalculate device features
6703  *	@dev: the device to check
6704  *
6705  *	Recalculate dev->features set and send notifications even
6706  *	if they have not changed. Should be called instead of
6707  *	netdev_update_features() if also dev->vlan_features might
6708  *	have changed to allow the changes to be propagated to stacked
6709  *	VLAN devices.
6710  */
6711 void netdev_change_features(struct net_device *dev)
6712 {
6713 	__netdev_update_features(dev);
6714 	netdev_features_change(dev);
6715 }
6716 EXPORT_SYMBOL(netdev_change_features);
6717 
6718 /**
6719  *	netif_stacked_transfer_operstate -	transfer operstate
6720  *	@rootdev: the root or lower level device to transfer state from
6721  *	@dev: the device to transfer operstate to
6722  *
6723  *	Transfer operational state from root to device. This is normally
6724  *	called when a stacking relationship exists between the root
6725  *	device and the device(a leaf device).
6726  */
6727 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6728 					struct net_device *dev)
6729 {
6730 	if (rootdev->operstate == IF_OPER_DORMANT)
6731 		netif_dormant_on(dev);
6732 	else
6733 		netif_dormant_off(dev);
6734 
6735 	if (netif_carrier_ok(rootdev)) {
6736 		if (!netif_carrier_ok(dev))
6737 			netif_carrier_on(dev);
6738 	} else {
6739 		if (netif_carrier_ok(dev))
6740 			netif_carrier_off(dev);
6741 	}
6742 }
6743 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6744 
6745 #ifdef CONFIG_SYSFS
6746 static int netif_alloc_rx_queues(struct net_device *dev)
6747 {
6748 	unsigned int i, count = dev->num_rx_queues;
6749 	struct netdev_rx_queue *rx;
6750 	size_t sz = count * sizeof(*rx);
6751 
6752 	BUG_ON(count < 1);
6753 
6754 	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6755 	if (!rx) {
6756 		rx = vzalloc(sz);
6757 		if (!rx)
6758 			return -ENOMEM;
6759 	}
6760 	dev->_rx = rx;
6761 
6762 	for (i = 0; i < count; i++)
6763 		rx[i].dev = dev;
6764 	return 0;
6765 }
6766 #endif
6767 
6768 static void netdev_init_one_queue(struct net_device *dev,
6769 				  struct netdev_queue *queue, void *_unused)
6770 {
6771 	/* Initialize queue lock */
6772 	spin_lock_init(&queue->_xmit_lock);
6773 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6774 	queue->xmit_lock_owner = -1;
6775 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6776 	queue->dev = dev;
6777 #ifdef CONFIG_BQL
6778 	dql_init(&queue->dql, HZ);
6779 #endif
6780 }
6781 
6782 static void netif_free_tx_queues(struct net_device *dev)
6783 {
6784 	kvfree(dev->_tx);
6785 }
6786 
6787 static int netif_alloc_netdev_queues(struct net_device *dev)
6788 {
6789 	unsigned int count = dev->num_tx_queues;
6790 	struct netdev_queue *tx;
6791 	size_t sz = count * sizeof(*tx);
6792 
6793 	if (count < 1 || count > 0xffff)
6794 		return -EINVAL;
6795 
6796 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6797 	if (!tx) {
6798 		tx = vzalloc(sz);
6799 		if (!tx)
6800 			return -ENOMEM;
6801 	}
6802 	dev->_tx = tx;
6803 
6804 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6805 	spin_lock_init(&dev->tx_global_lock);
6806 
6807 	return 0;
6808 }
6809 
6810 void netif_tx_stop_all_queues(struct net_device *dev)
6811 {
6812 	unsigned int i;
6813 
6814 	for (i = 0; i < dev->num_tx_queues; i++) {
6815 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6816 		netif_tx_stop_queue(txq);
6817 	}
6818 }
6819 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6820 
6821 /**
6822  *	register_netdevice	- register a network device
6823  *	@dev: device to register
6824  *
6825  *	Take a completed network device structure and add it to the kernel
6826  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6827  *	chain. 0 is returned on success. A negative errno code is returned
6828  *	on a failure to set up the device, or if the name is a duplicate.
6829  *
6830  *	Callers must hold the rtnl semaphore. You may want
6831  *	register_netdev() instead of this.
6832  *
6833  *	BUGS:
6834  *	The locking appears insufficient to guarantee two parallel registers
6835  *	will not get the same name.
6836  */
6837 
6838 int register_netdevice(struct net_device *dev)
6839 {
6840 	int ret;
6841 	struct net *net = dev_net(dev);
6842 
6843 	BUG_ON(dev_boot_phase);
6844 	ASSERT_RTNL();
6845 
6846 	might_sleep();
6847 
6848 	/* When net_device's are persistent, this will be fatal. */
6849 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6850 	BUG_ON(!net);
6851 
6852 	spin_lock_init(&dev->addr_list_lock);
6853 	netdev_set_addr_lockdep_class(dev);
6854 
6855 	ret = dev_get_valid_name(net, dev, dev->name);
6856 	if (ret < 0)
6857 		goto out;
6858 
6859 	/* Init, if this function is available */
6860 	if (dev->netdev_ops->ndo_init) {
6861 		ret = dev->netdev_ops->ndo_init(dev);
6862 		if (ret) {
6863 			if (ret > 0)
6864 				ret = -EIO;
6865 			goto out;
6866 		}
6867 	}
6868 
6869 	if (((dev->hw_features | dev->features) &
6870 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
6871 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6872 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6873 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6874 		ret = -EINVAL;
6875 		goto err_uninit;
6876 	}
6877 
6878 	ret = -EBUSY;
6879 	if (!dev->ifindex)
6880 		dev->ifindex = dev_new_index(net);
6881 	else if (__dev_get_by_index(net, dev->ifindex))
6882 		goto err_uninit;
6883 
6884 	/* Transfer changeable features to wanted_features and enable
6885 	 * software offloads (GSO and GRO).
6886 	 */
6887 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
6888 	dev->features |= NETIF_F_SOFT_FEATURES;
6889 	dev->wanted_features = dev->features & dev->hw_features;
6890 
6891 	if (!(dev->flags & IFF_LOOPBACK)) {
6892 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
6893 	}
6894 
6895 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6896 	 */
6897 	dev->vlan_features |= NETIF_F_HIGHDMA;
6898 
6899 	/* Make NETIF_F_SG inheritable to tunnel devices.
6900 	 */
6901 	dev->hw_enc_features |= NETIF_F_SG;
6902 
6903 	/* Make NETIF_F_SG inheritable to MPLS.
6904 	 */
6905 	dev->mpls_features |= NETIF_F_SG;
6906 
6907 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6908 	ret = notifier_to_errno(ret);
6909 	if (ret)
6910 		goto err_uninit;
6911 
6912 	ret = netdev_register_kobject(dev);
6913 	if (ret)
6914 		goto err_uninit;
6915 	dev->reg_state = NETREG_REGISTERED;
6916 
6917 	__netdev_update_features(dev);
6918 
6919 	/*
6920 	 *	Default initial state at registry is that the
6921 	 *	device is present.
6922 	 */
6923 
6924 	set_bit(__LINK_STATE_PRESENT, &dev->state);
6925 
6926 	linkwatch_init_dev(dev);
6927 
6928 	dev_init_scheduler(dev);
6929 	dev_hold(dev);
6930 	list_netdevice(dev);
6931 	add_device_randomness(dev->dev_addr, dev->addr_len);
6932 
6933 	/* If the device has permanent device address, driver should
6934 	 * set dev_addr and also addr_assign_type should be set to
6935 	 * NET_ADDR_PERM (default value).
6936 	 */
6937 	if (dev->addr_assign_type == NET_ADDR_PERM)
6938 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6939 
6940 	/* Notify protocols, that a new device appeared. */
6941 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6942 	ret = notifier_to_errno(ret);
6943 	if (ret) {
6944 		rollback_registered(dev);
6945 		dev->reg_state = NETREG_UNREGISTERED;
6946 	}
6947 	/*
6948 	 *	Prevent userspace races by waiting until the network
6949 	 *	device is fully setup before sending notifications.
6950 	 */
6951 	if (!dev->rtnl_link_ops ||
6952 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6953 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6954 
6955 out:
6956 	return ret;
6957 
6958 err_uninit:
6959 	if (dev->netdev_ops->ndo_uninit)
6960 		dev->netdev_ops->ndo_uninit(dev);
6961 	goto out;
6962 }
6963 EXPORT_SYMBOL(register_netdevice);
6964 
6965 /**
6966  *	init_dummy_netdev	- init a dummy network device for NAPI
6967  *	@dev: device to init
6968  *
6969  *	This takes a network device structure and initialize the minimum
6970  *	amount of fields so it can be used to schedule NAPI polls without
6971  *	registering a full blown interface. This is to be used by drivers
6972  *	that need to tie several hardware interfaces to a single NAPI
6973  *	poll scheduler due to HW limitations.
6974  */
6975 int init_dummy_netdev(struct net_device *dev)
6976 {
6977 	/* Clear everything. Note we don't initialize spinlocks
6978 	 * are they aren't supposed to be taken by any of the
6979 	 * NAPI code and this dummy netdev is supposed to be
6980 	 * only ever used for NAPI polls
6981 	 */
6982 	memset(dev, 0, sizeof(struct net_device));
6983 
6984 	/* make sure we BUG if trying to hit standard
6985 	 * register/unregister code path
6986 	 */
6987 	dev->reg_state = NETREG_DUMMY;
6988 
6989 	/* NAPI wants this */
6990 	INIT_LIST_HEAD(&dev->napi_list);
6991 
6992 	/* a dummy interface is started by default */
6993 	set_bit(__LINK_STATE_PRESENT, &dev->state);
6994 	set_bit(__LINK_STATE_START, &dev->state);
6995 
6996 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
6997 	 * because users of this 'device' dont need to change
6998 	 * its refcount.
6999 	 */
7000 
7001 	return 0;
7002 }
7003 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7004 
7005 
7006 /**
7007  *	register_netdev	- register a network device
7008  *	@dev: device to register
7009  *
7010  *	Take a completed network device structure and add it to the kernel
7011  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7012  *	chain. 0 is returned on success. A negative errno code is returned
7013  *	on a failure to set up the device, or if the name is a duplicate.
7014  *
7015  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7016  *	and expands the device name if you passed a format string to
7017  *	alloc_netdev.
7018  */
7019 int register_netdev(struct net_device *dev)
7020 {
7021 	int err;
7022 
7023 	rtnl_lock();
7024 	err = register_netdevice(dev);
7025 	rtnl_unlock();
7026 	return err;
7027 }
7028 EXPORT_SYMBOL(register_netdev);
7029 
7030 int netdev_refcnt_read(const struct net_device *dev)
7031 {
7032 	int i, refcnt = 0;
7033 
7034 	for_each_possible_cpu(i)
7035 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7036 	return refcnt;
7037 }
7038 EXPORT_SYMBOL(netdev_refcnt_read);
7039 
7040 /**
7041  * netdev_wait_allrefs - wait until all references are gone.
7042  * @dev: target net_device
7043  *
7044  * This is called when unregistering network devices.
7045  *
7046  * Any protocol or device that holds a reference should register
7047  * for netdevice notification, and cleanup and put back the
7048  * reference if they receive an UNREGISTER event.
7049  * We can get stuck here if buggy protocols don't correctly
7050  * call dev_put.
7051  */
7052 static void netdev_wait_allrefs(struct net_device *dev)
7053 {
7054 	unsigned long rebroadcast_time, warning_time;
7055 	int refcnt;
7056 
7057 	linkwatch_forget_dev(dev);
7058 
7059 	rebroadcast_time = warning_time = jiffies;
7060 	refcnt = netdev_refcnt_read(dev);
7061 
7062 	while (refcnt != 0) {
7063 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7064 			rtnl_lock();
7065 
7066 			/* Rebroadcast unregister notification */
7067 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7068 
7069 			__rtnl_unlock();
7070 			rcu_barrier();
7071 			rtnl_lock();
7072 
7073 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7074 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7075 				     &dev->state)) {
7076 				/* We must not have linkwatch events
7077 				 * pending on unregister. If this
7078 				 * happens, we simply run the queue
7079 				 * unscheduled, resulting in a noop
7080 				 * for this device.
7081 				 */
7082 				linkwatch_run_queue();
7083 			}
7084 
7085 			__rtnl_unlock();
7086 
7087 			rebroadcast_time = jiffies;
7088 		}
7089 
7090 		msleep(250);
7091 
7092 		refcnt = netdev_refcnt_read(dev);
7093 
7094 		if (time_after(jiffies, warning_time + 10 * HZ)) {
7095 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7096 				 dev->name, refcnt);
7097 			warning_time = jiffies;
7098 		}
7099 	}
7100 }
7101 
7102 /* The sequence is:
7103  *
7104  *	rtnl_lock();
7105  *	...
7106  *	register_netdevice(x1);
7107  *	register_netdevice(x2);
7108  *	...
7109  *	unregister_netdevice(y1);
7110  *	unregister_netdevice(y2);
7111  *      ...
7112  *	rtnl_unlock();
7113  *	free_netdev(y1);
7114  *	free_netdev(y2);
7115  *
7116  * We are invoked by rtnl_unlock().
7117  * This allows us to deal with problems:
7118  * 1) We can delete sysfs objects which invoke hotplug
7119  *    without deadlocking with linkwatch via keventd.
7120  * 2) Since we run with the RTNL semaphore not held, we can sleep
7121  *    safely in order to wait for the netdev refcnt to drop to zero.
7122  *
7123  * We must not return until all unregister events added during
7124  * the interval the lock was held have been completed.
7125  */
7126 void netdev_run_todo(void)
7127 {
7128 	struct list_head list;
7129 
7130 	/* Snapshot list, allow later requests */
7131 	list_replace_init(&net_todo_list, &list);
7132 
7133 	__rtnl_unlock();
7134 
7135 
7136 	/* Wait for rcu callbacks to finish before next phase */
7137 	if (!list_empty(&list))
7138 		rcu_barrier();
7139 
7140 	while (!list_empty(&list)) {
7141 		struct net_device *dev
7142 			= list_first_entry(&list, struct net_device, todo_list);
7143 		list_del(&dev->todo_list);
7144 
7145 		rtnl_lock();
7146 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7147 		__rtnl_unlock();
7148 
7149 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7150 			pr_err("network todo '%s' but state %d\n",
7151 			       dev->name, dev->reg_state);
7152 			dump_stack();
7153 			continue;
7154 		}
7155 
7156 		dev->reg_state = NETREG_UNREGISTERED;
7157 
7158 		netdev_wait_allrefs(dev);
7159 
7160 		/* paranoia */
7161 		BUG_ON(netdev_refcnt_read(dev));
7162 		BUG_ON(!list_empty(&dev->ptype_all));
7163 		BUG_ON(!list_empty(&dev->ptype_specific));
7164 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7165 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7166 		WARN_ON(dev->dn_ptr);
7167 
7168 		if (dev->destructor)
7169 			dev->destructor(dev);
7170 
7171 		/* Report a network device has been unregistered */
7172 		rtnl_lock();
7173 		dev_net(dev)->dev_unreg_count--;
7174 		__rtnl_unlock();
7175 		wake_up(&netdev_unregistering_wq);
7176 
7177 		/* Free network device */
7178 		kobject_put(&dev->dev.kobj);
7179 	}
7180 }
7181 
7182 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
7183  * fields in the same order, with only the type differing.
7184  */
7185 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7186 			     const struct net_device_stats *netdev_stats)
7187 {
7188 #if BITS_PER_LONG == 64
7189 	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
7190 	memcpy(stats64, netdev_stats, sizeof(*stats64));
7191 #else
7192 	size_t i, n = sizeof(*stats64) / sizeof(u64);
7193 	const unsigned long *src = (const unsigned long *)netdev_stats;
7194 	u64 *dst = (u64 *)stats64;
7195 
7196 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
7197 		     sizeof(*stats64) / sizeof(u64));
7198 	for (i = 0; i < n; i++)
7199 		dst[i] = src[i];
7200 #endif
7201 }
7202 EXPORT_SYMBOL(netdev_stats_to_stats64);
7203 
7204 /**
7205  *	dev_get_stats	- get network device statistics
7206  *	@dev: device to get statistics from
7207  *	@storage: place to store stats
7208  *
7209  *	Get network statistics from device. Return @storage.
7210  *	The device driver may provide its own method by setting
7211  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7212  *	otherwise the internal statistics structure is used.
7213  */
7214 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7215 					struct rtnl_link_stats64 *storage)
7216 {
7217 	const struct net_device_ops *ops = dev->netdev_ops;
7218 
7219 	if (ops->ndo_get_stats64) {
7220 		memset(storage, 0, sizeof(*storage));
7221 		ops->ndo_get_stats64(dev, storage);
7222 	} else if (ops->ndo_get_stats) {
7223 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7224 	} else {
7225 		netdev_stats_to_stats64(storage, &dev->stats);
7226 	}
7227 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7228 	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7229 	return storage;
7230 }
7231 EXPORT_SYMBOL(dev_get_stats);
7232 
7233 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7234 {
7235 	struct netdev_queue *queue = dev_ingress_queue(dev);
7236 
7237 #ifdef CONFIG_NET_CLS_ACT
7238 	if (queue)
7239 		return queue;
7240 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7241 	if (!queue)
7242 		return NULL;
7243 	netdev_init_one_queue(dev, queue, NULL);
7244 	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7245 	queue->qdisc_sleeping = &noop_qdisc;
7246 	rcu_assign_pointer(dev->ingress_queue, queue);
7247 #endif
7248 	return queue;
7249 }
7250 
7251 static const struct ethtool_ops default_ethtool_ops;
7252 
7253 void netdev_set_default_ethtool_ops(struct net_device *dev,
7254 				    const struct ethtool_ops *ops)
7255 {
7256 	if (dev->ethtool_ops == &default_ethtool_ops)
7257 		dev->ethtool_ops = ops;
7258 }
7259 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7260 
7261 void netdev_freemem(struct net_device *dev)
7262 {
7263 	char *addr = (char *)dev - dev->padded;
7264 
7265 	kvfree(addr);
7266 }
7267 
7268 /**
7269  *	alloc_netdev_mqs - allocate network device
7270  *	@sizeof_priv:		size of private data to allocate space for
7271  *	@name:			device name format string
7272  *	@name_assign_type: 	origin of device name
7273  *	@setup:			callback to initialize device
7274  *	@txqs:			the number of TX subqueues to allocate
7275  *	@rxqs:			the number of RX subqueues to allocate
7276  *
7277  *	Allocates a struct net_device with private data area for driver use
7278  *	and performs basic initialization.  Also allocates subqueue structs
7279  *	for each queue on the device.
7280  */
7281 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7282 		unsigned char name_assign_type,
7283 		void (*setup)(struct net_device *),
7284 		unsigned int txqs, unsigned int rxqs)
7285 {
7286 	struct net_device *dev;
7287 	size_t alloc_size;
7288 	struct net_device *p;
7289 
7290 	BUG_ON(strlen(name) >= sizeof(dev->name));
7291 
7292 	if (txqs < 1) {
7293 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7294 		return NULL;
7295 	}
7296 
7297 #ifdef CONFIG_SYSFS
7298 	if (rxqs < 1) {
7299 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7300 		return NULL;
7301 	}
7302 #endif
7303 
7304 	alloc_size = sizeof(struct net_device);
7305 	if (sizeof_priv) {
7306 		/* ensure 32-byte alignment of private area */
7307 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7308 		alloc_size += sizeof_priv;
7309 	}
7310 	/* ensure 32-byte alignment of whole construct */
7311 	alloc_size += NETDEV_ALIGN - 1;
7312 
7313 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7314 	if (!p)
7315 		p = vzalloc(alloc_size);
7316 	if (!p)
7317 		return NULL;
7318 
7319 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7320 	dev->padded = (char *)dev - (char *)p;
7321 
7322 	dev->pcpu_refcnt = alloc_percpu(int);
7323 	if (!dev->pcpu_refcnt)
7324 		goto free_dev;
7325 
7326 	if (dev_addr_init(dev))
7327 		goto free_pcpu;
7328 
7329 	dev_mc_init(dev);
7330 	dev_uc_init(dev);
7331 
7332 	dev_net_set(dev, &init_net);
7333 
7334 	dev->gso_max_size = GSO_MAX_SIZE;
7335 	dev->gso_max_segs = GSO_MAX_SEGS;
7336 	dev->gso_min_segs = 0;
7337 
7338 	INIT_LIST_HEAD(&dev->napi_list);
7339 	INIT_LIST_HEAD(&dev->unreg_list);
7340 	INIT_LIST_HEAD(&dev->close_list);
7341 	INIT_LIST_HEAD(&dev->link_watch_list);
7342 	INIT_LIST_HEAD(&dev->adj_list.upper);
7343 	INIT_LIST_HEAD(&dev->adj_list.lower);
7344 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
7345 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
7346 	INIT_LIST_HEAD(&dev->ptype_all);
7347 	INIT_LIST_HEAD(&dev->ptype_specific);
7348 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7349 	setup(dev);
7350 
7351 	if (!dev->tx_queue_len)
7352 		dev->priv_flags |= IFF_NO_QUEUE;
7353 
7354 	dev->num_tx_queues = txqs;
7355 	dev->real_num_tx_queues = txqs;
7356 	if (netif_alloc_netdev_queues(dev))
7357 		goto free_all;
7358 
7359 #ifdef CONFIG_SYSFS
7360 	dev->num_rx_queues = rxqs;
7361 	dev->real_num_rx_queues = rxqs;
7362 	if (netif_alloc_rx_queues(dev))
7363 		goto free_all;
7364 #endif
7365 
7366 	strcpy(dev->name, name);
7367 	dev->name_assign_type = name_assign_type;
7368 	dev->group = INIT_NETDEV_GROUP;
7369 	if (!dev->ethtool_ops)
7370 		dev->ethtool_ops = &default_ethtool_ops;
7371 
7372 	nf_hook_ingress_init(dev);
7373 
7374 	return dev;
7375 
7376 free_all:
7377 	free_netdev(dev);
7378 	return NULL;
7379 
7380 free_pcpu:
7381 	free_percpu(dev->pcpu_refcnt);
7382 free_dev:
7383 	netdev_freemem(dev);
7384 	return NULL;
7385 }
7386 EXPORT_SYMBOL(alloc_netdev_mqs);
7387 
7388 /**
7389  *	free_netdev - free network device
7390  *	@dev: device
7391  *
7392  *	This function does the last stage of destroying an allocated device
7393  * 	interface. The reference to the device object is released.
7394  *	If this is the last reference then it will be freed.
7395  *	Must be called in process context.
7396  */
7397 void free_netdev(struct net_device *dev)
7398 {
7399 	struct napi_struct *p, *n;
7400 
7401 	might_sleep();
7402 	netif_free_tx_queues(dev);
7403 #ifdef CONFIG_SYSFS
7404 	kvfree(dev->_rx);
7405 #endif
7406 
7407 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7408 
7409 	/* Flush device addresses */
7410 	dev_addr_flush(dev);
7411 
7412 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7413 		netif_napi_del(p);
7414 
7415 	free_percpu(dev->pcpu_refcnt);
7416 	dev->pcpu_refcnt = NULL;
7417 
7418 	/*  Compatibility with error handling in drivers */
7419 	if (dev->reg_state == NETREG_UNINITIALIZED) {
7420 		netdev_freemem(dev);
7421 		return;
7422 	}
7423 
7424 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7425 	dev->reg_state = NETREG_RELEASED;
7426 
7427 	/* will free via device release */
7428 	put_device(&dev->dev);
7429 }
7430 EXPORT_SYMBOL(free_netdev);
7431 
7432 /**
7433  *	synchronize_net -  Synchronize with packet receive processing
7434  *
7435  *	Wait for packets currently being received to be done.
7436  *	Does not block later packets from starting.
7437  */
7438 void synchronize_net(void)
7439 {
7440 	might_sleep();
7441 	if (rtnl_is_locked())
7442 		synchronize_rcu_expedited();
7443 	else
7444 		synchronize_rcu();
7445 }
7446 EXPORT_SYMBOL(synchronize_net);
7447 
7448 /**
7449  *	unregister_netdevice_queue - remove device from the kernel
7450  *	@dev: device
7451  *	@head: list
7452  *
7453  *	This function shuts down a device interface and removes it
7454  *	from the kernel tables.
7455  *	If head not NULL, device is queued to be unregistered later.
7456  *
7457  *	Callers must hold the rtnl semaphore.  You may want
7458  *	unregister_netdev() instead of this.
7459  */
7460 
7461 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7462 {
7463 	ASSERT_RTNL();
7464 
7465 	if (head) {
7466 		list_move_tail(&dev->unreg_list, head);
7467 	} else {
7468 		rollback_registered(dev);
7469 		/* Finish processing unregister after unlock */
7470 		net_set_todo(dev);
7471 	}
7472 }
7473 EXPORT_SYMBOL(unregister_netdevice_queue);
7474 
7475 /**
7476  *	unregister_netdevice_many - unregister many devices
7477  *	@head: list of devices
7478  *
7479  *  Note: As most callers use a stack allocated list_head,
7480  *  we force a list_del() to make sure stack wont be corrupted later.
7481  */
7482 void unregister_netdevice_many(struct list_head *head)
7483 {
7484 	struct net_device *dev;
7485 
7486 	if (!list_empty(head)) {
7487 		rollback_registered_many(head);
7488 		list_for_each_entry(dev, head, unreg_list)
7489 			net_set_todo(dev);
7490 		list_del(head);
7491 	}
7492 }
7493 EXPORT_SYMBOL(unregister_netdevice_many);
7494 
7495 /**
7496  *	unregister_netdev - remove device from the kernel
7497  *	@dev: device
7498  *
7499  *	This function shuts down a device interface and removes it
7500  *	from the kernel tables.
7501  *
7502  *	This is just a wrapper for unregister_netdevice that takes
7503  *	the rtnl semaphore.  In general you want to use this and not
7504  *	unregister_netdevice.
7505  */
7506 void unregister_netdev(struct net_device *dev)
7507 {
7508 	rtnl_lock();
7509 	unregister_netdevice(dev);
7510 	rtnl_unlock();
7511 }
7512 EXPORT_SYMBOL(unregister_netdev);
7513 
7514 /**
7515  *	dev_change_net_namespace - move device to different nethost namespace
7516  *	@dev: device
7517  *	@net: network namespace
7518  *	@pat: If not NULL name pattern to try if the current device name
7519  *	      is already taken in the destination network namespace.
7520  *
7521  *	This function shuts down a device interface and moves it
7522  *	to a new network namespace. On success 0 is returned, on
7523  *	a failure a netagive errno code is returned.
7524  *
7525  *	Callers must hold the rtnl semaphore.
7526  */
7527 
7528 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7529 {
7530 	int err;
7531 
7532 	ASSERT_RTNL();
7533 
7534 	/* Don't allow namespace local devices to be moved. */
7535 	err = -EINVAL;
7536 	if (dev->features & NETIF_F_NETNS_LOCAL)
7537 		goto out;
7538 
7539 	/* Ensure the device has been registrered */
7540 	if (dev->reg_state != NETREG_REGISTERED)
7541 		goto out;
7542 
7543 	/* Get out if there is nothing todo */
7544 	err = 0;
7545 	if (net_eq(dev_net(dev), net))
7546 		goto out;
7547 
7548 	/* Pick the destination device name, and ensure
7549 	 * we can use it in the destination network namespace.
7550 	 */
7551 	err = -EEXIST;
7552 	if (__dev_get_by_name(net, dev->name)) {
7553 		/* We get here if we can't use the current device name */
7554 		if (!pat)
7555 			goto out;
7556 		if (dev_get_valid_name(net, dev, pat) < 0)
7557 			goto out;
7558 	}
7559 
7560 	/*
7561 	 * And now a mini version of register_netdevice unregister_netdevice.
7562 	 */
7563 
7564 	/* If device is running close it first. */
7565 	dev_close(dev);
7566 
7567 	/* And unlink it from device chain */
7568 	err = -ENODEV;
7569 	unlist_netdevice(dev);
7570 
7571 	synchronize_net();
7572 
7573 	/* Shutdown queueing discipline. */
7574 	dev_shutdown(dev);
7575 
7576 	/* Notify protocols, that we are about to destroy
7577 	   this device. They should clean all the things.
7578 
7579 	   Note that dev->reg_state stays at NETREG_REGISTERED.
7580 	   This is wanted because this way 8021q and macvlan know
7581 	   the device is just moving and can keep their slaves up.
7582 	*/
7583 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7584 	rcu_barrier();
7585 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7586 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7587 
7588 	/*
7589 	 *	Flush the unicast and multicast chains
7590 	 */
7591 	dev_uc_flush(dev);
7592 	dev_mc_flush(dev);
7593 
7594 	/* Send a netdev-removed uevent to the old namespace */
7595 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7596 	netdev_adjacent_del_links(dev);
7597 
7598 	/* Actually switch the network namespace */
7599 	dev_net_set(dev, net);
7600 
7601 	/* If there is an ifindex conflict assign a new one */
7602 	if (__dev_get_by_index(net, dev->ifindex))
7603 		dev->ifindex = dev_new_index(net);
7604 
7605 	/* Send a netdev-add uevent to the new namespace */
7606 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7607 	netdev_adjacent_add_links(dev);
7608 
7609 	/* Fixup kobjects */
7610 	err = device_rename(&dev->dev, dev->name);
7611 	WARN_ON(err);
7612 
7613 	/* Add the device back in the hashes */
7614 	list_netdevice(dev);
7615 
7616 	/* Notify protocols, that a new device appeared. */
7617 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7618 
7619 	/*
7620 	 *	Prevent userspace races by waiting until the network
7621 	 *	device is fully setup before sending notifications.
7622 	 */
7623 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7624 
7625 	synchronize_net();
7626 	err = 0;
7627 out:
7628 	return err;
7629 }
7630 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7631 
7632 static int dev_cpu_callback(struct notifier_block *nfb,
7633 			    unsigned long action,
7634 			    void *ocpu)
7635 {
7636 	struct sk_buff **list_skb;
7637 	struct sk_buff *skb;
7638 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
7639 	struct softnet_data *sd, *oldsd;
7640 
7641 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7642 		return NOTIFY_OK;
7643 
7644 	local_irq_disable();
7645 	cpu = smp_processor_id();
7646 	sd = &per_cpu(softnet_data, cpu);
7647 	oldsd = &per_cpu(softnet_data, oldcpu);
7648 
7649 	/* Find end of our completion_queue. */
7650 	list_skb = &sd->completion_queue;
7651 	while (*list_skb)
7652 		list_skb = &(*list_skb)->next;
7653 	/* Append completion queue from offline CPU. */
7654 	*list_skb = oldsd->completion_queue;
7655 	oldsd->completion_queue = NULL;
7656 
7657 	/* Append output queue from offline CPU. */
7658 	if (oldsd->output_queue) {
7659 		*sd->output_queue_tailp = oldsd->output_queue;
7660 		sd->output_queue_tailp = oldsd->output_queue_tailp;
7661 		oldsd->output_queue = NULL;
7662 		oldsd->output_queue_tailp = &oldsd->output_queue;
7663 	}
7664 	/* Append NAPI poll list from offline CPU, with one exception :
7665 	 * process_backlog() must be called by cpu owning percpu backlog.
7666 	 * We properly handle process_queue & input_pkt_queue later.
7667 	 */
7668 	while (!list_empty(&oldsd->poll_list)) {
7669 		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7670 							    struct napi_struct,
7671 							    poll_list);
7672 
7673 		list_del_init(&napi->poll_list);
7674 		if (napi->poll == process_backlog)
7675 			napi->state = 0;
7676 		else
7677 			____napi_schedule(sd, napi);
7678 	}
7679 
7680 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
7681 	local_irq_enable();
7682 
7683 	/* Process offline CPU's input_pkt_queue */
7684 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7685 		netif_rx_ni(skb);
7686 		input_queue_head_incr(oldsd);
7687 	}
7688 	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7689 		netif_rx_ni(skb);
7690 		input_queue_head_incr(oldsd);
7691 	}
7692 
7693 	return NOTIFY_OK;
7694 }
7695 
7696 
7697 /**
7698  *	netdev_increment_features - increment feature set by one
7699  *	@all: current feature set
7700  *	@one: new feature set
7701  *	@mask: mask feature set
7702  *
7703  *	Computes a new feature set after adding a device with feature set
7704  *	@one to the master device with current feature set @all.  Will not
7705  *	enable anything that is off in @mask. Returns the new feature set.
7706  */
7707 netdev_features_t netdev_increment_features(netdev_features_t all,
7708 	netdev_features_t one, netdev_features_t mask)
7709 {
7710 	if (mask & NETIF_F_HW_CSUM)
7711 		mask |= NETIF_F_CSUM_MASK;
7712 	mask |= NETIF_F_VLAN_CHALLENGED;
7713 
7714 	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
7715 	all &= one | ~NETIF_F_ALL_FOR_ALL;
7716 
7717 	/* If one device supports hw checksumming, set for all. */
7718 	if (all & NETIF_F_HW_CSUM)
7719 		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7720 
7721 	return all;
7722 }
7723 EXPORT_SYMBOL(netdev_increment_features);
7724 
7725 static struct hlist_head * __net_init netdev_create_hash(void)
7726 {
7727 	int i;
7728 	struct hlist_head *hash;
7729 
7730 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7731 	if (hash != NULL)
7732 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
7733 			INIT_HLIST_HEAD(&hash[i]);
7734 
7735 	return hash;
7736 }
7737 
7738 /* Initialize per network namespace state */
7739 static int __net_init netdev_init(struct net *net)
7740 {
7741 	if (net != &init_net)
7742 		INIT_LIST_HEAD(&net->dev_base_head);
7743 
7744 	net->dev_name_head = netdev_create_hash();
7745 	if (net->dev_name_head == NULL)
7746 		goto err_name;
7747 
7748 	net->dev_index_head = netdev_create_hash();
7749 	if (net->dev_index_head == NULL)
7750 		goto err_idx;
7751 
7752 	return 0;
7753 
7754 err_idx:
7755 	kfree(net->dev_name_head);
7756 err_name:
7757 	return -ENOMEM;
7758 }
7759 
7760 /**
7761  *	netdev_drivername - network driver for the device
7762  *	@dev: network device
7763  *
7764  *	Determine network driver for device.
7765  */
7766 const char *netdev_drivername(const struct net_device *dev)
7767 {
7768 	const struct device_driver *driver;
7769 	const struct device *parent;
7770 	const char *empty = "";
7771 
7772 	parent = dev->dev.parent;
7773 	if (!parent)
7774 		return empty;
7775 
7776 	driver = parent->driver;
7777 	if (driver && driver->name)
7778 		return driver->name;
7779 	return empty;
7780 }
7781 
7782 static void __netdev_printk(const char *level, const struct net_device *dev,
7783 			    struct va_format *vaf)
7784 {
7785 	if (dev && dev->dev.parent) {
7786 		dev_printk_emit(level[1] - '0',
7787 				dev->dev.parent,
7788 				"%s %s %s%s: %pV",
7789 				dev_driver_string(dev->dev.parent),
7790 				dev_name(dev->dev.parent),
7791 				netdev_name(dev), netdev_reg_state(dev),
7792 				vaf);
7793 	} else if (dev) {
7794 		printk("%s%s%s: %pV",
7795 		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
7796 	} else {
7797 		printk("%s(NULL net_device): %pV", level, vaf);
7798 	}
7799 }
7800 
7801 void netdev_printk(const char *level, const struct net_device *dev,
7802 		   const char *format, ...)
7803 {
7804 	struct va_format vaf;
7805 	va_list args;
7806 
7807 	va_start(args, format);
7808 
7809 	vaf.fmt = format;
7810 	vaf.va = &args;
7811 
7812 	__netdev_printk(level, dev, &vaf);
7813 
7814 	va_end(args);
7815 }
7816 EXPORT_SYMBOL(netdev_printk);
7817 
7818 #define define_netdev_printk_level(func, level)			\
7819 void func(const struct net_device *dev, const char *fmt, ...)	\
7820 {								\
7821 	struct va_format vaf;					\
7822 	va_list args;						\
7823 								\
7824 	va_start(args, fmt);					\
7825 								\
7826 	vaf.fmt = fmt;						\
7827 	vaf.va = &args;						\
7828 								\
7829 	__netdev_printk(level, dev, &vaf);			\
7830 								\
7831 	va_end(args);						\
7832 }								\
7833 EXPORT_SYMBOL(func);
7834 
7835 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7836 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7837 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7838 define_netdev_printk_level(netdev_err, KERN_ERR);
7839 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7840 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7841 define_netdev_printk_level(netdev_info, KERN_INFO);
7842 
7843 static void __net_exit netdev_exit(struct net *net)
7844 {
7845 	kfree(net->dev_name_head);
7846 	kfree(net->dev_index_head);
7847 }
7848 
7849 static struct pernet_operations __net_initdata netdev_net_ops = {
7850 	.init = netdev_init,
7851 	.exit = netdev_exit,
7852 };
7853 
7854 static void __net_exit default_device_exit(struct net *net)
7855 {
7856 	struct net_device *dev, *aux;
7857 	/*
7858 	 * Push all migratable network devices back to the
7859 	 * initial network namespace
7860 	 */
7861 	rtnl_lock();
7862 	for_each_netdev_safe(net, dev, aux) {
7863 		int err;
7864 		char fb_name[IFNAMSIZ];
7865 
7866 		/* Ignore unmoveable devices (i.e. loopback) */
7867 		if (dev->features & NETIF_F_NETNS_LOCAL)
7868 			continue;
7869 
7870 		/* Leave virtual devices for the generic cleanup */
7871 		if (dev->rtnl_link_ops)
7872 			continue;
7873 
7874 		/* Push remaining network devices to init_net */
7875 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7876 		err = dev_change_net_namespace(dev, &init_net, fb_name);
7877 		if (err) {
7878 			pr_emerg("%s: failed to move %s to init_net: %d\n",
7879 				 __func__, dev->name, err);
7880 			BUG();
7881 		}
7882 	}
7883 	rtnl_unlock();
7884 }
7885 
7886 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7887 {
7888 	/* Return with the rtnl_lock held when there are no network
7889 	 * devices unregistering in any network namespace in net_list.
7890 	 */
7891 	struct net *net;
7892 	bool unregistering;
7893 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
7894 
7895 	add_wait_queue(&netdev_unregistering_wq, &wait);
7896 	for (;;) {
7897 		unregistering = false;
7898 		rtnl_lock();
7899 		list_for_each_entry(net, net_list, exit_list) {
7900 			if (net->dev_unreg_count > 0) {
7901 				unregistering = true;
7902 				break;
7903 			}
7904 		}
7905 		if (!unregistering)
7906 			break;
7907 		__rtnl_unlock();
7908 
7909 		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7910 	}
7911 	remove_wait_queue(&netdev_unregistering_wq, &wait);
7912 }
7913 
7914 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7915 {
7916 	/* At exit all network devices most be removed from a network
7917 	 * namespace.  Do this in the reverse order of registration.
7918 	 * Do this across as many network namespaces as possible to
7919 	 * improve batching efficiency.
7920 	 */
7921 	struct net_device *dev;
7922 	struct net *net;
7923 	LIST_HEAD(dev_kill_list);
7924 
7925 	/* To prevent network device cleanup code from dereferencing
7926 	 * loopback devices or network devices that have been freed
7927 	 * wait here for all pending unregistrations to complete,
7928 	 * before unregistring the loopback device and allowing the
7929 	 * network namespace be freed.
7930 	 *
7931 	 * The netdev todo list containing all network devices
7932 	 * unregistrations that happen in default_device_exit_batch
7933 	 * will run in the rtnl_unlock() at the end of
7934 	 * default_device_exit_batch.
7935 	 */
7936 	rtnl_lock_unregistering(net_list);
7937 	list_for_each_entry(net, net_list, exit_list) {
7938 		for_each_netdev_reverse(net, dev) {
7939 			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7940 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7941 			else
7942 				unregister_netdevice_queue(dev, &dev_kill_list);
7943 		}
7944 	}
7945 	unregister_netdevice_many(&dev_kill_list);
7946 	rtnl_unlock();
7947 }
7948 
7949 static struct pernet_operations __net_initdata default_device_ops = {
7950 	.exit = default_device_exit,
7951 	.exit_batch = default_device_exit_batch,
7952 };
7953 
7954 /*
7955  *	Initialize the DEV module. At boot time this walks the device list and
7956  *	unhooks any devices that fail to initialise (normally hardware not
7957  *	present) and leaves us with a valid list of present and active devices.
7958  *
7959  */
7960 
7961 /*
7962  *       This is called single threaded during boot, so no need
7963  *       to take the rtnl semaphore.
7964  */
7965 static int __init net_dev_init(void)
7966 {
7967 	int i, rc = -ENOMEM;
7968 
7969 	BUG_ON(!dev_boot_phase);
7970 
7971 	if (dev_proc_init())
7972 		goto out;
7973 
7974 	if (netdev_kobject_init())
7975 		goto out;
7976 
7977 	INIT_LIST_HEAD(&ptype_all);
7978 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
7979 		INIT_LIST_HEAD(&ptype_base[i]);
7980 
7981 	INIT_LIST_HEAD(&offload_base);
7982 
7983 	if (register_pernet_subsys(&netdev_net_ops))
7984 		goto out;
7985 
7986 	/*
7987 	 *	Initialise the packet receive queues.
7988 	 */
7989 
7990 	for_each_possible_cpu(i) {
7991 		struct softnet_data *sd = &per_cpu(softnet_data, i);
7992 
7993 		skb_queue_head_init(&sd->input_pkt_queue);
7994 		skb_queue_head_init(&sd->process_queue);
7995 		INIT_LIST_HEAD(&sd->poll_list);
7996 		sd->output_queue_tailp = &sd->output_queue;
7997 #ifdef CONFIG_RPS
7998 		sd->csd.func = rps_trigger_softirq;
7999 		sd->csd.info = sd;
8000 		sd->cpu = i;
8001 #endif
8002 
8003 		sd->backlog.poll = process_backlog;
8004 		sd->backlog.weight = weight_p;
8005 	}
8006 
8007 	dev_boot_phase = 0;
8008 
8009 	/* The loopback device is special if any other network devices
8010 	 * is present in a network namespace the loopback device must
8011 	 * be present. Since we now dynamically allocate and free the
8012 	 * loopback device ensure this invariant is maintained by
8013 	 * keeping the loopback device as the first device on the
8014 	 * list of network devices.  Ensuring the loopback devices
8015 	 * is the first device that appears and the last network device
8016 	 * that disappears.
8017 	 */
8018 	if (register_pernet_device(&loopback_net_ops))
8019 		goto out;
8020 
8021 	if (register_pernet_device(&default_device_ops))
8022 		goto out;
8023 
8024 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8025 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8026 
8027 	hotcpu_notifier(dev_cpu_callback, 0);
8028 	dst_subsys_init();
8029 	rc = 0;
8030 out:
8031 	return rc;
8032 }
8033 
8034 subsys_initcall(net_dev_init);
8035