xref: /linux/net/core/dev.c (revision 04d8a0a5f3b6887543850d991a5e37c4ec90e250)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <linux/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <linux/bpf.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <net/busy_poll.h>
101 #include <linux/rtnetlink.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/dst_metadata.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
123 #include <net/ip.h>
124 #include <net/mpls.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/static_key.h>
136 #include <linux/hashtable.h>
137 #include <linux/vmalloc.h>
138 #include <linux/if_macvlan.h>
139 #include <linux/errqueue.h>
140 #include <linux/hrtimer.h>
141 #include <linux/netfilter_ingress.h>
142 #include <linux/crash_dump.h>
143 
144 #include "net-sysfs.h"
145 
146 /* Instead of increasing this, you should create a hash table. */
147 #define MAX_GRO_SKBS 8
148 
149 /* This should be increased if a protocol with a bigger head is added. */
150 #define GRO_MAX_HEAD (MAX_HEADER + 128)
151 
152 static DEFINE_SPINLOCK(ptype_lock);
153 static DEFINE_SPINLOCK(offload_lock);
154 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
155 struct list_head ptype_all __read_mostly;	/* Taps */
156 static struct list_head offload_base __read_mostly;
157 
158 static int netif_rx_internal(struct sk_buff *skb);
159 static int call_netdevice_notifiers_info(unsigned long val,
160 					 struct net_device *dev,
161 					 struct netdev_notifier_info *info);
162 
163 /*
164  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
165  * semaphore.
166  *
167  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
168  *
169  * Writers must hold the rtnl semaphore while they loop through the
170  * dev_base_head list, and hold dev_base_lock for writing when they do the
171  * actual updates.  This allows pure readers to access the list even
172  * while a writer is preparing to update it.
173  *
174  * To put it another way, dev_base_lock is held for writing only to
175  * protect against pure readers; the rtnl semaphore provides the
176  * protection against other writers.
177  *
178  * See, for example usages, register_netdevice() and
179  * unregister_netdevice(), which must be called with the rtnl
180  * semaphore held.
181  */
182 DEFINE_RWLOCK(dev_base_lock);
183 EXPORT_SYMBOL(dev_base_lock);
184 
185 /* protects napi_hash addition/deletion and napi_gen_id */
186 static DEFINE_SPINLOCK(napi_hash_lock);
187 
188 static unsigned int napi_gen_id = NR_CPUS;
189 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
190 
191 static seqcount_t devnet_rename_seq;
192 
193 static inline void dev_base_seq_inc(struct net *net)
194 {
195 	while (++net->dev_base_seq == 0);
196 }
197 
198 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
199 {
200 	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
201 
202 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
203 }
204 
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
208 }
209 
210 static inline void rps_lock(struct softnet_data *sd)
211 {
212 #ifdef CONFIG_RPS
213 	spin_lock(&sd->input_pkt_queue.lock);
214 #endif
215 }
216 
217 static inline void rps_unlock(struct softnet_data *sd)
218 {
219 #ifdef CONFIG_RPS
220 	spin_unlock(&sd->input_pkt_queue.lock);
221 #endif
222 }
223 
224 /* Device list insertion */
225 static void list_netdevice(struct net_device *dev)
226 {
227 	struct net *net = dev_net(dev);
228 
229 	ASSERT_RTNL();
230 
231 	write_lock_bh(&dev_base_lock);
232 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
233 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
234 	hlist_add_head_rcu(&dev->index_hlist,
235 			   dev_index_hash(net, dev->ifindex));
236 	write_unlock_bh(&dev_base_lock);
237 
238 	dev_base_seq_inc(net);
239 }
240 
241 /* Device list removal
242  * caller must respect a RCU grace period before freeing/reusing dev
243  */
244 static void unlist_netdevice(struct net_device *dev)
245 {
246 	ASSERT_RTNL();
247 
248 	/* Unlink dev from the device chain */
249 	write_lock_bh(&dev_base_lock);
250 	list_del_rcu(&dev->dev_list);
251 	hlist_del_rcu(&dev->name_hlist);
252 	hlist_del_rcu(&dev->index_hlist);
253 	write_unlock_bh(&dev_base_lock);
254 
255 	dev_base_seq_inc(dev_net(dev));
256 }
257 
258 /*
259  *	Our notifier list
260  */
261 
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263 
264 /*
265  *	Device drivers call our routines to queue packets here. We empty the
266  *	queue in the local softnet handler.
267  */
268 
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271 
272 #ifdef CONFIG_LOCKDEP
273 /*
274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275  * according to dev->type
276  */
277 static const unsigned short netdev_lock_type[] =
278 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
291 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
292 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
293 
294 static const char *const netdev_lock_name[] =
295 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
308 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
309 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
310 
311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
313 
314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315 {
316 	int i;
317 
318 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 		if (netdev_lock_type[i] == dev_type)
320 			return i;
321 	/* the last key is used by default */
322 	return ARRAY_SIZE(netdev_lock_type) - 1;
323 }
324 
325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 						 unsigned short dev_type)
327 {
328 	int i;
329 
330 	i = netdev_lock_pos(dev_type);
331 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 				   netdev_lock_name[i]);
333 }
334 
335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336 {
337 	int i;
338 
339 	i = netdev_lock_pos(dev->type);
340 	lockdep_set_class_and_name(&dev->addr_list_lock,
341 				   &netdev_addr_lock_key[i],
342 				   netdev_lock_name[i]);
343 }
344 #else
345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 						 unsigned short dev_type)
347 {
348 }
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350 {
351 }
352 #endif
353 
354 /*******************************************************************************
355 
356 		Protocol management and registration routines
357 
358 *******************************************************************************/
359 
360 /*
361  *	Add a protocol ID to the list. Now that the input handler is
362  *	smarter we can dispense with all the messy stuff that used to be
363  *	here.
364  *
365  *	BEWARE!!! Protocol handlers, mangling input packets,
366  *	MUST BE last in hash buckets and checking protocol handlers
367  *	MUST start from promiscuous ptype_all chain in net_bh.
368  *	It is true now, do not change it.
369  *	Explanation follows: if protocol handler, mangling packet, will
370  *	be the first on list, it is not able to sense, that packet
371  *	is cloned and should be copied-on-write, so that it will
372  *	change it and subsequent readers will get broken packet.
373  *							--ANK (980803)
374  */
375 
376 static inline struct list_head *ptype_head(const struct packet_type *pt)
377 {
378 	if (pt->type == htons(ETH_P_ALL))
379 		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
380 	else
381 		return pt->dev ? &pt->dev->ptype_specific :
382 				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383 }
384 
385 /**
386  *	dev_add_pack - add packet handler
387  *	@pt: packet type declaration
388  *
389  *	Add a protocol handler to the networking stack. The passed &packet_type
390  *	is linked into kernel lists and may not be freed until it has been
391  *	removed from the kernel lists.
392  *
393  *	This call does not sleep therefore it can not
394  *	guarantee all CPU's that are in middle of receiving packets
395  *	will see the new packet type (until the next received packet).
396  */
397 
398 void dev_add_pack(struct packet_type *pt)
399 {
400 	struct list_head *head = ptype_head(pt);
401 
402 	spin_lock(&ptype_lock);
403 	list_add_rcu(&pt->list, head);
404 	spin_unlock(&ptype_lock);
405 }
406 EXPORT_SYMBOL(dev_add_pack);
407 
408 /**
409  *	__dev_remove_pack	 - remove packet handler
410  *	@pt: packet type declaration
411  *
412  *	Remove a protocol handler that was previously added to the kernel
413  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
414  *	from the kernel lists and can be freed or reused once this function
415  *	returns.
416  *
417  *      The packet type might still be in use by receivers
418  *	and must not be freed until after all the CPU's have gone
419  *	through a quiescent state.
420  */
421 void __dev_remove_pack(struct packet_type *pt)
422 {
423 	struct list_head *head = ptype_head(pt);
424 	struct packet_type *pt1;
425 
426 	spin_lock(&ptype_lock);
427 
428 	list_for_each_entry(pt1, head, list) {
429 		if (pt == pt1) {
430 			list_del_rcu(&pt->list);
431 			goto out;
432 		}
433 	}
434 
435 	pr_warn("dev_remove_pack: %p not found\n", pt);
436 out:
437 	spin_unlock(&ptype_lock);
438 }
439 EXPORT_SYMBOL(__dev_remove_pack);
440 
441 /**
442  *	dev_remove_pack	 - remove packet handler
443  *	@pt: packet type declaration
444  *
445  *	Remove a protocol handler that was previously added to the kernel
446  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
447  *	from the kernel lists and can be freed or reused once this function
448  *	returns.
449  *
450  *	This call sleeps to guarantee that no CPU is looking at the packet
451  *	type after return.
452  */
453 void dev_remove_pack(struct packet_type *pt)
454 {
455 	__dev_remove_pack(pt);
456 
457 	synchronize_net();
458 }
459 EXPORT_SYMBOL(dev_remove_pack);
460 
461 
462 /**
463  *	dev_add_offload - register offload handlers
464  *	@po: protocol offload declaration
465  *
466  *	Add protocol offload handlers to the networking stack. The passed
467  *	&proto_offload is linked into kernel lists and may not be freed until
468  *	it has been removed from the kernel lists.
469  *
470  *	This call does not sleep therefore it can not
471  *	guarantee all CPU's that are in middle of receiving packets
472  *	will see the new offload handlers (until the next received packet).
473  */
474 void dev_add_offload(struct packet_offload *po)
475 {
476 	struct packet_offload *elem;
477 
478 	spin_lock(&offload_lock);
479 	list_for_each_entry(elem, &offload_base, list) {
480 		if (po->priority < elem->priority)
481 			break;
482 	}
483 	list_add_rcu(&po->list, elem->list.prev);
484 	spin_unlock(&offload_lock);
485 }
486 EXPORT_SYMBOL(dev_add_offload);
487 
488 /**
489  *	__dev_remove_offload	 - remove offload handler
490  *	@po: packet offload declaration
491  *
492  *	Remove a protocol offload handler that was previously added to the
493  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
494  *	is removed from the kernel lists and can be freed or reused once this
495  *	function returns.
496  *
497  *      The packet type might still be in use by receivers
498  *	and must not be freed until after all the CPU's have gone
499  *	through a quiescent state.
500  */
501 static void __dev_remove_offload(struct packet_offload *po)
502 {
503 	struct list_head *head = &offload_base;
504 	struct packet_offload *po1;
505 
506 	spin_lock(&offload_lock);
507 
508 	list_for_each_entry(po1, head, list) {
509 		if (po == po1) {
510 			list_del_rcu(&po->list);
511 			goto out;
512 		}
513 	}
514 
515 	pr_warn("dev_remove_offload: %p not found\n", po);
516 out:
517 	spin_unlock(&offload_lock);
518 }
519 
520 /**
521  *	dev_remove_offload	 - remove packet offload handler
522  *	@po: packet offload declaration
523  *
524  *	Remove a packet offload handler that was previously added to the kernel
525  *	offload handlers by dev_add_offload(). The passed &offload_type is
526  *	removed from the kernel lists and can be freed or reused once this
527  *	function returns.
528  *
529  *	This call sleeps to guarantee that no CPU is looking at the packet
530  *	type after return.
531  */
532 void dev_remove_offload(struct packet_offload *po)
533 {
534 	__dev_remove_offload(po);
535 
536 	synchronize_net();
537 }
538 EXPORT_SYMBOL(dev_remove_offload);
539 
540 /******************************************************************************
541 
542 		      Device Boot-time Settings Routines
543 
544 *******************************************************************************/
545 
546 /* Boot time configuration table */
547 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
548 
549 /**
550  *	netdev_boot_setup_add	- add new setup entry
551  *	@name: name of the device
552  *	@map: configured settings for the device
553  *
554  *	Adds new setup entry to the dev_boot_setup list.  The function
555  *	returns 0 on error and 1 on success.  This is a generic routine to
556  *	all netdevices.
557  */
558 static int netdev_boot_setup_add(char *name, struct ifmap *map)
559 {
560 	struct netdev_boot_setup *s;
561 	int i;
562 
563 	s = dev_boot_setup;
564 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
565 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
566 			memset(s[i].name, 0, sizeof(s[i].name));
567 			strlcpy(s[i].name, name, IFNAMSIZ);
568 			memcpy(&s[i].map, map, sizeof(s[i].map));
569 			break;
570 		}
571 	}
572 
573 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
574 }
575 
576 /**
577  *	netdev_boot_setup_check	- check boot time settings
578  *	@dev: the netdevice
579  *
580  * 	Check boot time settings for the device.
581  *	The found settings are set for the device to be used
582  *	later in the device probing.
583  *	Returns 0 if no settings found, 1 if they are.
584  */
585 int netdev_boot_setup_check(struct net_device *dev)
586 {
587 	struct netdev_boot_setup *s = dev_boot_setup;
588 	int i;
589 
590 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
591 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
592 		    !strcmp(dev->name, s[i].name)) {
593 			dev->irq 	= s[i].map.irq;
594 			dev->base_addr 	= s[i].map.base_addr;
595 			dev->mem_start 	= s[i].map.mem_start;
596 			dev->mem_end 	= s[i].map.mem_end;
597 			return 1;
598 		}
599 	}
600 	return 0;
601 }
602 EXPORT_SYMBOL(netdev_boot_setup_check);
603 
604 
605 /**
606  *	netdev_boot_base	- get address from boot time settings
607  *	@prefix: prefix for network device
608  *	@unit: id for network device
609  *
610  * 	Check boot time settings for the base address of device.
611  *	The found settings are set for the device to be used
612  *	later in the device probing.
613  *	Returns 0 if no settings found.
614  */
615 unsigned long netdev_boot_base(const char *prefix, int unit)
616 {
617 	const struct netdev_boot_setup *s = dev_boot_setup;
618 	char name[IFNAMSIZ];
619 	int i;
620 
621 	sprintf(name, "%s%d", prefix, unit);
622 
623 	/*
624 	 * If device already registered then return base of 1
625 	 * to indicate not to probe for this interface
626 	 */
627 	if (__dev_get_by_name(&init_net, name))
628 		return 1;
629 
630 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
631 		if (!strcmp(name, s[i].name))
632 			return s[i].map.base_addr;
633 	return 0;
634 }
635 
636 /*
637  * Saves at boot time configured settings for any netdevice.
638  */
639 int __init netdev_boot_setup(char *str)
640 {
641 	int ints[5];
642 	struct ifmap map;
643 
644 	str = get_options(str, ARRAY_SIZE(ints), ints);
645 	if (!str || !*str)
646 		return 0;
647 
648 	/* Save settings */
649 	memset(&map, 0, sizeof(map));
650 	if (ints[0] > 0)
651 		map.irq = ints[1];
652 	if (ints[0] > 1)
653 		map.base_addr = ints[2];
654 	if (ints[0] > 2)
655 		map.mem_start = ints[3];
656 	if (ints[0] > 3)
657 		map.mem_end = ints[4];
658 
659 	/* Add new entry to the list */
660 	return netdev_boot_setup_add(str, &map);
661 }
662 
663 __setup("netdev=", netdev_boot_setup);
664 
665 /*******************************************************************************
666 
667 			    Device Interface Subroutines
668 
669 *******************************************************************************/
670 
671 /**
672  *	dev_get_iflink	- get 'iflink' value of a interface
673  *	@dev: targeted interface
674  *
675  *	Indicates the ifindex the interface is linked to.
676  *	Physical interfaces have the same 'ifindex' and 'iflink' values.
677  */
678 
679 int dev_get_iflink(const struct net_device *dev)
680 {
681 	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
682 		return dev->netdev_ops->ndo_get_iflink(dev);
683 
684 	return dev->ifindex;
685 }
686 EXPORT_SYMBOL(dev_get_iflink);
687 
688 /**
689  *	dev_fill_metadata_dst - Retrieve tunnel egress information.
690  *	@dev: targeted interface
691  *	@skb: The packet.
692  *
693  *	For better visibility of tunnel traffic OVS needs to retrieve
694  *	egress tunnel information for a packet. Following API allows
695  *	user to get this info.
696  */
697 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
698 {
699 	struct ip_tunnel_info *info;
700 
701 	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
702 		return -EINVAL;
703 
704 	info = skb_tunnel_info_unclone(skb);
705 	if (!info)
706 		return -ENOMEM;
707 	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
708 		return -EINVAL;
709 
710 	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
711 }
712 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
713 
714 /**
715  *	__dev_get_by_name	- find a device by its name
716  *	@net: the applicable net namespace
717  *	@name: name to find
718  *
719  *	Find an interface by name. Must be called under RTNL semaphore
720  *	or @dev_base_lock. If the name is found a pointer to the device
721  *	is returned. If the name is not found then %NULL is returned. The
722  *	reference counters are not incremented so the caller must be
723  *	careful with locks.
724  */
725 
726 struct net_device *__dev_get_by_name(struct net *net, const char *name)
727 {
728 	struct net_device *dev;
729 	struct hlist_head *head = dev_name_hash(net, name);
730 
731 	hlist_for_each_entry(dev, head, name_hlist)
732 		if (!strncmp(dev->name, name, IFNAMSIZ))
733 			return dev;
734 
735 	return NULL;
736 }
737 EXPORT_SYMBOL(__dev_get_by_name);
738 
739 /**
740  *	dev_get_by_name_rcu	- find a device by its name
741  *	@net: the applicable net namespace
742  *	@name: name to find
743  *
744  *	Find an interface by name.
745  *	If the name is found a pointer to the device is returned.
746  * 	If the name is not found then %NULL is returned.
747  *	The reference counters are not incremented so the caller must be
748  *	careful with locks. The caller must hold RCU lock.
749  */
750 
751 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
752 {
753 	struct net_device *dev;
754 	struct hlist_head *head = dev_name_hash(net, name);
755 
756 	hlist_for_each_entry_rcu(dev, head, name_hlist)
757 		if (!strncmp(dev->name, name, IFNAMSIZ))
758 			return dev;
759 
760 	return NULL;
761 }
762 EXPORT_SYMBOL(dev_get_by_name_rcu);
763 
764 /**
765  *	dev_get_by_name		- find a device by its name
766  *	@net: the applicable net namespace
767  *	@name: name to find
768  *
769  *	Find an interface by name. This can be called from any
770  *	context and does its own locking. The returned handle has
771  *	the usage count incremented and the caller must use dev_put() to
772  *	release it when it is no longer needed. %NULL is returned if no
773  *	matching device is found.
774  */
775 
776 struct net_device *dev_get_by_name(struct net *net, const char *name)
777 {
778 	struct net_device *dev;
779 
780 	rcu_read_lock();
781 	dev = dev_get_by_name_rcu(net, name);
782 	if (dev)
783 		dev_hold(dev);
784 	rcu_read_unlock();
785 	return dev;
786 }
787 EXPORT_SYMBOL(dev_get_by_name);
788 
789 /**
790  *	__dev_get_by_index - find a device by its ifindex
791  *	@net: the applicable net namespace
792  *	@ifindex: index of device
793  *
794  *	Search for an interface by index. Returns %NULL if the device
795  *	is not found or a pointer to the device. The device has not
796  *	had its reference counter increased so the caller must be careful
797  *	about locking. The caller must hold either the RTNL semaphore
798  *	or @dev_base_lock.
799  */
800 
801 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
802 {
803 	struct net_device *dev;
804 	struct hlist_head *head = dev_index_hash(net, ifindex);
805 
806 	hlist_for_each_entry(dev, head, index_hlist)
807 		if (dev->ifindex == ifindex)
808 			return dev;
809 
810 	return NULL;
811 }
812 EXPORT_SYMBOL(__dev_get_by_index);
813 
814 /**
815  *	dev_get_by_index_rcu - find a device by its ifindex
816  *	@net: the applicable net namespace
817  *	@ifindex: index of device
818  *
819  *	Search for an interface by index. Returns %NULL if the device
820  *	is not found or a pointer to the device. The device has not
821  *	had its reference counter increased so the caller must be careful
822  *	about locking. The caller must hold RCU lock.
823  */
824 
825 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
826 {
827 	struct net_device *dev;
828 	struct hlist_head *head = dev_index_hash(net, ifindex);
829 
830 	hlist_for_each_entry_rcu(dev, head, index_hlist)
831 		if (dev->ifindex == ifindex)
832 			return dev;
833 
834 	return NULL;
835 }
836 EXPORT_SYMBOL(dev_get_by_index_rcu);
837 
838 
839 /**
840  *	dev_get_by_index - find a device by its ifindex
841  *	@net: the applicable net namespace
842  *	@ifindex: index of device
843  *
844  *	Search for an interface by index. Returns NULL if the device
845  *	is not found or a pointer to the device. The device returned has
846  *	had a reference added and the pointer is safe until the user calls
847  *	dev_put to indicate they have finished with it.
848  */
849 
850 struct net_device *dev_get_by_index(struct net *net, int ifindex)
851 {
852 	struct net_device *dev;
853 
854 	rcu_read_lock();
855 	dev = dev_get_by_index_rcu(net, ifindex);
856 	if (dev)
857 		dev_hold(dev);
858 	rcu_read_unlock();
859 	return dev;
860 }
861 EXPORT_SYMBOL(dev_get_by_index);
862 
863 /**
864  *	netdev_get_name - get a netdevice name, knowing its ifindex.
865  *	@net: network namespace
866  *	@name: a pointer to the buffer where the name will be stored.
867  *	@ifindex: the ifindex of the interface to get the name from.
868  *
869  *	The use of raw_seqcount_begin() and cond_resched() before
870  *	retrying is required as we want to give the writers a chance
871  *	to complete when CONFIG_PREEMPT is not set.
872  */
873 int netdev_get_name(struct net *net, char *name, int ifindex)
874 {
875 	struct net_device *dev;
876 	unsigned int seq;
877 
878 retry:
879 	seq = raw_seqcount_begin(&devnet_rename_seq);
880 	rcu_read_lock();
881 	dev = dev_get_by_index_rcu(net, ifindex);
882 	if (!dev) {
883 		rcu_read_unlock();
884 		return -ENODEV;
885 	}
886 
887 	strcpy(name, dev->name);
888 	rcu_read_unlock();
889 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
890 		cond_resched();
891 		goto retry;
892 	}
893 
894 	return 0;
895 }
896 
897 /**
898  *	dev_getbyhwaddr_rcu - find a device by its hardware address
899  *	@net: the applicable net namespace
900  *	@type: media type of device
901  *	@ha: hardware address
902  *
903  *	Search for an interface by MAC address. Returns NULL if the device
904  *	is not found or a pointer to the device.
905  *	The caller must hold RCU or RTNL.
906  *	The returned device has not had its ref count increased
907  *	and the caller must therefore be careful about locking
908  *
909  */
910 
911 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
912 				       const char *ha)
913 {
914 	struct net_device *dev;
915 
916 	for_each_netdev_rcu(net, dev)
917 		if (dev->type == type &&
918 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
919 			return dev;
920 
921 	return NULL;
922 }
923 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
924 
925 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
926 {
927 	struct net_device *dev;
928 
929 	ASSERT_RTNL();
930 	for_each_netdev(net, dev)
931 		if (dev->type == type)
932 			return dev;
933 
934 	return NULL;
935 }
936 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
937 
938 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
939 {
940 	struct net_device *dev, *ret = NULL;
941 
942 	rcu_read_lock();
943 	for_each_netdev_rcu(net, dev)
944 		if (dev->type == type) {
945 			dev_hold(dev);
946 			ret = dev;
947 			break;
948 		}
949 	rcu_read_unlock();
950 	return ret;
951 }
952 EXPORT_SYMBOL(dev_getfirstbyhwtype);
953 
954 /**
955  *	__dev_get_by_flags - find any device with given flags
956  *	@net: the applicable net namespace
957  *	@if_flags: IFF_* values
958  *	@mask: bitmask of bits in if_flags to check
959  *
960  *	Search for any interface with the given flags. Returns NULL if a device
961  *	is not found or a pointer to the device. Must be called inside
962  *	rtnl_lock(), and result refcount is unchanged.
963  */
964 
965 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
966 				      unsigned short mask)
967 {
968 	struct net_device *dev, *ret;
969 
970 	ASSERT_RTNL();
971 
972 	ret = NULL;
973 	for_each_netdev(net, dev) {
974 		if (((dev->flags ^ if_flags) & mask) == 0) {
975 			ret = dev;
976 			break;
977 		}
978 	}
979 	return ret;
980 }
981 EXPORT_SYMBOL(__dev_get_by_flags);
982 
983 /**
984  *	dev_valid_name - check if name is okay for network device
985  *	@name: name string
986  *
987  *	Network device names need to be valid file names to
988  *	to allow sysfs to work.  We also disallow any kind of
989  *	whitespace.
990  */
991 bool dev_valid_name(const char *name)
992 {
993 	if (*name == '\0')
994 		return false;
995 	if (strlen(name) >= IFNAMSIZ)
996 		return false;
997 	if (!strcmp(name, ".") || !strcmp(name, ".."))
998 		return false;
999 
1000 	while (*name) {
1001 		if (*name == '/' || *name == ':' || isspace(*name))
1002 			return false;
1003 		name++;
1004 	}
1005 	return true;
1006 }
1007 EXPORT_SYMBOL(dev_valid_name);
1008 
1009 /**
1010  *	__dev_alloc_name - allocate a name for a device
1011  *	@net: network namespace to allocate the device name in
1012  *	@name: name format string
1013  *	@buf:  scratch buffer and result name string
1014  *
1015  *	Passed a format string - eg "lt%d" it will try and find a suitable
1016  *	id. It scans list of devices to build up a free map, then chooses
1017  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1018  *	while allocating the name and adding the device in order to avoid
1019  *	duplicates.
1020  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021  *	Returns the number of the unit assigned or a negative errno code.
1022  */
1023 
1024 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025 {
1026 	int i = 0;
1027 	const char *p;
1028 	const int max_netdevices = 8*PAGE_SIZE;
1029 	unsigned long *inuse;
1030 	struct net_device *d;
1031 
1032 	p = strnchr(name, IFNAMSIZ-1, '%');
1033 	if (p) {
1034 		/*
1035 		 * Verify the string as this thing may have come from
1036 		 * the user.  There must be either one "%d" and no other "%"
1037 		 * characters.
1038 		 */
1039 		if (p[1] != 'd' || strchr(p + 2, '%'))
1040 			return -EINVAL;
1041 
1042 		/* Use one page as a bit array of possible slots */
1043 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044 		if (!inuse)
1045 			return -ENOMEM;
1046 
1047 		for_each_netdev(net, d) {
1048 			if (!sscanf(d->name, name, &i))
1049 				continue;
1050 			if (i < 0 || i >= max_netdevices)
1051 				continue;
1052 
1053 			/*  avoid cases where sscanf is not exact inverse of printf */
1054 			snprintf(buf, IFNAMSIZ, name, i);
1055 			if (!strncmp(buf, d->name, IFNAMSIZ))
1056 				set_bit(i, inuse);
1057 		}
1058 
1059 		i = find_first_zero_bit(inuse, max_netdevices);
1060 		free_page((unsigned long) inuse);
1061 	}
1062 
1063 	if (buf != name)
1064 		snprintf(buf, IFNAMSIZ, name, i);
1065 	if (!__dev_get_by_name(net, buf))
1066 		return i;
1067 
1068 	/* It is possible to run out of possible slots
1069 	 * when the name is long and there isn't enough space left
1070 	 * for the digits, or if all bits are used.
1071 	 */
1072 	return -ENFILE;
1073 }
1074 
1075 /**
1076  *	dev_alloc_name - allocate a name for a device
1077  *	@dev: device
1078  *	@name: name format string
1079  *
1080  *	Passed a format string - eg "lt%d" it will try and find a suitable
1081  *	id. It scans list of devices to build up a free map, then chooses
1082  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1083  *	while allocating the name and adding the device in order to avoid
1084  *	duplicates.
1085  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086  *	Returns the number of the unit assigned or a negative errno code.
1087  */
1088 
1089 int dev_alloc_name(struct net_device *dev, const char *name)
1090 {
1091 	char buf[IFNAMSIZ];
1092 	struct net *net;
1093 	int ret;
1094 
1095 	BUG_ON(!dev_net(dev));
1096 	net = dev_net(dev);
1097 	ret = __dev_alloc_name(net, name, buf);
1098 	if (ret >= 0)
1099 		strlcpy(dev->name, buf, IFNAMSIZ);
1100 	return ret;
1101 }
1102 EXPORT_SYMBOL(dev_alloc_name);
1103 
1104 static int dev_alloc_name_ns(struct net *net,
1105 			     struct net_device *dev,
1106 			     const char *name)
1107 {
1108 	char buf[IFNAMSIZ];
1109 	int ret;
1110 
1111 	ret = __dev_alloc_name(net, name, buf);
1112 	if (ret >= 0)
1113 		strlcpy(dev->name, buf, IFNAMSIZ);
1114 	return ret;
1115 }
1116 
1117 static int dev_get_valid_name(struct net *net,
1118 			      struct net_device *dev,
1119 			      const char *name)
1120 {
1121 	BUG_ON(!net);
1122 
1123 	if (!dev_valid_name(name))
1124 		return -EINVAL;
1125 
1126 	if (strchr(name, '%'))
1127 		return dev_alloc_name_ns(net, dev, name);
1128 	else if (__dev_get_by_name(net, name))
1129 		return -EEXIST;
1130 	else if (dev->name != name)
1131 		strlcpy(dev->name, name, IFNAMSIZ);
1132 
1133 	return 0;
1134 }
1135 
1136 /**
1137  *	dev_change_name - change name of a device
1138  *	@dev: device
1139  *	@newname: name (or format string) must be at least IFNAMSIZ
1140  *
1141  *	Change name of a device, can pass format strings "eth%d".
1142  *	for wildcarding.
1143  */
1144 int dev_change_name(struct net_device *dev, const char *newname)
1145 {
1146 	unsigned char old_assign_type;
1147 	char oldname[IFNAMSIZ];
1148 	int err = 0;
1149 	int ret;
1150 	struct net *net;
1151 
1152 	ASSERT_RTNL();
1153 	BUG_ON(!dev_net(dev));
1154 
1155 	net = dev_net(dev);
1156 	if (dev->flags & IFF_UP)
1157 		return -EBUSY;
1158 
1159 	write_seqcount_begin(&devnet_rename_seq);
1160 
1161 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162 		write_seqcount_end(&devnet_rename_seq);
1163 		return 0;
1164 	}
1165 
1166 	memcpy(oldname, dev->name, IFNAMSIZ);
1167 
1168 	err = dev_get_valid_name(net, dev, newname);
1169 	if (err < 0) {
1170 		write_seqcount_end(&devnet_rename_seq);
1171 		return err;
1172 	}
1173 
1174 	if (oldname[0] && !strchr(oldname, '%'))
1175 		netdev_info(dev, "renamed from %s\n", oldname);
1176 
1177 	old_assign_type = dev->name_assign_type;
1178 	dev->name_assign_type = NET_NAME_RENAMED;
1179 
1180 rollback:
1181 	ret = device_rename(&dev->dev, dev->name);
1182 	if (ret) {
1183 		memcpy(dev->name, oldname, IFNAMSIZ);
1184 		dev->name_assign_type = old_assign_type;
1185 		write_seqcount_end(&devnet_rename_seq);
1186 		return ret;
1187 	}
1188 
1189 	write_seqcount_end(&devnet_rename_seq);
1190 
1191 	netdev_adjacent_rename_links(dev, oldname);
1192 
1193 	write_lock_bh(&dev_base_lock);
1194 	hlist_del_rcu(&dev->name_hlist);
1195 	write_unlock_bh(&dev_base_lock);
1196 
1197 	synchronize_rcu();
1198 
1199 	write_lock_bh(&dev_base_lock);
1200 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201 	write_unlock_bh(&dev_base_lock);
1202 
1203 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204 	ret = notifier_to_errno(ret);
1205 
1206 	if (ret) {
1207 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1208 		if (err >= 0) {
1209 			err = ret;
1210 			write_seqcount_begin(&devnet_rename_seq);
1211 			memcpy(dev->name, oldname, IFNAMSIZ);
1212 			memcpy(oldname, newname, IFNAMSIZ);
1213 			dev->name_assign_type = old_assign_type;
1214 			old_assign_type = NET_NAME_RENAMED;
1215 			goto rollback;
1216 		} else {
1217 			pr_err("%s: name change rollback failed: %d\n",
1218 			       dev->name, ret);
1219 		}
1220 	}
1221 
1222 	return err;
1223 }
1224 
1225 /**
1226  *	dev_set_alias - change ifalias of a device
1227  *	@dev: device
1228  *	@alias: name up to IFALIASZ
1229  *	@len: limit of bytes to copy from info
1230  *
1231  *	Set ifalias for a device,
1232  */
1233 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234 {
1235 	char *new_ifalias;
1236 
1237 	ASSERT_RTNL();
1238 
1239 	if (len >= IFALIASZ)
1240 		return -EINVAL;
1241 
1242 	if (!len) {
1243 		kfree(dev->ifalias);
1244 		dev->ifalias = NULL;
1245 		return 0;
1246 	}
1247 
1248 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249 	if (!new_ifalias)
1250 		return -ENOMEM;
1251 	dev->ifalias = new_ifalias;
1252 
1253 	strlcpy(dev->ifalias, alias, len+1);
1254 	return len;
1255 }
1256 
1257 
1258 /**
1259  *	netdev_features_change - device changes features
1260  *	@dev: device to cause notification
1261  *
1262  *	Called to indicate a device has changed features.
1263  */
1264 void netdev_features_change(struct net_device *dev)
1265 {
1266 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267 }
1268 EXPORT_SYMBOL(netdev_features_change);
1269 
1270 /**
1271  *	netdev_state_change - device changes state
1272  *	@dev: device to cause notification
1273  *
1274  *	Called to indicate a device has changed state. This function calls
1275  *	the notifier chains for netdev_chain and sends a NEWLINK message
1276  *	to the routing socket.
1277  */
1278 void netdev_state_change(struct net_device *dev)
1279 {
1280 	if (dev->flags & IFF_UP) {
1281 		struct netdev_notifier_change_info change_info;
1282 
1283 		change_info.flags_changed = 0;
1284 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285 					      &change_info.info);
1286 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287 	}
1288 }
1289 EXPORT_SYMBOL(netdev_state_change);
1290 
1291 /**
1292  * 	netdev_notify_peers - notify network peers about existence of @dev
1293  * 	@dev: network device
1294  *
1295  * Generate traffic such that interested network peers are aware of
1296  * @dev, such as by generating a gratuitous ARP. This may be used when
1297  * a device wants to inform the rest of the network about some sort of
1298  * reconfiguration such as a failover event or virtual machine
1299  * migration.
1300  */
1301 void netdev_notify_peers(struct net_device *dev)
1302 {
1303 	rtnl_lock();
1304 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305 	rtnl_unlock();
1306 }
1307 EXPORT_SYMBOL(netdev_notify_peers);
1308 
1309 static int __dev_open(struct net_device *dev)
1310 {
1311 	const struct net_device_ops *ops = dev->netdev_ops;
1312 	int ret;
1313 
1314 	ASSERT_RTNL();
1315 
1316 	if (!netif_device_present(dev))
1317 		return -ENODEV;
1318 
1319 	/* Block netpoll from trying to do any rx path servicing.
1320 	 * If we don't do this there is a chance ndo_poll_controller
1321 	 * or ndo_poll may be running while we open the device
1322 	 */
1323 	netpoll_poll_disable(dev);
1324 
1325 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326 	ret = notifier_to_errno(ret);
1327 	if (ret)
1328 		return ret;
1329 
1330 	set_bit(__LINK_STATE_START, &dev->state);
1331 
1332 	if (ops->ndo_validate_addr)
1333 		ret = ops->ndo_validate_addr(dev);
1334 
1335 	if (!ret && ops->ndo_open)
1336 		ret = ops->ndo_open(dev);
1337 
1338 	netpoll_poll_enable(dev);
1339 
1340 	if (ret)
1341 		clear_bit(__LINK_STATE_START, &dev->state);
1342 	else {
1343 		dev->flags |= IFF_UP;
1344 		dev_set_rx_mode(dev);
1345 		dev_activate(dev);
1346 		add_device_randomness(dev->dev_addr, dev->addr_len);
1347 	}
1348 
1349 	return ret;
1350 }
1351 
1352 /**
1353  *	dev_open	- prepare an interface for use.
1354  *	@dev:	device to open
1355  *
1356  *	Takes a device from down to up state. The device's private open
1357  *	function is invoked and then the multicast lists are loaded. Finally
1358  *	the device is moved into the up state and a %NETDEV_UP message is
1359  *	sent to the netdev notifier chain.
1360  *
1361  *	Calling this function on an active interface is a nop. On a failure
1362  *	a negative errno code is returned.
1363  */
1364 int dev_open(struct net_device *dev)
1365 {
1366 	int ret;
1367 
1368 	if (dev->flags & IFF_UP)
1369 		return 0;
1370 
1371 	ret = __dev_open(dev);
1372 	if (ret < 0)
1373 		return ret;
1374 
1375 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376 	call_netdevice_notifiers(NETDEV_UP, dev);
1377 
1378 	return ret;
1379 }
1380 EXPORT_SYMBOL(dev_open);
1381 
1382 static int __dev_close_many(struct list_head *head)
1383 {
1384 	struct net_device *dev;
1385 
1386 	ASSERT_RTNL();
1387 	might_sleep();
1388 
1389 	list_for_each_entry(dev, head, close_list) {
1390 		/* Temporarily disable netpoll until the interface is down */
1391 		netpoll_poll_disable(dev);
1392 
1393 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394 
1395 		clear_bit(__LINK_STATE_START, &dev->state);
1396 
1397 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1398 		 * can be even on different cpu. So just clear netif_running().
1399 		 *
1400 		 * dev->stop() will invoke napi_disable() on all of it's
1401 		 * napi_struct instances on this device.
1402 		 */
1403 		smp_mb__after_atomic(); /* Commit netif_running(). */
1404 	}
1405 
1406 	dev_deactivate_many(head);
1407 
1408 	list_for_each_entry(dev, head, close_list) {
1409 		const struct net_device_ops *ops = dev->netdev_ops;
1410 
1411 		/*
1412 		 *	Call the device specific close. This cannot fail.
1413 		 *	Only if device is UP
1414 		 *
1415 		 *	We allow it to be called even after a DETACH hot-plug
1416 		 *	event.
1417 		 */
1418 		if (ops->ndo_stop)
1419 			ops->ndo_stop(dev);
1420 
1421 		dev->flags &= ~IFF_UP;
1422 		netpoll_poll_enable(dev);
1423 	}
1424 
1425 	return 0;
1426 }
1427 
1428 static int __dev_close(struct net_device *dev)
1429 {
1430 	int retval;
1431 	LIST_HEAD(single);
1432 
1433 	list_add(&dev->close_list, &single);
1434 	retval = __dev_close_many(&single);
1435 	list_del(&single);
1436 
1437 	return retval;
1438 }
1439 
1440 int dev_close_many(struct list_head *head, bool unlink)
1441 {
1442 	struct net_device *dev, *tmp;
1443 
1444 	/* Remove the devices that don't need to be closed */
1445 	list_for_each_entry_safe(dev, tmp, head, close_list)
1446 		if (!(dev->flags & IFF_UP))
1447 			list_del_init(&dev->close_list);
1448 
1449 	__dev_close_many(head);
1450 
1451 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1452 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1454 		if (unlink)
1455 			list_del_init(&dev->close_list);
1456 	}
1457 
1458 	return 0;
1459 }
1460 EXPORT_SYMBOL(dev_close_many);
1461 
1462 /**
1463  *	dev_close - shutdown an interface.
1464  *	@dev: device to shutdown
1465  *
1466  *	This function moves an active device into down state. A
1467  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469  *	chain.
1470  */
1471 int dev_close(struct net_device *dev)
1472 {
1473 	if (dev->flags & IFF_UP) {
1474 		LIST_HEAD(single);
1475 
1476 		list_add(&dev->close_list, &single);
1477 		dev_close_many(&single, true);
1478 		list_del(&single);
1479 	}
1480 	return 0;
1481 }
1482 EXPORT_SYMBOL(dev_close);
1483 
1484 
1485 /**
1486  *	dev_disable_lro - disable Large Receive Offload on a device
1487  *	@dev: device
1488  *
1489  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1490  *	called under RTNL.  This is needed if received packets may be
1491  *	forwarded to another interface.
1492  */
1493 void dev_disable_lro(struct net_device *dev)
1494 {
1495 	struct net_device *lower_dev;
1496 	struct list_head *iter;
1497 
1498 	dev->wanted_features &= ~NETIF_F_LRO;
1499 	netdev_update_features(dev);
1500 
1501 	if (unlikely(dev->features & NETIF_F_LRO))
1502 		netdev_WARN(dev, "failed to disable LRO!\n");
1503 
1504 	netdev_for_each_lower_dev(dev, lower_dev, iter)
1505 		dev_disable_lro(lower_dev);
1506 }
1507 EXPORT_SYMBOL(dev_disable_lro);
1508 
1509 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510 				   struct net_device *dev)
1511 {
1512 	struct netdev_notifier_info info;
1513 
1514 	netdev_notifier_info_init(&info, dev);
1515 	return nb->notifier_call(nb, val, &info);
1516 }
1517 
1518 static int dev_boot_phase = 1;
1519 
1520 /**
1521  *	register_netdevice_notifier - register a network notifier block
1522  *	@nb: notifier
1523  *
1524  *	Register a notifier to be called when network device events occur.
1525  *	The notifier passed is linked into the kernel structures and must
1526  *	not be reused until it has been unregistered. A negative errno code
1527  *	is returned on a failure.
1528  *
1529  * 	When registered all registration and up events are replayed
1530  *	to the new notifier to allow device to have a race free
1531  *	view of the network device list.
1532  */
1533 
1534 int register_netdevice_notifier(struct notifier_block *nb)
1535 {
1536 	struct net_device *dev;
1537 	struct net_device *last;
1538 	struct net *net;
1539 	int err;
1540 
1541 	rtnl_lock();
1542 	err = raw_notifier_chain_register(&netdev_chain, nb);
1543 	if (err)
1544 		goto unlock;
1545 	if (dev_boot_phase)
1546 		goto unlock;
1547 	for_each_net(net) {
1548 		for_each_netdev(net, dev) {
1549 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550 			err = notifier_to_errno(err);
1551 			if (err)
1552 				goto rollback;
1553 
1554 			if (!(dev->flags & IFF_UP))
1555 				continue;
1556 
1557 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1558 		}
1559 	}
1560 
1561 unlock:
1562 	rtnl_unlock();
1563 	return err;
1564 
1565 rollback:
1566 	last = dev;
1567 	for_each_net(net) {
1568 		for_each_netdev(net, dev) {
1569 			if (dev == last)
1570 				goto outroll;
1571 
1572 			if (dev->flags & IFF_UP) {
1573 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574 							dev);
1575 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576 			}
1577 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578 		}
1579 	}
1580 
1581 outroll:
1582 	raw_notifier_chain_unregister(&netdev_chain, nb);
1583 	goto unlock;
1584 }
1585 EXPORT_SYMBOL(register_netdevice_notifier);
1586 
1587 /**
1588  *	unregister_netdevice_notifier - unregister a network notifier block
1589  *	@nb: notifier
1590  *
1591  *	Unregister a notifier previously registered by
1592  *	register_netdevice_notifier(). The notifier is unlinked into the
1593  *	kernel structures and may then be reused. A negative errno code
1594  *	is returned on a failure.
1595  *
1596  * 	After unregistering unregister and down device events are synthesized
1597  *	for all devices on the device list to the removed notifier to remove
1598  *	the need for special case cleanup code.
1599  */
1600 
1601 int unregister_netdevice_notifier(struct notifier_block *nb)
1602 {
1603 	struct net_device *dev;
1604 	struct net *net;
1605 	int err;
1606 
1607 	rtnl_lock();
1608 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609 	if (err)
1610 		goto unlock;
1611 
1612 	for_each_net(net) {
1613 		for_each_netdev(net, dev) {
1614 			if (dev->flags & IFF_UP) {
1615 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616 							dev);
1617 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618 			}
1619 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620 		}
1621 	}
1622 unlock:
1623 	rtnl_unlock();
1624 	return err;
1625 }
1626 EXPORT_SYMBOL(unregister_netdevice_notifier);
1627 
1628 /**
1629  *	call_netdevice_notifiers_info - call all network notifier blocks
1630  *	@val: value passed unmodified to notifier function
1631  *	@dev: net_device pointer passed unmodified to notifier function
1632  *	@info: notifier information data
1633  *
1634  *	Call all network notifier blocks.  Parameters and return value
1635  *	are as for raw_notifier_call_chain().
1636  */
1637 
1638 static int call_netdevice_notifiers_info(unsigned long val,
1639 					 struct net_device *dev,
1640 					 struct netdev_notifier_info *info)
1641 {
1642 	ASSERT_RTNL();
1643 	netdev_notifier_info_init(info, dev);
1644 	return raw_notifier_call_chain(&netdev_chain, val, info);
1645 }
1646 
1647 /**
1648  *	call_netdevice_notifiers - call all network notifier blocks
1649  *      @val: value passed unmodified to notifier function
1650  *      @dev: net_device pointer passed unmodified to notifier function
1651  *
1652  *	Call all network notifier blocks.  Parameters and return value
1653  *	are as for raw_notifier_call_chain().
1654  */
1655 
1656 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657 {
1658 	struct netdev_notifier_info info;
1659 
1660 	return call_netdevice_notifiers_info(val, dev, &info);
1661 }
1662 EXPORT_SYMBOL(call_netdevice_notifiers);
1663 
1664 #ifdef CONFIG_NET_INGRESS
1665 static struct static_key ingress_needed __read_mostly;
1666 
1667 void net_inc_ingress_queue(void)
1668 {
1669 	static_key_slow_inc(&ingress_needed);
1670 }
1671 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672 
1673 void net_dec_ingress_queue(void)
1674 {
1675 	static_key_slow_dec(&ingress_needed);
1676 }
1677 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678 #endif
1679 
1680 #ifdef CONFIG_NET_EGRESS
1681 static struct static_key egress_needed __read_mostly;
1682 
1683 void net_inc_egress_queue(void)
1684 {
1685 	static_key_slow_inc(&egress_needed);
1686 }
1687 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688 
1689 void net_dec_egress_queue(void)
1690 {
1691 	static_key_slow_dec(&egress_needed);
1692 }
1693 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694 #endif
1695 
1696 static struct static_key netstamp_needed __read_mostly;
1697 #ifdef HAVE_JUMP_LABEL
1698 static atomic_t netstamp_needed_deferred;
1699 static void netstamp_clear(struct work_struct *work)
1700 {
1701 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1702 
1703 	while (deferred--)
1704 		static_key_slow_dec(&netstamp_needed);
1705 }
1706 static DECLARE_WORK(netstamp_work, netstamp_clear);
1707 #endif
1708 
1709 void net_enable_timestamp(void)
1710 {
1711 	static_key_slow_inc(&netstamp_needed);
1712 }
1713 EXPORT_SYMBOL(net_enable_timestamp);
1714 
1715 void net_disable_timestamp(void)
1716 {
1717 #ifdef HAVE_JUMP_LABEL
1718 	/* net_disable_timestamp() can be called from non process context */
1719 	atomic_inc(&netstamp_needed_deferred);
1720 	schedule_work(&netstamp_work);
1721 #else
1722 	static_key_slow_dec(&netstamp_needed);
1723 #endif
1724 }
1725 EXPORT_SYMBOL(net_disable_timestamp);
1726 
1727 static inline void net_timestamp_set(struct sk_buff *skb)
1728 {
1729 	skb->tstamp = 0;
1730 	if (static_key_false(&netstamp_needed))
1731 		__net_timestamp(skb);
1732 }
1733 
1734 #define net_timestamp_check(COND, SKB)			\
1735 	if (static_key_false(&netstamp_needed)) {		\
1736 		if ((COND) && !(SKB)->tstamp)	\
1737 			__net_timestamp(SKB);		\
1738 	}						\
1739 
1740 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1741 {
1742 	unsigned int len;
1743 
1744 	if (!(dev->flags & IFF_UP))
1745 		return false;
1746 
1747 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1748 	if (skb->len <= len)
1749 		return true;
1750 
1751 	/* if TSO is enabled, we don't care about the length as the packet
1752 	 * could be forwarded without being segmented before
1753 	 */
1754 	if (skb_is_gso(skb))
1755 		return true;
1756 
1757 	return false;
1758 }
1759 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1760 
1761 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1762 {
1763 	int ret = ____dev_forward_skb(dev, skb);
1764 
1765 	if (likely(!ret)) {
1766 		skb->protocol = eth_type_trans(skb, dev);
1767 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1768 	}
1769 
1770 	return ret;
1771 }
1772 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1773 
1774 /**
1775  * dev_forward_skb - loopback an skb to another netif
1776  *
1777  * @dev: destination network device
1778  * @skb: buffer to forward
1779  *
1780  * return values:
1781  *	NET_RX_SUCCESS	(no congestion)
1782  *	NET_RX_DROP     (packet was dropped, but freed)
1783  *
1784  * dev_forward_skb can be used for injecting an skb from the
1785  * start_xmit function of one device into the receive queue
1786  * of another device.
1787  *
1788  * The receiving device may be in another namespace, so
1789  * we have to clear all information in the skb that could
1790  * impact namespace isolation.
1791  */
1792 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1793 {
1794 	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1795 }
1796 EXPORT_SYMBOL_GPL(dev_forward_skb);
1797 
1798 static inline int deliver_skb(struct sk_buff *skb,
1799 			      struct packet_type *pt_prev,
1800 			      struct net_device *orig_dev)
1801 {
1802 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1803 		return -ENOMEM;
1804 	atomic_inc(&skb->users);
1805 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1806 }
1807 
1808 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1809 					  struct packet_type **pt,
1810 					  struct net_device *orig_dev,
1811 					  __be16 type,
1812 					  struct list_head *ptype_list)
1813 {
1814 	struct packet_type *ptype, *pt_prev = *pt;
1815 
1816 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1817 		if (ptype->type != type)
1818 			continue;
1819 		if (pt_prev)
1820 			deliver_skb(skb, pt_prev, orig_dev);
1821 		pt_prev = ptype;
1822 	}
1823 	*pt = pt_prev;
1824 }
1825 
1826 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1827 {
1828 	if (!ptype->af_packet_priv || !skb->sk)
1829 		return false;
1830 
1831 	if (ptype->id_match)
1832 		return ptype->id_match(ptype, skb->sk);
1833 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1834 		return true;
1835 
1836 	return false;
1837 }
1838 
1839 /*
1840  *	Support routine. Sends outgoing frames to any network
1841  *	taps currently in use.
1842  */
1843 
1844 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1845 {
1846 	struct packet_type *ptype;
1847 	struct sk_buff *skb2 = NULL;
1848 	struct packet_type *pt_prev = NULL;
1849 	struct list_head *ptype_list = &ptype_all;
1850 
1851 	rcu_read_lock();
1852 again:
1853 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1854 		/* Never send packets back to the socket
1855 		 * they originated from - MvS (miquels@drinkel.ow.org)
1856 		 */
1857 		if (skb_loop_sk(ptype, skb))
1858 			continue;
1859 
1860 		if (pt_prev) {
1861 			deliver_skb(skb2, pt_prev, skb->dev);
1862 			pt_prev = ptype;
1863 			continue;
1864 		}
1865 
1866 		/* need to clone skb, done only once */
1867 		skb2 = skb_clone(skb, GFP_ATOMIC);
1868 		if (!skb2)
1869 			goto out_unlock;
1870 
1871 		net_timestamp_set(skb2);
1872 
1873 		/* skb->nh should be correctly
1874 		 * set by sender, so that the second statement is
1875 		 * just protection against buggy protocols.
1876 		 */
1877 		skb_reset_mac_header(skb2);
1878 
1879 		if (skb_network_header(skb2) < skb2->data ||
1880 		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1881 			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1882 					     ntohs(skb2->protocol),
1883 					     dev->name);
1884 			skb_reset_network_header(skb2);
1885 		}
1886 
1887 		skb2->transport_header = skb2->network_header;
1888 		skb2->pkt_type = PACKET_OUTGOING;
1889 		pt_prev = ptype;
1890 	}
1891 
1892 	if (ptype_list == &ptype_all) {
1893 		ptype_list = &dev->ptype_all;
1894 		goto again;
1895 	}
1896 out_unlock:
1897 	if (pt_prev)
1898 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1899 	rcu_read_unlock();
1900 }
1901 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1902 
1903 /**
1904  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1905  * @dev: Network device
1906  * @txq: number of queues available
1907  *
1908  * If real_num_tx_queues is changed the tc mappings may no longer be
1909  * valid. To resolve this verify the tc mapping remains valid and if
1910  * not NULL the mapping. With no priorities mapping to this
1911  * offset/count pair it will no longer be used. In the worst case TC0
1912  * is invalid nothing can be done so disable priority mappings. If is
1913  * expected that drivers will fix this mapping if they can before
1914  * calling netif_set_real_num_tx_queues.
1915  */
1916 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1917 {
1918 	int i;
1919 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1920 
1921 	/* If TC0 is invalidated disable TC mapping */
1922 	if (tc->offset + tc->count > txq) {
1923 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1924 		dev->num_tc = 0;
1925 		return;
1926 	}
1927 
1928 	/* Invalidated prio to tc mappings set to TC0 */
1929 	for (i = 1; i < TC_BITMASK + 1; i++) {
1930 		int q = netdev_get_prio_tc_map(dev, i);
1931 
1932 		tc = &dev->tc_to_txq[q];
1933 		if (tc->offset + tc->count > txq) {
1934 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1935 				i, q);
1936 			netdev_set_prio_tc_map(dev, i, 0);
1937 		}
1938 	}
1939 }
1940 
1941 int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1942 {
1943 	if (dev->num_tc) {
1944 		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1945 		int i;
1946 
1947 		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1948 			if ((txq - tc->offset) < tc->count)
1949 				return i;
1950 		}
1951 
1952 		return -1;
1953 	}
1954 
1955 	return 0;
1956 }
1957 
1958 #ifdef CONFIG_XPS
1959 static DEFINE_MUTEX(xps_map_mutex);
1960 #define xmap_dereference(P)		\
1961 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1962 
1963 static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1964 			     int tci, u16 index)
1965 {
1966 	struct xps_map *map = NULL;
1967 	int pos;
1968 
1969 	if (dev_maps)
1970 		map = xmap_dereference(dev_maps->cpu_map[tci]);
1971 	if (!map)
1972 		return false;
1973 
1974 	for (pos = map->len; pos--;) {
1975 		if (map->queues[pos] != index)
1976 			continue;
1977 
1978 		if (map->len > 1) {
1979 			map->queues[pos] = map->queues[--map->len];
1980 			break;
1981 		}
1982 
1983 		RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
1984 		kfree_rcu(map, rcu);
1985 		return false;
1986 	}
1987 
1988 	return true;
1989 }
1990 
1991 static bool remove_xps_queue_cpu(struct net_device *dev,
1992 				 struct xps_dev_maps *dev_maps,
1993 				 int cpu, u16 offset, u16 count)
1994 {
1995 	int num_tc = dev->num_tc ? : 1;
1996 	bool active = false;
1997 	int tci;
1998 
1999 	for (tci = cpu * num_tc; num_tc--; tci++) {
2000 		int i, j;
2001 
2002 		for (i = count, j = offset; i--; j++) {
2003 			if (!remove_xps_queue(dev_maps, cpu, j))
2004 				break;
2005 		}
2006 
2007 		active |= i < 0;
2008 	}
2009 
2010 	return active;
2011 }
2012 
2013 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2014 				   u16 count)
2015 {
2016 	struct xps_dev_maps *dev_maps;
2017 	int cpu, i;
2018 	bool active = false;
2019 
2020 	mutex_lock(&xps_map_mutex);
2021 	dev_maps = xmap_dereference(dev->xps_maps);
2022 
2023 	if (!dev_maps)
2024 		goto out_no_maps;
2025 
2026 	for_each_possible_cpu(cpu)
2027 		active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2028 					       offset, count);
2029 
2030 	if (!active) {
2031 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2032 		kfree_rcu(dev_maps, rcu);
2033 	}
2034 
2035 	for (i = offset + (count - 1); count--; i--)
2036 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2037 					     NUMA_NO_NODE);
2038 
2039 out_no_maps:
2040 	mutex_unlock(&xps_map_mutex);
2041 }
2042 
2043 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2044 {
2045 	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2046 }
2047 
2048 static struct xps_map *expand_xps_map(struct xps_map *map,
2049 				      int cpu, u16 index)
2050 {
2051 	struct xps_map *new_map;
2052 	int alloc_len = XPS_MIN_MAP_ALLOC;
2053 	int i, pos;
2054 
2055 	for (pos = 0; map && pos < map->len; pos++) {
2056 		if (map->queues[pos] != index)
2057 			continue;
2058 		return map;
2059 	}
2060 
2061 	/* Need to add queue to this CPU's existing map */
2062 	if (map) {
2063 		if (pos < map->alloc_len)
2064 			return map;
2065 
2066 		alloc_len = map->alloc_len * 2;
2067 	}
2068 
2069 	/* Need to allocate new map to store queue on this CPU's map */
2070 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2071 			       cpu_to_node(cpu));
2072 	if (!new_map)
2073 		return NULL;
2074 
2075 	for (i = 0; i < pos; i++)
2076 		new_map->queues[i] = map->queues[i];
2077 	new_map->alloc_len = alloc_len;
2078 	new_map->len = pos;
2079 
2080 	return new_map;
2081 }
2082 
2083 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2084 			u16 index)
2085 {
2086 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2087 	int i, cpu, tci, numa_node_id = -2;
2088 	int maps_sz, num_tc = 1, tc = 0;
2089 	struct xps_map *map, *new_map;
2090 	bool active = false;
2091 
2092 	if (dev->num_tc) {
2093 		num_tc = dev->num_tc;
2094 		tc = netdev_txq_to_tc(dev, index);
2095 		if (tc < 0)
2096 			return -EINVAL;
2097 	}
2098 
2099 	maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2100 	if (maps_sz < L1_CACHE_BYTES)
2101 		maps_sz = L1_CACHE_BYTES;
2102 
2103 	mutex_lock(&xps_map_mutex);
2104 
2105 	dev_maps = xmap_dereference(dev->xps_maps);
2106 
2107 	/* allocate memory for queue storage */
2108 	for_each_cpu_and(cpu, cpu_online_mask, mask) {
2109 		if (!new_dev_maps)
2110 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2111 		if (!new_dev_maps) {
2112 			mutex_unlock(&xps_map_mutex);
2113 			return -ENOMEM;
2114 		}
2115 
2116 		tci = cpu * num_tc + tc;
2117 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2118 				 NULL;
2119 
2120 		map = expand_xps_map(map, cpu, index);
2121 		if (!map)
2122 			goto error;
2123 
2124 		RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2125 	}
2126 
2127 	if (!new_dev_maps)
2128 		goto out_no_new_maps;
2129 
2130 	for_each_possible_cpu(cpu) {
2131 		/* copy maps belonging to foreign traffic classes */
2132 		for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2133 			/* fill in the new device map from the old device map */
2134 			map = xmap_dereference(dev_maps->cpu_map[tci]);
2135 			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2136 		}
2137 
2138 		/* We need to explicitly update tci as prevous loop
2139 		 * could break out early if dev_maps is NULL.
2140 		 */
2141 		tci = cpu * num_tc + tc;
2142 
2143 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2144 			/* add queue to CPU maps */
2145 			int pos = 0;
2146 
2147 			map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2148 			while ((pos < map->len) && (map->queues[pos] != index))
2149 				pos++;
2150 
2151 			if (pos == map->len)
2152 				map->queues[map->len++] = index;
2153 #ifdef CONFIG_NUMA
2154 			if (numa_node_id == -2)
2155 				numa_node_id = cpu_to_node(cpu);
2156 			else if (numa_node_id != cpu_to_node(cpu))
2157 				numa_node_id = -1;
2158 #endif
2159 		} else if (dev_maps) {
2160 			/* fill in the new device map from the old device map */
2161 			map = xmap_dereference(dev_maps->cpu_map[tci]);
2162 			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2163 		}
2164 
2165 		/* copy maps belonging to foreign traffic classes */
2166 		for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2167 			/* fill in the new device map from the old device map */
2168 			map = xmap_dereference(dev_maps->cpu_map[tci]);
2169 			RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2170 		}
2171 	}
2172 
2173 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2174 
2175 	/* Cleanup old maps */
2176 	if (!dev_maps)
2177 		goto out_no_old_maps;
2178 
2179 	for_each_possible_cpu(cpu) {
2180 		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2181 			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2182 			map = xmap_dereference(dev_maps->cpu_map[tci]);
2183 			if (map && map != new_map)
2184 				kfree_rcu(map, rcu);
2185 		}
2186 	}
2187 
2188 	kfree_rcu(dev_maps, rcu);
2189 
2190 out_no_old_maps:
2191 	dev_maps = new_dev_maps;
2192 	active = true;
2193 
2194 out_no_new_maps:
2195 	/* update Tx queue numa node */
2196 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2197 				     (numa_node_id >= 0) ? numa_node_id :
2198 				     NUMA_NO_NODE);
2199 
2200 	if (!dev_maps)
2201 		goto out_no_maps;
2202 
2203 	/* removes queue from unused CPUs */
2204 	for_each_possible_cpu(cpu) {
2205 		for (i = tc, tci = cpu * num_tc; i--; tci++)
2206 			active |= remove_xps_queue(dev_maps, tci, index);
2207 		if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2208 			active |= remove_xps_queue(dev_maps, tci, index);
2209 		for (i = num_tc - tc, tci++; --i; tci++)
2210 			active |= remove_xps_queue(dev_maps, tci, index);
2211 	}
2212 
2213 	/* free map if not active */
2214 	if (!active) {
2215 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2216 		kfree_rcu(dev_maps, rcu);
2217 	}
2218 
2219 out_no_maps:
2220 	mutex_unlock(&xps_map_mutex);
2221 
2222 	return 0;
2223 error:
2224 	/* remove any maps that we added */
2225 	for_each_possible_cpu(cpu) {
2226 		for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2227 			new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2228 			map = dev_maps ?
2229 			      xmap_dereference(dev_maps->cpu_map[tci]) :
2230 			      NULL;
2231 			if (new_map && new_map != map)
2232 				kfree(new_map);
2233 		}
2234 	}
2235 
2236 	mutex_unlock(&xps_map_mutex);
2237 
2238 	kfree(new_dev_maps);
2239 	return -ENOMEM;
2240 }
2241 EXPORT_SYMBOL(netif_set_xps_queue);
2242 
2243 #endif
2244 void netdev_reset_tc(struct net_device *dev)
2245 {
2246 #ifdef CONFIG_XPS
2247 	netif_reset_xps_queues_gt(dev, 0);
2248 #endif
2249 	dev->num_tc = 0;
2250 	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2251 	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2252 }
2253 EXPORT_SYMBOL(netdev_reset_tc);
2254 
2255 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2256 {
2257 	if (tc >= dev->num_tc)
2258 		return -EINVAL;
2259 
2260 #ifdef CONFIG_XPS
2261 	netif_reset_xps_queues(dev, offset, count);
2262 #endif
2263 	dev->tc_to_txq[tc].count = count;
2264 	dev->tc_to_txq[tc].offset = offset;
2265 	return 0;
2266 }
2267 EXPORT_SYMBOL(netdev_set_tc_queue);
2268 
2269 int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2270 {
2271 	if (num_tc > TC_MAX_QUEUE)
2272 		return -EINVAL;
2273 
2274 #ifdef CONFIG_XPS
2275 	netif_reset_xps_queues_gt(dev, 0);
2276 #endif
2277 	dev->num_tc = num_tc;
2278 	return 0;
2279 }
2280 EXPORT_SYMBOL(netdev_set_num_tc);
2281 
2282 /*
2283  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2284  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2285  */
2286 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2287 {
2288 	int rc;
2289 
2290 	if (txq < 1 || txq > dev->num_tx_queues)
2291 		return -EINVAL;
2292 
2293 	if (dev->reg_state == NETREG_REGISTERED ||
2294 	    dev->reg_state == NETREG_UNREGISTERING) {
2295 		ASSERT_RTNL();
2296 
2297 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2298 						  txq);
2299 		if (rc)
2300 			return rc;
2301 
2302 		if (dev->num_tc)
2303 			netif_setup_tc(dev, txq);
2304 
2305 		if (txq < dev->real_num_tx_queues) {
2306 			qdisc_reset_all_tx_gt(dev, txq);
2307 #ifdef CONFIG_XPS
2308 			netif_reset_xps_queues_gt(dev, txq);
2309 #endif
2310 		}
2311 	}
2312 
2313 	dev->real_num_tx_queues = txq;
2314 	return 0;
2315 }
2316 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2317 
2318 #ifdef CONFIG_SYSFS
2319 /**
2320  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2321  *	@dev: Network device
2322  *	@rxq: Actual number of RX queues
2323  *
2324  *	This must be called either with the rtnl_lock held or before
2325  *	registration of the net device.  Returns 0 on success, or a
2326  *	negative error code.  If called before registration, it always
2327  *	succeeds.
2328  */
2329 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2330 {
2331 	int rc;
2332 
2333 	if (rxq < 1 || rxq > dev->num_rx_queues)
2334 		return -EINVAL;
2335 
2336 	if (dev->reg_state == NETREG_REGISTERED) {
2337 		ASSERT_RTNL();
2338 
2339 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2340 						  rxq);
2341 		if (rc)
2342 			return rc;
2343 	}
2344 
2345 	dev->real_num_rx_queues = rxq;
2346 	return 0;
2347 }
2348 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2349 #endif
2350 
2351 /**
2352  * netif_get_num_default_rss_queues - default number of RSS queues
2353  *
2354  * This routine should set an upper limit on the number of RSS queues
2355  * used by default by multiqueue devices.
2356  */
2357 int netif_get_num_default_rss_queues(void)
2358 {
2359 	return is_kdump_kernel() ?
2360 		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2361 }
2362 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2363 
2364 static void __netif_reschedule(struct Qdisc *q)
2365 {
2366 	struct softnet_data *sd;
2367 	unsigned long flags;
2368 
2369 	local_irq_save(flags);
2370 	sd = this_cpu_ptr(&softnet_data);
2371 	q->next_sched = NULL;
2372 	*sd->output_queue_tailp = q;
2373 	sd->output_queue_tailp = &q->next_sched;
2374 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2375 	local_irq_restore(flags);
2376 }
2377 
2378 void __netif_schedule(struct Qdisc *q)
2379 {
2380 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2381 		__netif_reschedule(q);
2382 }
2383 EXPORT_SYMBOL(__netif_schedule);
2384 
2385 struct dev_kfree_skb_cb {
2386 	enum skb_free_reason reason;
2387 };
2388 
2389 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2390 {
2391 	return (struct dev_kfree_skb_cb *)skb->cb;
2392 }
2393 
2394 void netif_schedule_queue(struct netdev_queue *txq)
2395 {
2396 	rcu_read_lock();
2397 	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2398 		struct Qdisc *q = rcu_dereference(txq->qdisc);
2399 
2400 		__netif_schedule(q);
2401 	}
2402 	rcu_read_unlock();
2403 }
2404 EXPORT_SYMBOL(netif_schedule_queue);
2405 
2406 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2407 {
2408 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2409 		struct Qdisc *q;
2410 
2411 		rcu_read_lock();
2412 		q = rcu_dereference(dev_queue->qdisc);
2413 		__netif_schedule(q);
2414 		rcu_read_unlock();
2415 	}
2416 }
2417 EXPORT_SYMBOL(netif_tx_wake_queue);
2418 
2419 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2420 {
2421 	unsigned long flags;
2422 
2423 	if (likely(atomic_read(&skb->users) == 1)) {
2424 		smp_rmb();
2425 		atomic_set(&skb->users, 0);
2426 	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2427 		return;
2428 	}
2429 	get_kfree_skb_cb(skb)->reason = reason;
2430 	local_irq_save(flags);
2431 	skb->next = __this_cpu_read(softnet_data.completion_queue);
2432 	__this_cpu_write(softnet_data.completion_queue, skb);
2433 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2434 	local_irq_restore(flags);
2435 }
2436 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2437 
2438 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2439 {
2440 	if (in_irq() || irqs_disabled())
2441 		__dev_kfree_skb_irq(skb, reason);
2442 	else
2443 		dev_kfree_skb(skb);
2444 }
2445 EXPORT_SYMBOL(__dev_kfree_skb_any);
2446 
2447 
2448 /**
2449  * netif_device_detach - mark device as removed
2450  * @dev: network device
2451  *
2452  * Mark device as removed from system and therefore no longer available.
2453  */
2454 void netif_device_detach(struct net_device *dev)
2455 {
2456 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2457 	    netif_running(dev)) {
2458 		netif_tx_stop_all_queues(dev);
2459 	}
2460 }
2461 EXPORT_SYMBOL(netif_device_detach);
2462 
2463 /**
2464  * netif_device_attach - mark device as attached
2465  * @dev: network device
2466  *
2467  * Mark device as attached from system and restart if needed.
2468  */
2469 void netif_device_attach(struct net_device *dev)
2470 {
2471 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2472 	    netif_running(dev)) {
2473 		netif_tx_wake_all_queues(dev);
2474 		__netdev_watchdog_up(dev);
2475 	}
2476 }
2477 EXPORT_SYMBOL(netif_device_attach);
2478 
2479 /*
2480  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2481  * to be used as a distribution range.
2482  */
2483 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2484 		  unsigned int num_tx_queues)
2485 {
2486 	u32 hash;
2487 	u16 qoffset = 0;
2488 	u16 qcount = num_tx_queues;
2489 
2490 	if (skb_rx_queue_recorded(skb)) {
2491 		hash = skb_get_rx_queue(skb);
2492 		while (unlikely(hash >= num_tx_queues))
2493 			hash -= num_tx_queues;
2494 		return hash;
2495 	}
2496 
2497 	if (dev->num_tc) {
2498 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2499 		qoffset = dev->tc_to_txq[tc].offset;
2500 		qcount = dev->tc_to_txq[tc].count;
2501 	}
2502 
2503 	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2504 }
2505 EXPORT_SYMBOL(__skb_tx_hash);
2506 
2507 static void skb_warn_bad_offload(const struct sk_buff *skb)
2508 {
2509 	static const netdev_features_t null_features;
2510 	struct net_device *dev = skb->dev;
2511 	const char *name = "";
2512 
2513 	if (!net_ratelimit())
2514 		return;
2515 
2516 	if (dev) {
2517 		if (dev->dev.parent)
2518 			name = dev_driver_string(dev->dev.parent);
2519 		else
2520 			name = netdev_name(dev);
2521 	}
2522 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2523 	     "gso_type=%d ip_summed=%d\n",
2524 	     name, dev ? &dev->features : &null_features,
2525 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2526 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2527 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2528 }
2529 
2530 /*
2531  * Invalidate hardware checksum when packet is to be mangled, and
2532  * complete checksum manually on outgoing path.
2533  */
2534 int skb_checksum_help(struct sk_buff *skb)
2535 {
2536 	__wsum csum;
2537 	int ret = 0, offset;
2538 
2539 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2540 		goto out_set_summed;
2541 
2542 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2543 		skb_warn_bad_offload(skb);
2544 		return -EINVAL;
2545 	}
2546 
2547 	/* Before computing a checksum, we should make sure no frag could
2548 	 * be modified by an external entity : checksum could be wrong.
2549 	 */
2550 	if (skb_has_shared_frag(skb)) {
2551 		ret = __skb_linearize(skb);
2552 		if (ret)
2553 			goto out;
2554 	}
2555 
2556 	offset = skb_checksum_start_offset(skb);
2557 	BUG_ON(offset >= skb_headlen(skb));
2558 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2559 
2560 	offset += skb->csum_offset;
2561 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2562 
2563 	if (skb_cloned(skb) &&
2564 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2565 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2566 		if (ret)
2567 			goto out;
2568 	}
2569 
2570 	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2571 out_set_summed:
2572 	skb->ip_summed = CHECKSUM_NONE;
2573 out:
2574 	return ret;
2575 }
2576 EXPORT_SYMBOL(skb_checksum_help);
2577 
2578 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2579 {
2580 	__be16 type = skb->protocol;
2581 
2582 	/* Tunnel gso handlers can set protocol to ethernet. */
2583 	if (type == htons(ETH_P_TEB)) {
2584 		struct ethhdr *eth;
2585 
2586 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2587 			return 0;
2588 
2589 		eth = (struct ethhdr *)skb_mac_header(skb);
2590 		type = eth->h_proto;
2591 	}
2592 
2593 	return __vlan_get_protocol(skb, type, depth);
2594 }
2595 
2596 /**
2597  *	skb_mac_gso_segment - mac layer segmentation handler.
2598  *	@skb: buffer to segment
2599  *	@features: features for the output path (see dev->features)
2600  */
2601 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2602 				    netdev_features_t features)
2603 {
2604 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2605 	struct packet_offload *ptype;
2606 	int vlan_depth = skb->mac_len;
2607 	__be16 type = skb_network_protocol(skb, &vlan_depth);
2608 
2609 	if (unlikely(!type))
2610 		return ERR_PTR(-EINVAL);
2611 
2612 	__skb_pull(skb, vlan_depth);
2613 
2614 	rcu_read_lock();
2615 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2616 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2617 			segs = ptype->callbacks.gso_segment(skb, features);
2618 			break;
2619 		}
2620 	}
2621 	rcu_read_unlock();
2622 
2623 	__skb_push(skb, skb->data - skb_mac_header(skb));
2624 
2625 	return segs;
2626 }
2627 EXPORT_SYMBOL(skb_mac_gso_segment);
2628 
2629 
2630 /* openvswitch calls this on rx path, so we need a different check.
2631  */
2632 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2633 {
2634 	if (tx_path)
2635 		return skb->ip_summed != CHECKSUM_PARTIAL &&
2636 		       skb->ip_summed != CHECKSUM_NONE;
2637 
2638 	return skb->ip_summed == CHECKSUM_NONE;
2639 }
2640 
2641 /**
2642  *	__skb_gso_segment - Perform segmentation on skb.
2643  *	@skb: buffer to segment
2644  *	@features: features for the output path (see dev->features)
2645  *	@tx_path: whether it is called in TX path
2646  *
2647  *	This function segments the given skb and returns a list of segments.
2648  *
2649  *	It may return NULL if the skb requires no segmentation.  This is
2650  *	only possible when GSO is used for verifying header integrity.
2651  *
2652  *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2653  */
2654 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2655 				  netdev_features_t features, bool tx_path)
2656 {
2657 	struct sk_buff *segs;
2658 
2659 	if (unlikely(skb_needs_check(skb, tx_path))) {
2660 		int err;
2661 
2662 		/* We're going to init ->check field in TCP or UDP header */
2663 		err = skb_cow_head(skb, 0);
2664 		if (err < 0)
2665 			return ERR_PTR(err);
2666 	}
2667 
2668 	/* Only report GSO partial support if it will enable us to
2669 	 * support segmentation on this frame without needing additional
2670 	 * work.
2671 	 */
2672 	if (features & NETIF_F_GSO_PARTIAL) {
2673 		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2674 		struct net_device *dev = skb->dev;
2675 
2676 		partial_features |= dev->features & dev->gso_partial_features;
2677 		if (!skb_gso_ok(skb, features | partial_features))
2678 			features &= ~NETIF_F_GSO_PARTIAL;
2679 	}
2680 
2681 	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2682 		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2683 
2684 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2685 	SKB_GSO_CB(skb)->encap_level = 0;
2686 
2687 	skb_reset_mac_header(skb);
2688 	skb_reset_mac_len(skb);
2689 
2690 	segs = skb_mac_gso_segment(skb, features);
2691 
2692 	if (unlikely(skb_needs_check(skb, tx_path)))
2693 		skb_warn_bad_offload(skb);
2694 
2695 	return segs;
2696 }
2697 EXPORT_SYMBOL(__skb_gso_segment);
2698 
2699 /* Take action when hardware reception checksum errors are detected. */
2700 #ifdef CONFIG_BUG
2701 void netdev_rx_csum_fault(struct net_device *dev)
2702 {
2703 	if (net_ratelimit()) {
2704 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2705 		dump_stack();
2706 	}
2707 }
2708 EXPORT_SYMBOL(netdev_rx_csum_fault);
2709 #endif
2710 
2711 /* Actually, we should eliminate this check as soon as we know, that:
2712  * 1. IOMMU is present and allows to map all the memory.
2713  * 2. No high memory really exists on this machine.
2714  */
2715 
2716 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2717 {
2718 #ifdef CONFIG_HIGHMEM
2719 	int i;
2720 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2721 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2722 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2723 			if (PageHighMem(skb_frag_page(frag)))
2724 				return 1;
2725 		}
2726 	}
2727 
2728 	if (PCI_DMA_BUS_IS_PHYS) {
2729 		struct device *pdev = dev->dev.parent;
2730 
2731 		if (!pdev)
2732 			return 0;
2733 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2734 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2735 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2736 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2737 				return 1;
2738 		}
2739 	}
2740 #endif
2741 	return 0;
2742 }
2743 
2744 /* If MPLS offload request, verify we are testing hardware MPLS features
2745  * instead of standard features for the netdev.
2746  */
2747 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2748 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2749 					   netdev_features_t features,
2750 					   __be16 type)
2751 {
2752 	if (eth_p_mpls(type))
2753 		features &= skb->dev->mpls_features;
2754 
2755 	return features;
2756 }
2757 #else
2758 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2759 					   netdev_features_t features,
2760 					   __be16 type)
2761 {
2762 	return features;
2763 }
2764 #endif
2765 
2766 static netdev_features_t harmonize_features(struct sk_buff *skb,
2767 	netdev_features_t features)
2768 {
2769 	int tmp;
2770 	__be16 type;
2771 
2772 	type = skb_network_protocol(skb, &tmp);
2773 	features = net_mpls_features(skb, features, type);
2774 
2775 	if (skb->ip_summed != CHECKSUM_NONE &&
2776 	    !can_checksum_protocol(features, type)) {
2777 		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2778 	}
2779 	if (illegal_highdma(skb->dev, skb))
2780 		features &= ~NETIF_F_SG;
2781 
2782 	return features;
2783 }
2784 
2785 netdev_features_t passthru_features_check(struct sk_buff *skb,
2786 					  struct net_device *dev,
2787 					  netdev_features_t features)
2788 {
2789 	return features;
2790 }
2791 EXPORT_SYMBOL(passthru_features_check);
2792 
2793 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2794 					     struct net_device *dev,
2795 					     netdev_features_t features)
2796 {
2797 	return vlan_features_check(skb, features);
2798 }
2799 
2800 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2801 					    struct net_device *dev,
2802 					    netdev_features_t features)
2803 {
2804 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2805 
2806 	if (gso_segs > dev->gso_max_segs)
2807 		return features & ~NETIF_F_GSO_MASK;
2808 
2809 	/* Support for GSO partial features requires software
2810 	 * intervention before we can actually process the packets
2811 	 * so we need to strip support for any partial features now
2812 	 * and we can pull them back in after we have partially
2813 	 * segmented the frame.
2814 	 */
2815 	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2816 		features &= ~dev->gso_partial_features;
2817 
2818 	/* Make sure to clear the IPv4 ID mangling feature if the
2819 	 * IPv4 header has the potential to be fragmented.
2820 	 */
2821 	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2822 		struct iphdr *iph = skb->encapsulation ?
2823 				    inner_ip_hdr(skb) : ip_hdr(skb);
2824 
2825 		if (!(iph->frag_off & htons(IP_DF)))
2826 			features &= ~NETIF_F_TSO_MANGLEID;
2827 	}
2828 
2829 	return features;
2830 }
2831 
2832 netdev_features_t netif_skb_features(struct sk_buff *skb)
2833 {
2834 	struct net_device *dev = skb->dev;
2835 	netdev_features_t features = dev->features;
2836 
2837 	if (skb_is_gso(skb))
2838 		features = gso_features_check(skb, dev, features);
2839 
2840 	/* If encapsulation offload request, verify we are testing
2841 	 * hardware encapsulation features instead of standard
2842 	 * features for the netdev
2843 	 */
2844 	if (skb->encapsulation)
2845 		features &= dev->hw_enc_features;
2846 
2847 	if (skb_vlan_tagged(skb))
2848 		features = netdev_intersect_features(features,
2849 						     dev->vlan_features |
2850 						     NETIF_F_HW_VLAN_CTAG_TX |
2851 						     NETIF_F_HW_VLAN_STAG_TX);
2852 
2853 	if (dev->netdev_ops->ndo_features_check)
2854 		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2855 								features);
2856 	else
2857 		features &= dflt_features_check(skb, dev, features);
2858 
2859 	return harmonize_features(skb, features);
2860 }
2861 EXPORT_SYMBOL(netif_skb_features);
2862 
2863 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2864 		    struct netdev_queue *txq, bool more)
2865 {
2866 	unsigned int len;
2867 	int rc;
2868 
2869 	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2870 		dev_queue_xmit_nit(skb, dev);
2871 
2872 	len = skb->len;
2873 	trace_net_dev_start_xmit(skb, dev);
2874 	rc = netdev_start_xmit(skb, dev, txq, more);
2875 	trace_net_dev_xmit(skb, rc, dev, len);
2876 
2877 	return rc;
2878 }
2879 
2880 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2881 				    struct netdev_queue *txq, int *ret)
2882 {
2883 	struct sk_buff *skb = first;
2884 	int rc = NETDEV_TX_OK;
2885 
2886 	while (skb) {
2887 		struct sk_buff *next = skb->next;
2888 
2889 		skb->next = NULL;
2890 		rc = xmit_one(skb, dev, txq, next != NULL);
2891 		if (unlikely(!dev_xmit_complete(rc))) {
2892 			skb->next = next;
2893 			goto out;
2894 		}
2895 
2896 		skb = next;
2897 		if (netif_xmit_stopped(txq) && skb) {
2898 			rc = NETDEV_TX_BUSY;
2899 			break;
2900 		}
2901 	}
2902 
2903 out:
2904 	*ret = rc;
2905 	return skb;
2906 }
2907 
2908 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2909 					  netdev_features_t features)
2910 {
2911 	if (skb_vlan_tag_present(skb) &&
2912 	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2913 		skb = __vlan_hwaccel_push_inside(skb);
2914 	return skb;
2915 }
2916 
2917 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2918 {
2919 	netdev_features_t features;
2920 
2921 	features = netif_skb_features(skb);
2922 	skb = validate_xmit_vlan(skb, features);
2923 	if (unlikely(!skb))
2924 		goto out_null;
2925 
2926 	if (netif_needs_gso(skb, features)) {
2927 		struct sk_buff *segs;
2928 
2929 		segs = skb_gso_segment(skb, features);
2930 		if (IS_ERR(segs)) {
2931 			goto out_kfree_skb;
2932 		} else if (segs) {
2933 			consume_skb(skb);
2934 			skb = segs;
2935 		}
2936 	} else {
2937 		if (skb_needs_linearize(skb, features) &&
2938 		    __skb_linearize(skb))
2939 			goto out_kfree_skb;
2940 
2941 		/* If packet is not checksummed and device does not
2942 		 * support checksumming for this protocol, complete
2943 		 * checksumming here.
2944 		 */
2945 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2946 			if (skb->encapsulation)
2947 				skb_set_inner_transport_header(skb,
2948 							       skb_checksum_start_offset(skb));
2949 			else
2950 				skb_set_transport_header(skb,
2951 							 skb_checksum_start_offset(skb));
2952 			if (!(features & NETIF_F_CSUM_MASK) &&
2953 			    skb_checksum_help(skb))
2954 				goto out_kfree_skb;
2955 		}
2956 	}
2957 
2958 	return skb;
2959 
2960 out_kfree_skb:
2961 	kfree_skb(skb);
2962 out_null:
2963 	atomic_long_inc(&dev->tx_dropped);
2964 	return NULL;
2965 }
2966 
2967 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2968 {
2969 	struct sk_buff *next, *head = NULL, *tail;
2970 
2971 	for (; skb != NULL; skb = next) {
2972 		next = skb->next;
2973 		skb->next = NULL;
2974 
2975 		/* in case skb wont be segmented, point to itself */
2976 		skb->prev = skb;
2977 
2978 		skb = validate_xmit_skb(skb, dev);
2979 		if (!skb)
2980 			continue;
2981 
2982 		if (!head)
2983 			head = skb;
2984 		else
2985 			tail->next = skb;
2986 		/* If skb was segmented, skb->prev points to
2987 		 * the last segment. If not, it still contains skb.
2988 		 */
2989 		tail = skb->prev;
2990 	}
2991 	return head;
2992 }
2993 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
2994 
2995 static void qdisc_pkt_len_init(struct sk_buff *skb)
2996 {
2997 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2998 
2999 	qdisc_skb_cb(skb)->pkt_len = skb->len;
3000 
3001 	/* To get more precise estimation of bytes sent on wire,
3002 	 * we add to pkt_len the headers size of all segments
3003 	 */
3004 	if (shinfo->gso_size)  {
3005 		unsigned int hdr_len;
3006 		u16 gso_segs = shinfo->gso_segs;
3007 
3008 		/* mac layer + network layer */
3009 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3010 
3011 		/* + transport layer */
3012 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3013 			hdr_len += tcp_hdrlen(skb);
3014 		else
3015 			hdr_len += sizeof(struct udphdr);
3016 
3017 		if (shinfo->gso_type & SKB_GSO_DODGY)
3018 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3019 						shinfo->gso_size);
3020 
3021 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3022 	}
3023 }
3024 
3025 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3026 				 struct net_device *dev,
3027 				 struct netdev_queue *txq)
3028 {
3029 	spinlock_t *root_lock = qdisc_lock(q);
3030 	struct sk_buff *to_free = NULL;
3031 	bool contended;
3032 	int rc;
3033 
3034 	qdisc_calculate_pkt_len(skb, q);
3035 	/*
3036 	 * Heuristic to force contended enqueues to serialize on a
3037 	 * separate lock before trying to get qdisc main lock.
3038 	 * This permits qdisc->running owner to get the lock more
3039 	 * often and dequeue packets faster.
3040 	 */
3041 	contended = qdisc_is_running(q);
3042 	if (unlikely(contended))
3043 		spin_lock(&q->busylock);
3044 
3045 	spin_lock(root_lock);
3046 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3047 		__qdisc_drop(skb, &to_free);
3048 		rc = NET_XMIT_DROP;
3049 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3050 		   qdisc_run_begin(q)) {
3051 		/*
3052 		 * This is a work-conserving queue; there are no old skbs
3053 		 * waiting to be sent out; and the qdisc is not running -
3054 		 * xmit the skb directly.
3055 		 */
3056 
3057 		qdisc_bstats_update(q, skb);
3058 
3059 		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3060 			if (unlikely(contended)) {
3061 				spin_unlock(&q->busylock);
3062 				contended = false;
3063 			}
3064 			__qdisc_run(q);
3065 		} else
3066 			qdisc_run_end(q);
3067 
3068 		rc = NET_XMIT_SUCCESS;
3069 	} else {
3070 		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3071 		if (qdisc_run_begin(q)) {
3072 			if (unlikely(contended)) {
3073 				spin_unlock(&q->busylock);
3074 				contended = false;
3075 			}
3076 			__qdisc_run(q);
3077 		}
3078 	}
3079 	spin_unlock(root_lock);
3080 	if (unlikely(to_free))
3081 		kfree_skb_list(to_free);
3082 	if (unlikely(contended))
3083 		spin_unlock(&q->busylock);
3084 	return rc;
3085 }
3086 
3087 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3088 static void skb_update_prio(struct sk_buff *skb)
3089 {
3090 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3091 
3092 	if (!skb->priority && skb->sk && map) {
3093 		unsigned int prioidx =
3094 			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3095 
3096 		if (prioidx < map->priomap_len)
3097 			skb->priority = map->priomap[prioidx];
3098 	}
3099 }
3100 #else
3101 #define skb_update_prio(skb)
3102 #endif
3103 
3104 DEFINE_PER_CPU(int, xmit_recursion);
3105 EXPORT_SYMBOL(xmit_recursion);
3106 
3107 /**
3108  *	dev_loopback_xmit - loop back @skb
3109  *	@net: network namespace this loopback is happening in
3110  *	@sk:  sk needed to be a netfilter okfn
3111  *	@skb: buffer to transmit
3112  */
3113 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3114 {
3115 	skb_reset_mac_header(skb);
3116 	__skb_pull(skb, skb_network_offset(skb));
3117 	skb->pkt_type = PACKET_LOOPBACK;
3118 	skb->ip_summed = CHECKSUM_UNNECESSARY;
3119 	WARN_ON(!skb_dst(skb));
3120 	skb_dst_force(skb);
3121 	netif_rx_ni(skb);
3122 	return 0;
3123 }
3124 EXPORT_SYMBOL(dev_loopback_xmit);
3125 
3126 #ifdef CONFIG_NET_EGRESS
3127 static struct sk_buff *
3128 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3129 {
3130 	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3131 	struct tcf_result cl_res;
3132 
3133 	if (!cl)
3134 		return skb;
3135 
3136 	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3137 	qdisc_bstats_cpu_update(cl->q, skb);
3138 
3139 	switch (tc_classify(skb, cl, &cl_res, false)) {
3140 	case TC_ACT_OK:
3141 	case TC_ACT_RECLASSIFY:
3142 		skb->tc_index = TC_H_MIN(cl_res.classid);
3143 		break;
3144 	case TC_ACT_SHOT:
3145 		qdisc_qstats_cpu_drop(cl->q);
3146 		*ret = NET_XMIT_DROP;
3147 		kfree_skb(skb);
3148 		return NULL;
3149 	case TC_ACT_STOLEN:
3150 	case TC_ACT_QUEUED:
3151 		*ret = NET_XMIT_SUCCESS;
3152 		consume_skb(skb);
3153 		return NULL;
3154 	case TC_ACT_REDIRECT:
3155 		/* No need to push/pop skb's mac_header here on egress! */
3156 		skb_do_redirect(skb);
3157 		*ret = NET_XMIT_SUCCESS;
3158 		return NULL;
3159 	default:
3160 		break;
3161 	}
3162 
3163 	return skb;
3164 }
3165 #endif /* CONFIG_NET_EGRESS */
3166 
3167 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3168 {
3169 #ifdef CONFIG_XPS
3170 	struct xps_dev_maps *dev_maps;
3171 	struct xps_map *map;
3172 	int queue_index = -1;
3173 
3174 	rcu_read_lock();
3175 	dev_maps = rcu_dereference(dev->xps_maps);
3176 	if (dev_maps) {
3177 		unsigned int tci = skb->sender_cpu - 1;
3178 
3179 		if (dev->num_tc) {
3180 			tci *= dev->num_tc;
3181 			tci += netdev_get_prio_tc_map(dev, skb->priority);
3182 		}
3183 
3184 		map = rcu_dereference(dev_maps->cpu_map[tci]);
3185 		if (map) {
3186 			if (map->len == 1)
3187 				queue_index = map->queues[0];
3188 			else
3189 				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3190 									   map->len)];
3191 			if (unlikely(queue_index >= dev->real_num_tx_queues))
3192 				queue_index = -1;
3193 		}
3194 	}
3195 	rcu_read_unlock();
3196 
3197 	return queue_index;
3198 #else
3199 	return -1;
3200 #endif
3201 }
3202 
3203 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3204 {
3205 	struct sock *sk = skb->sk;
3206 	int queue_index = sk_tx_queue_get(sk);
3207 
3208 	if (queue_index < 0 || skb->ooo_okay ||
3209 	    queue_index >= dev->real_num_tx_queues) {
3210 		int new_index = get_xps_queue(dev, skb);
3211 		if (new_index < 0)
3212 			new_index = skb_tx_hash(dev, skb);
3213 
3214 		if (queue_index != new_index && sk &&
3215 		    sk_fullsock(sk) &&
3216 		    rcu_access_pointer(sk->sk_dst_cache))
3217 			sk_tx_queue_set(sk, new_index);
3218 
3219 		queue_index = new_index;
3220 	}
3221 
3222 	return queue_index;
3223 }
3224 
3225 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3226 				    struct sk_buff *skb,
3227 				    void *accel_priv)
3228 {
3229 	int queue_index = 0;
3230 
3231 #ifdef CONFIG_XPS
3232 	u32 sender_cpu = skb->sender_cpu - 1;
3233 
3234 	if (sender_cpu >= (u32)NR_CPUS)
3235 		skb->sender_cpu = raw_smp_processor_id() + 1;
3236 #endif
3237 
3238 	if (dev->real_num_tx_queues != 1) {
3239 		const struct net_device_ops *ops = dev->netdev_ops;
3240 		if (ops->ndo_select_queue)
3241 			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3242 							    __netdev_pick_tx);
3243 		else
3244 			queue_index = __netdev_pick_tx(dev, skb);
3245 
3246 		if (!accel_priv)
3247 			queue_index = netdev_cap_txqueue(dev, queue_index);
3248 	}
3249 
3250 	skb_set_queue_mapping(skb, queue_index);
3251 	return netdev_get_tx_queue(dev, queue_index);
3252 }
3253 
3254 /**
3255  *	__dev_queue_xmit - transmit a buffer
3256  *	@skb: buffer to transmit
3257  *	@accel_priv: private data used for L2 forwarding offload
3258  *
3259  *	Queue a buffer for transmission to a network device. The caller must
3260  *	have set the device and priority and built the buffer before calling
3261  *	this function. The function can be called from an interrupt.
3262  *
3263  *	A negative errno code is returned on a failure. A success does not
3264  *	guarantee the frame will be transmitted as it may be dropped due
3265  *	to congestion or traffic shaping.
3266  *
3267  * -----------------------------------------------------------------------------------
3268  *      I notice this method can also return errors from the queue disciplines,
3269  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3270  *      be positive.
3271  *
3272  *      Regardless of the return value, the skb is consumed, so it is currently
3273  *      difficult to retry a send to this method.  (You can bump the ref count
3274  *      before sending to hold a reference for retry if you are careful.)
3275  *
3276  *      When calling this method, interrupts MUST be enabled.  This is because
3277  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3278  *          --BLG
3279  */
3280 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3281 {
3282 	struct net_device *dev = skb->dev;
3283 	struct netdev_queue *txq;
3284 	struct Qdisc *q;
3285 	int rc = -ENOMEM;
3286 
3287 	skb_reset_mac_header(skb);
3288 
3289 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3290 		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3291 
3292 	/* Disable soft irqs for various locks below. Also
3293 	 * stops preemption for RCU.
3294 	 */
3295 	rcu_read_lock_bh();
3296 
3297 	skb_update_prio(skb);
3298 
3299 	qdisc_pkt_len_init(skb);
3300 #ifdef CONFIG_NET_CLS_ACT
3301 	skb->tc_at_ingress = 0;
3302 # ifdef CONFIG_NET_EGRESS
3303 	if (static_key_false(&egress_needed)) {
3304 		skb = sch_handle_egress(skb, &rc, dev);
3305 		if (!skb)
3306 			goto out;
3307 	}
3308 # endif
3309 #endif
3310 	/* If device/qdisc don't need skb->dst, release it right now while
3311 	 * its hot in this cpu cache.
3312 	 */
3313 	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3314 		skb_dst_drop(skb);
3315 	else
3316 		skb_dst_force(skb);
3317 
3318 	txq = netdev_pick_tx(dev, skb, accel_priv);
3319 	q = rcu_dereference_bh(txq->qdisc);
3320 
3321 	trace_net_dev_queue(skb);
3322 	if (q->enqueue) {
3323 		rc = __dev_xmit_skb(skb, q, dev, txq);
3324 		goto out;
3325 	}
3326 
3327 	/* The device has no queue. Common case for software devices:
3328 	   loopback, all the sorts of tunnels...
3329 
3330 	   Really, it is unlikely that netif_tx_lock protection is necessary
3331 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3332 	   counters.)
3333 	   However, it is possible, that they rely on protection
3334 	   made by us here.
3335 
3336 	   Check this and shot the lock. It is not prone from deadlocks.
3337 	   Either shot noqueue qdisc, it is even simpler 8)
3338 	 */
3339 	if (dev->flags & IFF_UP) {
3340 		int cpu = smp_processor_id(); /* ok because BHs are off */
3341 
3342 		if (txq->xmit_lock_owner != cpu) {
3343 			if (unlikely(__this_cpu_read(xmit_recursion) >
3344 				     XMIT_RECURSION_LIMIT))
3345 				goto recursion_alert;
3346 
3347 			skb = validate_xmit_skb(skb, dev);
3348 			if (!skb)
3349 				goto out;
3350 
3351 			HARD_TX_LOCK(dev, txq, cpu);
3352 
3353 			if (!netif_xmit_stopped(txq)) {
3354 				__this_cpu_inc(xmit_recursion);
3355 				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3356 				__this_cpu_dec(xmit_recursion);
3357 				if (dev_xmit_complete(rc)) {
3358 					HARD_TX_UNLOCK(dev, txq);
3359 					goto out;
3360 				}
3361 			}
3362 			HARD_TX_UNLOCK(dev, txq);
3363 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3364 					     dev->name);
3365 		} else {
3366 			/* Recursion is detected! It is possible,
3367 			 * unfortunately
3368 			 */
3369 recursion_alert:
3370 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3371 					     dev->name);
3372 		}
3373 	}
3374 
3375 	rc = -ENETDOWN;
3376 	rcu_read_unlock_bh();
3377 
3378 	atomic_long_inc(&dev->tx_dropped);
3379 	kfree_skb_list(skb);
3380 	return rc;
3381 out:
3382 	rcu_read_unlock_bh();
3383 	return rc;
3384 }
3385 
3386 int dev_queue_xmit(struct sk_buff *skb)
3387 {
3388 	return __dev_queue_xmit(skb, NULL);
3389 }
3390 EXPORT_SYMBOL(dev_queue_xmit);
3391 
3392 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3393 {
3394 	return __dev_queue_xmit(skb, accel_priv);
3395 }
3396 EXPORT_SYMBOL(dev_queue_xmit_accel);
3397 
3398 
3399 /*=======================================================================
3400 			Receiver routines
3401   =======================================================================*/
3402 
3403 int netdev_max_backlog __read_mostly = 1000;
3404 EXPORT_SYMBOL(netdev_max_backlog);
3405 
3406 int netdev_tstamp_prequeue __read_mostly = 1;
3407 int netdev_budget __read_mostly = 300;
3408 int weight_p __read_mostly = 64;           /* old backlog weight */
3409 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
3410 int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
3411 int dev_rx_weight __read_mostly = 64;
3412 int dev_tx_weight __read_mostly = 64;
3413 
3414 /* Called with irq disabled */
3415 static inline void ____napi_schedule(struct softnet_data *sd,
3416 				     struct napi_struct *napi)
3417 {
3418 	list_add_tail(&napi->poll_list, &sd->poll_list);
3419 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3420 }
3421 
3422 #ifdef CONFIG_RPS
3423 
3424 /* One global table that all flow-based protocols share. */
3425 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3426 EXPORT_SYMBOL(rps_sock_flow_table);
3427 u32 rps_cpu_mask __read_mostly;
3428 EXPORT_SYMBOL(rps_cpu_mask);
3429 
3430 struct static_key rps_needed __read_mostly;
3431 EXPORT_SYMBOL(rps_needed);
3432 struct static_key rfs_needed __read_mostly;
3433 EXPORT_SYMBOL(rfs_needed);
3434 
3435 static struct rps_dev_flow *
3436 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3437 	    struct rps_dev_flow *rflow, u16 next_cpu)
3438 {
3439 	if (next_cpu < nr_cpu_ids) {
3440 #ifdef CONFIG_RFS_ACCEL
3441 		struct netdev_rx_queue *rxqueue;
3442 		struct rps_dev_flow_table *flow_table;
3443 		struct rps_dev_flow *old_rflow;
3444 		u32 flow_id;
3445 		u16 rxq_index;
3446 		int rc;
3447 
3448 		/* Should we steer this flow to a different hardware queue? */
3449 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3450 		    !(dev->features & NETIF_F_NTUPLE))
3451 			goto out;
3452 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3453 		if (rxq_index == skb_get_rx_queue(skb))
3454 			goto out;
3455 
3456 		rxqueue = dev->_rx + rxq_index;
3457 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3458 		if (!flow_table)
3459 			goto out;
3460 		flow_id = skb_get_hash(skb) & flow_table->mask;
3461 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3462 							rxq_index, flow_id);
3463 		if (rc < 0)
3464 			goto out;
3465 		old_rflow = rflow;
3466 		rflow = &flow_table->flows[flow_id];
3467 		rflow->filter = rc;
3468 		if (old_rflow->filter == rflow->filter)
3469 			old_rflow->filter = RPS_NO_FILTER;
3470 	out:
3471 #endif
3472 		rflow->last_qtail =
3473 			per_cpu(softnet_data, next_cpu).input_queue_head;
3474 	}
3475 
3476 	rflow->cpu = next_cpu;
3477 	return rflow;
3478 }
3479 
3480 /*
3481  * get_rps_cpu is called from netif_receive_skb and returns the target
3482  * CPU from the RPS map of the receiving queue for a given skb.
3483  * rcu_read_lock must be held on entry.
3484  */
3485 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3486 		       struct rps_dev_flow **rflowp)
3487 {
3488 	const struct rps_sock_flow_table *sock_flow_table;
3489 	struct netdev_rx_queue *rxqueue = dev->_rx;
3490 	struct rps_dev_flow_table *flow_table;
3491 	struct rps_map *map;
3492 	int cpu = -1;
3493 	u32 tcpu;
3494 	u32 hash;
3495 
3496 	if (skb_rx_queue_recorded(skb)) {
3497 		u16 index = skb_get_rx_queue(skb);
3498 
3499 		if (unlikely(index >= dev->real_num_rx_queues)) {
3500 			WARN_ONCE(dev->real_num_rx_queues > 1,
3501 				  "%s received packet on queue %u, but number "
3502 				  "of RX queues is %u\n",
3503 				  dev->name, index, dev->real_num_rx_queues);
3504 			goto done;
3505 		}
3506 		rxqueue += index;
3507 	}
3508 
3509 	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3510 
3511 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3512 	map = rcu_dereference(rxqueue->rps_map);
3513 	if (!flow_table && !map)
3514 		goto done;
3515 
3516 	skb_reset_network_header(skb);
3517 	hash = skb_get_hash(skb);
3518 	if (!hash)
3519 		goto done;
3520 
3521 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3522 	if (flow_table && sock_flow_table) {
3523 		struct rps_dev_flow *rflow;
3524 		u32 next_cpu;
3525 		u32 ident;
3526 
3527 		/* First check into global flow table if there is a match */
3528 		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3529 		if ((ident ^ hash) & ~rps_cpu_mask)
3530 			goto try_rps;
3531 
3532 		next_cpu = ident & rps_cpu_mask;
3533 
3534 		/* OK, now we know there is a match,
3535 		 * we can look at the local (per receive queue) flow table
3536 		 */
3537 		rflow = &flow_table->flows[hash & flow_table->mask];
3538 		tcpu = rflow->cpu;
3539 
3540 		/*
3541 		 * If the desired CPU (where last recvmsg was done) is
3542 		 * different from current CPU (one in the rx-queue flow
3543 		 * table entry), switch if one of the following holds:
3544 		 *   - Current CPU is unset (>= nr_cpu_ids).
3545 		 *   - Current CPU is offline.
3546 		 *   - The current CPU's queue tail has advanced beyond the
3547 		 *     last packet that was enqueued using this table entry.
3548 		 *     This guarantees that all previous packets for the flow
3549 		 *     have been dequeued, thus preserving in order delivery.
3550 		 */
3551 		if (unlikely(tcpu != next_cpu) &&
3552 		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3553 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3554 		      rflow->last_qtail)) >= 0)) {
3555 			tcpu = next_cpu;
3556 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3557 		}
3558 
3559 		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3560 			*rflowp = rflow;
3561 			cpu = tcpu;
3562 			goto done;
3563 		}
3564 	}
3565 
3566 try_rps:
3567 
3568 	if (map) {
3569 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3570 		if (cpu_online(tcpu)) {
3571 			cpu = tcpu;
3572 			goto done;
3573 		}
3574 	}
3575 
3576 done:
3577 	return cpu;
3578 }
3579 
3580 #ifdef CONFIG_RFS_ACCEL
3581 
3582 /**
3583  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3584  * @dev: Device on which the filter was set
3585  * @rxq_index: RX queue index
3586  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3587  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3588  *
3589  * Drivers that implement ndo_rx_flow_steer() should periodically call
3590  * this function for each installed filter and remove the filters for
3591  * which it returns %true.
3592  */
3593 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3594 			 u32 flow_id, u16 filter_id)
3595 {
3596 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3597 	struct rps_dev_flow_table *flow_table;
3598 	struct rps_dev_flow *rflow;
3599 	bool expire = true;
3600 	unsigned int cpu;
3601 
3602 	rcu_read_lock();
3603 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3604 	if (flow_table && flow_id <= flow_table->mask) {
3605 		rflow = &flow_table->flows[flow_id];
3606 		cpu = ACCESS_ONCE(rflow->cpu);
3607 		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3608 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3609 			   rflow->last_qtail) <
3610 		     (int)(10 * flow_table->mask)))
3611 			expire = false;
3612 	}
3613 	rcu_read_unlock();
3614 	return expire;
3615 }
3616 EXPORT_SYMBOL(rps_may_expire_flow);
3617 
3618 #endif /* CONFIG_RFS_ACCEL */
3619 
3620 /* Called from hardirq (IPI) context */
3621 static void rps_trigger_softirq(void *data)
3622 {
3623 	struct softnet_data *sd = data;
3624 
3625 	____napi_schedule(sd, &sd->backlog);
3626 	sd->received_rps++;
3627 }
3628 
3629 #endif /* CONFIG_RPS */
3630 
3631 /*
3632  * Check if this softnet_data structure is another cpu one
3633  * If yes, queue it to our IPI list and return 1
3634  * If no, return 0
3635  */
3636 static int rps_ipi_queued(struct softnet_data *sd)
3637 {
3638 #ifdef CONFIG_RPS
3639 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3640 
3641 	if (sd != mysd) {
3642 		sd->rps_ipi_next = mysd->rps_ipi_list;
3643 		mysd->rps_ipi_list = sd;
3644 
3645 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3646 		return 1;
3647 	}
3648 #endif /* CONFIG_RPS */
3649 	return 0;
3650 }
3651 
3652 #ifdef CONFIG_NET_FLOW_LIMIT
3653 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3654 #endif
3655 
3656 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3657 {
3658 #ifdef CONFIG_NET_FLOW_LIMIT
3659 	struct sd_flow_limit *fl;
3660 	struct softnet_data *sd;
3661 	unsigned int old_flow, new_flow;
3662 
3663 	if (qlen < (netdev_max_backlog >> 1))
3664 		return false;
3665 
3666 	sd = this_cpu_ptr(&softnet_data);
3667 
3668 	rcu_read_lock();
3669 	fl = rcu_dereference(sd->flow_limit);
3670 	if (fl) {
3671 		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3672 		old_flow = fl->history[fl->history_head];
3673 		fl->history[fl->history_head] = new_flow;
3674 
3675 		fl->history_head++;
3676 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3677 
3678 		if (likely(fl->buckets[old_flow]))
3679 			fl->buckets[old_flow]--;
3680 
3681 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3682 			fl->count++;
3683 			rcu_read_unlock();
3684 			return true;
3685 		}
3686 	}
3687 	rcu_read_unlock();
3688 #endif
3689 	return false;
3690 }
3691 
3692 /*
3693  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3694  * queue (may be a remote CPU queue).
3695  */
3696 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3697 			      unsigned int *qtail)
3698 {
3699 	struct softnet_data *sd;
3700 	unsigned long flags;
3701 	unsigned int qlen;
3702 
3703 	sd = &per_cpu(softnet_data, cpu);
3704 
3705 	local_irq_save(flags);
3706 
3707 	rps_lock(sd);
3708 	if (!netif_running(skb->dev))
3709 		goto drop;
3710 	qlen = skb_queue_len(&sd->input_pkt_queue);
3711 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3712 		if (qlen) {
3713 enqueue:
3714 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3715 			input_queue_tail_incr_save(sd, qtail);
3716 			rps_unlock(sd);
3717 			local_irq_restore(flags);
3718 			return NET_RX_SUCCESS;
3719 		}
3720 
3721 		/* Schedule NAPI for backlog device
3722 		 * We can use non atomic operation since we own the queue lock
3723 		 */
3724 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3725 			if (!rps_ipi_queued(sd))
3726 				____napi_schedule(sd, &sd->backlog);
3727 		}
3728 		goto enqueue;
3729 	}
3730 
3731 drop:
3732 	sd->dropped++;
3733 	rps_unlock(sd);
3734 
3735 	local_irq_restore(flags);
3736 
3737 	atomic_long_inc(&skb->dev->rx_dropped);
3738 	kfree_skb(skb);
3739 	return NET_RX_DROP;
3740 }
3741 
3742 static int netif_rx_internal(struct sk_buff *skb)
3743 {
3744 	int ret;
3745 
3746 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3747 
3748 	trace_netif_rx(skb);
3749 #ifdef CONFIG_RPS
3750 	if (static_key_false(&rps_needed)) {
3751 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3752 		int cpu;
3753 
3754 		preempt_disable();
3755 		rcu_read_lock();
3756 
3757 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3758 		if (cpu < 0)
3759 			cpu = smp_processor_id();
3760 
3761 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3762 
3763 		rcu_read_unlock();
3764 		preempt_enable();
3765 	} else
3766 #endif
3767 	{
3768 		unsigned int qtail;
3769 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3770 		put_cpu();
3771 	}
3772 	return ret;
3773 }
3774 
3775 /**
3776  *	netif_rx	-	post buffer to the network code
3777  *	@skb: buffer to post
3778  *
3779  *	This function receives a packet from a device driver and queues it for
3780  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3781  *	may be dropped during processing for congestion control or by the
3782  *	protocol layers.
3783  *
3784  *	return values:
3785  *	NET_RX_SUCCESS	(no congestion)
3786  *	NET_RX_DROP     (packet was dropped)
3787  *
3788  */
3789 
3790 int netif_rx(struct sk_buff *skb)
3791 {
3792 	trace_netif_rx_entry(skb);
3793 
3794 	return netif_rx_internal(skb);
3795 }
3796 EXPORT_SYMBOL(netif_rx);
3797 
3798 int netif_rx_ni(struct sk_buff *skb)
3799 {
3800 	int err;
3801 
3802 	trace_netif_rx_ni_entry(skb);
3803 
3804 	preempt_disable();
3805 	err = netif_rx_internal(skb);
3806 	if (local_softirq_pending())
3807 		do_softirq();
3808 	preempt_enable();
3809 
3810 	return err;
3811 }
3812 EXPORT_SYMBOL(netif_rx_ni);
3813 
3814 static __latent_entropy void net_tx_action(struct softirq_action *h)
3815 {
3816 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3817 
3818 	if (sd->completion_queue) {
3819 		struct sk_buff *clist;
3820 
3821 		local_irq_disable();
3822 		clist = sd->completion_queue;
3823 		sd->completion_queue = NULL;
3824 		local_irq_enable();
3825 
3826 		while (clist) {
3827 			struct sk_buff *skb = clist;
3828 			clist = clist->next;
3829 
3830 			WARN_ON(atomic_read(&skb->users));
3831 			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3832 				trace_consume_skb(skb);
3833 			else
3834 				trace_kfree_skb(skb, net_tx_action);
3835 
3836 			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3837 				__kfree_skb(skb);
3838 			else
3839 				__kfree_skb_defer(skb);
3840 		}
3841 
3842 		__kfree_skb_flush();
3843 	}
3844 
3845 	if (sd->output_queue) {
3846 		struct Qdisc *head;
3847 
3848 		local_irq_disable();
3849 		head = sd->output_queue;
3850 		sd->output_queue = NULL;
3851 		sd->output_queue_tailp = &sd->output_queue;
3852 		local_irq_enable();
3853 
3854 		while (head) {
3855 			struct Qdisc *q = head;
3856 			spinlock_t *root_lock;
3857 
3858 			head = head->next_sched;
3859 
3860 			root_lock = qdisc_lock(q);
3861 			spin_lock(root_lock);
3862 			/* We need to make sure head->next_sched is read
3863 			 * before clearing __QDISC_STATE_SCHED
3864 			 */
3865 			smp_mb__before_atomic();
3866 			clear_bit(__QDISC_STATE_SCHED, &q->state);
3867 			qdisc_run(q);
3868 			spin_unlock(root_lock);
3869 		}
3870 	}
3871 }
3872 
3873 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3874 /* This hook is defined here for ATM LANE */
3875 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3876 			     unsigned char *addr) __read_mostly;
3877 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3878 #endif
3879 
3880 static inline struct sk_buff *
3881 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3882 		   struct net_device *orig_dev)
3883 {
3884 #ifdef CONFIG_NET_CLS_ACT
3885 	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3886 	struct tcf_result cl_res;
3887 
3888 	/* If there's at least one ingress present somewhere (so
3889 	 * we get here via enabled static key), remaining devices
3890 	 * that are not configured with an ingress qdisc will bail
3891 	 * out here.
3892 	 */
3893 	if (!cl)
3894 		return skb;
3895 	if (*pt_prev) {
3896 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3897 		*pt_prev = NULL;
3898 	}
3899 
3900 	qdisc_skb_cb(skb)->pkt_len = skb->len;
3901 	skb->tc_at_ingress = 1;
3902 	qdisc_bstats_cpu_update(cl->q, skb);
3903 
3904 	switch (tc_classify(skb, cl, &cl_res, false)) {
3905 	case TC_ACT_OK:
3906 	case TC_ACT_RECLASSIFY:
3907 		skb->tc_index = TC_H_MIN(cl_res.classid);
3908 		break;
3909 	case TC_ACT_SHOT:
3910 		qdisc_qstats_cpu_drop(cl->q);
3911 		kfree_skb(skb);
3912 		return NULL;
3913 	case TC_ACT_STOLEN:
3914 	case TC_ACT_QUEUED:
3915 		consume_skb(skb);
3916 		return NULL;
3917 	case TC_ACT_REDIRECT:
3918 		/* skb_mac_header check was done by cls/act_bpf, so
3919 		 * we can safely push the L2 header back before
3920 		 * redirecting to another netdev
3921 		 */
3922 		__skb_push(skb, skb->mac_len);
3923 		skb_do_redirect(skb);
3924 		return NULL;
3925 	default:
3926 		break;
3927 	}
3928 #endif /* CONFIG_NET_CLS_ACT */
3929 	return skb;
3930 }
3931 
3932 /**
3933  *	netdev_is_rx_handler_busy - check if receive handler is registered
3934  *	@dev: device to check
3935  *
3936  *	Check if a receive handler is already registered for a given device.
3937  *	Return true if there one.
3938  *
3939  *	The caller must hold the rtnl_mutex.
3940  */
3941 bool netdev_is_rx_handler_busy(struct net_device *dev)
3942 {
3943 	ASSERT_RTNL();
3944 	return dev && rtnl_dereference(dev->rx_handler);
3945 }
3946 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3947 
3948 /**
3949  *	netdev_rx_handler_register - register receive handler
3950  *	@dev: device to register a handler for
3951  *	@rx_handler: receive handler to register
3952  *	@rx_handler_data: data pointer that is used by rx handler
3953  *
3954  *	Register a receive handler for a device. This handler will then be
3955  *	called from __netif_receive_skb. A negative errno code is returned
3956  *	on a failure.
3957  *
3958  *	The caller must hold the rtnl_mutex.
3959  *
3960  *	For a general description of rx_handler, see enum rx_handler_result.
3961  */
3962 int netdev_rx_handler_register(struct net_device *dev,
3963 			       rx_handler_func_t *rx_handler,
3964 			       void *rx_handler_data)
3965 {
3966 	if (netdev_is_rx_handler_busy(dev))
3967 		return -EBUSY;
3968 
3969 	/* Note: rx_handler_data must be set before rx_handler */
3970 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3971 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3972 
3973 	return 0;
3974 }
3975 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3976 
3977 /**
3978  *	netdev_rx_handler_unregister - unregister receive handler
3979  *	@dev: device to unregister a handler from
3980  *
3981  *	Unregister a receive handler from a device.
3982  *
3983  *	The caller must hold the rtnl_mutex.
3984  */
3985 void netdev_rx_handler_unregister(struct net_device *dev)
3986 {
3987 
3988 	ASSERT_RTNL();
3989 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3990 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3991 	 * section has a guarantee to see a non NULL rx_handler_data
3992 	 * as well.
3993 	 */
3994 	synchronize_net();
3995 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3996 }
3997 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3998 
3999 /*
4000  * Limit the use of PFMEMALLOC reserves to those protocols that implement
4001  * the special handling of PFMEMALLOC skbs.
4002  */
4003 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4004 {
4005 	switch (skb->protocol) {
4006 	case htons(ETH_P_ARP):
4007 	case htons(ETH_P_IP):
4008 	case htons(ETH_P_IPV6):
4009 	case htons(ETH_P_8021Q):
4010 	case htons(ETH_P_8021AD):
4011 		return true;
4012 	default:
4013 		return false;
4014 	}
4015 }
4016 
4017 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4018 			     int *ret, struct net_device *orig_dev)
4019 {
4020 #ifdef CONFIG_NETFILTER_INGRESS
4021 	if (nf_hook_ingress_active(skb)) {
4022 		int ingress_retval;
4023 
4024 		if (*pt_prev) {
4025 			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4026 			*pt_prev = NULL;
4027 		}
4028 
4029 		rcu_read_lock();
4030 		ingress_retval = nf_hook_ingress(skb);
4031 		rcu_read_unlock();
4032 		return ingress_retval;
4033 	}
4034 #endif /* CONFIG_NETFILTER_INGRESS */
4035 	return 0;
4036 }
4037 
4038 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4039 {
4040 	struct packet_type *ptype, *pt_prev;
4041 	rx_handler_func_t *rx_handler;
4042 	struct net_device *orig_dev;
4043 	bool deliver_exact = false;
4044 	int ret = NET_RX_DROP;
4045 	__be16 type;
4046 
4047 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4048 
4049 	trace_netif_receive_skb(skb);
4050 
4051 	orig_dev = skb->dev;
4052 
4053 	skb_reset_network_header(skb);
4054 	if (!skb_transport_header_was_set(skb))
4055 		skb_reset_transport_header(skb);
4056 	skb_reset_mac_len(skb);
4057 
4058 	pt_prev = NULL;
4059 
4060 another_round:
4061 	skb->skb_iif = skb->dev->ifindex;
4062 
4063 	__this_cpu_inc(softnet_data.processed);
4064 
4065 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4066 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4067 		skb = skb_vlan_untag(skb);
4068 		if (unlikely(!skb))
4069 			goto out;
4070 	}
4071 
4072 	if (skb_skip_tc_classify(skb))
4073 		goto skip_classify;
4074 
4075 	if (pfmemalloc)
4076 		goto skip_taps;
4077 
4078 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4079 		if (pt_prev)
4080 			ret = deliver_skb(skb, pt_prev, orig_dev);
4081 		pt_prev = ptype;
4082 	}
4083 
4084 	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4085 		if (pt_prev)
4086 			ret = deliver_skb(skb, pt_prev, orig_dev);
4087 		pt_prev = ptype;
4088 	}
4089 
4090 skip_taps:
4091 #ifdef CONFIG_NET_INGRESS
4092 	if (static_key_false(&ingress_needed)) {
4093 		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4094 		if (!skb)
4095 			goto out;
4096 
4097 		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4098 			goto out;
4099 	}
4100 #endif
4101 	skb_reset_tc(skb);
4102 skip_classify:
4103 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4104 		goto drop;
4105 
4106 	if (skb_vlan_tag_present(skb)) {
4107 		if (pt_prev) {
4108 			ret = deliver_skb(skb, pt_prev, orig_dev);
4109 			pt_prev = NULL;
4110 		}
4111 		if (vlan_do_receive(&skb))
4112 			goto another_round;
4113 		else if (unlikely(!skb))
4114 			goto out;
4115 	}
4116 
4117 	rx_handler = rcu_dereference(skb->dev->rx_handler);
4118 	if (rx_handler) {
4119 		if (pt_prev) {
4120 			ret = deliver_skb(skb, pt_prev, orig_dev);
4121 			pt_prev = NULL;
4122 		}
4123 		switch (rx_handler(&skb)) {
4124 		case RX_HANDLER_CONSUMED:
4125 			ret = NET_RX_SUCCESS;
4126 			goto out;
4127 		case RX_HANDLER_ANOTHER:
4128 			goto another_round;
4129 		case RX_HANDLER_EXACT:
4130 			deliver_exact = true;
4131 		case RX_HANDLER_PASS:
4132 			break;
4133 		default:
4134 			BUG();
4135 		}
4136 	}
4137 
4138 	if (unlikely(skb_vlan_tag_present(skb))) {
4139 		if (skb_vlan_tag_get_id(skb))
4140 			skb->pkt_type = PACKET_OTHERHOST;
4141 		/* Note: we might in the future use prio bits
4142 		 * and set skb->priority like in vlan_do_receive()
4143 		 * For the time being, just ignore Priority Code Point
4144 		 */
4145 		skb->vlan_tci = 0;
4146 	}
4147 
4148 	type = skb->protocol;
4149 
4150 	/* deliver only exact match when indicated */
4151 	if (likely(!deliver_exact)) {
4152 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4153 				       &ptype_base[ntohs(type) &
4154 						   PTYPE_HASH_MASK]);
4155 	}
4156 
4157 	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4158 			       &orig_dev->ptype_specific);
4159 
4160 	if (unlikely(skb->dev != orig_dev)) {
4161 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4162 				       &skb->dev->ptype_specific);
4163 	}
4164 
4165 	if (pt_prev) {
4166 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4167 			goto drop;
4168 		else
4169 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4170 	} else {
4171 drop:
4172 		if (!deliver_exact)
4173 			atomic_long_inc(&skb->dev->rx_dropped);
4174 		else
4175 			atomic_long_inc(&skb->dev->rx_nohandler);
4176 		kfree_skb(skb);
4177 		/* Jamal, now you will not able to escape explaining
4178 		 * me how you were going to use this. :-)
4179 		 */
4180 		ret = NET_RX_DROP;
4181 	}
4182 
4183 out:
4184 	return ret;
4185 }
4186 
4187 static int __netif_receive_skb(struct sk_buff *skb)
4188 {
4189 	int ret;
4190 
4191 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4192 		unsigned long pflags = current->flags;
4193 
4194 		/*
4195 		 * PFMEMALLOC skbs are special, they should
4196 		 * - be delivered to SOCK_MEMALLOC sockets only
4197 		 * - stay away from userspace
4198 		 * - have bounded memory usage
4199 		 *
4200 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4201 		 * context down to all allocation sites.
4202 		 */
4203 		current->flags |= PF_MEMALLOC;
4204 		ret = __netif_receive_skb_core(skb, true);
4205 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4206 	} else
4207 		ret = __netif_receive_skb_core(skb, false);
4208 
4209 	return ret;
4210 }
4211 
4212 static int netif_receive_skb_internal(struct sk_buff *skb)
4213 {
4214 	int ret;
4215 
4216 	net_timestamp_check(netdev_tstamp_prequeue, skb);
4217 
4218 	if (skb_defer_rx_timestamp(skb))
4219 		return NET_RX_SUCCESS;
4220 
4221 	rcu_read_lock();
4222 
4223 #ifdef CONFIG_RPS
4224 	if (static_key_false(&rps_needed)) {
4225 		struct rps_dev_flow voidflow, *rflow = &voidflow;
4226 		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4227 
4228 		if (cpu >= 0) {
4229 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4230 			rcu_read_unlock();
4231 			return ret;
4232 		}
4233 	}
4234 #endif
4235 	ret = __netif_receive_skb(skb);
4236 	rcu_read_unlock();
4237 	return ret;
4238 }
4239 
4240 /**
4241  *	netif_receive_skb - process receive buffer from network
4242  *	@skb: buffer to process
4243  *
4244  *	netif_receive_skb() is the main receive data processing function.
4245  *	It always succeeds. The buffer may be dropped during processing
4246  *	for congestion control or by the protocol layers.
4247  *
4248  *	This function may only be called from softirq context and interrupts
4249  *	should be enabled.
4250  *
4251  *	Return values (usually ignored):
4252  *	NET_RX_SUCCESS: no congestion
4253  *	NET_RX_DROP: packet was dropped
4254  */
4255 int netif_receive_skb(struct sk_buff *skb)
4256 {
4257 	trace_netif_receive_skb_entry(skb);
4258 
4259 	return netif_receive_skb_internal(skb);
4260 }
4261 EXPORT_SYMBOL(netif_receive_skb);
4262 
4263 DEFINE_PER_CPU(struct work_struct, flush_works);
4264 
4265 /* Network device is going away, flush any packets still pending */
4266 static void flush_backlog(struct work_struct *work)
4267 {
4268 	struct sk_buff *skb, *tmp;
4269 	struct softnet_data *sd;
4270 
4271 	local_bh_disable();
4272 	sd = this_cpu_ptr(&softnet_data);
4273 
4274 	local_irq_disable();
4275 	rps_lock(sd);
4276 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4277 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4278 			__skb_unlink(skb, &sd->input_pkt_queue);
4279 			kfree_skb(skb);
4280 			input_queue_head_incr(sd);
4281 		}
4282 	}
4283 	rps_unlock(sd);
4284 	local_irq_enable();
4285 
4286 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4287 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4288 			__skb_unlink(skb, &sd->process_queue);
4289 			kfree_skb(skb);
4290 			input_queue_head_incr(sd);
4291 		}
4292 	}
4293 	local_bh_enable();
4294 }
4295 
4296 static void flush_all_backlogs(void)
4297 {
4298 	unsigned int cpu;
4299 
4300 	get_online_cpus();
4301 
4302 	for_each_online_cpu(cpu)
4303 		queue_work_on(cpu, system_highpri_wq,
4304 			      per_cpu_ptr(&flush_works, cpu));
4305 
4306 	for_each_online_cpu(cpu)
4307 		flush_work(per_cpu_ptr(&flush_works, cpu));
4308 
4309 	put_online_cpus();
4310 }
4311 
4312 static int napi_gro_complete(struct sk_buff *skb)
4313 {
4314 	struct packet_offload *ptype;
4315 	__be16 type = skb->protocol;
4316 	struct list_head *head = &offload_base;
4317 	int err = -ENOENT;
4318 
4319 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4320 
4321 	if (NAPI_GRO_CB(skb)->count == 1) {
4322 		skb_shinfo(skb)->gso_size = 0;
4323 		goto out;
4324 	}
4325 
4326 	rcu_read_lock();
4327 	list_for_each_entry_rcu(ptype, head, list) {
4328 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4329 			continue;
4330 
4331 		err = ptype->callbacks.gro_complete(skb, 0);
4332 		break;
4333 	}
4334 	rcu_read_unlock();
4335 
4336 	if (err) {
4337 		WARN_ON(&ptype->list == head);
4338 		kfree_skb(skb);
4339 		return NET_RX_SUCCESS;
4340 	}
4341 
4342 out:
4343 	return netif_receive_skb_internal(skb);
4344 }
4345 
4346 /* napi->gro_list contains packets ordered by age.
4347  * youngest packets at the head of it.
4348  * Complete skbs in reverse order to reduce latencies.
4349  */
4350 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4351 {
4352 	struct sk_buff *skb, *prev = NULL;
4353 
4354 	/* scan list and build reverse chain */
4355 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4356 		skb->prev = prev;
4357 		prev = skb;
4358 	}
4359 
4360 	for (skb = prev; skb; skb = prev) {
4361 		skb->next = NULL;
4362 
4363 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4364 			return;
4365 
4366 		prev = skb->prev;
4367 		napi_gro_complete(skb);
4368 		napi->gro_count--;
4369 	}
4370 
4371 	napi->gro_list = NULL;
4372 }
4373 EXPORT_SYMBOL(napi_gro_flush);
4374 
4375 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4376 {
4377 	struct sk_buff *p;
4378 	unsigned int maclen = skb->dev->hard_header_len;
4379 	u32 hash = skb_get_hash_raw(skb);
4380 
4381 	for (p = napi->gro_list; p; p = p->next) {
4382 		unsigned long diffs;
4383 
4384 		NAPI_GRO_CB(p)->flush = 0;
4385 
4386 		if (hash != skb_get_hash_raw(p)) {
4387 			NAPI_GRO_CB(p)->same_flow = 0;
4388 			continue;
4389 		}
4390 
4391 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4392 		diffs |= p->vlan_tci ^ skb->vlan_tci;
4393 		diffs |= skb_metadata_dst_cmp(p, skb);
4394 		if (maclen == ETH_HLEN)
4395 			diffs |= compare_ether_header(skb_mac_header(p),
4396 						      skb_mac_header(skb));
4397 		else if (!diffs)
4398 			diffs = memcmp(skb_mac_header(p),
4399 				       skb_mac_header(skb),
4400 				       maclen);
4401 		NAPI_GRO_CB(p)->same_flow = !diffs;
4402 	}
4403 }
4404 
4405 static void skb_gro_reset_offset(struct sk_buff *skb)
4406 {
4407 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4408 	const skb_frag_t *frag0 = &pinfo->frags[0];
4409 
4410 	NAPI_GRO_CB(skb)->data_offset = 0;
4411 	NAPI_GRO_CB(skb)->frag0 = NULL;
4412 	NAPI_GRO_CB(skb)->frag0_len = 0;
4413 
4414 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4415 	    pinfo->nr_frags &&
4416 	    !PageHighMem(skb_frag_page(frag0))) {
4417 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4418 		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4419 						    skb_frag_size(frag0),
4420 						    skb->end - skb->tail);
4421 	}
4422 }
4423 
4424 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4425 {
4426 	struct skb_shared_info *pinfo = skb_shinfo(skb);
4427 
4428 	BUG_ON(skb->end - skb->tail < grow);
4429 
4430 	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4431 
4432 	skb->data_len -= grow;
4433 	skb->tail += grow;
4434 
4435 	pinfo->frags[0].page_offset += grow;
4436 	skb_frag_size_sub(&pinfo->frags[0], grow);
4437 
4438 	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4439 		skb_frag_unref(skb, 0);
4440 		memmove(pinfo->frags, pinfo->frags + 1,
4441 			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4442 	}
4443 }
4444 
4445 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4446 {
4447 	struct sk_buff **pp = NULL;
4448 	struct packet_offload *ptype;
4449 	__be16 type = skb->protocol;
4450 	struct list_head *head = &offload_base;
4451 	int same_flow;
4452 	enum gro_result ret;
4453 	int grow;
4454 
4455 	if (!(skb->dev->features & NETIF_F_GRO))
4456 		goto normal;
4457 
4458 	if (skb->csum_bad)
4459 		goto normal;
4460 
4461 	gro_list_prepare(napi, skb);
4462 
4463 	rcu_read_lock();
4464 	list_for_each_entry_rcu(ptype, head, list) {
4465 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4466 			continue;
4467 
4468 		skb_set_network_header(skb, skb_gro_offset(skb));
4469 		skb_reset_mac_len(skb);
4470 		NAPI_GRO_CB(skb)->same_flow = 0;
4471 		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4472 		NAPI_GRO_CB(skb)->free = 0;
4473 		NAPI_GRO_CB(skb)->encap_mark = 0;
4474 		NAPI_GRO_CB(skb)->recursion_counter = 0;
4475 		NAPI_GRO_CB(skb)->is_fou = 0;
4476 		NAPI_GRO_CB(skb)->is_atomic = 1;
4477 		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4478 
4479 		/* Setup for GRO checksum validation */
4480 		switch (skb->ip_summed) {
4481 		case CHECKSUM_COMPLETE:
4482 			NAPI_GRO_CB(skb)->csum = skb->csum;
4483 			NAPI_GRO_CB(skb)->csum_valid = 1;
4484 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4485 			break;
4486 		case CHECKSUM_UNNECESSARY:
4487 			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4488 			NAPI_GRO_CB(skb)->csum_valid = 0;
4489 			break;
4490 		default:
4491 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4492 			NAPI_GRO_CB(skb)->csum_valid = 0;
4493 		}
4494 
4495 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4496 		break;
4497 	}
4498 	rcu_read_unlock();
4499 
4500 	if (&ptype->list == head)
4501 		goto normal;
4502 
4503 	same_flow = NAPI_GRO_CB(skb)->same_flow;
4504 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4505 
4506 	if (pp) {
4507 		struct sk_buff *nskb = *pp;
4508 
4509 		*pp = nskb->next;
4510 		nskb->next = NULL;
4511 		napi_gro_complete(nskb);
4512 		napi->gro_count--;
4513 	}
4514 
4515 	if (same_flow)
4516 		goto ok;
4517 
4518 	if (NAPI_GRO_CB(skb)->flush)
4519 		goto normal;
4520 
4521 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4522 		struct sk_buff *nskb = napi->gro_list;
4523 
4524 		/* locate the end of the list to select the 'oldest' flow */
4525 		while (nskb->next) {
4526 			pp = &nskb->next;
4527 			nskb = *pp;
4528 		}
4529 		*pp = NULL;
4530 		nskb->next = NULL;
4531 		napi_gro_complete(nskb);
4532 	} else {
4533 		napi->gro_count++;
4534 	}
4535 	NAPI_GRO_CB(skb)->count = 1;
4536 	NAPI_GRO_CB(skb)->age = jiffies;
4537 	NAPI_GRO_CB(skb)->last = skb;
4538 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4539 	skb->next = napi->gro_list;
4540 	napi->gro_list = skb;
4541 	ret = GRO_HELD;
4542 
4543 pull:
4544 	grow = skb_gro_offset(skb) - skb_headlen(skb);
4545 	if (grow > 0)
4546 		gro_pull_from_frag0(skb, grow);
4547 ok:
4548 	return ret;
4549 
4550 normal:
4551 	ret = GRO_NORMAL;
4552 	goto pull;
4553 }
4554 
4555 struct packet_offload *gro_find_receive_by_type(__be16 type)
4556 {
4557 	struct list_head *offload_head = &offload_base;
4558 	struct packet_offload *ptype;
4559 
4560 	list_for_each_entry_rcu(ptype, offload_head, list) {
4561 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4562 			continue;
4563 		return ptype;
4564 	}
4565 	return NULL;
4566 }
4567 EXPORT_SYMBOL(gro_find_receive_by_type);
4568 
4569 struct packet_offload *gro_find_complete_by_type(__be16 type)
4570 {
4571 	struct list_head *offload_head = &offload_base;
4572 	struct packet_offload *ptype;
4573 
4574 	list_for_each_entry_rcu(ptype, offload_head, list) {
4575 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4576 			continue;
4577 		return ptype;
4578 	}
4579 	return NULL;
4580 }
4581 EXPORT_SYMBOL(gro_find_complete_by_type);
4582 
4583 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4584 {
4585 	switch (ret) {
4586 	case GRO_NORMAL:
4587 		if (netif_receive_skb_internal(skb))
4588 			ret = GRO_DROP;
4589 		break;
4590 
4591 	case GRO_DROP:
4592 		kfree_skb(skb);
4593 		break;
4594 
4595 	case GRO_MERGED_FREE:
4596 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4597 			skb_dst_drop(skb);
4598 			secpath_reset(skb);
4599 			kmem_cache_free(skbuff_head_cache, skb);
4600 		} else {
4601 			__kfree_skb(skb);
4602 		}
4603 		break;
4604 
4605 	case GRO_HELD:
4606 	case GRO_MERGED:
4607 		break;
4608 	}
4609 
4610 	return ret;
4611 }
4612 
4613 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4614 {
4615 	skb_mark_napi_id(skb, napi);
4616 	trace_napi_gro_receive_entry(skb);
4617 
4618 	skb_gro_reset_offset(skb);
4619 
4620 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4621 }
4622 EXPORT_SYMBOL(napi_gro_receive);
4623 
4624 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4625 {
4626 	if (unlikely(skb->pfmemalloc)) {
4627 		consume_skb(skb);
4628 		return;
4629 	}
4630 	__skb_pull(skb, skb_headlen(skb));
4631 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4632 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4633 	skb->vlan_tci = 0;
4634 	skb->dev = napi->dev;
4635 	skb->skb_iif = 0;
4636 	skb->encapsulation = 0;
4637 	skb_shinfo(skb)->gso_type = 0;
4638 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4639 	secpath_reset(skb);
4640 
4641 	napi->skb = skb;
4642 }
4643 
4644 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4645 {
4646 	struct sk_buff *skb = napi->skb;
4647 
4648 	if (!skb) {
4649 		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4650 		if (skb) {
4651 			napi->skb = skb;
4652 			skb_mark_napi_id(skb, napi);
4653 		}
4654 	}
4655 	return skb;
4656 }
4657 EXPORT_SYMBOL(napi_get_frags);
4658 
4659 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4660 				      struct sk_buff *skb,
4661 				      gro_result_t ret)
4662 {
4663 	switch (ret) {
4664 	case GRO_NORMAL:
4665 	case GRO_HELD:
4666 		__skb_push(skb, ETH_HLEN);
4667 		skb->protocol = eth_type_trans(skb, skb->dev);
4668 		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4669 			ret = GRO_DROP;
4670 		break;
4671 
4672 	case GRO_DROP:
4673 	case GRO_MERGED_FREE:
4674 		napi_reuse_skb(napi, skb);
4675 		break;
4676 
4677 	case GRO_MERGED:
4678 		break;
4679 	}
4680 
4681 	return ret;
4682 }
4683 
4684 /* Upper GRO stack assumes network header starts at gro_offset=0
4685  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4686  * We copy ethernet header into skb->data to have a common layout.
4687  */
4688 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4689 {
4690 	struct sk_buff *skb = napi->skb;
4691 	const struct ethhdr *eth;
4692 	unsigned int hlen = sizeof(*eth);
4693 
4694 	napi->skb = NULL;
4695 
4696 	skb_reset_mac_header(skb);
4697 	skb_gro_reset_offset(skb);
4698 
4699 	eth = skb_gro_header_fast(skb, 0);
4700 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4701 		eth = skb_gro_header_slow(skb, hlen, 0);
4702 		if (unlikely(!eth)) {
4703 			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4704 					     __func__, napi->dev->name);
4705 			napi_reuse_skb(napi, skb);
4706 			return NULL;
4707 		}
4708 	} else {
4709 		gro_pull_from_frag0(skb, hlen);
4710 		NAPI_GRO_CB(skb)->frag0 += hlen;
4711 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4712 	}
4713 	__skb_pull(skb, hlen);
4714 
4715 	/*
4716 	 * This works because the only protocols we care about don't require
4717 	 * special handling.
4718 	 * We'll fix it up properly in napi_frags_finish()
4719 	 */
4720 	skb->protocol = eth->h_proto;
4721 
4722 	return skb;
4723 }
4724 
4725 gro_result_t napi_gro_frags(struct napi_struct *napi)
4726 {
4727 	struct sk_buff *skb = napi_frags_skb(napi);
4728 
4729 	if (!skb)
4730 		return GRO_DROP;
4731 
4732 	trace_napi_gro_frags_entry(skb);
4733 
4734 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4735 }
4736 EXPORT_SYMBOL(napi_gro_frags);
4737 
4738 /* Compute the checksum from gro_offset and return the folded value
4739  * after adding in any pseudo checksum.
4740  */
4741 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4742 {
4743 	__wsum wsum;
4744 	__sum16 sum;
4745 
4746 	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4747 
4748 	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4749 	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4750 	if (likely(!sum)) {
4751 		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4752 		    !skb->csum_complete_sw)
4753 			netdev_rx_csum_fault(skb->dev);
4754 	}
4755 
4756 	NAPI_GRO_CB(skb)->csum = wsum;
4757 	NAPI_GRO_CB(skb)->csum_valid = 1;
4758 
4759 	return sum;
4760 }
4761 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4762 
4763 /*
4764  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4765  * Note: called with local irq disabled, but exits with local irq enabled.
4766  */
4767 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4768 {
4769 #ifdef CONFIG_RPS
4770 	struct softnet_data *remsd = sd->rps_ipi_list;
4771 
4772 	if (remsd) {
4773 		sd->rps_ipi_list = NULL;
4774 
4775 		local_irq_enable();
4776 
4777 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4778 		while (remsd) {
4779 			struct softnet_data *next = remsd->rps_ipi_next;
4780 
4781 			if (cpu_online(remsd->cpu))
4782 				smp_call_function_single_async(remsd->cpu,
4783 							   &remsd->csd);
4784 			remsd = next;
4785 		}
4786 	} else
4787 #endif
4788 		local_irq_enable();
4789 }
4790 
4791 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4792 {
4793 #ifdef CONFIG_RPS
4794 	return sd->rps_ipi_list != NULL;
4795 #else
4796 	return false;
4797 #endif
4798 }
4799 
4800 static int process_backlog(struct napi_struct *napi, int quota)
4801 {
4802 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4803 	bool again = true;
4804 	int work = 0;
4805 
4806 	/* Check if we have pending ipi, its better to send them now,
4807 	 * not waiting net_rx_action() end.
4808 	 */
4809 	if (sd_has_rps_ipi_waiting(sd)) {
4810 		local_irq_disable();
4811 		net_rps_action_and_irq_enable(sd);
4812 	}
4813 
4814 	napi->weight = dev_rx_weight;
4815 	while (again) {
4816 		struct sk_buff *skb;
4817 
4818 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4819 			rcu_read_lock();
4820 			__netif_receive_skb(skb);
4821 			rcu_read_unlock();
4822 			input_queue_head_incr(sd);
4823 			if (++work >= quota)
4824 				return work;
4825 
4826 		}
4827 
4828 		local_irq_disable();
4829 		rps_lock(sd);
4830 		if (skb_queue_empty(&sd->input_pkt_queue)) {
4831 			/*
4832 			 * Inline a custom version of __napi_complete().
4833 			 * only current cpu owns and manipulates this napi,
4834 			 * and NAPI_STATE_SCHED is the only possible flag set
4835 			 * on backlog.
4836 			 * We can use a plain write instead of clear_bit(),
4837 			 * and we dont need an smp_mb() memory barrier.
4838 			 */
4839 			napi->state = 0;
4840 			again = false;
4841 		} else {
4842 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4843 						   &sd->process_queue);
4844 		}
4845 		rps_unlock(sd);
4846 		local_irq_enable();
4847 	}
4848 
4849 	return work;
4850 }
4851 
4852 /**
4853  * __napi_schedule - schedule for receive
4854  * @n: entry to schedule
4855  *
4856  * The entry's receive function will be scheduled to run.
4857  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4858  */
4859 void __napi_schedule(struct napi_struct *n)
4860 {
4861 	unsigned long flags;
4862 
4863 	local_irq_save(flags);
4864 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4865 	local_irq_restore(flags);
4866 }
4867 EXPORT_SYMBOL(__napi_schedule);
4868 
4869 /**
4870  * __napi_schedule_irqoff - schedule for receive
4871  * @n: entry to schedule
4872  *
4873  * Variant of __napi_schedule() assuming hard irqs are masked
4874  */
4875 void __napi_schedule_irqoff(struct napi_struct *n)
4876 {
4877 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4878 }
4879 EXPORT_SYMBOL(__napi_schedule_irqoff);
4880 
4881 bool napi_complete_done(struct napi_struct *n, int work_done)
4882 {
4883 	unsigned long flags;
4884 
4885 	/*
4886 	 * 1) Don't let napi dequeue from the cpu poll list
4887 	 *    just in case its running on a different cpu.
4888 	 * 2) If we are busy polling, do nothing here, we have
4889 	 *    the guarantee we will be called later.
4890 	 */
4891 	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4892 				 NAPIF_STATE_IN_BUSY_POLL)))
4893 		return false;
4894 
4895 	if (n->gro_list) {
4896 		unsigned long timeout = 0;
4897 
4898 		if (work_done)
4899 			timeout = n->dev->gro_flush_timeout;
4900 
4901 		if (timeout)
4902 			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4903 				      HRTIMER_MODE_REL_PINNED);
4904 		else
4905 			napi_gro_flush(n, false);
4906 	}
4907 	if (unlikely(!list_empty(&n->poll_list))) {
4908 		/* If n->poll_list is not empty, we need to mask irqs */
4909 		local_irq_save(flags);
4910 		list_del_init(&n->poll_list);
4911 		local_irq_restore(flags);
4912 	}
4913 	WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4914 	return true;
4915 }
4916 EXPORT_SYMBOL(napi_complete_done);
4917 
4918 /* must be called under rcu_read_lock(), as we dont take a reference */
4919 static struct napi_struct *napi_by_id(unsigned int napi_id)
4920 {
4921 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4922 	struct napi_struct *napi;
4923 
4924 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4925 		if (napi->napi_id == napi_id)
4926 			return napi;
4927 
4928 	return NULL;
4929 }
4930 
4931 #if defined(CONFIG_NET_RX_BUSY_POLL)
4932 
4933 #define BUSY_POLL_BUDGET 8
4934 
4935 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
4936 {
4937 	int rc;
4938 
4939 	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
4940 
4941 	local_bh_disable();
4942 
4943 	/* All we really want here is to re-enable device interrupts.
4944 	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
4945 	 */
4946 	rc = napi->poll(napi, BUSY_POLL_BUDGET);
4947 	netpoll_poll_unlock(have_poll_lock);
4948 	if (rc == BUSY_POLL_BUDGET)
4949 		__napi_schedule(napi);
4950 	local_bh_enable();
4951 	if (local_softirq_pending())
4952 		do_softirq();
4953 }
4954 
4955 bool sk_busy_loop(struct sock *sk, int nonblock)
4956 {
4957 	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4958 	int (*napi_poll)(struct napi_struct *napi, int budget);
4959 	void *have_poll_lock = NULL;
4960 	struct napi_struct *napi;
4961 	int rc;
4962 
4963 restart:
4964 	rc = false;
4965 	napi_poll = NULL;
4966 
4967 	rcu_read_lock();
4968 
4969 	napi = napi_by_id(sk->sk_napi_id);
4970 	if (!napi)
4971 		goto out;
4972 
4973 	preempt_disable();
4974 	for (;;) {
4975 		rc = 0;
4976 		local_bh_disable();
4977 		if (!napi_poll) {
4978 			unsigned long val = READ_ONCE(napi->state);
4979 
4980 			/* If multiple threads are competing for this napi,
4981 			 * we avoid dirtying napi->state as much as we can.
4982 			 */
4983 			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
4984 				   NAPIF_STATE_IN_BUSY_POLL))
4985 				goto count;
4986 			if (cmpxchg(&napi->state, val,
4987 				    val | NAPIF_STATE_IN_BUSY_POLL |
4988 					  NAPIF_STATE_SCHED) != val)
4989 				goto count;
4990 			have_poll_lock = netpoll_poll_lock(napi);
4991 			napi_poll = napi->poll;
4992 		}
4993 		rc = napi_poll(napi, BUSY_POLL_BUDGET);
4994 		trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
4995 count:
4996 		if (rc > 0)
4997 			__NET_ADD_STATS(sock_net(sk),
4998 					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
4999 		local_bh_enable();
5000 
5001 		if (rc == LL_FLUSH_FAILED)
5002 			break; /* permanent failure */
5003 
5004 		if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5005 		    busy_loop_timeout(end_time))
5006 			break;
5007 
5008 		if (unlikely(need_resched())) {
5009 			if (napi_poll)
5010 				busy_poll_stop(napi, have_poll_lock);
5011 			preempt_enable();
5012 			rcu_read_unlock();
5013 			cond_resched();
5014 			rc = !skb_queue_empty(&sk->sk_receive_queue);
5015 			if (rc || busy_loop_timeout(end_time))
5016 				return rc;
5017 			goto restart;
5018 		}
5019 		cpu_relax();
5020 	}
5021 	if (napi_poll)
5022 		busy_poll_stop(napi, have_poll_lock);
5023 	preempt_enable();
5024 	rc = !skb_queue_empty(&sk->sk_receive_queue);
5025 out:
5026 	rcu_read_unlock();
5027 	return rc;
5028 }
5029 EXPORT_SYMBOL(sk_busy_loop);
5030 
5031 #endif /* CONFIG_NET_RX_BUSY_POLL */
5032 
5033 static void napi_hash_add(struct napi_struct *napi)
5034 {
5035 	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5036 	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5037 		return;
5038 
5039 	spin_lock(&napi_hash_lock);
5040 
5041 	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5042 	do {
5043 		if (unlikely(++napi_gen_id < NR_CPUS + 1))
5044 			napi_gen_id = NR_CPUS + 1;
5045 	} while (napi_by_id(napi_gen_id));
5046 	napi->napi_id = napi_gen_id;
5047 
5048 	hlist_add_head_rcu(&napi->napi_hash_node,
5049 			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5050 
5051 	spin_unlock(&napi_hash_lock);
5052 }
5053 
5054 /* Warning : caller is responsible to make sure rcu grace period
5055  * is respected before freeing memory containing @napi
5056  */
5057 bool napi_hash_del(struct napi_struct *napi)
5058 {
5059 	bool rcu_sync_needed = false;
5060 
5061 	spin_lock(&napi_hash_lock);
5062 
5063 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5064 		rcu_sync_needed = true;
5065 		hlist_del_rcu(&napi->napi_hash_node);
5066 	}
5067 	spin_unlock(&napi_hash_lock);
5068 	return rcu_sync_needed;
5069 }
5070 EXPORT_SYMBOL_GPL(napi_hash_del);
5071 
5072 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5073 {
5074 	struct napi_struct *napi;
5075 
5076 	napi = container_of(timer, struct napi_struct, timer);
5077 	if (napi->gro_list)
5078 		napi_schedule(napi);
5079 
5080 	return HRTIMER_NORESTART;
5081 }
5082 
5083 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5084 		    int (*poll)(struct napi_struct *, int), int weight)
5085 {
5086 	INIT_LIST_HEAD(&napi->poll_list);
5087 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5088 	napi->timer.function = napi_watchdog;
5089 	napi->gro_count = 0;
5090 	napi->gro_list = NULL;
5091 	napi->skb = NULL;
5092 	napi->poll = poll;
5093 	if (weight > NAPI_POLL_WEIGHT)
5094 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5095 			    weight, dev->name);
5096 	napi->weight = weight;
5097 	list_add(&napi->dev_list, &dev->napi_list);
5098 	napi->dev = dev;
5099 #ifdef CONFIG_NETPOLL
5100 	napi->poll_owner = -1;
5101 #endif
5102 	set_bit(NAPI_STATE_SCHED, &napi->state);
5103 	napi_hash_add(napi);
5104 }
5105 EXPORT_SYMBOL(netif_napi_add);
5106 
5107 void napi_disable(struct napi_struct *n)
5108 {
5109 	might_sleep();
5110 	set_bit(NAPI_STATE_DISABLE, &n->state);
5111 
5112 	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5113 		msleep(1);
5114 	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5115 		msleep(1);
5116 
5117 	hrtimer_cancel(&n->timer);
5118 
5119 	clear_bit(NAPI_STATE_DISABLE, &n->state);
5120 }
5121 EXPORT_SYMBOL(napi_disable);
5122 
5123 /* Must be called in process context */
5124 void netif_napi_del(struct napi_struct *napi)
5125 {
5126 	might_sleep();
5127 	if (napi_hash_del(napi))
5128 		synchronize_net();
5129 	list_del_init(&napi->dev_list);
5130 	napi_free_frags(napi);
5131 
5132 	kfree_skb_list(napi->gro_list);
5133 	napi->gro_list = NULL;
5134 	napi->gro_count = 0;
5135 }
5136 EXPORT_SYMBOL(netif_napi_del);
5137 
5138 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5139 {
5140 	void *have;
5141 	int work, weight;
5142 
5143 	list_del_init(&n->poll_list);
5144 
5145 	have = netpoll_poll_lock(n);
5146 
5147 	weight = n->weight;
5148 
5149 	/* This NAPI_STATE_SCHED test is for avoiding a race
5150 	 * with netpoll's poll_napi().  Only the entity which
5151 	 * obtains the lock and sees NAPI_STATE_SCHED set will
5152 	 * actually make the ->poll() call.  Therefore we avoid
5153 	 * accidentally calling ->poll() when NAPI is not scheduled.
5154 	 */
5155 	work = 0;
5156 	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5157 		work = n->poll(n, weight);
5158 		trace_napi_poll(n, work, weight);
5159 	}
5160 
5161 	WARN_ON_ONCE(work > weight);
5162 
5163 	if (likely(work < weight))
5164 		goto out_unlock;
5165 
5166 	/* Drivers must not modify the NAPI state if they
5167 	 * consume the entire weight.  In such cases this code
5168 	 * still "owns" the NAPI instance and therefore can
5169 	 * move the instance around on the list at-will.
5170 	 */
5171 	if (unlikely(napi_disable_pending(n))) {
5172 		napi_complete(n);
5173 		goto out_unlock;
5174 	}
5175 
5176 	if (n->gro_list) {
5177 		/* flush too old packets
5178 		 * If HZ < 1000, flush all packets.
5179 		 */
5180 		napi_gro_flush(n, HZ >= 1000);
5181 	}
5182 
5183 	/* Some drivers may have called napi_schedule
5184 	 * prior to exhausting their budget.
5185 	 */
5186 	if (unlikely(!list_empty(&n->poll_list))) {
5187 		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5188 			     n->dev ? n->dev->name : "backlog");
5189 		goto out_unlock;
5190 	}
5191 
5192 	list_add_tail(&n->poll_list, repoll);
5193 
5194 out_unlock:
5195 	netpoll_poll_unlock(have);
5196 
5197 	return work;
5198 }
5199 
5200 static __latent_entropy void net_rx_action(struct softirq_action *h)
5201 {
5202 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5203 	unsigned long time_limit = jiffies + 2;
5204 	int budget = netdev_budget;
5205 	LIST_HEAD(list);
5206 	LIST_HEAD(repoll);
5207 
5208 	local_irq_disable();
5209 	list_splice_init(&sd->poll_list, &list);
5210 	local_irq_enable();
5211 
5212 	for (;;) {
5213 		struct napi_struct *n;
5214 
5215 		if (list_empty(&list)) {
5216 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5217 				goto out;
5218 			break;
5219 		}
5220 
5221 		n = list_first_entry(&list, struct napi_struct, poll_list);
5222 		budget -= napi_poll(n, &repoll);
5223 
5224 		/* If softirq window is exhausted then punt.
5225 		 * Allow this to run for 2 jiffies since which will allow
5226 		 * an average latency of 1.5/HZ.
5227 		 */
5228 		if (unlikely(budget <= 0 ||
5229 			     time_after_eq(jiffies, time_limit))) {
5230 			sd->time_squeeze++;
5231 			break;
5232 		}
5233 	}
5234 
5235 	local_irq_disable();
5236 
5237 	list_splice_tail_init(&sd->poll_list, &list);
5238 	list_splice_tail(&repoll, &list);
5239 	list_splice(&list, &sd->poll_list);
5240 	if (!list_empty(&sd->poll_list))
5241 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5242 
5243 	net_rps_action_and_irq_enable(sd);
5244 out:
5245 	__kfree_skb_flush();
5246 }
5247 
5248 struct netdev_adjacent {
5249 	struct net_device *dev;
5250 
5251 	/* upper master flag, there can only be one master device per list */
5252 	bool master;
5253 
5254 	/* counter for the number of times this device was added to us */
5255 	u16 ref_nr;
5256 
5257 	/* private field for the users */
5258 	void *private;
5259 
5260 	struct list_head list;
5261 	struct rcu_head rcu;
5262 };
5263 
5264 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5265 						 struct list_head *adj_list)
5266 {
5267 	struct netdev_adjacent *adj;
5268 
5269 	list_for_each_entry(adj, adj_list, list) {
5270 		if (adj->dev == adj_dev)
5271 			return adj;
5272 	}
5273 	return NULL;
5274 }
5275 
5276 static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5277 {
5278 	struct net_device *dev = data;
5279 
5280 	return upper_dev == dev;
5281 }
5282 
5283 /**
5284  * netdev_has_upper_dev - Check if device is linked to an upper device
5285  * @dev: device
5286  * @upper_dev: upper device to check
5287  *
5288  * Find out if a device is linked to specified upper device and return true
5289  * in case it is. Note that this checks only immediate upper device,
5290  * not through a complete stack of devices. The caller must hold the RTNL lock.
5291  */
5292 bool netdev_has_upper_dev(struct net_device *dev,
5293 			  struct net_device *upper_dev)
5294 {
5295 	ASSERT_RTNL();
5296 
5297 	return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5298 					     upper_dev);
5299 }
5300 EXPORT_SYMBOL(netdev_has_upper_dev);
5301 
5302 /**
5303  * netdev_has_upper_dev_all - Check if device is linked to an upper device
5304  * @dev: device
5305  * @upper_dev: upper device to check
5306  *
5307  * Find out if a device is linked to specified upper device and return true
5308  * in case it is. Note that this checks the entire upper device chain.
5309  * The caller must hold rcu lock.
5310  */
5311 
5312 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5313 				  struct net_device *upper_dev)
5314 {
5315 	return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5316 					       upper_dev);
5317 }
5318 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5319 
5320 /**
5321  * netdev_has_any_upper_dev - Check if device is linked to some device
5322  * @dev: device
5323  *
5324  * Find out if a device is linked to an upper device and return true in case
5325  * it is. The caller must hold the RTNL lock.
5326  */
5327 static bool netdev_has_any_upper_dev(struct net_device *dev)
5328 {
5329 	ASSERT_RTNL();
5330 
5331 	return !list_empty(&dev->adj_list.upper);
5332 }
5333 
5334 /**
5335  * netdev_master_upper_dev_get - Get master upper device
5336  * @dev: device
5337  *
5338  * Find a master upper device and return pointer to it or NULL in case
5339  * it's not there. The caller must hold the RTNL lock.
5340  */
5341 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5342 {
5343 	struct netdev_adjacent *upper;
5344 
5345 	ASSERT_RTNL();
5346 
5347 	if (list_empty(&dev->adj_list.upper))
5348 		return NULL;
5349 
5350 	upper = list_first_entry(&dev->adj_list.upper,
5351 				 struct netdev_adjacent, list);
5352 	if (likely(upper->master))
5353 		return upper->dev;
5354 	return NULL;
5355 }
5356 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5357 
5358 /**
5359  * netdev_has_any_lower_dev - Check if device is linked to some device
5360  * @dev: device
5361  *
5362  * Find out if a device is linked to a lower device and return true in case
5363  * it is. The caller must hold the RTNL lock.
5364  */
5365 static bool netdev_has_any_lower_dev(struct net_device *dev)
5366 {
5367 	ASSERT_RTNL();
5368 
5369 	return !list_empty(&dev->adj_list.lower);
5370 }
5371 
5372 void *netdev_adjacent_get_private(struct list_head *adj_list)
5373 {
5374 	struct netdev_adjacent *adj;
5375 
5376 	adj = list_entry(adj_list, struct netdev_adjacent, list);
5377 
5378 	return adj->private;
5379 }
5380 EXPORT_SYMBOL(netdev_adjacent_get_private);
5381 
5382 /**
5383  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5384  * @dev: device
5385  * @iter: list_head ** of the current position
5386  *
5387  * Gets the next device from the dev's upper list, starting from iter
5388  * position. The caller must hold RCU read lock.
5389  */
5390 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5391 						 struct list_head **iter)
5392 {
5393 	struct netdev_adjacent *upper;
5394 
5395 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5396 
5397 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5398 
5399 	if (&upper->list == &dev->adj_list.upper)
5400 		return NULL;
5401 
5402 	*iter = &upper->list;
5403 
5404 	return upper->dev;
5405 }
5406 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5407 
5408 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5409 						    struct list_head **iter)
5410 {
5411 	struct netdev_adjacent *upper;
5412 
5413 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5414 
5415 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5416 
5417 	if (&upper->list == &dev->adj_list.upper)
5418 		return NULL;
5419 
5420 	*iter = &upper->list;
5421 
5422 	return upper->dev;
5423 }
5424 
5425 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5426 				  int (*fn)(struct net_device *dev,
5427 					    void *data),
5428 				  void *data)
5429 {
5430 	struct net_device *udev;
5431 	struct list_head *iter;
5432 	int ret;
5433 
5434 	for (iter = &dev->adj_list.upper,
5435 	     udev = netdev_next_upper_dev_rcu(dev, &iter);
5436 	     udev;
5437 	     udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5438 		/* first is the upper device itself */
5439 		ret = fn(udev, data);
5440 		if (ret)
5441 			return ret;
5442 
5443 		/* then look at all of its upper devices */
5444 		ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5445 		if (ret)
5446 			return ret;
5447 	}
5448 
5449 	return 0;
5450 }
5451 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5452 
5453 /**
5454  * netdev_lower_get_next_private - Get the next ->private from the
5455  *				   lower neighbour list
5456  * @dev: device
5457  * @iter: list_head ** of the current position
5458  *
5459  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5460  * list, starting from iter position. The caller must hold either hold the
5461  * RTNL lock or its own locking that guarantees that the neighbour lower
5462  * list will remain unchanged.
5463  */
5464 void *netdev_lower_get_next_private(struct net_device *dev,
5465 				    struct list_head **iter)
5466 {
5467 	struct netdev_adjacent *lower;
5468 
5469 	lower = list_entry(*iter, struct netdev_adjacent, list);
5470 
5471 	if (&lower->list == &dev->adj_list.lower)
5472 		return NULL;
5473 
5474 	*iter = lower->list.next;
5475 
5476 	return lower->private;
5477 }
5478 EXPORT_SYMBOL(netdev_lower_get_next_private);
5479 
5480 /**
5481  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5482  *				       lower neighbour list, RCU
5483  *				       variant
5484  * @dev: device
5485  * @iter: list_head ** of the current position
5486  *
5487  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5488  * list, starting from iter position. The caller must hold RCU read lock.
5489  */
5490 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5491 					struct list_head **iter)
5492 {
5493 	struct netdev_adjacent *lower;
5494 
5495 	WARN_ON_ONCE(!rcu_read_lock_held());
5496 
5497 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5498 
5499 	if (&lower->list == &dev->adj_list.lower)
5500 		return NULL;
5501 
5502 	*iter = &lower->list;
5503 
5504 	return lower->private;
5505 }
5506 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5507 
5508 /**
5509  * netdev_lower_get_next - Get the next device from the lower neighbour
5510  *                         list
5511  * @dev: device
5512  * @iter: list_head ** of the current position
5513  *
5514  * Gets the next netdev_adjacent from the dev's lower neighbour
5515  * list, starting from iter position. The caller must hold RTNL lock or
5516  * its own locking that guarantees that the neighbour lower
5517  * list will remain unchanged.
5518  */
5519 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5520 {
5521 	struct netdev_adjacent *lower;
5522 
5523 	lower = list_entry(*iter, struct netdev_adjacent, list);
5524 
5525 	if (&lower->list == &dev->adj_list.lower)
5526 		return NULL;
5527 
5528 	*iter = lower->list.next;
5529 
5530 	return lower->dev;
5531 }
5532 EXPORT_SYMBOL(netdev_lower_get_next);
5533 
5534 static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5535 						struct list_head **iter)
5536 {
5537 	struct netdev_adjacent *lower;
5538 
5539 	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5540 
5541 	if (&lower->list == &dev->adj_list.lower)
5542 		return NULL;
5543 
5544 	*iter = &lower->list;
5545 
5546 	return lower->dev;
5547 }
5548 
5549 int netdev_walk_all_lower_dev(struct net_device *dev,
5550 			      int (*fn)(struct net_device *dev,
5551 					void *data),
5552 			      void *data)
5553 {
5554 	struct net_device *ldev;
5555 	struct list_head *iter;
5556 	int ret;
5557 
5558 	for (iter = &dev->adj_list.lower,
5559 	     ldev = netdev_next_lower_dev(dev, &iter);
5560 	     ldev;
5561 	     ldev = netdev_next_lower_dev(dev, &iter)) {
5562 		/* first is the lower device itself */
5563 		ret = fn(ldev, data);
5564 		if (ret)
5565 			return ret;
5566 
5567 		/* then look at all of its lower devices */
5568 		ret = netdev_walk_all_lower_dev(ldev, fn, data);
5569 		if (ret)
5570 			return ret;
5571 	}
5572 
5573 	return 0;
5574 }
5575 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5576 
5577 static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5578 						    struct list_head **iter)
5579 {
5580 	struct netdev_adjacent *lower;
5581 
5582 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5583 	if (&lower->list == &dev->adj_list.lower)
5584 		return NULL;
5585 
5586 	*iter = &lower->list;
5587 
5588 	return lower->dev;
5589 }
5590 
5591 int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5592 				  int (*fn)(struct net_device *dev,
5593 					    void *data),
5594 				  void *data)
5595 {
5596 	struct net_device *ldev;
5597 	struct list_head *iter;
5598 	int ret;
5599 
5600 	for (iter = &dev->adj_list.lower,
5601 	     ldev = netdev_next_lower_dev_rcu(dev, &iter);
5602 	     ldev;
5603 	     ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5604 		/* first is the lower device itself */
5605 		ret = fn(ldev, data);
5606 		if (ret)
5607 			return ret;
5608 
5609 		/* then look at all of its lower devices */
5610 		ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5611 		if (ret)
5612 			return ret;
5613 	}
5614 
5615 	return 0;
5616 }
5617 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5618 
5619 /**
5620  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5621  *				       lower neighbour list, RCU
5622  *				       variant
5623  * @dev: device
5624  *
5625  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5626  * list. The caller must hold RCU read lock.
5627  */
5628 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5629 {
5630 	struct netdev_adjacent *lower;
5631 
5632 	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5633 			struct netdev_adjacent, list);
5634 	if (lower)
5635 		return lower->private;
5636 	return NULL;
5637 }
5638 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5639 
5640 /**
5641  * netdev_master_upper_dev_get_rcu - Get master upper device
5642  * @dev: device
5643  *
5644  * Find a master upper device and return pointer to it or NULL in case
5645  * it's not there. The caller must hold the RCU read lock.
5646  */
5647 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5648 {
5649 	struct netdev_adjacent *upper;
5650 
5651 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5652 				       struct netdev_adjacent, list);
5653 	if (upper && likely(upper->master))
5654 		return upper->dev;
5655 	return NULL;
5656 }
5657 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5658 
5659 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5660 			      struct net_device *adj_dev,
5661 			      struct list_head *dev_list)
5662 {
5663 	char linkname[IFNAMSIZ+7];
5664 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5665 		"upper_%s" : "lower_%s", adj_dev->name);
5666 	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5667 				 linkname);
5668 }
5669 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5670 			       char *name,
5671 			       struct list_head *dev_list)
5672 {
5673 	char linkname[IFNAMSIZ+7];
5674 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5675 		"upper_%s" : "lower_%s", name);
5676 	sysfs_remove_link(&(dev->dev.kobj), linkname);
5677 }
5678 
5679 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5680 						 struct net_device *adj_dev,
5681 						 struct list_head *dev_list)
5682 {
5683 	return (dev_list == &dev->adj_list.upper ||
5684 		dev_list == &dev->adj_list.lower) &&
5685 		net_eq(dev_net(dev), dev_net(adj_dev));
5686 }
5687 
5688 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5689 					struct net_device *adj_dev,
5690 					struct list_head *dev_list,
5691 					void *private, bool master)
5692 {
5693 	struct netdev_adjacent *adj;
5694 	int ret;
5695 
5696 	adj = __netdev_find_adj(adj_dev, dev_list);
5697 
5698 	if (adj) {
5699 		adj->ref_nr += 1;
5700 		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5701 			 dev->name, adj_dev->name, adj->ref_nr);
5702 
5703 		return 0;
5704 	}
5705 
5706 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5707 	if (!adj)
5708 		return -ENOMEM;
5709 
5710 	adj->dev = adj_dev;
5711 	adj->master = master;
5712 	adj->ref_nr = 1;
5713 	adj->private = private;
5714 	dev_hold(adj_dev);
5715 
5716 	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5717 		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5718 
5719 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5720 		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5721 		if (ret)
5722 			goto free_adj;
5723 	}
5724 
5725 	/* Ensure that master link is always the first item in list. */
5726 	if (master) {
5727 		ret = sysfs_create_link(&(dev->dev.kobj),
5728 					&(adj_dev->dev.kobj), "master");
5729 		if (ret)
5730 			goto remove_symlinks;
5731 
5732 		list_add_rcu(&adj->list, dev_list);
5733 	} else {
5734 		list_add_tail_rcu(&adj->list, dev_list);
5735 	}
5736 
5737 	return 0;
5738 
5739 remove_symlinks:
5740 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5741 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5742 free_adj:
5743 	kfree(adj);
5744 	dev_put(adj_dev);
5745 
5746 	return ret;
5747 }
5748 
5749 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5750 					 struct net_device *adj_dev,
5751 					 u16 ref_nr,
5752 					 struct list_head *dev_list)
5753 {
5754 	struct netdev_adjacent *adj;
5755 
5756 	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5757 		 dev->name, adj_dev->name, ref_nr);
5758 
5759 	adj = __netdev_find_adj(adj_dev, dev_list);
5760 
5761 	if (!adj) {
5762 		pr_err("Adjacency does not exist for device %s from %s\n",
5763 		       dev->name, adj_dev->name);
5764 		WARN_ON(1);
5765 		return;
5766 	}
5767 
5768 	if (adj->ref_nr > ref_nr) {
5769 		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5770 			 dev->name, adj_dev->name, ref_nr,
5771 			 adj->ref_nr - ref_nr);
5772 		adj->ref_nr -= ref_nr;
5773 		return;
5774 	}
5775 
5776 	if (adj->master)
5777 		sysfs_remove_link(&(dev->dev.kobj), "master");
5778 
5779 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5780 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5781 
5782 	list_del_rcu(&adj->list);
5783 	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5784 		 adj_dev->name, dev->name, adj_dev->name);
5785 	dev_put(adj_dev);
5786 	kfree_rcu(adj, rcu);
5787 }
5788 
5789 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5790 					    struct net_device *upper_dev,
5791 					    struct list_head *up_list,
5792 					    struct list_head *down_list,
5793 					    void *private, bool master)
5794 {
5795 	int ret;
5796 
5797 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5798 					   private, master);
5799 	if (ret)
5800 		return ret;
5801 
5802 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5803 					   private, false);
5804 	if (ret) {
5805 		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5806 		return ret;
5807 	}
5808 
5809 	return 0;
5810 }
5811 
5812 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5813 					       struct net_device *upper_dev,
5814 					       u16 ref_nr,
5815 					       struct list_head *up_list,
5816 					       struct list_head *down_list)
5817 {
5818 	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5819 	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5820 }
5821 
5822 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5823 						struct net_device *upper_dev,
5824 						void *private, bool master)
5825 {
5826 	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5827 						&dev->adj_list.upper,
5828 						&upper_dev->adj_list.lower,
5829 						private, master);
5830 }
5831 
5832 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5833 						   struct net_device *upper_dev)
5834 {
5835 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5836 					   &dev->adj_list.upper,
5837 					   &upper_dev->adj_list.lower);
5838 }
5839 
5840 static int __netdev_upper_dev_link(struct net_device *dev,
5841 				   struct net_device *upper_dev, bool master,
5842 				   void *upper_priv, void *upper_info)
5843 {
5844 	struct netdev_notifier_changeupper_info changeupper_info;
5845 	int ret = 0;
5846 
5847 	ASSERT_RTNL();
5848 
5849 	if (dev == upper_dev)
5850 		return -EBUSY;
5851 
5852 	/* To prevent loops, check if dev is not upper device to upper_dev. */
5853 	if (netdev_has_upper_dev(upper_dev, dev))
5854 		return -EBUSY;
5855 
5856 	if (netdev_has_upper_dev(dev, upper_dev))
5857 		return -EEXIST;
5858 
5859 	if (master && netdev_master_upper_dev_get(dev))
5860 		return -EBUSY;
5861 
5862 	changeupper_info.upper_dev = upper_dev;
5863 	changeupper_info.master = master;
5864 	changeupper_info.linking = true;
5865 	changeupper_info.upper_info = upper_info;
5866 
5867 	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5868 					    &changeupper_info.info);
5869 	ret = notifier_to_errno(ret);
5870 	if (ret)
5871 		return ret;
5872 
5873 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5874 						   master);
5875 	if (ret)
5876 		return ret;
5877 
5878 	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5879 					    &changeupper_info.info);
5880 	ret = notifier_to_errno(ret);
5881 	if (ret)
5882 		goto rollback;
5883 
5884 	return 0;
5885 
5886 rollback:
5887 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5888 
5889 	return ret;
5890 }
5891 
5892 /**
5893  * netdev_upper_dev_link - Add a link to the upper device
5894  * @dev: device
5895  * @upper_dev: new upper device
5896  *
5897  * Adds a link to device which is upper to this one. The caller must hold
5898  * the RTNL lock. On a failure a negative errno code is returned.
5899  * On success the reference counts are adjusted and the function
5900  * returns zero.
5901  */
5902 int netdev_upper_dev_link(struct net_device *dev,
5903 			  struct net_device *upper_dev)
5904 {
5905 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5906 }
5907 EXPORT_SYMBOL(netdev_upper_dev_link);
5908 
5909 /**
5910  * netdev_master_upper_dev_link - Add a master link to the upper device
5911  * @dev: device
5912  * @upper_dev: new upper device
5913  * @upper_priv: upper device private
5914  * @upper_info: upper info to be passed down via notifier
5915  *
5916  * Adds a link to device which is upper to this one. In this case, only
5917  * one master upper device can be linked, although other non-master devices
5918  * might be linked as well. The caller must hold the RTNL lock.
5919  * On a failure a negative errno code is returned. On success the reference
5920  * counts are adjusted and the function returns zero.
5921  */
5922 int netdev_master_upper_dev_link(struct net_device *dev,
5923 				 struct net_device *upper_dev,
5924 				 void *upper_priv, void *upper_info)
5925 {
5926 	return __netdev_upper_dev_link(dev, upper_dev, true,
5927 				       upper_priv, upper_info);
5928 }
5929 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5930 
5931 /**
5932  * netdev_upper_dev_unlink - Removes a link to upper device
5933  * @dev: device
5934  * @upper_dev: new upper device
5935  *
5936  * Removes a link to device which is upper to this one. The caller must hold
5937  * the RTNL lock.
5938  */
5939 void netdev_upper_dev_unlink(struct net_device *dev,
5940 			     struct net_device *upper_dev)
5941 {
5942 	struct netdev_notifier_changeupper_info changeupper_info;
5943 	ASSERT_RTNL();
5944 
5945 	changeupper_info.upper_dev = upper_dev;
5946 	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5947 	changeupper_info.linking = false;
5948 
5949 	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5950 				      &changeupper_info.info);
5951 
5952 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5953 
5954 	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5955 				      &changeupper_info.info);
5956 }
5957 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5958 
5959 /**
5960  * netdev_bonding_info_change - Dispatch event about slave change
5961  * @dev: device
5962  * @bonding_info: info to dispatch
5963  *
5964  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5965  * The caller must hold the RTNL lock.
5966  */
5967 void netdev_bonding_info_change(struct net_device *dev,
5968 				struct netdev_bonding_info *bonding_info)
5969 {
5970 	struct netdev_notifier_bonding_info	info;
5971 
5972 	memcpy(&info.bonding_info, bonding_info,
5973 	       sizeof(struct netdev_bonding_info));
5974 	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5975 				      &info.info);
5976 }
5977 EXPORT_SYMBOL(netdev_bonding_info_change);
5978 
5979 static void netdev_adjacent_add_links(struct net_device *dev)
5980 {
5981 	struct netdev_adjacent *iter;
5982 
5983 	struct net *net = dev_net(dev);
5984 
5985 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5986 		if (!net_eq(net, dev_net(iter->dev)))
5987 			continue;
5988 		netdev_adjacent_sysfs_add(iter->dev, dev,
5989 					  &iter->dev->adj_list.lower);
5990 		netdev_adjacent_sysfs_add(dev, iter->dev,
5991 					  &dev->adj_list.upper);
5992 	}
5993 
5994 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5995 		if (!net_eq(net, dev_net(iter->dev)))
5996 			continue;
5997 		netdev_adjacent_sysfs_add(iter->dev, dev,
5998 					  &iter->dev->adj_list.upper);
5999 		netdev_adjacent_sysfs_add(dev, iter->dev,
6000 					  &dev->adj_list.lower);
6001 	}
6002 }
6003 
6004 static void netdev_adjacent_del_links(struct net_device *dev)
6005 {
6006 	struct netdev_adjacent *iter;
6007 
6008 	struct net *net = dev_net(dev);
6009 
6010 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6011 		if (!net_eq(net, dev_net(iter->dev)))
6012 			continue;
6013 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6014 					  &iter->dev->adj_list.lower);
6015 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6016 					  &dev->adj_list.upper);
6017 	}
6018 
6019 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6020 		if (!net_eq(net, dev_net(iter->dev)))
6021 			continue;
6022 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6023 					  &iter->dev->adj_list.upper);
6024 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6025 					  &dev->adj_list.lower);
6026 	}
6027 }
6028 
6029 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6030 {
6031 	struct netdev_adjacent *iter;
6032 
6033 	struct net *net = dev_net(dev);
6034 
6035 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6036 		if (!net_eq(net, dev_net(iter->dev)))
6037 			continue;
6038 		netdev_adjacent_sysfs_del(iter->dev, oldname,
6039 					  &iter->dev->adj_list.lower);
6040 		netdev_adjacent_sysfs_add(iter->dev, dev,
6041 					  &iter->dev->adj_list.lower);
6042 	}
6043 
6044 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6045 		if (!net_eq(net, dev_net(iter->dev)))
6046 			continue;
6047 		netdev_adjacent_sysfs_del(iter->dev, oldname,
6048 					  &iter->dev->adj_list.upper);
6049 		netdev_adjacent_sysfs_add(iter->dev, dev,
6050 					  &iter->dev->adj_list.upper);
6051 	}
6052 }
6053 
6054 void *netdev_lower_dev_get_private(struct net_device *dev,
6055 				   struct net_device *lower_dev)
6056 {
6057 	struct netdev_adjacent *lower;
6058 
6059 	if (!lower_dev)
6060 		return NULL;
6061 	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6062 	if (!lower)
6063 		return NULL;
6064 
6065 	return lower->private;
6066 }
6067 EXPORT_SYMBOL(netdev_lower_dev_get_private);
6068 
6069 
6070 int dev_get_nest_level(struct net_device *dev)
6071 {
6072 	struct net_device *lower = NULL;
6073 	struct list_head *iter;
6074 	int max_nest = -1;
6075 	int nest;
6076 
6077 	ASSERT_RTNL();
6078 
6079 	netdev_for_each_lower_dev(dev, lower, iter) {
6080 		nest = dev_get_nest_level(lower);
6081 		if (max_nest < nest)
6082 			max_nest = nest;
6083 	}
6084 
6085 	return max_nest + 1;
6086 }
6087 EXPORT_SYMBOL(dev_get_nest_level);
6088 
6089 /**
6090  * netdev_lower_change - Dispatch event about lower device state change
6091  * @lower_dev: device
6092  * @lower_state_info: state to dispatch
6093  *
6094  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6095  * The caller must hold the RTNL lock.
6096  */
6097 void netdev_lower_state_changed(struct net_device *lower_dev,
6098 				void *lower_state_info)
6099 {
6100 	struct netdev_notifier_changelowerstate_info changelowerstate_info;
6101 
6102 	ASSERT_RTNL();
6103 	changelowerstate_info.lower_state_info = lower_state_info;
6104 	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6105 				      &changelowerstate_info.info);
6106 }
6107 EXPORT_SYMBOL(netdev_lower_state_changed);
6108 
6109 static void dev_change_rx_flags(struct net_device *dev, int flags)
6110 {
6111 	const struct net_device_ops *ops = dev->netdev_ops;
6112 
6113 	if (ops->ndo_change_rx_flags)
6114 		ops->ndo_change_rx_flags(dev, flags);
6115 }
6116 
6117 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6118 {
6119 	unsigned int old_flags = dev->flags;
6120 	kuid_t uid;
6121 	kgid_t gid;
6122 
6123 	ASSERT_RTNL();
6124 
6125 	dev->flags |= IFF_PROMISC;
6126 	dev->promiscuity += inc;
6127 	if (dev->promiscuity == 0) {
6128 		/*
6129 		 * Avoid overflow.
6130 		 * If inc causes overflow, untouch promisc and return error.
6131 		 */
6132 		if (inc < 0)
6133 			dev->flags &= ~IFF_PROMISC;
6134 		else {
6135 			dev->promiscuity -= inc;
6136 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6137 				dev->name);
6138 			return -EOVERFLOW;
6139 		}
6140 	}
6141 	if (dev->flags != old_flags) {
6142 		pr_info("device %s %s promiscuous mode\n",
6143 			dev->name,
6144 			dev->flags & IFF_PROMISC ? "entered" : "left");
6145 		if (audit_enabled) {
6146 			current_uid_gid(&uid, &gid);
6147 			audit_log(current->audit_context, GFP_ATOMIC,
6148 				AUDIT_ANOM_PROMISCUOUS,
6149 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6150 				dev->name, (dev->flags & IFF_PROMISC),
6151 				(old_flags & IFF_PROMISC),
6152 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6153 				from_kuid(&init_user_ns, uid),
6154 				from_kgid(&init_user_ns, gid),
6155 				audit_get_sessionid(current));
6156 		}
6157 
6158 		dev_change_rx_flags(dev, IFF_PROMISC);
6159 	}
6160 	if (notify)
6161 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6162 	return 0;
6163 }
6164 
6165 /**
6166  *	dev_set_promiscuity	- update promiscuity count on a device
6167  *	@dev: device
6168  *	@inc: modifier
6169  *
6170  *	Add or remove promiscuity from a device. While the count in the device
6171  *	remains above zero the interface remains promiscuous. Once it hits zero
6172  *	the device reverts back to normal filtering operation. A negative inc
6173  *	value is used to drop promiscuity on the device.
6174  *	Return 0 if successful or a negative errno code on error.
6175  */
6176 int dev_set_promiscuity(struct net_device *dev, int inc)
6177 {
6178 	unsigned int old_flags = dev->flags;
6179 	int err;
6180 
6181 	err = __dev_set_promiscuity(dev, inc, true);
6182 	if (err < 0)
6183 		return err;
6184 	if (dev->flags != old_flags)
6185 		dev_set_rx_mode(dev);
6186 	return err;
6187 }
6188 EXPORT_SYMBOL(dev_set_promiscuity);
6189 
6190 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6191 {
6192 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6193 
6194 	ASSERT_RTNL();
6195 
6196 	dev->flags |= IFF_ALLMULTI;
6197 	dev->allmulti += inc;
6198 	if (dev->allmulti == 0) {
6199 		/*
6200 		 * Avoid overflow.
6201 		 * If inc causes overflow, untouch allmulti and return error.
6202 		 */
6203 		if (inc < 0)
6204 			dev->flags &= ~IFF_ALLMULTI;
6205 		else {
6206 			dev->allmulti -= inc;
6207 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6208 				dev->name);
6209 			return -EOVERFLOW;
6210 		}
6211 	}
6212 	if (dev->flags ^ old_flags) {
6213 		dev_change_rx_flags(dev, IFF_ALLMULTI);
6214 		dev_set_rx_mode(dev);
6215 		if (notify)
6216 			__dev_notify_flags(dev, old_flags,
6217 					   dev->gflags ^ old_gflags);
6218 	}
6219 	return 0;
6220 }
6221 
6222 /**
6223  *	dev_set_allmulti	- update allmulti count on a device
6224  *	@dev: device
6225  *	@inc: modifier
6226  *
6227  *	Add or remove reception of all multicast frames to a device. While the
6228  *	count in the device remains above zero the interface remains listening
6229  *	to all interfaces. Once it hits zero the device reverts back to normal
6230  *	filtering operation. A negative @inc value is used to drop the counter
6231  *	when releasing a resource needing all multicasts.
6232  *	Return 0 if successful or a negative errno code on error.
6233  */
6234 
6235 int dev_set_allmulti(struct net_device *dev, int inc)
6236 {
6237 	return __dev_set_allmulti(dev, inc, true);
6238 }
6239 EXPORT_SYMBOL(dev_set_allmulti);
6240 
6241 /*
6242  *	Upload unicast and multicast address lists to device and
6243  *	configure RX filtering. When the device doesn't support unicast
6244  *	filtering it is put in promiscuous mode while unicast addresses
6245  *	are present.
6246  */
6247 void __dev_set_rx_mode(struct net_device *dev)
6248 {
6249 	const struct net_device_ops *ops = dev->netdev_ops;
6250 
6251 	/* dev_open will call this function so the list will stay sane. */
6252 	if (!(dev->flags&IFF_UP))
6253 		return;
6254 
6255 	if (!netif_device_present(dev))
6256 		return;
6257 
6258 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6259 		/* Unicast addresses changes may only happen under the rtnl,
6260 		 * therefore calling __dev_set_promiscuity here is safe.
6261 		 */
6262 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6263 			__dev_set_promiscuity(dev, 1, false);
6264 			dev->uc_promisc = true;
6265 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6266 			__dev_set_promiscuity(dev, -1, false);
6267 			dev->uc_promisc = false;
6268 		}
6269 	}
6270 
6271 	if (ops->ndo_set_rx_mode)
6272 		ops->ndo_set_rx_mode(dev);
6273 }
6274 
6275 void dev_set_rx_mode(struct net_device *dev)
6276 {
6277 	netif_addr_lock_bh(dev);
6278 	__dev_set_rx_mode(dev);
6279 	netif_addr_unlock_bh(dev);
6280 }
6281 
6282 /**
6283  *	dev_get_flags - get flags reported to userspace
6284  *	@dev: device
6285  *
6286  *	Get the combination of flag bits exported through APIs to userspace.
6287  */
6288 unsigned int dev_get_flags(const struct net_device *dev)
6289 {
6290 	unsigned int flags;
6291 
6292 	flags = (dev->flags & ~(IFF_PROMISC |
6293 				IFF_ALLMULTI |
6294 				IFF_RUNNING |
6295 				IFF_LOWER_UP |
6296 				IFF_DORMANT)) |
6297 		(dev->gflags & (IFF_PROMISC |
6298 				IFF_ALLMULTI));
6299 
6300 	if (netif_running(dev)) {
6301 		if (netif_oper_up(dev))
6302 			flags |= IFF_RUNNING;
6303 		if (netif_carrier_ok(dev))
6304 			flags |= IFF_LOWER_UP;
6305 		if (netif_dormant(dev))
6306 			flags |= IFF_DORMANT;
6307 	}
6308 
6309 	return flags;
6310 }
6311 EXPORT_SYMBOL(dev_get_flags);
6312 
6313 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6314 {
6315 	unsigned int old_flags = dev->flags;
6316 	int ret;
6317 
6318 	ASSERT_RTNL();
6319 
6320 	/*
6321 	 *	Set the flags on our device.
6322 	 */
6323 
6324 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6325 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6326 			       IFF_AUTOMEDIA)) |
6327 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6328 				    IFF_ALLMULTI));
6329 
6330 	/*
6331 	 *	Load in the correct multicast list now the flags have changed.
6332 	 */
6333 
6334 	if ((old_flags ^ flags) & IFF_MULTICAST)
6335 		dev_change_rx_flags(dev, IFF_MULTICAST);
6336 
6337 	dev_set_rx_mode(dev);
6338 
6339 	/*
6340 	 *	Have we downed the interface. We handle IFF_UP ourselves
6341 	 *	according to user attempts to set it, rather than blindly
6342 	 *	setting it.
6343 	 */
6344 
6345 	ret = 0;
6346 	if ((old_flags ^ flags) & IFF_UP)
6347 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6348 
6349 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6350 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6351 		unsigned int old_flags = dev->flags;
6352 
6353 		dev->gflags ^= IFF_PROMISC;
6354 
6355 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6356 			if (dev->flags != old_flags)
6357 				dev_set_rx_mode(dev);
6358 	}
6359 
6360 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6361 	   is important. Some (broken) drivers set IFF_PROMISC, when
6362 	   IFF_ALLMULTI is requested not asking us and not reporting.
6363 	 */
6364 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6365 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6366 
6367 		dev->gflags ^= IFF_ALLMULTI;
6368 		__dev_set_allmulti(dev, inc, false);
6369 	}
6370 
6371 	return ret;
6372 }
6373 
6374 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6375 			unsigned int gchanges)
6376 {
6377 	unsigned int changes = dev->flags ^ old_flags;
6378 
6379 	if (gchanges)
6380 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6381 
6382 	if (changes & IFF_UP) {
6383 		if (dev->flags & IFF_UP)
6384 			call_netdevice_notifiers(NETDEV_UP, dev);
6385 		else
6386 			call_netdevice_notifiers(NETDEV_DOWN, dev);
6387 	}
6388 
6389 	if (dev->flags & IFF_UP &&
6390 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6391 		struct netdev_notifier_change_info change_info;
6392 
6393 		change_info.flags_changed = changes;
6394 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6395 					      &change_info.info);
6396 	}
6397 }
6398 
6399 /**
6400  *	dev_change_flags - change device settings
6401  *	@dev: device
6402  *	@flags: device state flags
6403  *
6404  *	Change settings on device based state flags. The flags are
6405  *	in the userspace exported format.
6406  */
6407 int dev_change_flags(struct net_device *dev, unsigned int flags)
6408 {
6409 	int ret;
6410 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6411 
6412 	ret = __dev_change_flags(dev, flags);
6413 	if (ret < 0)
6414 		return ret;
6415 
6416 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6417 	__dev_notify_flags(dev, old_flags, changes);
6418 	return ret;
6419 }
6420 EXPORT_SYMBOL(dev_change_flags);
6421 
6422 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6423 {
6424 	const struct net_device_ops *ops = dev->netdev_ops;
6425 
6426 	if (ops->ndo_change_mtu)
6427 		return ops->ndo_change_mtu(dev, new_mtu);
6428 
6429 	dev->mtu = new_mtu;
6430 	return 0;
6431 }
6432 
6433 /**
6434  *	dev_set_mtu - Change maximum transfer unit
6435  *	@dev: device
6436  *	@new_mtu: new transfer unit
6437  *
6438  *	Change the maximum transfer size of the network device.
6439  */
6440 int dev_set_mtu(struct net_device *dev, int new_mtu)
6441 {
6442 	int err, orig_mtu;
6443 
6444 	if (new_mtu == dev->mtu)
6445 		return 0;
6446 
6447 	/* MTU must be positive, and in range */
6448 	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6449 		net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6450 				    dev->name, new_mtu, dev->min_mtu);
6451 		return -EINVAL;
6452 	}
6453 
6454 	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6455 		net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6456 				    dev->name, new_mtu, dev->max_mtu);
6457 		return -EINVAL;
6458 	}
6459 
6460 	if (!netif_device_present(dev))
6461 		return -ENODEV;
6462 
6463 	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6464 	err = notifier_to_errno(err);
6465 	if (err)
6466 		return err;
6467 
6468 	orig_mtu = dev->mtu;
6469 	err = __dev_set_mtu(dev, new_mtu);
6470 
6471 	if (!err) {
6472 		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6473 		err = notifier_to_errno(err);
6474 		if (err) {
6475 			/* setting mtu back and notifying everyone again,
6476 			 * so that they have a chance to revert changes.
6477 			 */
6478 			__dev_set_mtu(dev, orig_mtu);
6479 			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6480 		}
6481 	}
6482 	return err;
6483 }
6484 EXPORT_SYMBOL(dev_set_mtu);
6485 
6486 /**
6487  *	dev_set_group - Change group this device belongs to
6488  *	@dev: device
6489  *	@new_group: group this device should belong to
6490  */
6491 void dev_set_group(struct net_device *dev, int new_group)
6492 {
6493 	dev->group = new_group;
6494 }
6495 EXPORT_SYMBOL(dev_set_group);
6496 
6497 /**
6498  *	dev_set_mac_address - Change Media Access Control Address
6499  *	@dev: device
6500  *	@sa: new address
6501  *
6502  *	Change the hardware (MAC) address of the device
6503  */
6504 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6505 {
6506 	const struct net_device_ops *ops = dev->netdev_ops;
6507 	int err;
6508 
6509 	if (!ops->ndo_set_mac_address)
6510 		return -EOPNOTSUPP;
6511 	if (sa->sa_family != dev->type)
6512 		return -EINVAL;
6513 	if (!netif_device_present(dev))
6514 		return -ENODEV;
6515 	err = ops->ndo_set_mac_address(dev, sa);
6516 	if (err)
6517 		return err;
6518 	dev->addr_assign_type = NET_ADDR_SET;
6519 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6520 	add_device_randomness(dev->dev_addr, dev->addr_len);
6521 	return 0;
6522 }
6523 EXPORT_SYMBOL(dev_set_mac_address);
6524 
6525 /**
6526  *	dev_change_carrier - Change device carrier
6527  *	@dev: device
6528  *	@new_carrier: new value
6529  *
6530  *	Change device carrier
6531  */
6532 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6533 {
6534 	const struct net_device_ops *ops = dev->netdev_ops;
6535 
6536 	if (!ops->ndo_change_carrier)
6537 		return -EOPNOTSUPP;
6538 	if (!netif_device_present(dev))
6539 		return -ENODEV;
6540 	return ops->ndo_change_carrier(dev, new_carrier);
6541 }
6542 EXPORT_SYMBOL(dev_change_carrier);
6543 
6544 /**
6545  *	dev_get_phys_port_id - Get device physical port ID
6546  *	@dev: device
6547  *	@ppid: port ID
6548  *
6549  *	Get device physical port ID
6550  */
6551 int dev_get_phys_port_id(struct net_device *dev,
6552 			 struct netdev_phys_item_id *ppid)
6553 {
6554 	const struct net_device_ops *ops = dev->netdev_ops;
6555 
6556 	if (!ops->ndo_get_phys_port_id)
6557 		return -EOPNOTSUPP;
6558 	return ops->ndo_get_phys_port_id(dev, ppid);
6559 }
6560 EXPORT_SYMBOL(dev_get_phys_port_id);
6561 
6562 /**
6563  *	dev_get_phys_port_name - Get device physical port name
6564  *	@dev: device
6565  *	@name: port name
6566  *	@len: limit of bytes to copy to name
6567  *
6568  *	Get device physical port name
6569  */
6570 int dev_get_phys_port_name(struct net_device *dev,
6571 			   char *name, size_t len)
6572 {
6573 	const struct net_device_ops *ops = dev->netdev_ops;
6574 
6575 	if (!ops->ndo_get_phys_port_name)
6576 		return -EOPNOTSUPP;
6577 	return ops->ndo_get_phys_port_name(dev, name, len);
6578 }
6579 EXPORT_SYMBOL(dev_get_phys_port_name);
6580 
6581 /**
6582  *	dev_change_proto_down - update protocol port state information
6583  *	@dev: device
6584  *	@proto_down: new value
6585  *
6586  *	This info can be used by switch drivers to set the phys state of the
6587  *	port.
6588  */
6589 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6590 {
6591 	const struct net_device_ops *ops = dev->netdev_ops;
6592 
6593 	if (!ops->ndo_change_proto_down)
6594 		return -EOPNOTSUPP;
6595 	if (!netif_device_present(dev))
6596 		return -ENODEV;
6597 	return ops->ndo_change_proto_down(dev, proto_down);
6598 }
6599 EXPORT_SYMBOL(dev_change_proto_down);
6600 
6601 /**
6602  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
6603  *	@dev: device
6604  *	@fd: new program fd or negative value to clear
6605  *	@flags: xdp-related flags
6606  *
6607  *	Set or clear a bpf program for a device
6608  */
6609 int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
6610 {
6611 	const struct net_device_ops *ops = dev->netdev_ops;
6612 	struct bpf_prog *prog = NULL;
6613 	struct netdev_xdp xdp;
6614 	int err;
6615 
6616 	ASSERT_RTNL();
6617 
6618 	if (!ops->ndo_xdp)
6619 		return -EOPNOTSUPP;
6620 	if (fd >= 0) {
6621 		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6622 			memset(&xdp, 0, sizeof(xdp));
6623 			xdp.command = XDP_QUERY_PROG;
6624 
6625 			err = ops->ndo_xdp(dev, &xdp);
6626 			if (err < 0)
6627 				return err;
6628 			if (xdp.prog_attached)
6629 				return -EBUSY;
6630 		}
6631 
6632 		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6633 		if (IS_ERR(prog))
6634 			return PTR_ERR(prog);
6635 	}
6636 
6637 	memset(&xdp, 0, sizeof(xdp));
6638 	xdp.command = XDP_SETUP_PROG;
6639 	xdp.prog = prog;
6640 
6641 	err = ops->ndo_xdp(dev, &xdp);
6642 	if (err < 0 && prog)
6643 		bpf_prog_put(prog);
6644 
6645 	return err;
6646 }
6647 EXPORT_SYMBOL(dev_change_xdp_fd);
6648 
6649 /**
6650  *	dev_new_index	-	allocate an ifindex
6651  *	@net: the applicable net namespace
6652  *
6653  *	Returns a suitable unique value for a new device interface
6654  *	number.  The caller must hold the rtnl semaphore or the
6655  *	dev_base_lock to be sure it remains unique.
6656  */
6657 static int dev_new_index(struct net *net)
6658 {
6659 	int ifindex = net->ifindex;
6660 	for (;;) {
6661 		if (++ifindex <= 0)
6662 			ifindex = 1;
6663 		if (!__dev_get_by_index(net, ifindex))
6664 			return net->ifindex = ifindex;
6665 	}
6666 }
6667 
6668 /* Delayed registration/unregisteration */
6669 static LIST_HEAD(net_todo_list);
6670 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6671 
6672 static void net_set_todo(struct net_device *dev)
6673 {
6674 	list_add_tail(&dev->todo_list, &net_todo_list);
6675 	dev_net(dev)->dev_unreg_count++;
6676 }
6677 
6678 static void rollback_registered_many(struct list_head *head)
6679 {
6680 	struct net_device *dev, *tmp;
6681 	LIST_HEAD(close_head);
6682 
6683 	BUG_ON(dev_boot_phase);
6684 	ASSERT_RTNL();
6685 
6686 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6687 		/* Some devices call without registering
6688 		 * for initialization unwind. Remove those
6689 		 * devices and proceed with the remaining.
6690 		 */
6691 		if (dev->reg_state == NETREG_UNINITIALIZED) {
6692 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6693 				 dev->name, dev);
6694 
6695 			WARN_ON(1);
6696 			list_del(&dev->unreg_list);
6697 			continue;
6698 		}
6699 		dev->dismantle = true;
6700 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6701 	}
6702 
6703 	/* If device is running, close it first. */
6704 	list_for_each_entry(dev, head, unreg_list)
6705 		list_add_tail(&dev->close_list, &close_head);
6706 	dev_close_many(&close_head, true);
6707 
6708 	list_for_each_entry(dev, head, unreg_list) {
6709 		/* And unlink it from device chain. */
6710 		unlist_netdevice(dev);
6711 
6712 		dev->reg_state = NETREG_UNREGISTERING;
6713 	}
6714 	flush_all_backlogs();
6715 
6716 	synchronize_net();
6717 
6718 	list_for_each_entry(dev, head, unreg_list) {
6719 		struct sk_buff *skb = NULL;
6720 
6721 		/* Shutdown queueing discipline. */
6722 		dev_shutdown(dev);
6723 
6724 
6725 		/* Notify protocols, that we are about to destroy
6726 		   this device. They should clean all the things.
6727 		*/
6728 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6729 
6730 		if (!dev->rtnl_link_ops ||
6731 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6732 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6733 						     GFP_KERNEL);
6734 
6735 		/*
6736 		 *	Flush the unicast and multicast chains
6737 		 */
6738 		dev_uc_flush(dev);
6739 		dev_mc_flush(dev);
6740 
6741 		if (dev->netdev_ops->ndo_uninit)
6742 			dev->netdev_ops->ndo_uninit(dev);
6743 
6744 		if (skb)
6745 			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6746 
6747 		/* Notifier chain MUST detach us all upper devices. */
6748 		WARN_ON(netdev_has_any_upper_dev(dev));
6749 		WARN_ON(netdev_has_any_lower_dev(dev));
6750 
6751 		/* Remove entries from kobject tree */
6752 		netdev_unregister_kobject(dev);
6753 #ifdef CONFIG_XPS
6754 		/* Remove XPS queueing entries */
6755 		netif_reset_xps_queues_gt(dev, 0);
6756 #endif
6757 	}
6758 
6759 	synchronize_net();
6760 
6761 	list_for_each_entry(dev, head, unreg_list)
6762 		dev_put(dev);
6763 }
6764 
6765 static void rollback_registered(struct net_device *dev)
6766 {
6767 	LIST_HEAD(single);
6768 
6769 	list_add(&dev->unreg_list, &single);
6770 	rollback_registered_many(&single);
6771 	list_del(&single);
6772 }
6773 
6774 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6775 	struct net_device *upper, netdev_features_t features)
6776 {
6777 	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6778 	netdev_features_t feature;
6779 	int feature_bit;
6780 
6781 	for_each_netdev_feature(&upper_disables, feature_bit) {
6782 		feature = __NETIF_F_BIT(feature_bit);
6783 		if (!(upper->wanted_features & feature)
6784 		    && (features & feature)) {
6785 			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6786 				   &feature, upper->name);
6787 			features &= ~feature;
6788 		}
6789 	}
6790 
6791 	return features;
6792 }
6793 
6794 static void netdev_sync_lower_features(struct net_device *upper,
6795 	struct net_device *lower, netdev_features_t features)
6796 {
6797 	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6798 	netdev_features_t feature;
6799 	int feature_bit;
6800 
6801 	for_each_netdev_feature(&upper_disables, feature_bit) {
6802 		feature = __NETIF_F_BIT(feature_bit);
6803 		if (!(features & feature) && (lower->features & feature)) {
6804 			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6805 				   &feature, lower->name);
6806 			lower->wanted_features &= ~feature;
6807 			netdev_update_features(lower);
6808 
6809 			if (unlikely(lower->features & feature))
6810 				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6811 					    &feature, lower->name);
6812 		}
6813 	}
6814 }
6815 
6816 static netdev_features_t netdev_fix_features(struct net_device *dev,
6817 	netdev_features_t features)
6818 {
6819 	/* Fix illegal checksum combinations */
6820 	if ((features & NETIF_F_HW_CSUM) &&
6821 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6822 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6823 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6824 	}
6825 
6826 	/* TSO requires that SG is present as well. */
6827 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6828 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6829 		features &= ~NETIF_F_ALL_TSO;
6830 	}
6831 
6832 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6833 					!(features & NETIF_F_IP_CSUM)) {
6834 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6835 		features &= ~NETIF_F_TSO;
6836 		features &= ~NETIF_F_TSO_ECN;
6837 	}
6838 
6839 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6840 					 !(features & NETIF_F_IPV6_CSUM)) {
6841 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6842 		features &= ~NETIF_F_TSO6;
6843 	}
6844 
6845 	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6846 	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6847 		features &= ~NETIF_F_TSO_MANGLEID;
6848 
6849 	/* TSO ECN requires that TSO is present as well. */
6850 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6851 		features &= ~NETIF_F_TSO_ECN;
6852 
6853 	/* Software GSO depends on SG. */
6854 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6855 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6856 		features &= ~NETIF_F_GSO;
6857 	}
6858 
6859 	/* UFO needs SG and checksumming */
6860 	if (features & NETIF_F_UFO) {
6861 		/* maybe split UFO into V4 and V6? */
6862 		if (!(features & NETIF_F_HW_CSUM) &&
6863 		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6864 		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6865 			netdev_dbg(dev,
6866 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6867 			features &= ~NETIF_F_UFO;
6868 		}
6869 
6870 		if (!(features & NETIF_F_SG)) {
6871 			netdev_dbg(dev,
6872 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6873 			features &= ~NETIF_F_UFO;
6874 		}
6875 	}
6876 
6877 	/* GSO partial features require GSO partial be set */
6878 	if ((features & dev->gso_partial_features) &&
6879 	    !(features & NETIF_F_GSO_PARTIAL)) {
6880 		netdev_dbg(dev,
6881 			   "Dropping partially supported GSO features since no GSO partial.\n");
6882 		features &= ~dev->gso_partial_features;
6883 	}
6884 
6885 	return features;
6886 }
6887 
6888 int __netdev_update_features(struct net_device *dev)
6889 {
6890 	struct net_device *upper, *lower;
6891 	netdev_features_t features;
6892 	struct list_head *iter;
6893 	int err = -1;
6894 
6895 	ASSERT_RTNL();
6896 
6897 	features = netdev_get_wanted_features(dev);
6898 
6899 	if (dev->netdev_ops->ndo_fix_features)
6900 		features = dev->netdev_ops->ndo_fix_features(dev, features);
6901 
6902 	/* driver might be less strict about feature dependencies */
6903 	features = netdev_fix_features(dev, features);
6904 
6905 	/* some features can't be enabled if they're off an an upper device */
6906 	netdev_for_each_upper_dev_rcu(dev, upper, iter)
6907 		features = netdev_sync_upper_features(dev, upper, features);
6908 
6909 	if (dev->features == features)
6910 		goto sync_lower;
6911 
6912 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6913 		&dev->features, &features);
6914 
6915 	if (dev->netdev_ops->ndo_set_features)
6916 		err = dev->netdev_ops->ndo_set_features(dev, features);
6917 	else
6918 		err = 0;
6919 
6920 	if (unlikely(err < 0)) {
6921 		netdev_err(dev,
6922 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
6923 			err, &features, &dev->features);
6924 		/* return non-0 since some features might have changed and
6925 		 * it's better to fire a spurious notification than miss it
6926 		 */
6927 		return -1;
6928 	}
6929 
6930 sync_lower:
6931 	/* some features must be disabled on lower devices when disabled
6932 	 * on an upper device (think: bonding master or bridge)
6933 	 */
6934 	netdev_for_each_lower_dev(dev, lower, iter)
6935 		netdev_sync_lower_features(dev, lower, features);
6936 
6937 	if (!err)
6938 		dev->features = features;
6939 
6940 	return err < 0 ? 0 : 1;
6941 }
6942 
6943 /**
6944  *	netdev_update_features - recalculate device features
6945  *	@dev: the device to check
6946  *
6947  *	Recalculate dev->features set and send notifications if it
6948  *	has changed. Should be called after driver or hardware dependent
6949  *	conditions might have changed that influence the features.
6950  */
6951 void netdev_update_features(struct net_device *dev)
6952 {
6953 	if (__netdev_update_features(dev))
6954 		netdev_features_change(dev);
6955 }
6956 EXPORT_SYMBOL(netdev_update_features);
6957 
6958 /**
6959  *	netdev_change_features - recalculate device features
6960  *	@dev: the device to check
6961  *
6962  *	Recalculate dev->features set and send notifications even
6963  *	if they have not changed. Should be called instead of
6964  *	netdev_update_features() if also dev->vlan_features might
6965  *	have changed to allow the changes to be propagated to stacked
6966  *	VLAN devices.
6967  */
6968 void netdev_change_features(struct net_device *dev)
6969 {
6970 	__netdev_update_features(dev);
6971 	netdev_features_change(dev);
6972 }
6973 EXPORT_SYMBOL(netdev_change_features);
6974 
6975 /**
6976  *	netif_stacked_transfer_operstate -	transfer operstate
6977  *	@rootdev: the root or lower level device to transfer state from
6978  *	@dev: the device to transfer operstate to
6979  *
6980  *	Transfer operational state from root to device. This is normally
6981  *	called when a stacking relationship exists between the root
6982  *	device and the device(a leaf device).
6983  */
6984 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6985 					struct net_device *dev)
6986 {
6987 	if (rootdev->operstate == IF_OPER_DORMANT)
6988 		netif_dormant_on(dev);
6989 	else
6990 		netif_dormant_off(dev);
6991 
6992 	if (netif_carrier_ok(rootdev)) {
6993 		if (!netif_carrier_ok(dev))
6994 			netif_carrier_on(dev);
6995 	} else {
6996 		if (netif_carrier_ok(dev))
6997 			netif_carrier_off(dev);
6998 	}
6999 }
7000 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7001 
7002 #ifdef CONFIG_SYSFS
7003 static int netif_alloc_rx_queues(struct net_device *dev)
7004 {
7005 	unsigned int i, count = dev->num_rx_queues;
7006 	struct netdev_rx_queue *rx;
7007 	size_t sz = count * sizeof(*rx);
7008 
7009 	BUG_ON(count < 1);
7010 
7011 	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7012 	if (!rx) {
7013 		rx = vzalloc(sz);
7014 		if (!rx)
7015 			return -ENOMEM;
7016 	}
7017 	dev->_rx = rx;
7018 
7019 	for (i = 0; i < count; i++)
7020 		rx[i].dev = dev;
7021 	return 0;
7022 }
7023 #endif
7024 
7025 static void netdev_init_one_queue(struct net_device *dev,
7026 				  struct netdev_queue *queue, void *_unused)
7027 {
7028 	/* Initialize queue lock */
7029 	spin_lock_init(&queue->_xmit_lock);
7030 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7031 	queue->xmit_lock_owner = -1;
7032 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7033 	queue->dev = dev;
7034 #ifdef CONFIG_BQL
7035 	dql_init(&queue->dql, HZ);
7036 #endif
7037 }
7038 
7039 static void netif_free_tx_queues(struct net_device *dev)
7040 {
7041 	kvfree(dev->_tx);
7042 }
7043 
7044 static int netif_alloc_netdev_queues(struct net_device *dev)
7045 {
7046 	unsigned int count = dev->num_tx_queues;
7047 	struct netdev_queue *tx;
7048 	size_t sz = count * sizeof(*tx);
7049 
7050 	if (count < 1 || count > 0xffff)
7051 		return -EINVAL;
7052 
7053 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7054 	if (!tx) {
7055 		tx = vzalloc(sz);
7056 		if (!tx)
7057 			return -ENOMEM;
7058 	}
7059 	dev->_tx = tx;
7060 
7061 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7062 	spin_lock_init(&dev->tx_global_lock);
7063 
7064 	return 0;
7065 }
7066 
7067 void netif_tx_stop_all_queues(struct net_device *dev)
7068 {
7069 	unsigned int i;
7070 
7071 	for (i = 0; i < dev->num_tx_queues; i++) {
7072 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7073 		netif_tx_stop_queue(txq);
7074 	}
7075 }
7076 EXPORT_SYMBOL(netif_tx_stop_all_queues);
7077 
7078 /**
7079  *	register_netdevice	- register a network device
7080  *	@dev: device to register
7081  *
7082  *	Take a completed network device structure and add it to the kernel
7083  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7084  *	chain. 0 is returned on success. A negative errno code is returned
7085  *	on a failure to set up the device, or if the name is a duplicate.
7086  *
7087  *	Callers must hold the rtnl semaphore. You may want
7088  *	register_netdev() instead of this.
7089  *
7090  *	BUGS:
7091  *	The locking appears insufficient to guarantee two parallel registers
7092  *	will not get the same name.
7093  */
7094 
7095 int register_netdevice(struct net_device *dev)
7096 {
7097 	int ret;
7098 	struct net *net = dev_net(dev);
7099 
7100 	BUG_ON(dev_boot_phase);
7101 	ASSERT_RTNL();
7102 
7103 	might_sleep();
7104 
7105 	/* When net_device's are persistent, this will be fatal. */
7106 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7107 	BUG_ON(!net);
7108 
7109 	spin_lock_init(&dev->addr_list_lock);
7110 	netdev_set_addr_lockdep_class(dev);
7111 
7112 	ret = dev_get_valid_name(net, dev, dev->name);
7113 	if (ret < 0)
7114 		goto out;
7115 
7116 	/* Init, if this function is available */
7117 	if (dev->netdev_ops->ndo_init) {
7118 		ret = dev->netdev_ops->ndo_init(dev);
7119 		if (ret) {
7120 			if (ret > 0)
7121 				ret = -EIO;
7122 			goto out;
7123 		}
7124 	}
7125 
7126 	if (((dev->hw_features | dev->features) &
7127 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
7128 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7129 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7130 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7131 		ret = -EINVAL;
7132 		goto err_uninit;
7133 	}
7134 
7135 	ret = -EBUSY;
7136 	if (!dev->ifindex)
7137 		dev->ifindex = dev_new_index(net);
7138 	else if (__dev_get_by_index(net, dev->ifindex))
7139 		goto err_uninit;
7140 
7141 	/* Transfer changeable features to wanted_features and enable
7142 	 * software offloads (GSO and GRO).
7143 	 */
7144 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
7145 	dev->features |= NETIF_F_SOFT_FEATURES;
7146 	dev->wanted_features = dev->features & dev->hw_features;
7147 
7148 	if (!(dev->flags & IFF_LOOPBACK))
7149 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
7150 
7151 	/* If IPv4 TCP segmentation offload is supported we should also
7152 	 * allow the device to enable segmenting the frame with the option
7153 	 * of ignoring a static IP ID value.  This doesn't enable the
7154 	 * feature itself but allows the user to enable it later.
7155 	 */
7156 	if (dev->hw_features & NETIF_F_TSO)
7157 		dev->hw_features |= NETIF_F_TSO_MANGLEID;
7158 	if (dev->vlan_features & NETIF_F_TSO)
7159 		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7160 	if (dev->mpls_features & NETIF_F_TSO)
7161 		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7162 	if (dev->hw_enc_features & NETIF_F_TSO)
7163 		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7164 
7165 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7166 	 */
7167 	dev->vlan_features |= NETIF_F_HIGHDMA;
7168 
7169 	/* Make NETIF_F_SG inheritable to tunnel devices.
7170 	 */
7171 	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7172 
7173 	/* Make NETIF_F_SG inheritable to MPLS.
7174 	 */
7175 	dev->mpls_features |= NETIF_F_SG;
7176 
7177 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7178 	ret = notifier_to_errno(ret);
7179 	if (ret)
7180 		goto err_uninit;
7181 
7182 	ret = netdev_register_kobject(dev);
7183 	if (ret)
7184 		goto err_uninit;
7185 	dev->reg_state = NETREG_REGISTERED;
7186 
7187 	__netdev_update_features(dev);
7188 
7189 	/*
7190 	 *	Default initial state at registry is that the
7191 	 *	device is present.
7192 	 */
7193 
7194 	set_bit(__LINK_STATE_PRESENT, &dev->state);
7195 
7196 	linkwatch_init_dev(dev);
7197 
7198 	dev_init_scheduler(dev);
7199 	dev_hold(dev);
7200 	list_netdevice(dev);
7201 	add_device_randomness(dev->dev_addr, dev->addr_len);
7202 
7203 	/* If the device has permanent device address, driver should
7204 	 * set dev_addr and also addr_assign_type should be set to
7205 	 * NET_ADDR_PERM (default value).
7206 	 */
7207 	if (dev->addr_assign_type == NET_ADDR_PERM)
7208 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7209 
7210 	/* Notify protocols, that a new device appeared. */
7211 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7212 	ret = notifier_to_errno(ret);
7213 	if (ret) {
7214 		rollback_registered(dev);
7215 		dev->reg_state = NETREG_UNREGISTERED;
7216 	}
7217 	/*
7218 	 *	Prevent userspace races by waiting until the network
7219 	 *	device is fully setup before sending notifications.
7220 	 */
7221 	if (!dev->rtnl_link_ops ||
7222 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7223 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7224 
7225 out:
7226 	return ret;
7227 
7228 err_uninit:
7229 	if (dev->netdev_ops->ndo_uninit)
7230 		dev->netdev_ops->ndo_uninit(dev);
7231 	goto out;
7232 }
7233 EXPORT_SYMBOL(register_netdevice);
7234 
7235 /**
7236  *	init_dummy_netdev	- init a dummy network device for NAPI
7237  *	@dev: device to init
7238  *
7239  *	This takes a network device structure and initialize the minimum
7240  *	amount of fields so it can be used to schedule NAPI polls without
7241  *	registering a full blown interface. This is to be used by drivers
7242  *	that need to tie several hardware interfaces to a single NAPI
7243  *	poll scheduler due to HW limitations.
7244  */
7245 int init_dummy_netdev(struct net_device *dev)
7246 {
7247 	/* Clear everything. Note we don't initialize spinlocks
7248 	 * are they aren't supposed to be taken by any of the
7249 	 * NAPI code and this dummy netdev is supposed to be
7250 	 * only ever used for NAPI polls
7251 	 */
7252 	memset(dev, 0, sizeof(struct net_device));
7253 
7254 	/* make sure we BUG if trying to hit standard
7255 	 * register/unregister code path
7256 	 */
7257 	dev->reg_state = NETREG_DUMMY;
7258 
7259 	/* NAPI wants this */
7260 	INIT_LIST_HEAD(&dev->napi_list);
7261 
7262 	/* a dummy interface is started by default */
7263 	set_bit(__LINK_STATE_PRESENT, &dev->state);
7264 	set_bit(__LINK_STATE_START, &dev->state);
7265 
7266 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
7267 	 * because users of this 'device' dont need to change
7268 	 * its refcount.
7269 	 */
7270 
7271 	return 0;
7272 }
7273 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7274 
7275 
7276 /**
7277  *	register_netdev	- register a network device
7278  *	@dev: device to register
7279  *
7280  *	Take a completed network device structure and add it to the kernel
7281  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7282  *	chain. 0 is returned on success. A negative errno code is returned
7283  *	on a failure to set up the device, or if the name is a duplicate.
7284  *
7285  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7286  *	and expands the device name if you passed a format string to
7287  *	alloc_netdev.
7288  */
7289 int register_netdev(struct net_device *dev)
7290 {
7291 	int err;
7292 
7293 	rtnl_lock();
7294 	err = register_netdevice(dev);
7295 	rtnl_unlock();
7296 	return err;
7297 }
7298 EXPORT_SYMBOL(register_netdev);
7299 
7300 int netdev_refcnt_read(const struct net_device *dev)
7301 {
7302 	int i, refcnt = 0;
7303 
7304 	for_each_possible_cpu(i)
7305 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7306 	return refcnt;
7307 }
7308 EXPORT_SYMBOL(netdev_refcnt_read);
7309 
7310 /**
7311  * netdev_wait_allrefs - wait until all references are gone.
7312  * @dev: target net_device
7313  *
7314  * This is called when unregistering network devices.
7315  *
7316  * Any protocol or device that holds a reference should register
7317  * for netdevice notification, and cleanup and put back the
7318  * reference if they receive an UNREGISTER event.
7319  * We can get stuck here if buggy protocols don't correctly
7320  * call dev_put.
7321  */
7322 static void netdev_wait_allrefs(struct net_device *dev)
7323 {
7324 	unsigned long rebroadcast_time, warning_time;
7325 	int refcnt;
7326 
7327 	linkwatch_forget_dev(dev);
7328 
7329 	rebroadcast_time = warning_time = jiffies;
7330 	refcnt = netdev_refcnt_read(dev);
7331 
7332 	while (refcnt != 0) {
7333 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7334 			rtnl_lock();
7335 
7336 			/* Rebroadcast unregister notification */
7337 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7338 
7339 			__rtnl_unlock();
7340 			rcu_barrier();
7341 			rtnl_lock();
7342 
7343 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7344 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7345 				     &dev->state)) {
7346 				/* We must not have linkwatch events
7347 				 * pending on unregister. If this
7348 				 * happens, we simply run the queue
7349 				 * unscheduled, resulting in a noop
7350 				 * for this device.
7351 				 */
7352 				linkwatch_run_queue();
7353 			}
7354 
7355 			__rtnl_unlock();
7356 
7357 			rebroadcast_time = jiffies;
7358 		}
7359 
7360 		msleep(250);
7361 
7362 		refcnt = netdev_refcnt_read(dev);
7363 
7364 		if (time_after(jiffies, warning_time + 10 * HZ)) {
7365 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7366 				 dev->name, refcnt);
7367 			warning_time = jiffies;
7368 		}
7369 	}
7370 }
7371 
7372 /* The sequence is:
7373  *
7374  *	rtnl_lock();
7375  *	...
7376  *	register_netdevice(x1);
7377  *	register_netdevice(x2);
7378  *	...
7379  *	unregister_netdevice(y1);
7380  *	unregister_netdevice(y2);
7381  *      ...
7382  *	rtnl_unlock();
7383  *	free_netdev(y1);
7384  *	free_netdev(y2);
7385  *
7386  * We are invoked by rtnl_unlock().
7387  * This allows us to deal with problems:
7388  * 1) We can delete sysfs objects which invoke hotplug
7389  *    without deadlocking with linkwatch via keventd.
7390  * 2) Since we run with the RTNL semaphore not held, we can sleep
7391  *    safely in order to wait for the netdev refcnt to drop to zero.
7392  *
7393  * We must not return until all unregister events added during
7394  * the interval the lock was held have been completed.
7395  */
7396 void netdev_run_todo(void)
7397 {
7398 	struct list_head list;
7399 
7400 	/* Snapshot list, allow later requests */
7401 	list_replace_init(&net_todo_list, &list);
7402 
7403 	__rtnl_unlock();
7404 
7405 
7406 	/* Wait for rcu callbacks to finish before next phase */
7407 	if (!list_empty(&list))
7408 		rcu_barrier();
7409 
7410 	while (!list_empty(&list)) {
7411 		struct net_device *dev
7412 			= list_first_entry(&list, struct net_device, todo_list);
7413 		list_del(&dev->todo_list);
7414 
7415 		rtnl_lock();
7416 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7417 		__rtnl_unlock();
7418 
7419 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7420 			pr_err("network todo '%s' but state %d\n",
7421 			       dev->name, dev->reg_state);
7422 			dump_stack();
7423 			continue;
7424 		}
7425 
7426 		dev->reg_state = NETREG_UNREGISTERED;
7427 
7428 		netdev_wait_allrefs(dev);
7429 
7430 		/* paranoia */
7431 		BUG_ON(netdev_refcnt_read(dev));
7432 		BUG_ON(!list_empty(&dev->ptype_all));
7433 		BUG_ON(!list_empty(&dev->ptype_specific));
7434 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7435 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7436 		WARN_ON(dev->dn_ptr);
7437 
7438 		if (dev->destructor)
7439 			dev->destructor(dev);
7440 
7441 		/* Report a network device has been unregistered */
7442 		rtnl_lock();
7443 		dev_net(dev)->dev_unreg_count--;
7444 		__rtnl_unlock();
7445 		wake_up(&netdev_unregistering_wq);
7446 
7447 		/* Free network device */
7448 		kobject_put(&dev->dev.kobj);
7449 	}
7450 }
7451 
7452 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7453  * all the same fields in the same order as net_device_stats, with only
7454  * the type differing, but rtnl_link_stats64 may have additional fields
7455  * at the end for newer counters.
7456  */
7457 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7458 			     const struct net_device_stats *netdev_stats)
7459 {
7460 #if BITS_PER_LONG == 64
7461 	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7462 	memcpy(stats64, netdev_stats, sizeof(*stats64));
7463 	/* zero out counters that only exist in rtnl_link_stats64 */
7464 	memset((char *)stats64 + sizeof(*netdev_stats), 0,
7465 	       sizeof(*stats64) - sizeof(*netdev_stats));
7466 #else
7467 	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7468 	const unsigned long *src = (const unsigned long *)netdev_stats;
7469 	u64 *dst = (u64 *)stats64;
7470 
7471 	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7472 	for (i = 0; i < n; i++)
7473 		dst[i] = src[i];
7474 	/* zero out counters that only exist in rtnl_link_stats64 */
7475 	memset((char *)stats64 + n * sizeof(u64), 0,
7476 	       sizeof(*stats64) - n * sizeof(u64));
7477 #endif
7478 }
7479 EXPORT_SYMBOL(netdev_stats_to_stats64);
7480 
7481 /**
7482  *	dev_get_stats	- get network device statistics
7483  *	@dev: device to get statistics from
7484  *	@storage: place to store stats
7485  *
7486  *	Get network statistics from device. Return @storage.
7487  *	The device driver may provide its own method by setting
7488  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7489  *	otherwise the internal statistics structure is used.
7490  */
7491 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7492 					struct rtnl_link_stats64 *storage)
7493 {
7494 	const struct net_device_ops *ops = dev->netdev_ops;
7495 
7496 	if (ops->ndo_get_stats64) {
7497 		memset(storage, 0, sizeof(*storage));
7498 		ops->ndo_get_stats64(dev, storage);
7499 	} else if (ops->ndo_get_stats) {
7500 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7501 	} else {
7502 		netdev_stats_to_stats64(storage, &dev->stats);
7503 	}
7504 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7505 	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7506 	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7507 	return storage;
7508 }
7509 EXPORT_SYMBOL(dev_get_stats);
7510 
7511 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7512 {
7513 	struct netdev_queue *queue = dev_ingress_queue(dev);
7514 
7515 #ifdef CONFIG_NET_CLS_ACT
7516 	if (queue)
7517 		return queue;
7518 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7519 	if (!queue)
7520 		return NULL;
7521 	netdev_init_one_queue(dev, queue, NULL);
7522 	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7523 	queue->qdisc_sleeping = &noop_qdisc;
7524 	rcu_assign_pointer(dev->ingress_queue, queue);
7525 #endif
7526 	return queue;
7527 }
7528 
7529 static const struct ethtool_ops default_ethtool_ops;
7530 
7531 void netdev_set_default_ethtool_ops(struct net_device *dev,
7532 				    const struct ethtool_ops *ops)
7533 {
7534 	if (dev->ethtool_ops == &default_ethtool_ops)
7535 		dev->ethtool_ops = ops;
7536 }
7537 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7538 
7539 void netdev_freemem(struct net_device *dev)
7540 {
7541 	char *addr = (char *)dev - dev->padded;
7542 
7543 	kvfree(addr);
7544 }
7545 
7546 /**
7547  *	alloc_netdev_mqs - allocate network device
7548  *	@sizeof_priv:		size of private data to allocate space for
7549  *	@name:			device name format string
7550  *	@name_assign_type: 	origin of device name
7551  *	@setup:			callback to initialize device
7552  *	@txqs:			the number of TX subqueues to allocate
7553  *	@rxqs:			the number of RX subqueues to allocate
7554  *
7555  *	Allocates a struct net_device with private data area for driver use
7556  *	and performs basic initialization.  Also allocates subqueue structs
7557  *	for each queue on the device.
7558  */
7559 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7560 		unsigned char name_assign_type,
7561 		void (*setup)(struct net_device *),
7562 		unsigned int txqs, unsigned int rxqs)
7563 {
7564 	struct net_device *dev;
7565 	size_t alloc_size;
7566 	struct net_device *p;
7567 
7568 	BUG_ON(strlen(name) >= sizeof(dev->name));
7569 
7570 	if (txqs < 1) {
7571 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7572 		return NULL;
7573 	}
7574 
7575 #ifdef CONFIG_SYSFS
7576 	if (rxqs < 1) {
7577 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7578 		return NULL;
7579 	}
7580 #endif
7581 
7582 	alloc_size = sizeof(struct net_device);
7583 	if (sizeof_priv) {
7584 		/* ensure 32-byte alignment of private area */
7585 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7586 		alloc_size += sizeof_priv;
7587 	}
7588 	/* ensure 32-byte alignment of whole construct */
7589 	alloc_size += NETDEV_ALIGN - 1;
7590 
7591 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7592 	if (!p)
7593 		p = vzalloc(alloc_size);
7594 	if (!p)
7595 		return NULL;
7596 
7597 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7598 	dev->padded = (char *)dev - (char *)p;
7599 
7600 	dev->pcpu_refcnt = alloc_percpu(int);
7601 	if (!dev->pcpu_refcnt)
7602 		goto free_dev;
7603 
7604 	if (dev_addr_init(dev))
7605 		goto free_pcpu;
7606 
7607 	dev_mc_init(dev);
7608 	dev_uc_init(dev);
7609 
7610 	dev_net_set(dev, &init_net);
7611 
7612 	dev->gso_max_size = GSO_MAX_SIZE;
7613 	dev->gso_max_segs = GSO_MAX_SEGS;
7614 
7615 	INIT_LIST_HEAD(&dev->napi_list);
7616 	INIT_LIST_HEAD(&dev->unreg_list);
7617 	INIT_LIST_HEAD(&dev->close_list);
7618 	INIT_LIST_HEAD(&dev->link_watch_list);
7619 	INIT_LIST_HEAD(&dev->adj_list.upper);
7620 	INIT_LIST_HEAD(&dev->adj_list.lower);
7621 	INIT_LIST_HEAD(&dev->ptype_all);
7622 	INIT_LIST_HEAD(&dev->ptype_specific);
7623 #ifdef CONFIG_NET_SCHED
7624 	hash_init(dev->qdisc_hash);
7625 #endif
7626 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7627 	setup(dev);
7628 
7629 	if (!dev->tx_queue_len) {
7630 		dev->priv_flags |= IFF_NO_QUEUE;
7631 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7632 	}
7633 
7634 	dev->num_tx_queues = txqs;
7635 	dev->real_num_tx_queues = txqs;
7636 	if (netif_alloc_netdev_queues(dev))
7637 		goto free_all;
7638 
7639 #ifdef CONFIG_SYSFS
7640 	dev->num_rx_queues = rxqs;
7641 	dev->real_num_rx_queues = rxqs;
7642 	if (netif_alloc_rx_queues(dev))
7643 		goto free_all;
7644 #endif
7645 
7646 	strcpy(dev->name, name);
7647 	dev->name_assign_type = name_assign_type;
7648 	dev->group = INIT_NETDEV_GROUP;
7649 	if (!dev->ethtool_ops)
7650 		dev->ethtool_ops = &default_ethtool_ops;
7651 
7652 	nf_hook_ingress_init(dev);
7653 
7654 	return dev;
7655 
7656 free_all:
7657 	free_netdev(dev);
7658 	return NULL;
7659 
7660 free_pcpu:
7661 	free_percpu(dev->pcpu_refcnt);
7662 free_dev:
7663 	netdev_freemem(dev);
7664 	return NULL;
7665 }
7666 EXPORT_SYMBOL(alloc_netdev_mqs);
7667 
7668 /**
7669  *	free_netdev - free network device
7670  *	@dev: device
7671  *
7672  *	This function does the last stage of destroying an allocated device
7673  * 	interface. The reference to the device object is released.
7674  *	If this is the last reference then it will be freed.
7675  *	Must be called in process context.
7676  */
7677 void free_netdev(struct net_device *dev)
7678 {
7679 	struct napi_struct *p, *n;
7680 
7681 	might_sleep();
7682 	netif_free_tx_queues(dev);
7683 #ifdef CONFIG_SYSFS
7684 	kvfree(dev->_rx);
7685 #endif
7686 
7687 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7688 
7689 	/* Flush device addresses */
7690 	dev_addr_flush(dev);
7691 
7692 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7693 		netif_napi_del(p);
7694 
7695 	free_percpu(dev->pcpu_refcnt);
7696 	dev->pcpu_refcnt = NULL;
7697 
7698 	/*  Compatibility with error handling in drivers */
7699 	if (dev->reg_state == NETREG_UNINITIALIZED) {
7700 		netdev_freemem(dev);
7701 		return;
7702 	}
7703 
7704 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7705 	dev->reg_state = NETREG_RELEASED;
7706 
7707 	/* will free via device release */
7708 	put_device(&dev->dev);
7709 }
7710 EXPORT_SYMBOL(free_netdev);
7711 
7712 /**
7713  *	synchronize_net -  Synchronize with packet receive processing
7714  *
7715  *	Wait for packets currently being received to be done.
7716  *	Does not block later packets from starting.
7717  */
7718 void synchronize_net(void)
7719 {
7720 	might_sleep();
7721 	if (rtnl_is_locked())
7722 		synchronize_rcu_expedited();
7723 	else
7724 		synchronize_rcu();
7725 }
7726 EXPORT_SYMBOL(synchronize_net);
7727 
7728 /**
7729  *	unregister_netdevice_queue - remove device from the kernel
7730  *	@dev: device
7731  *	@head: list
7732  *
7733  *	This function shuts down a device interface and removes it
7734  *	from the kernel tables.
7735  *	If head not NULL, device is queued to be unregistered later.
7736  *
7737  *	Callers must hold the rtnl semaphore.  You may want
7738  *	unregister_netdev() instead of this.
7739  */
7740 
7741 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7742 {
7743 	ASSERT_RTNL();
7744 
7745 	if (head) {
7746 		list_move_tail(&dev->unreg_list, head);
7747 	} else {
7748 		rollback_registered(dev);
7749 		/* Finish processing unregister after unlock */
7750 		net_set_todo(dev);
7751 	}
7752 }
7753 EXPORT_SYMBOL(unregister_netdevice_queue);
7754 
7755 /**
7756  *	unregister_netdevice_many - unregister many devices
7757  *	@head: list of devices
7758  *
7759  *  Note: As most callers use a stack allocated list_head,
7760  *  we force a list_del() to make sure stack wont be corrupted later.
7761  */
7762 void unregister_netdevice_many(struct list_head *head)
7763 {
7764 	struct net_device *dev;
7765 
7766 	if (!list_empty(head)) {
7767 		rollback_registered_many(head);
7768 		list_for_each_entry(dev, head, unreg_list)
7769 			net_set_todo(dev);
7770 		list_del(head);
7771 	}
7772 }
7773 EXPORT_SYMBOL(unregister_netdevice_many);
7774 
7775 /**
7776  *	unregister_netdev - remove device from the kernel
7777  *	@dev: device
7778  *
7779  *	This function shuts down a device interface and removes it
7780  *	from the kernel tables.
7781  *
7782  *	This is just a wrapper for unregister_netdevice that takes
7783  *	the rtnl semaphore.  In general you want to use this and not
7784  *	unregister_netdevice.
7785  */
7786 void unregister_netdev(struct net_device *dev)
7787 {
7788 	rtnl_lock();
7789 	unregister_netdevice(dev);
7790 	rtnl_unlock();
7791 }
7792 EXPORT_SYMBOL(unregister_netdev);
7793 
7794 /**
7795  *	dev_change_net_namespace - move device to different nethost namespace
7796  *	@dev: device
7797  *	@net: network namespace
7798  *	@pat: If not NULL name pattern to try if the current device name
7799  *	      is already taken in the destination network namespace.
7800  *
7801  *	This function shuts down a device interface and moves it
7802  *	to a new network namespace. On success 0 is returned, on
7803  *	a failure a netagive errno code is returned.
7804  *
7805  *	Callers must hold the rtnl semaphore.
7806  */
7807 
7808 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7809 {
7810 	int err;
7811 
7812 	ASSERT_RTNL();
7813 
7814 	/* Don't allow namespace local devices to be moved. */
7815 	err = -EINVAL;
7816 	if (dev->features & NETIF_F_NETNS_LOCAL)
7817 		goto out;
7818 
7819 	/* Ensure the device has been registrered */
7820 	if (dev->reg_state != NETREG_REGISTERED)
7821 		goto out;
7822 
7823 	/* Get out if there is nothing todo */
7824 	err = 0;
7825 	if (net_eq(dev_net(dev), net))
7826 		goto out;
7827 
7828 	/* Pick the destination device name, and ensure
7829 	 * we can use it in the destination network namespace.
7830 	 */
7831 	err = -EEXIST;
7832 	if (__dev_get_by_name(net, dev->name)) {
7833 		/* We get here if we can't use the current device name */
7834 		if (!pat)
7835 			goto out;
7836 		if (dev_get_valid_name(net, dev, pat) < 0)
7837 			goto out;
7838 	}
7839 
7840 	/*
7841 	 * And now a mini version of register_netdevice unregister_netdevice.
7842 	 */
7843 
7844 	/* If device is running close it first. */
7845 	dev_close(dev);
7846 
7847 	/* And unlink it from device chain */
7848 	err = -ENODEV;
7849 	unlist_netdevice(dev);
7850 
7851 	synchronize_net();
7852 
7853 	/* Shutdown queueing discipline. */
7854 	dev_shutdown(dev);
7855 
7856 	/* Notify protocols, that we are about to destroy
7857 	   this device. They should clean all the things.
7858 
7859 	   Note that dev->reg_state stays at NETREG_REGISTERED.
7860 	   This is wanted because this way 8021q and macvlan know
7861 	   the device is just moving and can keep their slaves up.
7862 	*/
7863 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7864 	rcu_barrier();
7865 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7866 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7867 
7868 	/*
7869 	 *	Flush the unicast and multicast chains
7870 	 */
7871 	dev_uc_flush(dev);
7872 	dev_mc_flush(dev);
7873 
7874 	/* Send a netdev-removed uevent to the old namespace */
7875 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7876 	netdev_adjacent_del_links(dev);
7877 
7878 	/* Actually switch the network namespace */
7879 	dev_net_set(dev, net);
7880 
7881 	/* If there is an ifindex conflict assign a new one */
7882 	if (__dev_get_by_index(net, dev->ifindex))
7883 		dev->ifindex = dev_new_index(net);
7884 
7885 	/* Send a netdev-add uevent to the new namespace */
7886 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7887 	netdev_adjacent_add_links(dev);
7888 
7889 	/* Fixup kobjects */
7890 	err = device_rename(&dev->dev, dev->name);
7891 	WARN_ON(err);
7892 
7893 	/* Add the device back in the hashes */
7894 	list_netdevice(dev);
7895 
7896 	/* Notify protocols, that a new device appeared. */
7897 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7898 
7899 	/*
7900 	 *	Prevent userspace races by waiting until the network
7901 	 *	device is fully setup before sending notifications.
7902 	 */
7903 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7904 
7905 	synchronize_net();
7906 	err = 0;
7907 out:
7908 	return err;
7909 }
7910 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7911 
7912 static int dev_cpu_dead(unsigned int oldcpu)
7913 {
7914 	struct sk_buff **list_skb;
7915 	struct sk_buff *skb;
7916 	unsigned int cpu;
7917 	struct softnet_data *sd, *oldsd;
7918 
7919 	local_irq_disable();
7920 	cpu = smp_processor_id();
7921 	sd = &per_cpu(softnet_data, cpu);
7922 	oldsd = &per_cpu(softnet_data, oldcpu);
7923 
7924 	/* Find end of our completion_queue. */
7925 	list_skb = &sd->completion_queue;
7926 	while (*list_skb)
7927 		list_skb = &(*list_skb)->next;
7928 	/* Append completion queue from offline CPU. */
7929 	*list_skb = oldsd->completion_queue;
7930 	oldsd->completion_queue = NULL;
7931 
7932 	/* Append output queue from offline CPU. */
7933 	if (oldsd->output_queue) {
7934 		*sd->output_queue_tailp = oldsd->output_queue;
7935 		sd->output_queue_tailp = oldsd->output_queue_tailp;
7936 		oldsd->output_queue = NULL;
7937 		oldsd->output_queue_tailp = &oldsd->output_queue;
7938 	}
7939 	/* Append NAPI poll list from offline CPU, with one exception :
7940 	 * process_backlog() must be called by cpu owning percpu backlog.
7941 	 * We properly handle process_queue & input_pkt_queue later.
7942 	 */
7943 	while (!list_empty(&oldsd->poll_list)) {
7944 		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7945 							    struct napi_struct,
7946 							    poll_list);
7947 
7948 		list_del_init(&napi->poll_list);
7949 		if (napi->poll == process_backlog)
7950 			napi->state = 0;
7951 		else
7952 			____napi_schedule(sd, napi);
7953 	}
7954 
7955 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
7956 	local_irq_enable();
7957 
7958 	/* Process offline CPU's input_pkt_queue */
7959 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7960 		netif_rx_ni(skb);
7961 		input_queue_head_incr(oldsd);
7962 	}
7963 	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7964 		netif_rx_ni(skb);
7965 		input_queue_head_incr(oldsd);
7966 	}
7967 
7968 	return 0;
7969 }
7970 
7971 /**
7972  *	netdev_increment_features - increment feature set by one
7973  *	@all: current feature set
7974  *	@one: new feature set
7975  *	@mask: mask feature set
7976  *
7977  *	Computes a new feature set after adding a device with feature set
7978  *	@one to the master device with current feature set @all.  Will not
7979  *	enable anything that is off in @mask. Returns the new feature set.
7980  */
7981 netdev_features_t netdev_increment_features(netdev_features_t all,
7982 	netdev_features_t one, netdev_features_t mask)
7983 {
7984 	if (mask & NETIF_F_HW_CSUM)
7985 		mask |= NETIF_F_CSUM_MASK;
7986 	mask |= NETIF_F_VLAN_CHALLENGED;
7987 
7988 	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
7989 	all &= one | ~NETIF_F_ALL_FOR_ALL;
7990 
7991 	/* If one device supports hw checksumming, set for all. */
7992 	if (all & NETIF_F_HW_CSUM)
7993 		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7994 
7995 	return all;
7996 }
7997 EXPORT_SYMBOL(netdev_increment_features);
7998 
7999 static struct hlist_head * __net_init netdev_create_hash(void)
8000 {
8001 	int i;
8002 	struct hlist_head *hash;
8003 
8004 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8005 	if (hash != NULL)
8006 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
8007 			INIT_HLIST_HEAD(&hash[i]);
8008 
8009 	return hash;
8010 }
8011 
8012 /* Initialize per network namespace state */
8013 static int __net_init netdev_init(struct net *net)
8014 {
8015 	if (net != &init_net)
8016 		INIT_LIST_HEAD(&net->dev_base_head);
8017 
8018 	net->dev_name_head = netdev_create_hash();
8019 	if (net->dev_name_head == NULL)
8020 		goto err_name;
8021 
8022 	net->dev_index_head = netdev_create_hash();
8023 	if (net->dev_index_head == NULL)
8024 		goto err_idx;
8025 
8026 	return 0;
8027 
8028 err_idx:
8029 	kfree(net->dev_name_head);
8030 err_name:
8031 	return -ENOMEM;
8032 }
8033 
8034 /**
8035  *	netdev_drivername - network driver for the device
8036  *	@dev: network device
8037  *
8038  *	Determine network driver for device.
8039  */
8040 const char *netdev_drivername(const struct net_device *dev)
8041 {
8042 	const struct device_driver *driver;
8043 	const struct device *parent;
8044 	const char *empty = "";
8045 
8046 	parent = dev->dev.parent;
8047 	if (!parent)
8048 		return empty;
8049 
8050 	driver = parent->driver;
8051 	if (driver && driver->name)
8052 		return driver->name;
8053 	return empty;
8054 }
8055 
8056 static void __netdev_printk(const char *level, const struct net_device *dev,
8057 			    struct va_format *vaf)
8058 {
8059 	if (dev && dev->dev.parent) {
8060 		dev_printk_emit(level[1] - '0',
8061 				dev->dev.parent,
8062 				"%s %s %s%s: %pV",
8063 				dev_driver_string(dev->dev.parent),
8064 				dev_name(dev->dev.parent),
8065 				netdev_name(dev), netdev_reg_state(dev),
8066 				vaf);
8067 	} else if (dev) {
8068 		printk("%s%s%s: %pV",
8069 		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
8070 	} else {
8071 		printk("%s(NULL net_device): %pV", level, vaf);
8072 	}
8073 }
8074 
8075 void netdev_printk(const char *level, const struct net_device *dev,
8076 		   const char *format, ...)
8077 {
8078 	struct va_format vaf;
8079 	va_list args;
8080 
8081 	va_start(args, format);
8082 
8083 	vaf.fmt = format;
8084 	vaf.va = &args;
8085 
8086 	__netdev_printk(level, dev, &vaf);
8087 
8088 	va_end(args);
8089 }
8090 EXPORT_SYMBOL(netdev_printk);
8091 
8092 #define define_netdev_printk_level(func, level)			\
8093 void func(const struct net_device *dev, const char *fmt, ...)	\
8094 {								\
8095 	struct va_format vaf;					\
8096 	va_list args;						\
8097 								\
8098 	va_start(args, fmt);					\
8099 								\
8100 	vaf.fmt = fmt;						\
8101 	vaf.va = &args;						\
8102 								\
8103 	__netdev_printk(level, dev, &vaf);			\
8104 								\
8105 	va_end(args);						\
8106 }								\
8107 EXPORT_SYMBOL(func);
8108 
8109 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8110 define_netdev_printk_level(netdev_alert, KERN_ALERT);
8111 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8112 define_netdev_printk_level(netdev_err, KERN_ERR);
8113 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8114 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8115 define_netdev_printk_level(netdev_info, KERN_INFO);
8116 
8117 static void __net_exit netdev_exit(struct net *net)
8118 {
8119 	kfree(net->dev_name_head);
8120 	kfree(net->dev_index_head);
8121 }
8122 
8123 static struct pernet_operations __net_initdata netdev_net_ops = {
8124 	.init = netdev_init,
8125 	.exit = netdev_exit,
8126 };
8127 
8128 static void __net_exit default_device_exit(struct net *net)
8129 {
8130 	struct net_device *dev, *aux;
8131 	/*
8132 	 * Push all migratable network devices back to the
8133 	 * initial network namespace
8134 	 */
8135 	rtnl_lock();
8136 	for_each_netdev_safe(net, dev, aux) {
8137 		int err;
8138 		char fb_name[IFNAMSIZ];
8139 
8140 		/* Ignore unmoveable devices (i.e. loopback) */
8141 		if (dev->features & NETIF_F_NETNS_LOCAL)
8142 			continue;
8143 
8144 		/* Leave virtual devices for the generic cleanup */
8145 		if (dev->rtnl_link_ops)
8146 			continue;
8147 
8148 		/* Push remaining network devices to init_net */
8149 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8150 		err = dev_change_net_namespace(dev, &init_net, fb_name);
8151 		if (err) {
8152 			pr_emerg("%s: failed to move %s to init_net: %d\n",
8153 				 __func__, dev->name, err);
8154 			BUG();
8155 		}
8156 	}
8157 	rtnl_unlock();
8158 }
8159 
8160 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8161 {
8162 	/* Return with the rtnl_lock held when there are no network
8163 	 * devices unregistering in any network namespace in net_list.
8164 	 */
8165 	struct net *net;
8166 	bool unregistering;
8167 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
8168 
8169 	add_wait_queue(&netdev_unregistering_wq, &wait);
8170 	for (;;) {
8171 		unregistering = false;
8172 		rtnl_lock();
8173 		list_for_each_entry(net, net_list, exit_list) {
8174 			if (net->dev_unreg_count > 0) {
8175 				unregistering = true;
8176 				break;
8177 			}
8178 		}
8179 		if (!unregistering)
8180 			break;
8181 		__rtnl_unlock();
8182 
8183 		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8184 	}
8185 	remove_wait_queue(&netdev_unregistering_wq, &wait);
8186 }
8187 
8188 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8189 {
8190 	/* At exit all network devices most be removed from a network
8191 	 * namespace.  Do this in the reverse order of registration.
8192 	 * Do this across as many network namespaces as possible to
8193 	 * improve batching efficiency.
8194 	 */
8195 	struct net_device *dev;
8196 	struct net *net;
8197 	LIST_HEAD(dev_kill_list);
8198 
8199 	/* To prevent network device cleanup code from dereferencing
8200 	 * loopback devices or network devices that have been freed
8201 	 * wait here for all pending unregistrations to complete,
8202 	 * before unregistring the loopback device and allowing the
8203 	 * network namespace be freed.
8204 	 *
8205 	 * The netdev todo list containing all network devices
8206 	 * unregistrations that happen in default_device_exit_batch
8207 	 * will run in the rtnl_unlock() at the end of
8208 	 * default_device_exit_batch.
8209 	 */
8210 	rtnl_lock_unregistering(net_list);
8211 	list_for_each_entry(net, net_list, exit_list) {
8212 		for_each_netdev_reverse(net, dev) {
8213 			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8214 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8215 			else
8216 				unregister_netdevice_queue(dev, &dev_kill_list);
8217 		}
8218 	}
8219 	unregister_netdevice_many(&dev_kill_list);
8220 	rtnl_unlock();
8221 }
8222 
8223 static struct pernet_operations __net_initdata default_device_ops = {
8224 	.exit = default_device_exit,
8225 	.exit_batch = default_device_exit_batch,
8226 };
8227 
8228 /*
8229  *	Initialize the DEV module. At boot time this walks the device list and
8230  *	unhooks any devices that fail to initialise (normally hardware not
8231  *	present) and leaves us with a valid list of present and active devices.
8232  *
8233  */
8234 
8235 /*
8236  *       This is called single threaded during boot, so no need
8237  *       to take the rtnl semaphore.
8238  */
8239 static int __init net_dev_init(void)
8240 {
8241 	int i, rc = -ENOMEM;
8242 
8243 	BUG_ON(!dev_boot_phase);
8244 
8245 	if (dev_proc_init())
8246 		goto out;
8247 
8248 	if (netdev_kobject_init())
8249 		goto out;
8250 
8251 	INIT_LIST_HEAD(&ptype_all);
8252 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
8253 		INIT_LIST_HEAD(&ptype_base[i]);
8254 
8255 	INIT_LIST_HEAD(&offload_base);
8256 
8257 	if (register_pernet_subsys(&netdev_net_ops))
8258 		goto out;
8259 
8260 	/*
8261 	 *	Initialise the packet receive queues.
8262 	 */
8263 
8264 	for_each_possible_cpu(i) {
8265 		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8266 		struct softnet_data *sd = &per_cpu(softnet_data, i);
8267 
8268 		INIT_WORK(flush, flush_backlog);
8269 
8270 		skb_queue_head_init(&sd->input_pkt_queue);
8271 		skb_queue_head_init(&sd->process_queue);
8272 		INIT_LIST_HEAD(&sd->poll_list);
8273 		sd->output_queue_tailp = &sd->output_queue;
8274 #ifdef CONFIG_RPS
8275 		sd->csd.func = rps_trigger_softirq;
8276 		sd->csd.info = sd;
8277 		sd->cpu = i;
8278 #endif
8279 
8280 		sd->backlog.poll = process_backlog;
8281 		sd->backlog.weight = weight_p;
8282 	}
8283 
8284 	dev_boot_phase = 0;
8285 
8286 	/* The loopback device is special if any other network devices
8287 	 * is present in a network namespace the loopback device must
8288 	 * be present. Since we now dynamically allocate and free the
8289 	 * loopback device ensure this invariant is maintained by
8290 	 * keeping the loopback device as the first device on the
8291 	 * list of network devices.  Ensuring the loopback devices
8292 	 * is the first device that appears and the last network device
8293 	 * that disappears.
8294 	 */
8295 	if (register_pernet_device(&loopback_net_ops))
8296 		goto out;
8297 
8298 	if (register_pernet_device(&default_device_ops))
8299 		goto out;
8300 
8301 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8302 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8303 
8304 	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8305 				       NULL, dev_cpu_dead);
8306 	WARN_ON(rc < 0);
8307 	dst_subsys_init();
8308 	rc = 0;
8309 out:
8310 	return rc;
8311 }
8312 
8313 subsys_initcall(net_dev_init);
8314