xref: /linux/net/core/dev.c (revision 4f58e6dceb0e44ca8f21568ed81e1df24e55964c)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <linux/bpf.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <net/busy_poll.h>
101 #include <linux/rtnetlink.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/dst_metadata.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
123 #include <net/ip.h>
124 #include <net/mpls.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/static_key.h>
136 #include <linux/hashtable.h>
137 #include <linux/vmalloc.h>
138 #include <linux/if_macvlan.h>
139 #include <linux/errqueue.h>
140 #include <linux/hrtimer.h>
141 #include <linux/netfilter_ingress.h>
142 #include <linux/crash_dump.h>
143 
144 #include "net-sysfs.h"
145 
146 /* Instead of increasing this, you should create a hash table. */
147 #define MAX_GRO_SKBS 8
148 
149 /* This should be increased if a protocol with a bigger head is added. */
150 #define GRO_MAX_HEAD (MAX_HEADER + 128)
151 
152 static DEFINE_SPINLOCK(ptype_lock);
153 static DEFINE_SPINLOCK(offload_lock);
154 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
155 struct list_head ptype_all __read_mostly;	/* Taps */
156 static struct list_head offload_base __read_mostly;
157 
158 static int netif_rx_internal(struct sk_buff *skb);
159 static int call_netdevice_notifiers_info(unsigned long val,
160 					 struct net_device *dev,
161 					 struct netdev_notifier_info *info);
162 
163 /*
164  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
165  * semaphore.
166  *
167  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
168  *
169  * Writers must hold the rtnl semaphore while they loop through the
170  * dev_base_head list, and hold dev_base_lock for writing when they do the
171  * actual updates.  This allows pure readers to access the list even
172  * while a writer is preparing to update it.
173  *
174  * To put it another way, dev_base_lock is held for writing only to
175  * protect against pure readers; the rtnl semaphore provides the
176  * protection against other writers.
177  *
178  * See, for example usages, register_netdevice() and
179  * unregister_netdevice(), which must be called with the rtnl
180  * semaphore held.
181  */
182 DEFINE_RWLOCK(dev_base_lock);
183 EXPORT_SYMBOL(dev_base_lock);
184 
185 /* protects napi_hash addition/deletion and napi_gen_id */
186 static DEFINE_SPINLOCK(napi_hash_lock);
187 
188 static unsigned int napi_gen_id = NR_CPUS;
189 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
190 
191 static seqcount_t devnet_rename_seq;
192 
193 static inline void dev_base_seq_inc(struct net *net)
194 {
195 	while (++net->dev_base_seq == 0);
196 }
197 
198 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
199 {
200 	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
201 
202 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
203 }
204 
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
208 }
209 
210 static inline void rps_lock(struct softnet_data *sd)
211 {
212 #ifdef CONFIG_RPS
213 	spin_lock(&sd->input_pkt_queue.lock);
214 #endif
215 }
216 
217 static inline void rps_unlock(struct softnet_data *sd)
218 {
219 #ifdef CONFIG_RPS
220 	spin_unlock(&sd->input_pkt_queue.lock);
221 #endif
222 }
223 
224 /* Device list insertion */
225 static void list_netdevice(struct net_device *dev)
226 {
227 	struct net *net = dev_net(dev);
228 
229 	ASSERT_RTNL();
230 
231 	write_lock_bh(&dev_base_lock);
232 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
233 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
234 	hlist_add_head_rcu(&dev->index_hlist,
235 			   dev_index_hash(net, dev->ifindex));
236 	write_unlock_bh(&dev_base_lock);
237 
238 	dev_base_seq_inc(net);
239 }
240 
241 /* Device list removal
242  * caller must respect a RCU grace period before freeing/reusing dev
243  */
244 static void unlist_netdevice(struct net_device *dev)
245 {
246 	ASSERT_RTNL();
247 
248 	/* Unlink dev from the device chain */
249 	write_lock_bh(&dev_base_lock);
250 	list_del_rcu(&dev->dev_list);
251 	hlist_del_rcu(&dev->name_hlist);
252 	hlist_del_rcu(&dev->index_hlist);
253 	write_unlock_bh(&dev_base_lock);
254 
255 	dev_base_seq_inc(dev_net(dev));
256 }
257 
258 /*
259  *	Our notifier list
260  */
261 
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263 
264 /*
265  *	Device drivers call our routines to queue packets here. We empty the
266  *	queue in the local softnet handler.
267  */
268 
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271 
272 #ifdef CONFIG_LOCKDEP
273 /*
274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275  * according to dev->type
276  */
277 static const unsigned short netdev_lock_type[] =
278 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
291 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
292 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
293 
294 static const char *const netdev_lock_name[] =
295 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
308 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
309 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
310 
311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
313 
314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315 {
316 	int i;
317 
318 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 		if (netdev_lock_type[i] == dev_type)
320 			return i;
321 	/* the last key is used by default */
322 	return ARRAY_SIZE(netdev_lock_type) - 1;
323 }
324 
325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 						 unsigned short dev_type)
327 {
328 	int i;
329 
330 	i = netdev_lock_pos(dev_type);
331 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 				   netdev_lock_name[i]);
333 }
334 
335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336 {
337 	int i;
338 
339 	i = netdev_lock_pos(dev->type);
340 	lockdep_set_class_and_name(&dev->addr_list_lock,
341 				   &netdev_addr_lock_key[i],
342 				   netdev_lock_name[i]);
343 }
344 #else
345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 						 unsigned short dev_type)
347 {
348 }
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350 {
351 }
352 #endif
353 
354 /*******************************************************************************
355 
356 		Protocol management and registration routines
357 
358 *******************************************************************************/
359 
360 /*
361  *	Add a protocol ID to the list. Now that the input handler is
362  *	smarter we can dispense with all the messy stuff that used to be
363  *	here.
364  *
365  *	BEWARE!!! Protocol handlers, mangling input packets,
366  *	MUST BE last in hash buckets and checking protocol handlers
367  *	MUST start from promiscuous ptype_all chain in net_bh.
368  *	It is true now, do not change it.
369  *	Explanation follows: if protocol handler, mangling packet, will
370  *	be the first on list, it is not able to sense, that packet
371  *	is cloned and should be copied-on-write, so that it will
372  *	change it and subsequent readers will get broken packet.
373  *							--ANK (980803)
374  */
375 
376 static inline struct list_head *ptype_head(const struct packet_type *pt)
377 {
378 	if (pt->type == htons(ETH_P_ALL))
379 		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
380 	else
381 		return pt->dev ? &pt->dev->ptype_specific :
382 				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383 }
384 
385 /**
386  *	dev_add_pack - add packet handler
387  *	@pt: packet type declaration
388  *
389  *	Add a protocol handler to the networking stack. The passed &packet_type
390  *	is linked into kernel lists and may not be freed until it has been
391  *	removed from the kernel lists.
392  *
393  *	This call does not sleep therefore it can not
394  *	guarantee all CPU's that are in middle of receiving packets
395  *	will see the new packet type (until the next received packet).
396  */
397 
398 void dev_add_pack(struct packet_type *pt)
399 {
400 	struct list_head *head = ptype_head(pt);
401 
402 	spin_lock(&ptype_lock);
403 	list_add_rcu(&pt->list, head);
404 	spin_unlock(&ptype_lock);
405 }
406 EXPORT_SYMBOL(dev_add_pack);
407 
408 /**
409  *	__dev_remove_pack	 - remove packet handler
410  *	@pt: packet type declaration
411  *
412  *	Remove a protocol handler that was previously added to the kernel
413  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
414  *	from the kernel lists and can be freed or reused once this function
415  *	returns.
416  *
417  *      The packet type might still be in use by receivers
418  *	and must not be freed until after all the CPU's have gone
419  *	through a quiescent state.
420  */
421 void __dev_remove_pack(struct packet_type *pt)
422 {
423 	struct list_head *head = ptype_head(pt);
424 	struct packet_type *pt1;
425 
426 	spin_lock(&ptype_lock);
427 
428 	list_for_each_entry(pt1, head, list) {
429 		if (pt == pt1) {
430 			list_del_rcu(&pt->list);
431 			goto out;
432 		}
433 	}
434 
435 	pr_warn("dev_remove_pack: %p not found\n", pt);
436 out:
437 	spin_unlock(&ptype_lock);
438 }
439 EXPORT_SYMBOL(__dev_remove_pack);
440 
441 /**
442  *	dev_remove_pack	 - remove packet handler
443  *	@pt: packet type declaration
444  *
445  *	Remove a protocol handler that was previously added to the kernel
446  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
447  *	from the kernel lists and can be freed or reused once this function
448  *	returns.
449  *
450  *	This call sleeps to guarantee that no CPU is looking at the packet
451  *	type after return.
452  */
453 void dev_remove_pack(struct packet_type *pt)
454 {
455 	__dev_remove_pack(pt);
456 
457 	synchronize_net();
458 }
459 EXPORT_SYMBOL(dev_remove_pack);
460 
461 
462 /**
463  *	dev_add_offload - register offload handlers
464  *	@po: protocol offload declaration
465  *
466  *	Add protocol offload handlers to the networking stack. The passed
467  *	&proto_offload is linked into kernel lists and may not be freed until
468  *	it has been removed from the kernel lists.
469  *
470  *	This call does not sleep therefore it can not
471  *	guarantee all CPU's that are in middle of receiving packets
472  *	will see the new offload handlers (until the next received packet).
473  */
474 void dev_add_offload(struct packet_offload *po)
475 {
476 	struct packet_offload *elem;
477 
478 	spin_lock(&offload_lock);
479 	list_for_each_entry(elem, &offload_base, list) {
480 		if (po->priority < elem->priority)
481 			break;
482 	}
483 	list_add_rcu(&po->list, elem->list.prev);
484 	spin_unlock(&offload_lock);
485 }
486 EXPORT_SYMBOL(dev_add_offload);
487 
488 /**
489  *	__dev_remove_offload	 - remove offload handler
490  *	@po: packet offload declaration
491  *
492  *	Remove a protocol offload handler that was previously added to the
493  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
494  *	is removed from the kernel lists and can be freed or reused once this
495  *	function returns.
496  *
497  *      The packet type might still be in use by receivers
498  *	and must not be freed until after all the CPU's have gone
499  *	through a quiescent state.
500  */
501 static void __dev_remove_offload(struct packet_offload *po)
502 {
503 	struct list_head *head = &offload_base;
504 	struct packet_offload *po1;
505 
506 	spin_lock(&offload_lock);
507 
508 	list_for_each_entry(po1, head, list) {
509 		if (po == po1) {
510 			list_del_rcu(&po->list);
511 			goto out;
512 		}
513 	}
514 
515 	pr_warn("dev_remove_offload: %p not found\n", po);
516 out:
517 	spin_unlock(&offload_lock);
518 }
519 
520 /**
521  *	dev_remove_offload	 - remove packet offload handler
522  *	@po: packet offload declaration
523  *
524  *	Remove a packet offload handler that was previously added to the kernel
525  *	offload handlers by dev_add_offload(). The passed &offload_type is
526  *	removed from the kernel lists and can be freed or reused once this
527  *	function returns.
528  *
529  *	This call sleeps to guarantee that no CPU is looking at the packet
530  *	type after return.
531  */
532 void dev_remove_offload(struct packet_offload *po)
533 {
534 	__dev_remove_offload(po);
535 
536 	synchronize_net();
537 }
538 EXPORT_SYMBOL(dev_remove_offload);
539 
540 /******************************************************************************
541 
542 		      Device Boot-time Settings Routines
543 
544 *******************************************************************************/
545 
546 /* Boot time configuration table */
547 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
548 
549 /**
550  *	netdev_boot_setup_add	- add new setup entry
551  *	@name: name of the device
552  *	@map: configured settings for the device
553  *
554  *	Adds new setup entry to the dev_boot_setup list.  The function
555  *	returns 0 on error and 1 on success.  This is a generic routine to
556  *	all netdevices.
557  */
558 static int netdev_boot_setup_add(char *name, struct ifmap *map)
559 {
560 	struct netdev_boot_setup *s;
561 	int i;
562 
563 	s = dev_boot_setup;
564 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
565 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
566 			memset(s[i].name, 0, sizeof(s[i].name));
567 			strlcpy(s[i].name, name, IFNAMSIZ);
568 			memcpy(&s[i].map, map, sizeof(s[i].map));
569 			break;
570 		}
571 	}
572 
573 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
574 }
575 
576 /**
577  *	netdev_boot_setup_check	- check boot time settings
578  *	@dev: the netdevice
579  *
580  * 	Check boot time settings for the device.
581  *	The found settings are set for the device to be used
582  *	later in the device probing.
583  *	Returns 0 if no settings found, 1 if they are.
584  */
585 int netdev_boot_setup_check(struct net_device *dev)
586 {
587 	struct netdev_boot_setup *s = dev_boot_setup;
588 	int i;
589 
590 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
591 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
592 		    !strcmp(dev->name, s[i].name)) {
593 			dev->irq 	= s[i].map.irq;
594 			dev->base_addr 	= s[i].map.base_addr;
595 			dev->mem_start 	= s[i].map.mem_start;
596 			dev->mem_end 	= s[i].map.mem_end;
597 			return 1;
598 		}
599 	}
600 	return 0;
601 }
602 EXPORT_SYMBOL(netdev_boot_setup_check);
603 
604 
605 /**
606  *	netdev_boot_base	- get address from boot time settings
607  *	@prefix: prefix for network device
608  *	@unit: id for network device
609  *
610  * 	Check boot time settings for the base address of device.
611  *	The found settings are set for the device to be used
612  *	later in the device probing.
613  *	Returns 0 if no settings found.
614  */
615 unsigned long netdev_boot_base(const char *prefix, int unit)
616 {
617 	const struct netdev_boot_setup *s = dev_boot_setup;
618 	char name[IFNAMSIZ];
619 	int i;
620 
621 	sprintf(name, "%s%d", prefix, unit);
622 
623 	/*
624 	 * If device already registered then return base of 1
625 	 * to indicate not to probe for this interface
626 	 */
627 	if (__dev_get_by_name(&init_net, name))
628 		return 1;
629 
630 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
631 		if (!strcmp(name, s[i].name))
632 			return s[i].map.base_addr;
633 	return 0;
634 }
635 
636 /*
637  * Saves at boot time configured settings for any netdevice.
638  */
639 int __init netdev_boot_setup(char *str)
640 {
641 	int ints[5];
642 	struct ifmap map;
643 
644 	str = get_options(str, ARRAY_SIZE(ints), ints);
645 	if (!str || !*str)
646 		return 0;
647 
648 	/* Save settings */
649 	memset(&map, 0, sizeof(map));
650 	if (ints[0] > 0)
651 		map.irq = ints[1];
652 	if (ints[0] > 1)
653 		map.base_addr = ints[2];
654 	if (ints[0] > 2)
655 		map.mem_start = ints[3];
656 	if (ints[0] > 3)
657 		map.mem_end = ints[4];
658 
659 	/* Add new entry to the list */
660 	return netdev_boot_setup_add(str, &map);
661 }
662 
663 __setup("netdev=", netdev_boot_setup);
664 
665 /*******************************************************************************
666 
667 			    Device Interface Subroutines
668 
669 *******************************************************************************/
670 
671 /**
672  *	dev_get_iflink	- get 'iflink' value of a interface
673  *	@dev: targeted interface
674  *
675  *	Indicates the ifindex the interface is linked to.
676  *	Physical interfaces have the same 'ifindex' and 'iflink' values.
677  */
678 
679 int dev_get_iflink(const struct net_device *dev)
680 {
681 	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
682 		return dev->netdev_ops->ndo_get_iflink(dev);
683 
684 	return dev->ifindex;
685 }
686 EXPORT_SYMBOL(dev_get_iflink);
687 
688 /**
689  *	dev_fill_metadata_dst - Retrieve tunnel egress information.
690  *	@dev: targeted interface
691  *	@skb: The packet.
692  *
693  *	For better visibility of tunnel traffic OVS needs to retrieve
694  *	egress tunnel information for a packet. Following API allows
695  *	user to get this info.
696  */
697 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
698 {
699 	struct ip_tunnel_info *info;
700 
701 	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
702 		return -EINVAL;
703 
704 	info = skb_tunnel_info_unclone(skb);
705 	if (!info)
706 		return -ENOMEM;
707 	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
708 		return -EINVAL;
709 
710 	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
711 }
712 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
713 
714 /**
715  *	__dev_get_by_name	- find a device by its name
716  *	@net: the applicable net namespace
717  *	@name: name to find
718  *
719  *	Find an interface by name. Must be called under RTNL semaphore
720  *	or @dev_base_lock. If the name is found a pointer to the device
721  *	is returned. If the name is not found then %NULL is returned. The
722  *	reference counters are not incremented so the caller must be
723  *	careful with locks.
724  */
725 
726 struct net_device *__dev_get_by_name(struct net *net, const char *name)
727 {
728 	struct net_device *dev;
729 	struct hlist_head *head = dev_name_hash(net, name);
730 
731 	hlist_for_each_entry(dev, head, name_hlist)
732 		if (!strncmp(dev->name, name, IFNAMSIZ))
733 			return dev;
734 
735 	return NULL;
736 }
737 EXPORT_SYMBOL(__dev_get_by_name);
738 
739 /**
740  *	dev_get_by_name_rcu	- find a device by its name
741  *	@net: the applicable net namespace
742  *	@name: name to find
743  *
744  *	Find an interface by name.
745  *	If the name is found a pointer to the device is returned.
746  * 	If the name is not found then %NULL is returned.
747  *	The reference counters are not incremented so the caller must be
748  *	careful with locks. The caller must hold RCU lock.
749  */
750 
751 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
752 {
753 	struct net_device *dev;
754 	struct hlist_head *head = dev_name_hash(net, name);
755 
756 	hlist_for_each_entry_rcu(dev, head, name_hlist)
757 		if (!strncmp(dev->name, name, IFNAMSIZ))
758 			return dev;
759 
760 	return NULL;
761 }
762 EXPORT_SYMBOL(dev_get_by_name_rcu);
763 
764 /**
765  *	dev_get_by_name		- find a device by its name
766  *	@net: the applicable net namespace
767  *	@name: name to find
768  *
769  *	Find an interface by name. This can be called from any
770  *	context and does its own locking. The returned handle has
771  *	the usage count incremented and the caller must use dev_put() to
772  *	release it when it is no longer needed. %NULL is returned if no
773  *	matching device is found.
774  */
775 
776 struct net_device *dev_get_by_name(struct net *net, const char *name)
777 {
778 	struct net_device *dev;
779 
780 	rcu_read_lock();
781 	dev = dev_get_by_name_rcu(net, name);
782 	if (dev)
783 		dev_hold(dev);
784 	rcu_read_unlock();
785 	return dev;
786 }
787 EXPORT_SYMBOL(dev_get_by_name);
788 
789 /**
790  *	__dev_get_by_index - find a device by its ifindex
791  *	@net: the applicable net namespace
792  *	@ifindex: index of device
793  *
794  *	Search for an interface by index. Returns %NULL if the device
795  *	is not found or a pointer to the device. The device has not
796  *	had its reference counter increased so the caller must be careful
797  *	about locking. The caller must hold either the RTNL semaphore
798  *	or @dev_base_lock.
799  */
800 
801 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
802 {
803 	struct net_device *dev;
804 	struct hlist_head *head = dev_index_hash(net, ifindex);
805 
806 	hlist_for_each_entry(dev, head, index_hlist)
807 		if (dev->ifindex == ifindex)
808 			return dev;
809 
810 	return NULL;
811 }
812 EXPORT_SYMBOL(__dev_get_by_index);
813 
814 /**
815  *	dev_get_by_index_rcu - find a device by its ifindex
816  *	@net: the applicable net namespace
817  *	@ifindex: index of device
818  *
819  *	Search for an interface by index. Returns %NULL if the device
820  *	is not found or a pointer to the device. The device has not
821  *	had its reference counter increased so the caller must be careful
822  *	about locking. The caller must hold RCU lock.
823  */
824 
825 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
826 {
827 	struct net_device *dev;
828 	struct hlist_head *head = dev_index_hash(net, ifindex);
829 
830 	hlist_for_each_entry_rcu(dev, head, index_hlist)
831 		if (dev->ifindex == ifindex)
832 			return dev;
833 
834 	return NULL;
835 }
836 EXPORT_SYMBOL(dev_get_by_index_rcu);
837 
838 
839 /**
840  *	dev_get_by_index - find a device by its ifindex
841  *	@net: the applicable net namespace
842  *	@ifindex: index of device
843  *
844  *	Search for an interface by index. Returns NULL if the device
845  *	is not found or a pointer to the device. The device returned has
846  *	had a reference added and the pointer is safe until the user calls
847  *	dev_put to indicate they have finished with it.
848  */
849 
850 struct net_device *dev_get_by_index(struct net *net, int ifindex)
851 {
852 	struct net_device *dev;
853 
854 	rcu_read_lock();
855 	dev = dev_get_by_index_rcu(net, ifindex);
856 	if (dev)
857 		dev_hold(dev);
858 	rcu_read_unlock();
859 	return dev;
860 }
861 EXPORT_SYMBOL(dev_get_by_index);
862 
863 /**
864  *	netdev_get_name - get a netdevice name, knowing its ifindex.
865  *	@net: network namespace
866  *	@name: a pointer to the buffer where the name will be stored.
867  *	@ifindex: the ifindex of the interface to get the name from.
868  *
869  *	The use of raw_seqcount_begin() and cond_resched() before
870  *	retrying is required as we want to give the writers a chance
871  *	to complete when CONFIG_PREEMPT is not set.
872  */
873 int netdev_get_name(struct net *net, char *name, int ifindex)
874 {
875 	struct net_device *dev;
876 	unsigned int seq;
877 
878 retry:
879 	seq = raw_seqcount_begin(&devnet_rename_seq);
880 	rcu_read_lock();
881 	dev = dev_get_by_index_rcu(net, ifindex);
882 	if (!dev) {
883 		rcu_read_unlock();
884 		return -ENODEV;
885 	}
886 
887 	strcpy(name, dev->name);
888 	rcu_read_unlock();
889 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
890 		cond_resched();
891 		goto retry;
892 	}
893 
894 	return 0;
895 }
896 
897 /**
898  *	dev_getbyhwaddr_rcu - find a device by its hardware address
899  *	@net: the applicable net namespace
900  *	@type: media type of device
901  *	@ha: hardware address
902  *
903  *	Search for an interface by MAC address. Returns NULL if the device
904  *	is not found or a pointer to the device.
905  *	The caller must hold RCU or RTNL.
906  *	The returned device has not had its ref count increased
907  *	and the caller must therefore be careful about locking
908  *
909  */
910 
911 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
912 				       const char *ha)
913 {
914 	struct net_device *dev;
915 
916 	for_each_netdev_rcu(net, dev)
917 		if (dev->type == type &&
918 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
919 			return dev;
920 
921 	return NULL;
922 }
923 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
924 
925 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
926 {
927 	struct net_device *dev;
928 
929 	ASSERT_RTNL();
930 	for_each_netdev(net, dev)
931 		if (dev->type == type)
932 			return dev;
933 
934 	return NULL;
935 }
936 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
937 
938 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
939 {
940 	struct net_device *dev, *ret = NULL;
941 
942 	rcu_read_lock();
943 	for_each_netdev_rcu(net, dev)
944 		if (dev->type == type) {
945 			dev_hold(dev);
946 			ret = dev;
947 			break;
948 		}
949 	rcu_read_unlock();
950 	return ret;
951 }
952 EXPORT_SYMBOL(dev_getfirstbyhwtype);
953 
954 /**
955  *	__dev_get_by_flags - find any device with given flags
956  *	@net: the applicable net namespace
957  *	@if_flags: IFF_* values
958  *	@mask: bitmask of bits in if_flags to check
959  *
960  *	Search for any interface with the given flags. Returns NULL if a device
961  *	is not found or a pointer to the device. Must be called inside
962  *	rtnl_lock(), and result refcount is unchanged.
963  */
964 
965 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
966 				      unsigned short mask)
967 {
968 	struct net_device *dev, *ret;
969 
970 	ASSERT_RTNL();
971 
972 	ret = NULL;
973 	for_each_netdev(net, dev) {
974 		if (((dev->flags ^ if_flags) & mask) == 0) {
975 			ret = dev;
976 			break;
977 		}
978 	}
979 	return ret;
980 }
981 EXPORT_SYMBOL(__dev_get_by_flags);
982 
983 /**
984  *	dev_valid_name - check if name is okay for network device
985  *	@name: name string
986  *
987  *	Network device names need to be valid file names to
988  *	to allow sysfs to work.  We also disallow any kind of
989  *	whitespace.
990  */
991 bool dev_valid_name(const char *name)
992 {
993 	if (*name == '\0')
994 		return false;
995 	if (strlen(name) >= IFNAMSIZ)
996 		return false;
997 	if (!strcmp(name, ".") || !strcmp(name, ".."))
998 		return false;
999 
1000 	while (*name) {
1001 		if (*name == '/' || *name == ':' || isspace(*name))
1002 			return false;
1003 		name++;
1004 	}
1005 	return true;
1006 }
1007 EXPORT_SYMBOL(dev_valid_name);
1008 
1009 /**
1010  *	__dev_alloc_name - allocate a name for a device
1011  *	@net: network namespace to allocate the device name in
1012  *	@name: name format string
1013  *	@buf:  scratch buffer and result name string
1014  *
1015  *	Passed a format string - eg "lt%d" it will try and find a suitable
1016  *	id. It scans list of devices to build up a free map, then chooses
1017  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1018  *	while allocating the name and adding the device in order to avoid
1019  *	duplicates.
1020  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021  *	Returns the number of the unit assigned or a negative errno code.
1022  */
1023 
1024 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025 {
1026 	int i = 0;
1027 	const char *p;
1028 	const int max_netdevices = 8*PAGE_SIZE;
1029 	unsigned long *inuse;
1030 	struct net_device *d;
1031 
1032 	p = strnchr(name, IFNAMSIZ-1, '%');
1033 	if (p) {
1034 		/*
1035 		 * Verify the string as this thing may have come from
1036 		 * the user.  There must be either one "%d" and no other "%"
1037 		 * characters.
1038 		 */
1039 		if (p[1] != 'd' || strchr(p + 2, '%'))
1040 			return -EINVAL;
1041 
1042 		/* Use one page as a bit array of possible slots */
1043 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044 		if (!inuse)
1045 			return -ENOMEM;
1046 
1047 		for_each_netdev(net, d) {
1048 			if (!sscanf(d->name, name, &i))
1049 				continue;
1050 			if (i < 0 || i >= max_netdevices)
1051 				continue;
1052 
1053 			/*  avoid cases where sscanf is not exact inverse of printf */
1054 			snprintf(buf, IFNAMSIZ, name, i);
1055 			if (!strncmp(buf, d->name, IFNAMSIZ))
1056 				set_bit(i, inuse);
1057 		}
1058 
1059 		i = find_first_zero_bit(inuse, max_netdevices);
1060 		free_page((unsigned long) inuse);
1061 	}
1062 
1063 	if (buf != name)
1064 		snprintf(buf, IFNAMSIZ, name, i);
1065 	if (!__dev_get_by_name(net, buf))
1066 		return i;
1067 
1068 	/* It is possible to run out of possible slots
1069 	 * when the name is long and there isn't enough space left
1070 	 * for the digits, or if all bits are used.
1071 	 */
1072 	return -ENFILE;
1073 }
1074 
1075 /**
1076  *	dev_alloc_name - allocate a name for a device
1077  *	@dev: device
1078  *	@name: name format string
1079  *
1080  *	Passed a format string - eg "lt%d" it will try and find a suitable
1081  *	id. It scans list of devices to build up a free map, then chooses
1082  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1083  *	while allocating the name and adding the device in order to avoid
1084  *	duplicates.
1085  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086  *	Returns the number of the unit assigned or a negative errno code.
1087  */
1088 
1089 int dev_alloc_name(struct net_device *dev, const char *name)
1090 {
1091 	char buf[IFNAMSIZ];
1092 	struct net *net;
1093 	int ret;
1094 
1095 	BUG_ON(!dev_net(dev));
1096 	net = dev_net(dev);
1097 	ret = __dev_alloc_name(net, name, buf);
1098 	if (ret >= 0)
1099 		strlcpy(dev->name, buf, IFNAMSIZ);
1100 	return ret;
1101 }
1102 EXPORT_SYMBOL(dev_alloc_name);
1103 
1104 static int dev_alloc_name_ns(struct net *net,
1105 			     struct net_device *dev,
1106 			     const char *name)
1107 {
1108 	char buf[IFNAMSIZ];
1109 	int ret;
1110 
1111 	ret = __dev_alloc_name(net, name, buf);
1112 	if (ret >= 0)
1113 		strlcpy(dev->name, buf, IFNAMSIZ);
1114 	return ret;
1115 }
1116 
1117 static int dev_get_valid_name(struct net *net,
1118 			      struct net_device *dev,
1119 			      const char *name)
1120 {
1121 	BUG_ON(!net);
1122 
1123 	if (!dev_valid_name(name))
1124 		return -EINVAL;
1125 
1126 	if (strchr(name, '%'))
1127 		return dev_alloc_name_ns(net, dev, name);
1128 	else if (__dev_get_by_name(net, name))
1129 		return -EEXIST;
1130 	else if (dev->name != name)
1131 		strlcpy(dev->name, name, IFNAMSIZ);
1132 
1133 	return 0;
1134 }
1135 
1136 /**
1137  *	dev_change_name - change name of a device
1138  *	@dev: device
1139  *	@newname: name (or format string) must be at least IFNAMSIZ
1140  *
1141  *	Change name of a device, can pass format strings "eth%d".
1142  *	for wildcarding.
1143  */
1144 int dev_change_name(struct net_device *dev, const char *newname)
1145 {
1146 	unsigned char old_assign_type;
1147 	char oldname[IFNAMSIZ];
1148 	int err = 0;
1149 	int ret;
1150 	struct net *net;
1151 
1152 	ASSERT_RTNL();
1153 	BUG_ON(!dev_net(dev));
1154 
1155 	net = dev_net(dev);
1156 	if (dev->flags & IFF_UP)
1157 		return -EBUSY;
1158 
1159 	write_seqcount_begin(&devnet_rename_seq);
1160 
1161 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162 		write_seqcount_end(&devnet_rename_seq);
1163 		return 0;
1164 	}
1165 
1166 	memcpy(oldname, dev->name, IFNAMSIZ);
1167 
1168 	err = dev_get_valid_name(net, dev, newname);
1169 	if (err < 0) {
1170 		write_seqcount_end(&devnet_rename_seq);
1171 		return err;
1172 	}
1173 
1174 	if (oldname[0] && !strchr(oldname, '%'))
1175 		netdev_info(dev, "renamed from %s\n", oldname);
1176 
1177 	old_assign_type = dev->name_assign_type;
1178 	dev->name_assign_type = NET_NAME_RENAMED;
1179 
1180 rollback:
1181 	ret = device_rename(&dev->dev, dev->name);
1182 	if (ret) {
1183 		memcpy(dev->name, oldname, IFNAMSIZ);
1184 		dev->name_assign_type = old_assign_type;
1185 		write_seqcount_end(&devnet_rename_seq);
1186 		return ret;
1187 	}
1188 
1189 	write_seqcount_end(&devnet_rename_seq);
1190 
1191 	netdev_adjacent_rename_links(dev, oldname);
1192 
1193 	write_lock_bh(&dev_base_lock);
1194 	hlist_del_rcu(&dev->name_hlist);
1195 	write_unlock_bh(&dev_base_lock);
1196 
1197 	synchronize_rcu();
1198 
1199 	write_lock_bh(&dev_base_lock);
1200 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201 	write_unlock_bh(&dev_base_lock);
1202 
1203 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204 	ret = notifier_to_errno(ret);
1205 
1206 	if (ret) {
1207 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1208 		if (err >= 0) {
1209 			err = ret;
1210 			write_seqcount_begin(&devnet_rename_seq);
1211 			memcpy(dev->name, oldname, IFNAMSIZ);
1212 			memcpy(oldname, newname, IFNAMSIZ);
1213 			dev->name_assign_type = old_assign_type;
1214 			old_assign_type = NET_NAME_RENAMED;
1215 			goto rollback;
1216 		} else {
1217 			pr_err("%s: name change rollback failed: %d\n",
1218 			       dev->name, ret);
1219 		}
1220 	}
1221 
1222 	return err;
1223 }
1224 
1225 /**
1226  *	dev_set_alias - change ifalias of a device
1227  *	@dev: device
1228  *	@alias: name up to IFALIASZ
1229  *	@len: limit of bytes to copy from info
1230  *
1231  *	Set ifalias for a device,
1232  */
1233 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234 {
1235 	char *new_ifalias;
1236 
1237 	ASSERT_RTNL();
1238 
1239 	if (len >= IFALIASZ)
1240 		return -EINVAL;
1241 
1242 	if (!len) {
1243 		kfree(dev->ifalias);
1244 		dev->ifalias = NULL;
1245 		return 0;
1246 	}
1247 
1248 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249 	if (!new_ifalias)
1250 		return -ENOMEM;
1251 	dev->ifalias = new_ifalias;
1252 
1253 	strlcpy(dev->ifalias, alias, len+1);
1254 	return len;
1255 }
1256 
1257 
1258 /**
1259  *	netdev_features_change - device changes features
1260  *	@dev: device to cause notification
1261  *
1262  *	Called to indicate a device has changed features.
1263  */
1264 void netdev_features_change(struct net_device *dev)
1265 {
1266 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267 }
1268 EXPORT_SYMBOL(netdev_features_change);
1269 
1270 /**
1271  *	netdev_state_change - device changes state
1272  *	@dev: device to cause notification
1273  *
1274  *	Called to indicate a device has changed state. This function calls
1275  *	the notifier chains for netdev_chain and sends a NEWLINK message
1276  *	to the routing socket.
1277  */
1278 void netdev_state_change(struct net_device *dev)
1279 {
1280 	if (dev->flags & IFF_UP) {
1281 		struct netdev_notifier_change_info change_info;
1282 
1283 		change_info.flags_changed = 0;
1284 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285 					      &change_info.info);
1286 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287 	}
1288 }
1289 EXPORT_SYMBOL(netdev_state_change);
1290 
1291 /**
1292  * 	netdev_notify_peers - notify network peers about existence of @dev
1293  * 	@dev: network device
1294  *
1295  * Generate traffic such that interested network peers are aware of
1296  * @dev, such as by generating a gratuitous ARP. This may be used when
1297  * a device wants to inform the rest of the network about some sort of
1298  * reconfiguration such as a failover event or virtual machine
1299  * migration.
1300  */
1301 void netdev_notify_peers(struct net_device *dev)
1302 {
1303 	rtnl_lock();
1304 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305 	rtnl_unlock();
1306 }
1307 EXPORT_SYMBOL(netdev_notify_peers);
1308 
1309 static int __dev_open(struct net_device *dev)
1310 {
1311 	const struct net_device_ops *ops = dev->netdev_ops;
1312 	int ret;
1313 
1314 	ASSERT_RTNL();
1315 
1316 	if (!netif_device_present(dev))
1317 		return -ENODEV;
1318 
1319 	/* Block netpoll from trying to do any rx path servicing.
1320 	 * If we don't do this there is a chance ndo_poll_controller
1321 	 * or ndo_poll may be running while we open the device
1322 	 */
1323 	netpoll_poll_disable(dev);
1324 
1325 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326 	ret = notifier_to_errno(ret);
1327 	if (ret)
1328 		return ret;
1329 
1330 	set_bit(__LINK_STATE_START, &dev->state);
1331 
1332 	if (ops->ndo_validate_addr)
1333 		ret = ops->ndo_validate_addr(dev);
1334 
1335 	if (!ret && ops->ndo_open)
1336 		ret = ops->ndo_open(dev);
1337 
1338 	netpoll_poll_enable(dev);
1339 
1340 	if (ret)
1341 		clear_bit(__LINK_STATE_START, &dev->state);
1342 	else {
1343 		dev->flags |= IFF_UP;
1344 		dev_set_rx_mode(dev);
1345 		dev_activate(dev);
1346 		add_device_randomness(dev->dev_addr, dev->addr_len);
1347 	}
1348 
1349 	return ret;
1350 }
1351 
1352 /**
1353  *	dev_open	- prepare an interface for use.
1354  *	@dev:	device to open
1355  *
1356  *	Takes a device from down to up state. The device's private open
1357  *	function is invoked and then the multicast lists are loaded. Finally
1358  *	the device is moved into the up state and a %NETDEV_UP message is
1359  *	sent to the netdev notifier chain.
1360  *
1361  *	Calling this function on an active interface is a nop. On a failure
1362  *	a negative errno code is returned.
1363  */
1364 int dev_open(struct net_device *dev)
1365 {
1366 	int ret;
1367 
1368 	if (dev->flags & IFF_UP)
1369 		return 0;
1370 
1371 	ret = __dev_open(dev);
1372 	if (ret < 0)
1373 		return ret;
1374 
1375 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376 	call_netdevice_notifiers(NETDEV_UP, dev);
1377 
1378 	return ret;
1379 }
1380 EXPORT_SYMBOL(dev_open);
1381 
1382 static int __dev_close_many(struct list_head *head)
1383 {
1384 	struct net_device *dev;
1385 
1386 	ASSERT_RTNL();
1387 	might_sleep();
1388 
1389 	list_for_each_entry(dev, head, close_list) {
1390 		/* Temporarily disable netpoll until the interface is down */
1391 		netpoll_poll_disable(dev);
1392 
1393 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394 
1395 		clear_bit(__LINK_STATE_START, &dev->state);
1396 
1397 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1398 		 * can be even on different cpu. So just clear netif_running().
1399 		 *
1400 		 * dev->stop() will invoke napi_disable() on all of it's
1401 		 * napi_struct instances on this device.
1402 		 */
1403 		smp_mb__after_atomic(); /* Commit netif_running(). */
1404 	}
1405 
1406 	dev_deactivate_many(head);
1407 
1408 	list_for_each_entry(dev, head, close_list) {
1409 		const struct net_device_ops *ops = dev->netdev_ops;
1410 
1411 		/*
1412 		 *	Call the device specific close. This cannot fail.
1413 		 *	Only if device is UP
1414 		 *
1415 		 *	We allow it to be called even after a DETACH hot-plug
1416 		 *	event.
1417 		 */
1418 		if (ops->ndo_stop)
1419 			ops->ndo_stop(dev);
1420 
1421 		dev->flags &= ~IFF_UP;
1422 		netpoll_poll_enable(dev);
1423 	}
1424 
1425 	return 0;
1426 }
1427 
1428 static int __dev_close(struct net_device *dev)
1429 {
1430 	int retval;
1431 	LIST_HEAD(single);
1432 
1433 	list_add(&dev->close_list, &single);
1434 	retval = __dev_close_many(&single);
1435 	list_del(&single);
1436 
1437 	return retval;
1438 }
1439 
1440 int dev_close_many(struct list_head *head, bool unlink)
1441 {
1442 	struct net_device *dev, *tmp;
1443 
1444 	/* Remove the devices that don't need to be closed */
1445 	list_for_each_entry_safe(dev, tmp, head, close_list)
1446 		if (!(dev->flags & IFF_UP))
1447 			list_del_init(&dev->close_list);
1448 
1449 	__dev_close_many(head);
1450 
1451 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1452 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1454 		if (unlink)
1455 			list_del_init(&dev->close_list);
1456 	}
1457 
1458 	return 0;
1459 }
1460 EXPORT_SYMBOL(dev_close_many);
1461 
1462 /**
1463  *	dev_close - shutdown an interface.
1464  *	@dev: device to shutdown
1465  *
1466  *	This function moves an active device into down state. A
1467  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469  *	chain.
1470  */
1471 int dev_close(struct net_device *dev)
1472 {
1473 	if (dev->flags & IFF_UP) {
1474 		LIST_HEAD(single);
1475 
1476 		list_add(&dev->close_list, &single);
1477 		dev_close_many(&single, true);
1478 		list_del(&single);
1479 	}
1480 	return 0;
1481 }
1482 EXPORT_SYMBOL(dev_close);
1483 
1484 
1485 /**
1486  *	dev_disable_lro - disable Large Receive Offload on a device
1487  *	@dev: device
1488  *
1489  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1490  *	called under RTNL.  This is needed if received packets may be
1491  *	forwarded to another interface.
1492  */
1493 void dev_disable_lro(struct net_device *dev)
1494 {
1495 	struct net_device *lower_dev;
1496 	struct list_head *iter;
1497 
1498 	dev->wanted_features &= ~NETIF_F_LRO;
1499 	netdev_update_features(dev);
1500 
1501 	if (unlikely(dev->features & NETIF_F_LRO))
1502 		netdev_WARN(dev, "failed to disable LRO!\n");
1503 
1504 	netdev_for_each_lower_dev(dev, lower_dev, iter)
1505 		dev_disable_lro(lower_dev);
1506 }
1507 EXPORT_SYMBOL(dev_disable_lro);
1508 
1509 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510 				   struct net_device *dev)
1511 {
1512 	struct netdev_notifier_info info;
1513 
1514 	netdev_notifier_info_init(&info, dev);
1515 	return nb->notifier_call(nb, val, &info);
1516 }
1517 
1518 static int dev_boot_phase = 1;
1519 
1520 /**
1521  *	register_netdevice_notifier - register a network notifier block
1522  *	@nb: notifier
1523  *
1524  *	Register a notifier to be called when network device events occur.
1525  *	The notifier passed is linked into the kernel structures and must
1526  *	not be reused until it has been unregistered. A negative errno code
1527  *	is returned on a failure.
1528  *
1529  * 	When registered all registration and up events are replayed
1530  *	to the new notifier to allow device to have a race free
1531  *	view of the network device list.
1532  */
1533 
1534 int register_netdevice_notifier(struct notifier_block *nb)
1535 {
1536 	struct net_device *dev;
1537 	struct net_device *last;
1538 	struct net *net;
1539 	int err;
1540 
1541 	rtnl_lock();
1542 	err = raw_notifier_chain_register(&netdev_chain, nb);
1543 	if (err)
1544 		goto unlock;
1545 	if (dev_boot_phase)
1546 		goto unlock;
1547 	for_each_net(net) {
1548 		for_each_netdev(net, dev) {
1549 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550 			err = notifier_to_errno(err);
1551 			if (err)
1552 				goto rollback;
1553 
1554 			if (!(dev->flags & IFF_UP))
1555 				continue;
1556 
1557 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1558 		}
1559 	}
1560 
1561 unlock:
1562 	rtnl_unlock();
1563 	return err;
1564 
1565 rollback:
1566 	last = dev;
1567 	for_each_net(net) {
1568 		for_each_netdev(net, dev) {
1569 			if (dev == last)
1570 				goto outroll;
1571 
1572 			if (dev->flags & IFF_UP) {
1573 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574 							dev);
1575 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576 			}
1577 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578 		}
1579 	}
1580 
1581 outroll:
1582 	raw_notifier_chain_unregister(&netdev_chain, nb);
1583 	goto unlock;
1584 }
1585 EXPORT_SYMBOL(register_netdevice_notifier);
1586 
1587 /**
1588  *	unregister_netdevice_notifier - unregister a network notifier block
1589  *	@nb: notifier
1590  *
1591  *	Unregister a notifier previously registered by
1592  *	register_netdevice_notifier(). The notifier is unlinked into the
1593  *	kernel structures and may then be reused. A negative errno code
1594  *	is returned on a failure.
1595  *
1596  * 	After unregistering unregister and down device events are synthesized
1597  *	for all devices on the device list to the removed notifier to remove
1598  *	the need for special case cleanup code.
1599  */
1600 
1601 int unregister_netdevice_notifier(struct notifier_block *nb)
1602 {
1603 	struct net_device *dev;
1604 	struct net *net;
1605 	int err;
1606 
1607 	rtnl_lock();
1608 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609 	if (err)
1610 		goto unlock;
1611 
1612 	for_each_net(net) {
1613 		for_each_netdev(net, dev) {
1614 			if (dev->flags & IFF_UP) {
1615 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616 							dev);
1617 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618 			}
1619 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620 		}
1621 	}
1622 unlock:
1623 	rtnl_unlock();
1624 	return err;
1625 }
1626 EXPORT_SYMBOL(unregister_netdevice_notifier);
1627 
1628 /**
1629  *	call_netdevice_notifiers_info - call all network notifier blocks
1630  *	@val: value passed unmodified to notifier function
1631  *	@dev: net_device pointer passed unmodified to notifier function
1632  *	@info: notifier information data
1633  *
1634  *	Call all network notifier blocks.  Parameters and return value
1635  *	are as for raw_notifier_call_chain().
1636  */
1637 
1638 static int call_netdevice_notifiers_info(unsigned long val,
1639 					 struct net_device *dev,
1640 					 struct netdev_notifier_info *info)
1641 {
1642 	ASSERT_RTNL();
1643 	netdev_notifier_info_init(info, dev);
1644 	return raw_notifier_call_chain(&netdev_chain, val, info);
1645 }
1646 
1647 /**
1648  *	call_netdevice_notifiers - call all network notifier blocks
1649  *      @val: value passed unmodified to notifier function
1650  *      @dev: net_device pointer passed unmodified to notifier function
1651  *
1652  *	Call all network notifier blocks.  Parameters and return value
1653  *	are as for raw_notifier_call_chain().
1654  */
1655 
1656 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657 {
1658 	struct netdev_notifier_info info;
1659 
1660 	return call_netdevice_notifiers_info(val, dev, &info);
1661 }
1662 EXPORT_SYMBOL(call_netdevice_notifiers);
1663 
1664 #ifdef CONFIG_NET_INGRESS
1665 static struct static_key ingress_needed __read_mostly;
1666 
1667 void net_inc_ingress_queue(void)
1668 {
1669 	static_key_slow_inc(&ingress_needed);
1670 }
1671 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672 
1673 void net_dec_ingress_queue(void)
1674 {
1675 	static_key_slow_dec(&ingress_needed);
1676 }
1677 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678 #endif
1679 
1680 #ifdef CONFIG_NET_EGRESS
1681 static struct static_key egress_needed __read_mostly;
1682 
1683 void net_inc_egress_queue(void)
1684 {
1685 	static_key_slow_inc(&egress_needed);
1686 }
1687 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688 
1689 void net_dec_egress_queue(void)
1690 {
1691 	static_key_slow_dec(&egress_needed);
1692 }
1693 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694 #endif
1695 
1696 static struct static_key netstamp_needed __read_mostly;
1697 #ifdef HAVE_JUMP_LABEL
1698 /* We are not allowed to call static_key_slow_dec() from irq context
1699  * If net_disable_timestamp() is called from irq context, defer the
1700  * static_key_slow_dec() calls.
1701  */
1702 static atomic_t netstamp_needed_deferred;
1703 #endif
1704 
1705 void net_enable_timestamp(void)
1706 {
1707 #ifdef HAVE_JUMP_LABEL
1708 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1709 
1710 	if (deferred) {
1711 		while (--deferred)
1712 			static_key_slow_dec(&netstamp_needed);
1713 		return;
1714 	}
1715 #endif
1716 	static_key_slow_inc(&netstamp_needed);
1717 }
1718 EXPORT_SYMBOL(net_enable_timestamp);
1719 
1720 void net_disable_timestamp(void)
1721 {
1722 #ifdef HAVE_JUMP_LABEL
1723 	if (in_interrupt()) {
1724 		atomic_inc(&netstamp_needed_deferred);
1725 		return;
1726 	}
1727 #endif
1728 	static_key_slow_dec(&netstamp_needed);
1729 }
1730 EXPORT_SYMBOL(net_disable_timestamp);
1731 
1732 static inline void net_timestamp_set(struct sk_buff *skb)
1733 {
1734 	skb->tstamp.tv64 = 0;
1735 	if (static_key_false(&netstamp_needed))
1736 		__net_timestamp(skb);
1737 }
1738 
1739 #define net_timestamp_check(COND, SKB)			\
1740 	if (static_key_false(&netstamp_needed)) {		\
1741 		if ((COND) && !(SKB)->tstamp.tv64)	\
1742 			__net_timestamp(SKB);		\
1743 	}						\
1744 
1745 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1746 {
1747 	unsigned int len;
1748 
1749 	if (!(dev->flags & IFF_UP))
1750 		return false;
1751 
1752 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1753 	if (skb->len <= len)
1754 		return true;
1755 
1756 	/* if TSO is enabled, we don't care about the length as the packet
1757 	 * could be forwarded without being segmented before
1758 	 */
1759 	if (skb_is_gso(skb))
1760 		return true;
1761 
1762 	return false;
1763 }
1764 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1765 
1766 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1767 {
1768 	if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1769 	    unlikely(!is_skb_forwardable(dev, skb))) {
1770 		atomic_long_inc(&dev->rx_dropped);
1771 		kfree_skb(skb);
1772 		return NET_RX_DROP;
1773 	}
1774 
1775 	skb_scrub_packet(skb, true);
1776 	skb->priority = 0;
1777 	skb->protocol = eth_type_trans(skb, dev);
1778 	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1779 
1780 	return 0;
1781 }
1782 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1783 
1784 /**
1785  * dev_forward_skb - loopback an skb to another netif
1786  *
1787  * @dev: destination network device
1788  * @skb: buffer to forward
1789  *
1790  * return values:
1791  *	NET_RX_SUCCESS	(no congestion)
1792  *	NET_RX_DROP     (packet was dropped, but freed)
1793  *
1794  * dev_forward_skb can be used for injecting an skb from the
1795  * start_xmit function of one device into the receive queue
1796  * of another device.
1797  *
1798  * The receiving device may be in another namespace, so
1799  * we have to clear all information in the skb that could
1800  * impact namespace isolation.
1801  */
1802 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1803 {
1804 	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1805 }
1806 EXPORT_SYMBOL_GPL(dev_forward_skb);
1807 
1808 static inline int deliver_skb(struct sk_buff *skb,
1809 			      struct packet_type *pt_prev,
1810 			      struct net_device *orig_dev)
1811 {
1812 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1813 		return -ENOMEM;
1814 	atomic_inc(&skb->users);
1815 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1816 }
1817 
1818 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1819 					  struct packet_type **pt,
1820 					  struct net_device *orig_dev,
1821 					  __be16 type,
1822 					  struct list_head *ptype_list)
1823 {
1824 	struct packet_type *ptype, *pt_prev = *pt;
1825 
1826 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1827 		if (ptype->type != type)
1828 			continue;
1829 		if (pt_prev)
1830 			deliver_skb(skb, pt_prev, orig_dev);
1831 		pt_prev = ptype;
1832 	}
1833 	*pt = pt_prev;
1834 }
1835 
1836 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1837 {
1838 	if (!ptype->af_packet_priv || !skb->sk)
1839 		return false;
1840 
1841 	if (ptype->id_match)
1842 		return ptype->id_match(ptype, skb->sk);
1843 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1844 		return true;
1845 
1846 	return false;
1847 }
1848 
1849 /*
1850  *	Support routine. Sends outgoing frames to any network
1851  *	taps currently in use.
1852  */
1853 
1854 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1855 {
1856 	struct packet_type *ptype;
1857 	struct sk_buff *skb2 = NULL;
1858 	struct packet_type *pt_prev = NULL;
1859 	struct list_head *ptype_list = &ptype_all;
1860 
1861 	rcu_read_lock();
1862 again:
1863 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1864 		/* Never send packets back to the socket
1865 		 * they originated from - MvS (miquels@drinkel.ow.org)
1866 		 */
1867 		if (skb_loop_sk(ptype, skb))
1868 			continue;
1869 
1870 		if (pt_prev) {
1871 			deliver_skb(skb2, pt_prev, skb->dev);
1872 			pt_prev = ptype;
1873 			continue;
1874 		}
1875 
1876 		/* need to clone skb, done only once */
1877 		skb2 = skb_clone(skb, GFP_ATOMIC);
1878 		if (!skb2)
1879 			goto out_unlock;
1880 
1881 		net_timestamp_set(skb2);
1882 
1883 		/* skb->nh should be correctly
1884 		 * set by sender, so that the second statement is
1885 		 * just protection against buggy protocols.
1886 		 */
1887 		skb_reset_mac_header(skb2);
1888 
1889 		if (skb_network_header(skb2) < skb2->data ||
1890 		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1891 			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1892 					     ntohs(skb2->protocol),
1893 					     dev->name);
1894 			skb_reset_network_header(skb2);
1895 		}
1896 
1897 		skb2->transport_header = skb2->network_header;
1898 		skb2->pkt_type = PACKET_OUTGOING;
1899 		pt_prev = ptype;
1900 	}
1901 
1902 	if (ptype_list == &ptype_all) {
1903 		ptype_list = &dev->ptype_all;
1904 		goto again;
1905 	}
1906 out_unlock:
1907 	if (pt_prev)
1908 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1909 	rcu_read_unlock();
1910 }
1911 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1912 
1913 /**
1914  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1915  * @dev: Network device
1916  * @txq: number of queues available
1917  *
1918  * If real_num_tx_queues is changed the tc mappings may no longer be
1919  * valid. To resolve this verify the tc mapping remains valid and if
1920  * not NULL the mapping. With no priorities mapping to this
1921  * offset/count pair it will no longer be used. In the worst case TC0
1922  * is invalid nothing can be done so disable priority mappings. If is
1923  * expected that drivers will fix this mapping if they can before
1924  * calling netif_set_real_num_tx_queues.
1925  */
1926 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1927 {
1928 	int i;
1929 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1930 
1931 	/* If TC0 is invalidated disable TC mapping */
1932 	if (tc->offset + tc->count > txq) {
1933 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1934 		dev->num_tc = 0;
1935 		return;
1936 	}
1937 
1938 	/* Invalidated prio to tc mappings set to TC0 */
1939 	for (i = 1; i < TC_BITMASK + 1; i++) {
1940 		int q = netdev_get_prio_tc_map(dev, i);
1941 
1942 		tc = &dev->tc_to_txq[q];
1943 		if (tc->offset + tc->count > txq) {
1944 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1945 				i, q);
1946 			netdev_set_prio_tc_map(dev, i, 0);
1947 		}
1948 	}
1949 }
1950 
1951 #ifdef CONFIG_XPS
1952 static DEFINE_MUTEX(xps_map_mutex);
1953 #define xmap_dereference(P)		\
1954 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1955 
1956 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1957 					int cpu, u16 index)
1958 {
1959 	struct xps_map *map = NULL;
1960 	int pos;
1961 
1962 	if (dev_maps)
1963 		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1964 
1965 	for (pos = 0; map && pos < map->len; pos++) {
1966 		if (map->queues[pos] == index) {
1967 			if (map->len > 1) {
1968 				map->queues[pos] = map->queues[--map->len];
1969 			} else {
1970 				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1971 				kfree_rcu(map, rcu);
1972 				map = NULL;
1973 			}
1974 			break;
1975 		}
1976 	}
1977 
1978 	return map;
1979 }
1980 
1981 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1982 {
1983 	struct xps_dev_maps *dev_maps;
1984 	int cpu, i;
1985 	bool active = false;
1986 
1987 	mutex_lock(&xps_map_mutex);
1988 	dev_maps = xmap_dereference(dev->xps_maps);
1989 
1990 	if (!dev_maps)
1991 		goto out_no_maps;
1992 
1993 	for_each_possible_cpu(cpu) {
1994 		for (i = index; i < dev->num_tx_queues; i++) {
1995 			if (!remove_xps_queue(dev_maps, cpu, i))
1996 				break;
1997 		}
1998 		if (i == dev->num_tx_queues)
1999 			active = true;
2000 	}
2001 
2002 	if (!active) {
2003 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2004 		kfree_rcu(dev_maps, rcu);
2005 	}
2006 
2007 	for (i = index; i < dev->num_tx_queues; i++)
2008 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2009 					     NUMA_NO_NODE);
2010 
2011 out_no_maps:
2012 	mutex_unlock(&xps_map_mutex);
2013 }
2014 
2015 static struct xps_map *expand_xps_map(struct xps_map *map,
2016 				      int cpu, u16 index)
2017 {
2018 	struct xps_map *new_map;
2019 	int alloc_len = XPS_MIN_MAP_ALLOC;
2020 	int i, pos;
2021 
2022 	for (pos = 0; map && pos < map->len; pos++) {
2023 		if (map->queues[pos] != index)
2024 			continue;
2025 		return map;
2026 	}
2027 
2028 	/* Need to add queue to this CPU's existing map */
2029 	if (map) {
2030 		if (pos < map->alloc_len)
2031 			return map;
2032 
2033 		alloc_len = map->alloc_len * 2;
2034 	}
2035 
2036 	/* Need to allocate new map to store queue on this CPU's map */
2037 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2038 			       cpu_to_node(cpu));
2039 	if (!new_map)
2040 		return NULL;
2041 
2042 	for (i = 0; i < pos; i++)
2043 		new_map->queues[i] = map->queues[i];
2044 	new_map->alloc_len = alloc_len;
2045 	new_map->len = pos;
2046 
2047 	return new_map;
2048 }
2049 
2050 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2051 			u16 index)
2052 {
2053 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2054 	struct xps_map *map, *new_map;
2055 	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2056 	int cpu, numa_node_id = -2;
2057 	bool active = false;
2058 
2059 	mutex_lock(&xps_map_mutex);
2060 
2061 	dev_maps = xmap_dereference(dev->xps_maps);
2062 
2063 	/* allocate memory for queue storage */
2064 	for_each_online_cpu(cpu) {
2065 		if (!cpumask_test_cpu(cpu, mask))
2066 			continue;
2067 
2068 		if (!new_dev_maps)
2069 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2070 		if (!new_dev_maps) {
2071 			mutex_unlock(&xps_map_mutex);
2072 			return -ENOMEM;
2073 		}
2074 
2075 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2076 				 NULL;
2077 
2078 		map = expand_xps_map(map, cpu, index);
2079 		if (!map)
2080 			goto error;
2081 
2082 		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2083 	}
2084 
2085 	if (!new_dev_maps)
2086 		goto out_no_new_maps;
2087 
2088 	for_each_possible_cpu(cpu) {
2089 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2090 			/* add queue to CPU maps */
2091 			int pos = 0;
2092 
2093 			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2094 			while ((pos < map->len) && (map->queues[pos] != index))
2095 				pos++;
2096 
2097 			if (pos == map->len)
2098 				map->queues[map->len++] = index;
2099 #ifdef CONFIG_NUMA
2100 			if (numa_node_id == -2)
2101 				numa_node_id = cpu_to_node(cpu);
2102 			else if (numa_node_id != cpu_to_node(cpu))
2103 				numa_node_id = -1;
2104 #endif
2105 		} else if (dev_maps) {
2106 			/* fill in the new device map from the old device map */
2107 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2108 			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2109 		}
2110 
2111 	}
2112 
2113 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2114 
2115 	/* Cleanup old maps */
2116 	if (dev_maps) {
2117 		for_each_possible_cpu(cpu) {
2118 			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2119 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2120 			if (map && map != new_map)
2121 				kfree_rcu(map, rcu);
2122 		}
2123 
2124 		kfree_rcu(dev_maps, rcu);
2125 	}
2126 
2127 	dev_maps = new_dev_maps;
2128 	active = true;
2129 
2130 out_no_new_maps:
2131 	/* update Tx queue numa node */
2132 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2133 				     (numa_node_id >= 0) ? numa_node_id :
2134 				     NUMA_NO_NODE);
2135 
2136 	if (!dev_maps)
2137 		goto out_no_maps;
2138 
2139 	/* removes queue from unused CPUs */
2140 	for_each_possible_cpu(cpu) {
2141 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2142 			continue;
2143 
2144 		if (remove_xps_queue(dev_maps, cpu, index))
2145 			active = true;
2146 	}
2147 
2148 	/* free map if not active */
2149 	if (!active) {
2150 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2151 		kfree_rcu(dev_maps, rcu);
2152 	}
2153 
2154 out_no_maps:
2155 	mutex_unlock(&xps_map_mutex);
2156 
2157 	return 0;
2158 error:
2159 	/* remove any maps that we added */
2160 	for_each_possible_cpu(cpu) {
2161 		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2162 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2163 				 NULL;
2164 		if (new_map && new_map != map)
2165 			kfree(new_map);
2166 	}
2167 
2168 	mutex_unlock(&xps_map_mutex);
2169 
2170 	kfree(new_dev_maps);
2171 	return -ENOMEM;
2172 }
2173 EXPORT_SYMBOL(netif_set_xps_queue);
2174 
2175 #endif
2176 /*
2177  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2178  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2179  */
2180 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2181 {
2182 	int rc;
2183 
2184 	if (txq < 1 || txq > dev->num_tx_queues)
2185 		return -EINVAL;
2186 
2187 	if (dev->reg_state == NETREG_REGISTERED ||
2188 	    dev->reg_state == NETREG_UNREGISTERING) {
2189 		ASSERT_RTNL();
2190 
2191 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2192 						  txq);
2193 		if (rc)
2194 			return rc;
2195 
2196 		if (dev->num_tc)
2197 			netif_setup_tc(dev, txq);
2198 
2199 		if (txq < dev->real_num_tx_queues) {
2200 			qdisc_reset_all_tx_gt(dev, txq);
2201 #ifdef CONFIG_XPS
2202 			netif_reset_xps_queues_gt(dev, txq);
2203 #endif
2204 		}
2205 	}
2206 
2207 	dev->real_num_tx_queues = txq;
2208 	return 0;
2209 }
2210 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2211 
2212 #ifdef CONFIG_SYSFS
2213 /**
2214  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2215  *	@dev: Network device
2216  *	@rxq: Actual number of RX queues
2217  *
2218  *	This must be called either with the rtnl_lock held or before
2219  *	registration of the net device.  Returns 0 on success, or a
2220  *	negative error code.  If called before registration, it always
2221  *	succeeds.
2222  */
2223 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2224 {
2225 	int rc;
2226 
2227 	if (rxq < 1 || rxq > dev->num_rx_queues)
2228 		return -EINVAL;
2229 
2230 	if (dev->reg_state == NETREG_REGISTERED) {
2231 		ASSERT_RTNL();
2232 
2233 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2234 						  rxq);
2235 		if (rc)
2236 			return rc;
2237 	}
2238 
2239 	dev->real_num_rx_queues = rxq;
2240 	return 0;
2241 }
2242 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2243 #endif
2244 
2245 /**
2246  * netif_get_num_default_rss_queues - default number of RSS queues
2247  *
2248  * This routine should set an upper limit on the number of RSS queues
2249  * used by default by multiqueue devices.
2250  */
2251 int netif_get_num_default_rss_queues(void)
2252 {
2253 	return is_kdump_kernel() ?
2254 		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2255 }
2256 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2257 
2258 static void __netif_reschedule(struct Qdisc *q)
2259 {
2260 	struct softnet_data *sd;
2261 	unsigned long flags;
2262 
2263 	local_irq_save(flags);
2264 	sd = this_cpu_ptr(&softnet_data);
2265 	q->next_sched = NULL;
2266 	*sd->output_queue_tailp = q;
2267 	sd->output_queue_tailp = &q->next_sched;
2268 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2269 	local_irq_restore(flags);
2270 }
2271 
2272 void __netif_schedule(struct Qdisc *q)
2273 {
2274 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2275 		__netif_reschedule(q);
2276 }
2277 EXPORT_SYMBOL(__netif_schedule);
2278 
2279 struct dev_kfree_skb_cb {
2280 	enum skb_free_reason reason;
2281 };
2282 
2283 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2284 {
2285 	return (struct dev_kfree_skb_cb *)skb->cb;
2286 }
2287 
2288 void netif_schedule_queue(struct netdev_queue *txq)
2289 {
2290 	rcu_read_lock();
2291 	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2292 		struct Qdisc *q = rcu_dereference(txq->qdisc);
2293 
2294 		__netif_schedule(q);
2295 	}
2296 	rcu_read_unlock();
2297 }
2298 EXPORT_SYMBOL(netif_schedule_queue);
2299 
2300 /**
2301  *	netif_wake_subqueue - allow sending packets on subqueue
2302  *	@dev: network device
2303  *	@queue_index: sub queue index
2304  *
2305  * Resume individual transmit queue of a device with multiple transmit queues.
2306  */
2307 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2308 {
2309 	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2310 
2311 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2312 		struct Qdisc *q;
2313 
2314 		rcu_read_lock();
2315 		q = rcu_dereference(txq->qdisc);
2316 		__netif_schedule(q);
2317 		rcu_read_unlock();
2318 	}
2319 }
2320 EXPORT_SYMBOL(netif_wake_subqueue);
2321 
2322 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2323 {
2324 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2325 		struct Qdisc *q;
2326 
2327 		rcu_read_lock();
2328 		q = rcu_dereference(dev_queue->qdisc);
2329 		__netif_schedule(q);
2330 		rcu_read_unlock();
2331 	}
2332 }
2333 EXPORT_SYMBOL(netif_tx_wake_queue);
2334 
2335 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2336 {
2337 	unsigned long flags;
2338 
2339 	if (likely(atomic_read(&skb->users) == 1)) {
2340 		smp_rmb();
2341 		atomic_set(&skb->users, 0);
2342 	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2343 		return;
2344 	}
2345 	get_kfree_skb_cb(skb)->reason = reason;
2346 	local_irq_save(flags);
2347 	skb->next = __this_cpu_read(softnet_data.completion_queue);
2348 	__this_cpu_write(softnet_data.completion_queue, skb);
2349 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2350 	local_irq_restore(flags);
2351 }
2352 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2353 
2354 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2355 {
2356 	if (in_irq() || irqs_disabled())
2357 		__dev_kfree_skb_irq(skb, reason);
2358 	else
2359 		dev_kfree_skb(skb);
2360 }
2361 EXPORT_SYMBOL(__dev_kfree_skb_any);
2362 
2363 
2364 /**
2365  * netif_device_detach - mark device as removed
2366  * @dev: network device
2367  *
2368  * Mark device as removed from system and therefore no longer available.
2369  */
2370 void netif_device_detach(struct net_device *dev)
2371 {
2372 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2373 	    netif_running(dev)) {
2374 		netif_tx_stop_all_queues(dev);
2375 	}
2376 }
2377 EXPORT_SYMBOL(netif_device_detach);
2378 
2379 /**
2380  * netif_device_attach - mark device as attached
2381  * @dev: network device
2382  *
2383  * Mark device as attached from system and restart if needed.
2384  */
2385 void netif_device_attach(struct net_device *dev)
2386 {
2387 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2388 	    netif_running(dev)) {
2389 		netif_tx_wake_all_queues(dev);
2390 		__netdev_watchdog_up(dev);
2391 	}
2392 }
2393 EXPORT_SYMBOL(netif_device_attach);
2394 
2395 /*
2396  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2397  * to be used as a distribution range.
2398  */
2399 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2400 		  unsigned int num_tx_queues)
2401 {
2402 	u32 hash;
2403 	u16 qoffset = 0;
2404 	u16 qcount = num_tx_queues;
2405 
2406 	if (skb_rx_queue_recorded(skb)) {
2407 		hash = skb_get_rx_queue(skb);
2408 		while (unlikely(hash >= num_tx_queues))
2409 			hash -= num_tx_queues;
2410 		return hash;
2411 	}
2412 
2413 	if (dev->num_tc) {
2414 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2415 		qoffset = dev->tc_to_txq[tc].offset;
2416 		qcount = dev->tc_to_txq[tc].count;
2417 	}
2418 
2419 	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2420 }
2421 EXPORT_SYMBOL(__skb_tx_hash);
2422 
2423 static void skb_warn_bad_offload(const struct sk_buff *skb)
2424 {
2425 	static const netdev_features_t null_features;
2426 	struct net_device *dev = skb->dev;
2427 	const char *name = "";
2428 
2429 	if (!net_ratelimit())
2430 		return;
2431 
2432 	if (dev) {
2433 		if (dev->dev.parent)
2434 			name = dev_driver_string(dev->dev.parent);
2435 		else
2436 			name = netdev_name(dev);
2437 	}
2438 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2439 	     "gso_type=%d ip_summed=%d\n",
2440 	     name, dev ? &dev->features : &null_features,
2441 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2442 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2443 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2444 }
2445 
2446 /*
2447  * Invalidate hardware checksum when packet is to be mangled, and
2448  * complete checksum manually on outgoing path.
2449  */
2450 int skb_checksum_help(struct sk_buff *skb)
2451 {
2452 	__wsum csum;
2453 	int ret = 0, offset;
2454 
2455 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2456 		goto out_set_summed;
2457 
2458 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2459 		skb_warn_bad_offload(skb);
2460 		return -EINVAL;
2461 	}
2462 
2463 	/* Before computing a checksum, we should make sure no frag could
2464 	 * be modified by an external entity : checksum could be wrong.
2465 	 */
2466 	if (skb_has_shared_frag(skb)) {
2467 		ret = __skb_linearize(skb);
2468 		if (ret)
2469 			goto out;
2470 	}
2471 
2472 	offset = skb_checksum_start_offset(skb);
2473 	BUG_ON(offset >= skb_headlen(skb));
2474 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2475 
2476 	offset += skb->csum_offset;
2477 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2478 
2479 	if (skb_cloned(skb) &&
2480 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2481 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2482 		if (ret)
2483 			goto out;
2484 	}
2485 
2486 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2487 out_set_summed:
2488 	skb->ip_summed = CHECKSUM_NONE;
2489 out:
2490 	return ret;
2491 }
2492 EXPORT_SYMBOL(skb_checksum_help);
2493 
2494 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2495 {
2496 	__be16 type = skb->protocol;
2497 
2498 	/* Tunnel gso handlers can set protocol to ethernet. */
2499 	if (type == htons(ETH_P_TEB)) {
2500 		struct ethhdr *eth;
2501 
2502 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2503 			return 0;
2504 
2505 		eth = (struct ethhdr *)skb_mac_header(skb);
2506 		type = eth->h_proto;
2507 	}
2508 
2509 	return __vlan_get_protocol(skb, type, depth);
2510 }
2511 
2512 /**
2513  *	skb_mac_gso_segment - mac layer segmentation handler.
2514  *	@skb: buffer to segment
2515  *	@features: features for the output path (see dev->features)
2516  */
2517 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2518 				    netdev_features_t features)
2519 {
2520 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2521 	struct packet_offload *ptype;
2522 	int vlan_depth = skb->mac_len;
2523 	__be16 type = skb_network_protocol(skb, &vlan_depth);
2524 
2525 	if (unlikely(!type))
2526 		return ERR_PTR(-EINVAL);
2527 
2528 	__skb_pull(skb, vlan_depth);
2529 
2530 	rcu_read_lock();
2531 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2532 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2533 			segs = ptype->callbacks.gso_segment(skb, features);
2534 			break;
2535 		}
2536 	}
2537 	rcu_read_unlock();
2538 
2539 	__skb_push(skb, skb->data - skb_mac_header(skb));
2540 
2541 	return segs;
2542 }
2543 EXPORT_SYMBOL(skb_mac_gso_segment);
2544 
2545 
2546 /* openvswitch calls this on rx path, so we need a different check.
2547  */
2548 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2549 {
2550 	if (tx_path)
2551 		return skb->ip_summed != CHECKSUM_PARTIAL;
2552 	else
2553 		return skb->ip_summed == CHECKSUM_NONE;
2554 }
2555 
2556 /**
2557  *	__skb_gso_segment - Perform segmentation on skb.
2558  *	@skb: buffer to segment
2559  *	@features: features for the output path (see dev->features)
2560  *	@tx_path: whether it is called in TX path
2561  *
2562  *	This function segments the given skb and returns a list of segments.
2563  *
2564  *	It may return NULL if the skb requires no segmentation.  This is
2565  *	only possible when GSO is used for verifying header integrity.
2566  *
2567  *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2568  */
2569 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2570 				  netdev_features_t features, bool tx_path)
2571 {
2572 	if (unlikely(skb_needs_check(skb, tx_path))) {
2573 		int err;
2574 
2575 		skb_warn_bad_offload(skb);
2576 
2577 		err = skb_cow_head(skb, 0);
2578 		if (err < 0)
2579 			return ERR_PTR(err);
2580 	}
2581 
2582 	/* Only report GSO partial support if it will enable us to
2583 	 * support segmentation on this frame without needing additional
2584 	 * work.
2585 	 */
2586 	if (features & NETIF_F_GSO_PARTIAL) {
2587 		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2588 		struct net_device *dev = skb->dev;
2589 
2590 		partial_features |= dev->features & dev->gso_partial_features;
2591 		if (!skb_gso_ok(skb, features | partial_features))
2592 			features &= ~NETIF_F_GSO_PARTIAL;
2593 	}
2594 
2595 	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2596 		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2597 
2598 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2599 	SKB_GSO_CB(skb)->encap_level = 0;
2600 
2601 	skb_reset_mac_header(skb);
2602 	skb_reset_mac_len(skb);
2603 
2604 	return skb_mac_gso_segment(skb, features);
2605 }
2606 EXPORT_SYMBOL(__skb_gso_segment);
2607 
2608 /* Take action when hardware reception checksum errors are detected. */
2609 #ifdef CONFIG_BUG
2610 void netdev_rx_csum_fault(struct net_device *dev)
2611 {
2612 	if (net_ratelimit()) {
2613 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2614 		dump_stack();
2615 	}
2616 }
2617 EXPORT_SYMBOL(netdev_rx_csum_fault);
2618 #endif
2619 
2620 /* Actually, we should eliminate this check as soon as we know, that:
2621  * 1. IOMMU is present and allows to map all the memory.
2622  * 2. No high memory really exists on this machine.
2623  */
2624 
2625 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2626 {
2627 #ifdef CONFIG_HIGHMEM
2628 	int i;
2629 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2630 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2631 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2632 			if (PageHighMem(skb_frag_page(frag)))
2633 				return 1;
2634 		}
2635 	}
2636 
2637 	if (PCI_DMA_BUS_IS_PHYS) {
2638 		struct device *pdev = dev->dev.parent;
2639 
2640 		if (!pdev)
2641 			return 0;
2642 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2643 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2644 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2645 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2646 				return 1;
2647 		}
2648 	}
2649 #endif
2650 	return 0;
2651 }
2652 
2653 /* If MPLS offload request, verify we are testing hardware MPLS features
2654  * instead of standard features for the netdev.
2655  */
2656 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2657 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2658 					   netdev_features_t features,
2659 					   __be16 type)
2660 {
2661 	if (eth_p_mpls(type))
2662 		features &= skb->dev->mpls_features;
2663 
2664 	return features;
2665 }
2666 #else
2667 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2668 					   netdev_features_t features,
2669 					   __be16 type)
2670 {
2671 	return features;
2672 }
2673 #endif
2674 
2675 static netdev_features_t harmonize_features(struct sk_buff *skb,
2676 	netdev_features_t features)
2677 {
2678 	int tmp;
2679 	__be16 type;
2680 
2681 	type = skb_network_protocol(skb, &tmp);
2682 	features = net_mpls_features(skb, features, type);
2683 
2684 	if (skb->ip_summed != CHECKSUM_NONE &&
2685 	    !can_checksum_protocol(features, type)) {
2686 		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2687 	} else if (illegal_highdma(skb->dev, skb)) {
2688 		features &= ~NETIF_F_SG;
2689 	}
2690 
2691 	return features;
2692 }
2693 
2694 netdev_features_t passthru_features_check(struct sk_buff *skb,
2695 					  struct net_device *dev,
2696 					  netdev_features_t features)
2697 {
2698 	return features;
2699 }
2700 EXPORT_SYMBOL(passthru_features_check);
2701 
2702 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2703 					     struct net_device *dev,
2704 					     netdev_features_t features)
2705 {
2706 	return vlan_features_check(skb, features);
2707 }
2708 
2709 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2710 					    struct net_device *dev,
2711 					    netdev_features_t features)
2712 {
2713 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2714 
2715 	if (gso_segs > dev->gso_max_segs)
2716 		return features & ~NETIF_F_GSO_MASK;
2717 
2718 	/* Support for GSO partial features requires software
2719 	 * intervention before we can actually process the packets
2720 	 * so we need to strip support for any partial features now
2721 	 * and we can pull them back in after we have partially
2722 	 * segmented the frame.
2723 	 */
2724 	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2725 		features &= ~dev->gso_partial_features;
2726 
2727 	/* Make sure to clear the IPv4 ID mangling feature if the
2728 	 * IPv4 header has the potential to be fragmented.
2729 	 */
2730 	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2731 		struct iphdr *iph = skb->encapsulation ?
2732 				    inner_ip_hdr(skb) : ip_hdr(skb);
2733 
2734 		if (!(iph->frag_off & htons(IP_DF)))
2735 			features &= ~NETIF_F_TSO_MANGLEID;
2736 	}
2737 
2738 	return features;
2739 }
2740 
2741 netdev_features_t netif_skb_features(struct sk_buff *skb)
2742 {
2743 	struct net_device *dev = skb->dev;
2744 	netdev_features_t features = dev->features;
2745 
2746 	if (skb_is_gso(skb))
2747 		features = gso_features_check(skb, dev, features);
2748 
2749 	/* If encapsulation offload request, verify we are testing
2750 	 * hardware encapsulation features instead of standard
2751 	 * features for the netdev
2752 	 */
2753 	if (skb->encapsulation)
2754 		features &= dev->hw_enc_features;
2755 
2756 	if (skb_vlan_tagged(skb))
2757 		features = netdev_intersect_features(features,
2758 						     dev->vlan_features |
2759 						     NETIF_F_HW_VLAN_CTAG_TX |
2760 						     NETIF_F_HW_VLAN_STAG_TX);
2761 
2762 	if (dev->netdev_ops->ndo_features_check)
2763 		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2764 								features);
2765 	else
2766 		features &= dflt_features_check(skb, dev, features);
2767 
2768 	return harmonize_features(skb, features);
2769 }
2770 EXPORT_SYMBOL(netif_skb_features);
2771 
2772 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2773 		    struct netdev_queue *txq, bool more)
2774 {
2775 	unsigned int len;
2776 	int rc;
2777 
2778 	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2779 		dev_queue_xmit_nit(skb, dev);
2780 
2781 	len = skb->len;
2782 	trace_net_dev_start_xmit(skb, dev);
2783 	rc = netdev_start_xmit(skb, dev, txq, more);
2784 	trace_net_dev_xmit(skb, rc, dev, len);
2785 
2786 	return rc;
2787 }
2788 
2789 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2790 				    struct netdev_queue *txq, int *ret)
2791 {
2792 	struct sk_buff *skb = first;
2793 	int rc = NETDEV_TX_OK;
2794 
2795 	while (skb) {
2796 		struct sk_buff *next = skb->next;
2797 
2798 		skb->next = NULL;
2799 		rc = xmit_one(skb, dev, txq, next != NULL);
2800 		if (unlikely(!dev_xmit_complete(rc))) {
2801 			skb->next = next;
2802 			goto out;
2803 		}
2804 
2805 		skb = next;
2806 		if (netif_xmit_stopped(txq) && skb) {
2807 			rc = NETDEV_TX_BUSY;
2808 			break;
2809 		}
2810 	}
2811 
2812 out:
2813 	*ret = rc;
2814 	return skb;
2815 }
2816 
2817 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2818 					  netdev_features_t features)
2819 {
2820 	if (skb_vlan_tag_present(skb) &&
2821 	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2822 		skb = __vlan_hwaccel_push_inside(skb);
2823 	return skb;
2824 }
2825 
2826 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2827 {
2828 	netdev_features_t features;
2829 
2830 	features = netif_skb_features(skb);
2831 	skb = validate_xmit_vlan(skb, features);
2832 	if (unlikely(!skb))
2833 		goto out_null;
2834 
2835 	if (netif_needs_gso(skb, features)) {
2836 		struct sk_buff *segs;
2837 
2838 		segs = skb_gso_segment(skb, features);
2839 		if (IS_ERR(segs)) {
2840 			goto out_kfree_skb;
2841 		} else if (segs) {
2842 			consume_skb(skb);
2843 			skb = segs;
2844 		}
2845 	} else {
2846 		if (skb_needs_linearize(skb, features) &&
2847 		    __skb_linearize(skb))
2848 			goto out_kfree_skb;
2849 
2850 		/* If packet is not checksummed and device does not
2851 		 * support checksumming for this protocol, complete
2852 		 * checksumming here.
2853 		 */
2854 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2855 			if (skb->encapsulation)
2856 				skb_set_inner_transport_header(skb,
2857 							       skb_checksum_start_offset(skb));
2858 			else
2859 				skb_set_transport_header(skb,
2860 							 skb_checksum_start_offset(skb));
2861 			if (!(features & NETIF_F_CSUM_MASK) &&
2862 			    skb_checksum_help(skb))
2863 				goto out_kfree_skb;
2864 		}
2865 	}
2866 
2867 	return skb;
2868 
2869 out_kfree_skb:
2870 	kfree_skb(skb);
2871 out_null:
2872 	atomic_long_inc(&dev->tx_dropped);
2873 	return NULL;
2874 }
2875 
2876 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2877 {
2878 	struct sk_buff *next, *head = NULL, *tail;
2879 
2880 	for (; skb != NULL; skb = next) {
2881 		next = skb->next;
2882 		skb->next = NULL;
2883 
2884 		/* in case skb wont be segmented, point to itself */
2885 		skb->prev = skb;
2886 
2887 		skb = validate_xmit_skb(skb, dev);
2888 		if (!skb)
2889 			continue;
2890 
2891 		if (!head)
2892 			head = skb;
2893 		else
2894 			tail->next = skb;
2895 		/* If skb was segmented, skb->prev points to
2896 		 * the last segment. If not, it still contains skb.
2897 		 */
2898 		tail = skb->prev;
2899 	}
2900 	return head;
2901 }
2902 
2903 static void qdisc_pkt_len_init(struct sk_buff *skb)
2904 {
2905 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2906 
2907 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2908 
2909 	/* To get more precise estimation of bytes sent on wire,
2910 	 * we add to pkt_len the headers size of all segments
2911 	 */
2912 	if (shinfo->gso_size)  {
2913 		unsigned int hdr_len;
2914 		u16 gso_segs = shinfo->gso_segs;
2915 
2916 		/* mac layer + network layer */
2917 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2918 
2919 		/* + transport layer */
2920 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2921 			hdr_len += tcp_hdrlen(skb);
2922 		else
2923 			hdr_len += sizeof(struct udphdr);
2924 
2925 		if (shinfo->gso_type & SKB_GSO_DODGY)
2926 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2927 						shinfo->gso_size);
2928 
2929 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2930 	}
2931 }
2932 
2933 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2934 				 struct net_device *dev,
2935 				 struct netdev_queue *txq)
2936 {
2937 	spinlock_t *root_lock = qdisc_lock(q);
2938 	struct sk_buff *to_free = NULL;
2939 	bool contended;
2940 	int rc;
2941 
2942 	qdisc_calculate_pkt_len(skb, q);
2943 	/*
2944 	 * Heuristic to force contended enqueues to serialize on a
2945 	 * separate lock before trying to get qdisc main lock.
2946 	 * This permits qdisc->running owner to get the lock more
2947 	 * often and dequeue packets faster.
2948 	 */
2949 	contended = qdisc_is_running(q);
2950 	if (unlikely(contended))
2951 		spin_lock(&q->busylock);
2952 
2953 	spin_lock(root_lock);
2954 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2955 		__qdisc_drop(skb, &to_free);
2956 		rc = NET_XMIT_DROP;
2957 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2958 		   qdisc_run_begin(q)) {
2959 		/*
2960 		 * This is a work-conserving queue; there are no old skbs
2961 		 * waiting to be sent out; and the qdisc is not running -
2962 		 * xmit the skb directly.
2963 		 */
2964 
2965 		qdisc_bstats_update(q, skb);
2966 
2967 		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2968 			if (unlikely(contended)) {
2969 				spin_unlock(&q->busylock);
2970 				contended = false;
2971 			}
2972 			__qdisc_run(q);
2973 		} else
2974 			qdisc_run_end(q);
2975 
2976 		rc = NET_XMIT_SUCCESS;
2977 	} else {
2978 		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
2979 		if (qdisc_run_begin(q)) {
2980 			if (unlikely(contended)) {
2981 				spin_unlock(&q->busylock);
2982 				contended = false;
2983 			}
2984 			__qdisc_run(q);
2985 		}
2986 	}
2987 	spin_unlock(root_lock);
2988 	if (unlikely(to_free))
2989 		kfree_skb_list(to_free);
2990 	if (unlikely(contended))
2991 		spin_unlock(&q->busylock);
2992 	return rc;
2993 }
2994 
2995 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2996 static void skb_update_prio(struct sk_buff *skb)
2997 {
2998 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2999 
3000 	if (!skb->priority && skb->sk && map) {
3001 		unsigned int prioidx =
3002 			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3003 
3004 		if (prioidx < map->priomap_len)
3005 			skb->priority = map->priomap[prioidx];
3006 	}
3007 }
3008 #else
3009 #define skb_update_prio(skb)
3010 #endif
3011 
3012 DEFINE_PER_CPU(int, xmit_recursion);
3013 EXPORT_SYMBOL(xmit_recursion);
3014 
3015 /**
3016  *	dev_loopback_xmit - loop back @skb
3017  *	@net: network namespace this loopback is happening in
3018  *	@sk:  sk needed to be a netfilter okfn
3019  *	@skb: buffer to transmit
3020  */
3021 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3022 {
3023 	skb_reset_mac_header(skb);
3024 	__skb_pull(skb, skb_network_offset(skb));
3025 	skb->pkt_type = PACKET_LOOPBACK;
3026 	skb->ip_summed = CHECKSUM_UNNECESSARY;
3027 	WARN_ON(!skb_dst(skb));
3028 	skb_dst_force(skb);
3029 	netif_rx_ni(skb);
3030 	return 0;
3031 }
3032 EXPORT_SYMBOL(dev_loopback_xmit);
3033 
3034 #ifdef CONFIG_NET_EGRESS
3035 static struct sk_buff *
3036 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3037 {
3038 	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3039 	struct tcf_result cl_res;
3040 
3041 	if (!cl)
3042 		return skb;
3043 
3044 	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3045 	 * earlier by the caller.
3046 	 */
3047 	qdisc_bstats_cpu_update(cl->q, skb);
3048 
3049 	switch (tc_classify(skb, cl, &cl_res, false)) {
3050 	case TC_ACT_OK:
3051 	case TC_ACT_RECLASSIFY:
3052 		skb->tc_index = TC_H_MIN(cl_res.classid);
3053 		break;
3054 	case TC_ACT_SHOT:
3055 		qdisc_qstats_cpu_drop(cl->q);
3056 		*ret = NET_XMIT_DROP;
3057 		kfree_skb(skb);
3058 		return NULL;
3059 	case TC_ACT_STOLEN:
3060 	case TC_ACT_QUEUED:
3061 		*ret = NET_XMIT_SUCCESS;
3062 		consume_skb(skb);
3063 		return NULL;
3064 	case TC_ACT_REDIRECT:
3065 		/* No need to push/pop skb's mac_header here on egress! */
3066 		skb_do_redirect(skb);
3067 		*ret = NET_XMIT_SUCCESS;
3068 		return NULL;
3069 	default:
3070 		break;
3071 	}
3072 
3073 	return skb;
3074 }
3075 #endif /* CONFIG_NET_EGRESS */
3076 
3077 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3078 {
3079 #ifdef CONFIG_XPS
3080 	struct xps_dev_maps *dev_maps;
3081 	struct xps_map *map;
3082 	int queue_index = -1;
3083 
3084 	rcu_read_lock();
3085 	dev_maps = rcu_dereference(dev->xps_maps);
3086 	if (dev_maps) {
3087 		map = rcu_dereference(
3088 		    dev_maps->cpu_map[skb->sender_cpu - 1]);
3089 		if (map) {
3090 			if (map->len == 1)
3091 				queue_index = map->queues[0];
3092 			else
3093 				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3094 									   map->len)];
3095 			if (unlikely(queue_index >= dev->real_num_tx_queues))
3096 				queue_index = -1;
3097 		}
3098 	}
3099 	rcu_read_unlock();
3100 
3101 	return queue_index;
3102 #else
3103 	return -1;
3104 #endif
3105 }
3106 
3107 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3108 {
3109 	struct sock *sk = skb->sk;
3110 	int queue_index = sk_tx_queue_get(sk);
3111 
3112 	if (queue_index < 0 || skb->ooo_okay ||
3113 	    queue_index >= dev->real_num_tx_queues) {
3114 		int new_index = get_xps_queue(dev, skb);
3115 		if (new_index < 0)
3116 			new_index = skb_tx_hash(dev, skb);
3117 
3118 		if (queue_index != new_index && sk &&
3119 		    sk_fullsock(sk) &&
3120 		    rcu_access_pointer(sk->sk_dst_cache))
3121 			sk_tx_queue_set(sk, new_index);
3122 
3123 		queue_index = new_index;
3124 	}
3125 
3126 	return queue_index;
3127 }
3128 
3129 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3130 				    struct sk_buff *skb,
3131 				    void *accel_priv)
3132 {
3133 	int queue_index = 0;
3134 
3135 #ifdef CONFIG_XPS
3136 	u32 sender_cpu = skb->sender_cpu - 1;
3137 
3138 	if (sender_cpu >= (u32)NR_CPUS)
3139 		skb->sender_cpu = raw_smp_processor_id() + 1;
3140 #endif
3141 
3142 	if (dev->real_num_tx_queues != 1) {
3143 		const struct net_device_ops *ops = dev->netdev_ops;
3144 		if (ops->ndo_select_queue)
3145 			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3146 							    __netdev_pick_tx);
3147 		else
3148 			queue_index = __netdev_pick_tx(dev, skb);
3149 
3150 		if (!accel_priv)
3151 			queue_index = netdev_cap_txqueue(dev, queue_index);
3152 	}
3153 
3154 	skb_set_queue_mapping(skb, queue_index);
3155 	return netdev_get_tx_queue(dev, queue_index);
3156 }
3157 
3158 /**
3159  *	__dev_queue_xmit - transmit a buffer
3160  *	@skb: buffer to transmit
3161  *	@accel_priv: private data used for L2 forwarding offload
3162  *
3163  *	Queue a buffer for transmission to a network device. The caller must
3164  *	have set the device and priority and built the buffer before calling
3165  *	this function. The function can be called from an interrupt.
3166  *
3167  *	A negative errno code is returned on a failure. A success does not
3168  *	guarantee the frame will be transmitted as it may be dropped due
3169  *	to congestion or traffic shaping.
3170  *
3171  * -----------------------------------------------------------------------------------
3172  *      I notice this method can also return errors from the queue disciplines,
3173  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3174  *      be positive.
3175  *
3176  *      Regardless of the return value, the skb is consumed, so it is currently
3177  *      difficult to retry a send to this method.  (You can bump the ref count
3178  *      before sending to hold a reference for retry if you are careful.)
3179  *
3180  *      When calling this method, interrupts MUST be enabled.  This is because
3181  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3182  *          --BLG
3183  */
3184 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3185 {
3186 	struct net_device *dev = skb->dev;
3187 	struct netdev_queue *txq;
3188 	struct Qdisc *q;
3189 	int rc = -ENOMEM;
3190 
3191 	skb_reset_mac_header(skb);
3192 
3193 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3194 		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3195 
3196 	/* Disable soft irqs for various locks below. Also
3197 	 * stops preemption for RCU.
3198 	 */
3199 	rcu_read_lock_bh();
3200 
3201 	skb_update_prio(skb);
3202 
3203 	qdisc_pkt_len_init(skb);
3204 #ifdef CONFIG_NET_CLS_ACT
3205 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3206 # ifdef CONFIG_NET_EGRESS
3207 	if (static_key_false(&egress_needed)) {
3208 		skb = sch_handle_egress(skb, &rc, dev);
3209 		if (!skb)
3210 			goto out;
3211 	}
3212 # endif
3213 #endif
3214 	/* If device/qdisc don't need skb->dst, release it right now while
3215 	 * its hot in this cpu cache.
3216 	 */
3217 	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3218 		skb_dst_drop(skb);
3219 	else
3220 		skb_dst_force(skb);
3221 
3222 	txq = netdev_pick_tx(dev, skb, accel_priv);
3223 	q = rcu_dereference_bh(txq->qdisc);
3224 
3225 	trace_net_dev_queue(skb);
3226 	if (q->enqueue) {
3227 		rc = __dev_xmit_skb(skb, q, dev, txq);
3228 		goto out;
3229 	}
3230 
3231 	/* The device has no queue. Common case for software devices:
3232 	   loopback, all the sorts of tunnels...
3233 
3234 	   Really, it is unlikely that netif_tx_lock protection is necessary
3235 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3236 	   counters.)
3237 	   However, it is possible, that they rely on protection
3238 	   made by us here.
3239 
3240 	   Check this and shot the lock. It is not prone from deadlocks.
3241 	   Either shot noqueue qdisc, it is even simpler 8)
3242 	 */
3243 	if (dev->flags & IFF_UP) {
3244 		int cpu = smp_processor_id(); /* ok because BHs are off */
3245 
3246 		if (txq->xmit_lock_owner != cpu) {
3247 			if (unlikely(__this_cpu_read(xmit_recursion) >
3248 				     XMIT_RECURSION_LIMIT))
3249 				goto recursion_alert;
3250 
3251 			skb = validate_xmit_skb(skb, dev);
3252 			if (!skb)
3253 				goto out;
3254 
3255 			HARD_TX_LOCK(dev, txq, cpu);
3256 
3257 			if (!netif_xmit_stopped(txq)) {
3258 				__this_cpu_inc(xmit_recursion);
3259 				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3260 				__this_cpu_dec(xmit_recursion);
3261 				if (dev_xmit_complete(rc)) {
3262 					HARD_TX_UNLOCK(dev, txq);
3263 					goto out;
3264 				}
3265 			}
3266 			HARD_TX_UNLOCK(dev, txq);
3267 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3268 					     dev->name);
3269 		} else {
3270 			/* Recursion is detected! It is possible,
3271 			 * unfortunately
3272 			 */
3273 recursion_alert:
3274 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3275 					     dev->name);
3276 		}
3277 	}
3278 
3279 	rc = -ENETDOWN;
3280 	rcu_read_unlock_bh();
3281 
3282 	atomic_long_inc(&dev->tx_dropped);
3283 	kfree_skb_list(skb);
3284 	return rc;
3285 out:
3286 	rcu_read_unlock_bh();
3287 	return rc;
3288 }
3289 
3290 int dev_queue_xmit(struct sk_buff *skb)
3291 {
3292 	return __dev_queue_xmit(skb, NULL);
3293 }
3294 EXPORT_SYMBOL(dev_queue_xmit);
3295 
3296 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3297 {
3298 	return __dev_queue_xmit(skb, accel_priv);
3299 }
3300 EXPORT_SYMBOL(dev_queue_xmit_accel);
3301 
3302 
3303 /*=======================================================================
3304 			Receiver routines
3305   =======================================================================*/
3306 
3307 int netdev_max_backlog __read_mostly = 1000;
3308 EXPORT_SYMBOL(netdev_max_backlog);
3309 
3310 int netdev_tstamp_prequeue __read_mostly = 1;
3311 int netdev_budget __read_mostly = 300;
3312 int weight_p __read_mostly = 64;            /* old backlog weight */
3313 
3314 /* Called with irq disabled */
3315 static inline void ____napi_schedule(struct softnet_data *sd,
3316 				     struct napi_struct *napi)
3317 {
3318 	list_add_tail(&napi->poll_list, &sd->poll_list);
3319 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3320 }
3321 
3322 #ifdef CONFIG_RPS
3323 
3324 /* One global table that all flow-based protocols share. */
3325 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3326 EXPORT_SYMBOL(rps_sock_flow_table);
3327 u32 rps_cpu_mask __read_mostly;
3328 EXPORT_SYMBOL(rps_cpu_mask);
3329 
3330 struct static_key rps_needed __read_mostly;
3331 EXPORT_SYMBOL(rps_needed);
3332 
3333 static struct rps_dev_flow *
3334 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3335 	    struct rps_dev_flow *rflow, u16 next_cpu)
3336 {
3337 	if (next_cpu < nr_cpu_ids) {
3338 #ifdef CONFIG_RFS_ACCEL
3339 		struct netdev_rx_queue *rxqueue;
3340 		struct rps_dev_flow_table *flow_table;
3341 		struct rps_dev_flow *old_rflow;
3342 		u32 flow_id;
3343 		u16 rxq_index;
3344 		int rc;
3345 
3346 		/* Should we steer this flow to a different hardware queue? */
3347 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3348 		    !(dev->features & NETIF_F_NTUPLE))
3349 			goto out;
3350 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3351 		if (rxq_index == skb_get_rx_queue(skb))
3352 			goto out;
3353 
3354 		rxqueue = dev->_rx + rxq_index;
3355 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3356 		if (!flow_table)
3357 			goto out;
3358 		flow_id = skb_get_hash(skb) & flow_table->mask;
3359 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3360 							rxq_index, flow_id);
3361 		if (rc < 0)
3362 			goto out;
3363 		old_rflow = rflow;
3364 		rflow = &flow_table->flows[flow_id];
3365 		rflow->filter = rc;
3366 		if (old_rflow->filter == rflow->filter)
3367 			old_rflow->filter = RPS_NO_FILTER;
3368 	out:
3369 #endif
3370 		rflow->last_qtail =
3371 			per_cpu(softnet_data, next_cpu).input_queue_head;
3372 	}
3373 
3374 	rflow->cpu = next_cpu;
3375 	return rflow;
3376 }
3377 
3378 /*
3379  * get_rps_cpu is called from netif_receive_skb and returns the target
3380  * CPU from the RPS map of the receiving queue for a given skb.
3381  * rcu_read_lock must be held on entry.
3382  */
3383 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3384 		       struct rps_dev_flow **rflowp)
3385 {
3386 	const struct rps_sock_flow_table *sock_flow_table;
3387 	struct netdev_rx_queue *rxqueue = dev->_rx;
3388 	struct rps_dev_flow_table *flow_table;
3389 	struct rps_map *map;
3390 	int cpu = -1;
3391 	u32 tcpu;
3392 	u32 hash;
3393 
3394 	if (skb_rx_queue_recorded(skb)) {
3395 		u16 index = skb_get_rx_queue(skb);
3396 
3397 		if (unlikely(index >= dev->real_num_rx_queues)) {
3398 			WARN_ONCE(dev->real_num_rx_queues > 1,
3399 				  "%s received packet on queue %u, but number "
3400 				  "of RX queues is %u\n",
3401 				  dev->name, index, dev->real_num_rx_queues);
3402 			goto done;
3403 		}
3404 		rxqueue += index;
3405 	}
3406 
3407 	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3408 
3409 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3410 	map = rcu_dereference(rxqueue->rps_map);
3411 	if (!flow_table && !map)
3412 		goto done;
3413 
3414 	skb_reset_network_header(skb);
3415 	hash = skb_get_hash(skb);
3416 	if (!hash)
3417 		goto done;
3418 
3419 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3420 	if (flow_table && sock_flow_table) {
3421 		struct rps_dev_flow *rflow;
3422 		u32 next_cpu;
3423 		u32 ident;
3424 
3425 		/* First check into global flow table if there is a match */
3426 		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3427 		if ((ident ^ hash) & ~rps_cpu_mask)
3428 			goto try_rps;
3429 
3430 		next_cpu = ident & rps_cpu_mask;
3431 
3432 		/* OK, now we know there is a match,
3433 		 * we can look at the local (per receive queue) flow table
3434 		 */
3435 		rflow = &flow_table->flows[hash & flow_table->mask];
3436 		tcpu = rflow->cpu;
3437 
3438 		/*
3439 		 * If the desired CPU (where last recvmsg was done) is
3440 		 * different from current CPU (one in the rx-queue flow
3441 		 * table entry), switch if one of the following holds:
3442 		 *   - Current CPU is unset (>= nr_cpu_ids).
3443 		 *   - Current CPU is offline.
3444 		 *   - The current CPU's queue tail has advanced beyond the
3445 		 *     last packet that was enqueued using this table entry.
3446 		 *     This guarantees that all previous packets for the flow
3447 		 *     have been dequeued, thus preserving in order delivery.
3448 		 */
3449 		if (unlikely(tcpu != next_cpu) &&
3450 		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3451 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3452 		      rflow->last_qtail)) >= 0)) {
3453 			tcpu = next_cpu;
3454 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3455 		}
3456 
3457 		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3458 			*rflowp = rflow;
3459 			cpu = tcpu;
3460 			goto done;
3461 		}
3462 	}
3463 
3464 try_rps:
3465 
3466 	if (map) {
3467 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3468 		if (cpu_online(tcpu)) {
3469 			cpu = tcpu;
3470 			goto done;
3471 		}
3472 	}
3473 
3474 done:
3475 	return cpu;
3476 }
3477 
3478 #ifdef CONFIG_RFS_ACCEL
3479 
3480 /**
3481  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3482  * @dev: Device on which the filter was set
3483  * @rxq_index: RX queue index
3484  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3485  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3486  *
3487  * Drivers that implement ndo_rx_flow_steer() should periodically call
3488  * this function for each installed filter and remove the filters for
3489  * which it returns %true.
3490  */
3491 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3492 			 u32 flow_id, u16 filter_id)
3493 {
3494 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3495 	struct rps_dev_flow_table *flow_table;
3496 	struct rps_dev_flow *rflow;
3497 	bool expire = true;
3498 	unsigned int cpu;
3499 
3500 	rcu_read_lock();
3501 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3502 	if (flow_table && flow_id <= flow_table->mask) {
3503 		rflow = &flow_table->flows[flow_id];
3504 		cpu = ACCESS_ONCE(rflow->cpu);
3505 		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3506 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3507 			   rflow->last_qtail) <
3508 		     (int)(10 * flow_table->mask)))
3509 			expire = false;
3510 	}
3511 	rcu_read_unlock();
3512 	return expire;
3513 }
3514 EXPORT_SYMBOL(rps_may_expire_flow);
3515 
3516 #endif /* CONFIG_RFS_ACCEL */
3517 
3518 /* Called from hardirq (IPI) context */
3519 static void rps_trigger_softirq(void *data)
3520 {
3521 	struct softnet_data *sd = data;
3522 
3523 	____napi_schedule(sd, &sd->backlog);
3524 	sd->received_rps++;
3525 }
3526 
3527 #endif /* CONFIG_RPS */
3528 
3529 /*
3530  * Check if this softnet_data structure is another cpu one
3531  * If yes, queue it to our IPI list and return 1
3532  * If no, return 0
3533  */
3534 static int rps_ipi_queued(struct softnet_data *sd)
3535 {
3536 #ifdef CONFIG_RPS
3537 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3538 
3539 	if (sd != mysd) {
3540 		sd->rps_ipi_next = mysd->rps_ipi_list;
3541 		mysd->rps_ipi_list = sd;
3542 
3543 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3544 		return 1;
3545 	}
3546 #endif /* CONFIG_RPS */
3547 	return 0;
3548 }
3549 
3550 #ifdef CONFIG_NET_FLOW_LIMIT
3551 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3552 #endif
3553 
3554 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3555 {
3556 #ifdef CONFIG_NET_FLOW_LIMIT
3557 	struct sd_flow_limit *fl;
3558 	struct softnet_data *sd;
3559 	unsigned int old_flow, new_flow;
3560 
3561 	if (qlen < (netdev_max_backlog >> 1))
3562 		return false;
3563 
3564 	sd = this_cpu_ptr(&softnet_data);
3565 
3566 	rcu_read_lock();
3567 	fl = rcu_dereference(sd->flow_limit);
3568 	if (fl) {
3569 		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3570 		old_flow = fl->history[fl->history_head];
3571 		fl->history[fl->history_head] = new_flow;
3572 
3573 		fl->history_head++;
3574 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3575 
3576 		if (likely(fl->buckets[old_flow]))
3577 			fl->buckets[old_flow]--;
3578 
3579 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3580 			fl->count++;
3581 			rcu_read_unlock();
3582 			return true;
3583 		}
3584 	}
3585 	rcu_read_unlock();
3586 #endif
3587 	return false;
3588 }
3589 
3590 /*
3591  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3592  * queue (may be a remote CPU queue).
3593  */
3594 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3595 			      unsigned int *qtail)
3596 {
3597 	struct softnet_data *sd;
3598 	unsigned long flags;
3599 	unsigned int qlen;
3600 
3601 	sd = &per_cpu(softnet_data, cpu);
3602 
3603 	local_irq_save(flags);
3604 
3605 	rps_lock(sd);
3606 	if (!netif_running(skb->dev))
3607 		goto drop;
3608 	qlen = skb_queue_len(&sd->input_pkt_queue);
3609 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3610 		if (qlen) {
3611 enqueue:
3612 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3613 			input_queue_tail_incr_save(sd, qtail);
3614 			rps_unlock(sd);
3615 			local_irq_restore(flags);
3616 			return NET_RX_SUCCESS;
3617 		}
3618 
3619 		/* Schedule NAPI for backlog device
3620 		 * We can use non atomic operation since we own the queue lock
3621 		 */
3622 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3623 			if (!rps_ipi_queued(sd))
3624 				____napi_schedule(sd, &sd->backlog);
3625 		}
3626 		goto enqueue;
3627 	}
3628 
3629 drop:
3630 	sd->dropped++;
3631 	rps_unlock(sd);
3632 
3633 	local_irq_restore(flags);
3634 
3635 	atomic_long_inc(&skb->dev->rx_dropped);
3636 	kfree_skb(skb);
3637 	return NET_RX_DROP;
3638 }
3639 
3640 static int netif_rx_internal(struct sk_buff *skb)
3641 {
3642 	int ret;
3643 
3644 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3645 
3646 	trace_netif_rx(skb);
3647 #ifdef CONFIG_RPS
3648 	if (static_key_false(&rps_needed)) {
3649 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3650 		int cpu;
3651 
3652 		preempt_disable();
3653 		rcu_read_lock();
3654 
3655 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3656 		if (cpu < 0)
3657 			cpu = smp_processor_id();
3658 
3659 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3660 
3661 		rcu_read_unlock();
3662 		preempt_enable();
3663 	} else
3664 #endif
3665 	{
3666 		unsigned int qtail;
3667 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3668 		put_cpu();
3669 	}
3670 	return ret;
3671 }
3672 
3673 /**
3674  *	netif_rx	-	post buffer to the network code
3675  *	@skb: buffer to post
3676  *
3677  *	This function receives a packet from a device driver and queues it for
3678  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3679  *	may be dropped during processing for congestion control or by the
3680  *	protocol layers.
3681  *
3682  *	return values:
3683  *	NET_RX_SUCCESS	(no congestion)
3684  *	NET_RX_DROP     (packet was dropped)
3685  *
3686  */
3687 
3688 int netif_rx(struct sk_buff *skb)
3689 {
3690 	trace_netif_rx_entry(skb);
3691 
3692 	return netif_rx_internal(skb);
3693 }
3694 EXPORT_SYMBOL(netif_rx);
3695 
3696 int netif_rx_ni(struct sk_buff *skb)
3697 {
3698 	int err;
3699 
3700 	trace_netif_rx_ni_entry(skb);
3701 
3702 	preempt_disable();
3703 	err = netif_rx_internal(skb);
3704 	if (local_softirq_pending())
3705 		do_softirq();
3706 	preempt_enable();
3707 
3708 	return err;
3709 }
3710 EXPORT_SYMBOL(netif_rx_ni);
3711 
3712 static void net_tx_action(struct softirq_action *h)
3713 {
3714 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3715 
3716 	if (sd->completion_queue) {
3717 		struct sk_buff *clist;
3718 
3719 		local_irq_disable();
3720 		clist = sd->completion_queue;
3721 		sd->completion_queue = NULL;
3722 		local_irq_enable();
3723 
3724 		while (clist) {
3725 			struct sk_buff *skb = clist;
3726 			clist = clist->next;
3727 
3728 			WARN_ON(atomic_read(&skb->users));
3729 			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3730 				trace_consume_skb(skb);
3731 			else
3732 				trace_kfree_skb(skb, net_tx_action);
3733 
3734 			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3735 				__kfree_skb(skb);
3736 			else
3737 				__kfree_skb_defer(skb);
3738 		}
3739 
3740 		__kfree_skb_flush();
3741 	}
3742 
3743 	if (sd->output_queue) {
3744 		struct Qdisc *head;
3745 
3746 		local_irq_disable();
3747 		head = sd->output_queue;
3748 		sd->output_queue = NULL;
3749 		sd->output_queue_tailp = &sd->output_queue;
3750 		local_irq_enable();
3751 
3752 		while (head) {
3753 			struct Qdisc *q = head;
3754 			spinlock_t *root_lock;
3755 
3756 			head = head->next_sched;
3757 
3758 			root_lock = qdisc_lock(q);
3759 			spin_lock(root_lock);
3760 			/* We need to make sure head->next_sched is read
3761 			 * before clearing __QDISC_STATE_SCHED
3762 			 */
3763 			smp_mb__before_atomic();
3764 			clear_bit(__QDISC_STATE_SCHED, &q->state);
3765 			qdisc_run(q);
3766 			spin_unlock(root_lock);
3767 		}
3768 	}
3769 }
3770 
3771 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3772 /* This hook is defined here for ATM LANE */
3773 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3774 			     unsigned char *addr) __read_mostly;
3775 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3776 #endif
3777 
3778 static inline struct sk_buff *
3779 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3780 		   struct net_device *orig_dev)
3781 {
3782 #ifdef CONFIG_NET_CLS_ACT
3783 	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3784 	struct tcf_result cl_res;
3785 
3786 	/* If there's at least one ingress present somewhere (so
3787 	 * we get here via enabled static key), remaining devices
3788 	 * that are not configured with an ingress qdisc will bail
3789 	 * out here.
3790 	 */
3791 	if (!cl)
3792 		return skb;
3793 	if (*pt_prev) {
3794 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3795 		*pt_prev = NULL;
3796 	}
3797 
3798 	qdisc_skb_cb(skb)->pkt_len = skb->len;
3799 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3800 	qdisc_bstats_cpu_update(cl->q, skb);
3801 
3802 	switch (tc_classify(skb, cl, &cl_res, false)) {
3803 	case TC_ACT_OK:
3804 	case TC_ACT_RECLASSIFY:
3805 		skb->tc_index = TC_H_MIN(cl_res.classid);
3806 		break;
3807 	case TC_ACT_SHOT:
3808 		qdisc_qstats_cpu_drop(cl->q);
3809 		kfree_skb(skb);
3810 		return NULL;
3811 	case TC_ACT_STOLEN:
3812 	case TC_ACT_QUEUED:
3813 		consume_skb(skb);
3814 		return NULL;
3815 	case TC_ACT_REDIRECT:
3816 		/* skb_mac_header check was done by cls/act_bpf, so
3817 		 * we can safely push the L2 header back before
3818 		 * redirecting to another netdev
3819 		 */
3820 		__skb_push(skb, skb->mac_len);
3821 		skb_do_redirect(skb);
3822 		return NULL;
3823 	default:
3824 		break;
3825 	}
3826 #endif /* CONFIG_NET_CLS_ACT */
3827 	return skb;
3828 }
3829 
3830 /**
3831  *	netdev_is_rx_handler_busy - check if receive handler is registered
3832  *	@dev: device to check
3833  *
3834  *	Check if a receive handler is already registered for a given device.
3835  *	Return true if there one.
3836  *
3837  *	The caller must hold the rtnl_mutex.
3838  */
3839 bool netdev_is_rx_handler_busy(struct net_device *dev)
3840 {
3841 	ASSERT_RTNL();
3842 	return dev && rtnl_dereference(dev->rx_handler);
3843 }
3844 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3845 
3846 /**
3847  *	netdev_rx_handler_register - register receive handler
3848  *	@dev: device to register a handler for
3849  *	@rx_handler: receive handler to register
3850  *	@rx_handler_data: data pointer that is used by rx handler
3851  *
3852  *	Register a receive handler for a device. This handler will then be
3853  *	called from __netif_receive_skb. A negative errno code is returned
3854  *	on a failure.
3855  *
3856  *	The caller must hold the rtnl_mutex.
3857  *
3858  *	For a general description of rx_handler, see enum rx_handler_result.
3859  */
3860 int netdev_rx_handler_register(struct net_device *dev,
3861 			       rx_handler_func_t *rx_handler,
3862 			       void *rx_handler_data)
3863 {
3864 	ASSERT_RTNL();
3865 
3866 	if (dev->rx_handler)
3867 		return -EBUSY;
3868 
3869 	/* Note: rx_handler_data must be set before rx_handler */
3870 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3871 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3872 
3873 	return 0;
3874 }
3875 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3876 
3877 /**
3878  *	netdev_rx_handler_unregister - unregister receive handler
3879  *	@dev: device to unregister a handler from
3880  *
3881  *	Unregister a receive handler from a device.
3882  *
3883  *	The caller must hold the rtnl_mutex.
3884  */
3885 void netdev_rx_handler_unregister(struct net_device *dev)
3886 {
3887 
3888 	ASSERT_RTNL();
3889 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3890 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3891 	 * section has a guarantee to see a non NULL rx_handler_data
3892 	 * as well.
3893 	 */
3894 	synchronize_net();
3895 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3896 }
3897 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3898 
3899 /*
3900  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3901  * the special handling of PFMEMALLOC skbs.
3902  */
3903 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3904 {
3905 	switch (skb->protocol) {
3906 	case htons(ETH_P_ARP):
3907 	case htons(ETH_P_IP):
3908 	case htons(ETH_P_IPV6):
3909 	case htons(ETH_P_8021Q):
3910 	case htons(ETH_P_8021AD):
3911 		return true;
3912 	default:
3913 		return false;
3914 	}
3915 }
3916 
3917 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3918 			     int *ret, struct net_device *orig_dev)
3919 {
3920 #ifdef CONFIG_NETFILTER_INGRESS
3921 	if (nf_hook_ingress_active(skb)) {
3922 		int ingress_retval;
3923 
3924 		if (*pt_prev) {
3925 			*ret = deliver_skb(skb, *pt_prev, orig_dev);
3926 			*pt_prev = NULL;
3927 		}
3928 
3929 		rcu_read_lock();
3930 		ingress_retval = nf_hook_ingress(skb);
3931 		rcu_read_unlock();
3932 		return ingress_retval;
3933 	}
3934 #endif /* CONFIG_NETFILTER_INGRESS */
3935 	return 0;
3936 }
3937 
3938 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3939 {
3940 	struct packet_type *ptype, *pt_prev;
3941 	rx_handler_func_t *rx_handler;
3942 	struct net_device *orig_dev;
3943 	bool deliver_exact = false;
3944 	int ret = NET_RX_DROP;
3945 	__be16 type;
3946 
3947 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3948 
3949 	trace_netif_receive_skb(skb);
3950 
3951 	orig_dev = skb->dev;
3952 
3953 	skb_reset_network_header(skb);
3954 	if (!skb_transport_header_was_set(skb))
3955 		skb_reset_transport_header(skb);
3956 	skb_reset_mac_len(skb);
3957 
3958 	pt_prev = NULL;
3959 
3960 another_round:
3961 	skb->skb_iif = skb->dev->ifindex;
3962 
3963 	__this_cpu_inc(softnet_data.processed);
3964 
3965 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3966 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3967 		skb = skb_vlan_untag(skb);
3968 		if (unlikely(!skb))
3969 			goto out;
3970 	}
3971 
3972 #ifdef CONFIG_NET_CLS_ACT
3973 	if (skb->tc_verd & TC_NCLS) {
3974 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3975 		goto ncls;
3976 	}
3977 #endif
3978 
3979 	if (pfmemalloc)
3980 		goto skip_taps;
3981 
3982 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3983 		if (pt_prev)
3984 			ret = deliver_skb(skb, pt_prev, orig_dev);
3985 		pt_prev = ptype;
3986 	}
3987 
3988 	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3989 		if (pt_prev)
3990 			ret = deliver_skb(skb, pt_prev, orig_dev);
3991 		pt_prev = ptype;
3992 	}
3993 
3994 skip_taps:
3995 #ifdef CONFIG_NET_INGRESS
3996 	if (static_key_false(&ingress_needed)) {
3997 		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
3998 		if (!skb)
3999 			goto out;
4000 
4001 		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4002 			goto out;
4003 	}
4004 #endif
4005 #ifdef CONFIG_NET_CLS_ACT
4006 	skb->tc_verd = 0;
4007 ncls:
4008 #endif
4009 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4010 		goto drop;
4011 
4012 	if (skb_vlan_tag_present(skb)) {
4013 		if (pt_prev) {
4014 			ret = deliver_skb(skb, pt_prev, orig_dev);
4015 			pt_prev = NULL;
4016 		}
4017 		if (vlan_do_receive(&skb))
4018 			goto another_round;
4019 		else if (unlikely(!skb))
4020 			goto out;
4021 	}
4022 
4023 	rx_handler = rcu_dereference(skb->dev->rx_handler);
4024 	if (rx_handler) {
4025 		if (pt_prev) {
4026 			ret = deliver_skb(skb, pt_prev, orig_dev);
4027 			pt_prev = NULL;
4028 		}
4029 		switch (rx_handler(&skb)) {
4030 		case RX_HANDLER_CONSUMED:
4031 			ret = NET_RX_SUCCESS;
4032 			goto out;
4033 		case RX_HANDLER_ANOTHER:
4034 			goto another_round;
4035 		case RX_HANDLER_EXACT:
4036 			deliver_exact = true;
4037 		case RX_HANDLER_PASS:
4038 			break;
4039 		default:
4040 			BUG();
4041 		}
4042 	}
4043 
4044 	if (unlikely(skb_vlan_tag_present(skb))) {
4045 		if (skb_vlan_tag_get_id(skb))
4046 			skb->pkt_type = PACKET_OTHERHOST;
4047 		/* Note: we might in the future use prio bits
4048 		 * and set skb->priority like in vlan_do_receive()
4049 		 * For the time being, just ignore Priority Code Point
4050 		 */
4051 		skb->vlan_tci = 0;
4052 	}
4053 
4054 	type = skb->protocol;
4055 
4056 	/* deliver only exact match when indicated */
4057 	if (likely(!deliver_exact)) {
4058 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4059 				       &ptype_base[ntohs(type) &
4060 						   PTYPE_HASH_MASK]);
4061 	}
4062 
4063 	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4064 			       &orig_dev->ptype_specific);
4065 
4066 	if (unlikely(skb->dev != orig_dev)) {
4067 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4068 				       &skb->dev->ptype_specific);
4069 	}
4070 
4071 	if (pt_prev) {
4072 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4073 			goto drop;
4074 		else
4075 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4076 	} else {
4077 drop:
4078 		if (!deliver_exact)
4079 			atomic_long_inc(&skb->dev->rx_dropped);
4080 		else
4081 			atomic_long_inc(&skb->dev->rx_nohandler);
4082 		kfree_skb(skb);
4083 		/* Jamal, now you will not able to escape explaining
4084 		 * me how you were going to use this. :-)
4085 		 */
4086 		ret = NET_RX_DROP;
4087 	}
4088 
4089 out:
4090 	return ret;
4091 }
4092 
4093 static int __netif_receive_skb(struct sk_buff *skb)
4094 {
4095 	int ret;
4096 
4097 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4098 		unsigned long pflags = current->flags;
4099 
4100 		/*
4101 		 * PFMEMALLOC skbs are special, they should
4102 		 * - be delivered to SOCK_MEMALLOC sockets only
4103 		 * - stay away from userspace
4104 		 * - have bounded memory usage
4105 		 *
4106 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4107 		 * context down to all allocation sites.
4108 		 */
4109 		current->flags |= PF_MEMALLOC;
4110 		ret = __netif_receive_skb_core(skb, true);
4111 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4112 	} else
4113 		ret = __netif_receive_skb_core(skb, false);
4114 
4115 	return ret;
4116 }
4117 
4118 static int netif_receive_skb_internal(struct sk_buff *skb)
4119 {
4120 	int ret;
4121 
4122 	net_timestamp_check(netdev_tstamp_prequeue, skb);
4123 
4124 	if (skb_defer_rx_timestamp(skb))
4125 		return NET_RX_SUCCESS;
4126 
4127 	rcu_read_lock();
4128 
4129 #ifdef CONFIG_RPS
4130 	if (static_key_false(&rps_needed)) {
4131 		struct rps_dev_flow voidflow, *rflow = &voidflow;
4132 		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4133 
4134 		if (cpu >= 0) {
4135 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4136 			rcu_read_unlock();
4137 			return ret;
4138 		}
4139 	}
4140 #endif
4141 	ret = __netif_receive_skb(skb);
4142 	rcu_read_unlock();
4143 	return ret;
4144 }
4145 
4146 /**
4147  *	netif_receive_skb - process receive buffer from network
4148  *	@skb: buffer to process
4149  *
4150  *	netif_receive_skb() is the main receive data processing function.
4151  *	It always succeeds. The buffer may be dropped during processing
4152  *	for congestion control or by the protocol layers.
4153  *
4154  *	This function may only be called from softirq context and interrupts
4155  *	should be enabled.
4156  *
4157  *	Return values (usually ignored):
4158  *	NET_RX_SUCCESS: no congestion
4159  *	NET_RX_DROP: packet was dropped
4160  */
4161 int netif_receive_skb(struct sk_buff *skb)
4162 {
4163 	trace_netif_receive_skb_entry(skb);
4164 
4165 	return netif_receive_skb_internal(skb);
4166 }
4167 EXPORT_SYMBOL(netif_receive_skb);
4168 
4169 DEFINE_PER_CPU(struct work_struct, flush_works);
4170 
4171 /* Network device is going away, flush any packets still pending */
4172 static void flush_backlog(struct work_struct *work)
4173 {
4174 	struct sk_buff *skb, *tmp;
4175 	struct softnet_data *sd;
4176 
4177 	local_bh_disable();
4178 	sd = this_cpu_ptr(&softnet_data);
4179 
4180 	local_irq_disable();
4181 	rps_lock(sd);
4182 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4183 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4184 			__skb_unlink(skb, &sd->input_pkt_queue);
4185 			kfree_skb(skb);
4186 			input_queue_head_incr(sd);
4187 		}
4188 	}
4189 	rps_unlock(sd);
4190 	local_irq_enable();
4191 
4192 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4193 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4194 			__skb_unlink(skb, &sd->process_queue);
4195 			kfree_skb(skb);
4196 			input_queue_head_incr(sd);
4197 		}
4198 	}
4199 	local_bh_enable();
4200 }
4201 
4202 static void flush_all_backlogs(void)
4203 {
4204 	unsigned int cpu;
4205 
4206 	get_online_cpus();
4207 
4208 	for_each_online_cpu(cpu)
4209 		queue_work_on(cpu, system_highpri_wq,
4210 			      per_cpu_ptr(&flush_works, cpu));
4211 
4212 	for_each_online_cpu(cpu)
4213 		flush_work(per_cpu_ptr(&flush_works, cpu));
4214 
4215 	put_online_cpus();
4216 }
4217 
4218 static int napi_gro_complete(struct sk_buff *skb)
4219 {
4220 	struct packet_offload *ptype;
4221 	__be16 type = skb->protocol;
4222 	struct list_head *head = &offload_base;
4223 	int err = -ENOENT;
4224 
4225 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4226 
4227 	if (NAPI_GRO_CB(skb)->count == 1) {
4228 		skb_shinfo(skb)->gso_size = 0;
4229 		goto out;
4230 	}
4231 
4232 	rcu_read_lock();
4233 	list_for_each_entry_rcu(ptype, head, list) {
4234 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4235 			continue;
4236 
4237 		err = ptype->callbacks.gro_complete(skb, 0);
4238 		break;
4239 	}
4240 	rcu_read_unlock();
4241 
4242 	if (err) {
4243 		WARN_ON(&ptype->list == head);
4244 		kfree_skb(skb);
4245 		return NET_RX_SUCCESS;
4246 	}
4247 
4248 out:
4249 	return netif_receive_skb_internal(skb);
4250 }
4251 
4252 /* napi->gro_list contains packets ordered by age.
4253  * youngest packets at the head of it.
4254  * Complete skbs in reverse order to reduce latencies.
4255  */
4256 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4257 {
4258 	struct sk_buff *skb, *prev = NULL;
4259 
4260 	/* scan list and build reverse chain */
4261 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4262 		skb->prev = prev;
4263 		prev = skb;
4264 	}
4265 
4266 	for (skb = prev; skb; skb = prev) {
4267 		skb->next = NULL;
4268 
4269 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4270 			return;
4271 
4272 		prev = skb->prev;
4273 		napi_gro_complete(skb);
4274 		napi->gro_count--;
4275 	}
4276 
4277 	napi->gro_list = NULL;
4278 }
4279 EXPORT_SYMBOL(napi_gro_flush);
4280 
4281 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4282 {
4283 	struct sk_buff *p;
4284 	unsigned int maclen = skb->dev->hard_header_len;
4285 	u32 hash = skb_get_hash_raw(skb);
4286 
4287 	for (p = napi->gro_list; p; p = p->next) {
4288 		unsigned long diffs;
4289 
4290 		NAPI_GRO_CB(p)->flush = 0;
4291 
4292 		if (hash != skb_get_hash_raw(p)) {
4293 			NAPI_GRO_CB(p)->same_flow = 0;
4294 			continue;
4295 		}
4296 
4297 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4298 		diffs |= p->vlan_tci ^ skb->vlan_tci;
4299 		diffs |= skb_metadata_dst_cmp(p, skb);
4300 		if (maclen == ETH_HLEN)
4301 			diffs |= compare_ether_header(skb_mac_header(p),
4302 						      skb_mac_header(skb));
4303 		else if (!diffs)
4304 			diffs = memcmp(skb_mac_header(p),
4305 				       skb_mac_header(skb),
4306 				       maclen);
4307 		NAPI_GRO_CB(p)->same_flow = !diffs;
4308 	}
4309 }
4310 
4311 static void skb_gro_reset_offset(struct sk_buff *skb)
4312 {
4313 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4314 	const skb_frag_t *frag0 = &pinfo->frags[0];
4315 
4316 	NAPI_GRO_CB(skb)->data_offset = 0;
4317 	NAPI_GRO_CB(skb)->frag0 = NULL;
4318 	NAPI_GRO_CB(skb)->frag0_len = 0;
4319 
4320 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4321 	    pinfo->nr_frags &&
4322 	    !PageHighMem(skb_frag_page(frag0))) {
4323 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4324 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4325 	}
4326 }
4327 
4328 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4329 {
4330 	struct skb_shared_info *pinfo = skb_shinfo(skb);
4331 
4332 	BUG_ON(skb->end - skb->tail < grow);
4333 
4334 	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4335 
4336 	skb->data_len -= grow;
4337 	skb->tail += grow;
4338 
4339 	pinfo->frags[0].page_offset += grow;
4340 	skb_frag_size_sub(&pinfo->frags[0], grow);
4341 
4342 	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4343 		skb_frag_unref(skb, 0);
4344 		memmove(pinfo->frags, pinfo->frags + 1,
4345 			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4346 	}
4347 }
4348 
4349 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4350 {
4351 	struct sk_buff **pp = NULL;
4352 	struct packet_offload *ptype;
4353 	__be16 type = skb->protocol;
4354 	struct list_head *head = &offload_base;
4355 	int same_flow;
4356 	enum gro_result ret;
4357 	int grow;
4358 
4359 	if (!(skb->dev->features & NETIF_F_GRO))
4360 		goto normal;
4361 
4362 	if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4363 		goto normal;
4364 
4365 	gro_list_prepare(napi, skb);
4366 
4367 	rcu_read_lock();
4368 	list_for_each_entry_rcu(ptype, head, list) {
4369 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4370 			continue;
4371 
4372 		skb_set_network_header(skb, skb_gro_offset(skb));
4373 		skb_reset_mac_len(skb);
4374 		NAPI_GRO_CB(skb)->same_flow = 0;
4375 		NAPI_GRO_CB(skb)->flush = 0;
4376 		NAPI_GRO_CB(skb)->free = 0;
4377 		NAPI_GRO_CB(skb)->encap_mark = 0;
4378 		NAPI_GRO_CB(skb)->is_fou = 0;
4379 		NAPI_GRO_CB(skb)->is_atomic = 1;
4380 		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4381 
4382 		/* Setup for GRO checksum validation */
4383 		switch (skb->ip_summed) {
4384 		case CHECKSUM_COMPLETE:
4385 			NAPI_GRO_CB(skb)->csum = skb->csum;
4386 			NAPI_GRO_CB(skb)->csum_valid = 1;
4387 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4388 			break;
4389 		case CHECKSUM_UNNECESSARY:
4390 			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4391 			NAPI_GRO_CB(skb)->csum_valid = 0;
4392 			break;
4393 		default:
4394 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4395 			NAPI_GRO_CB(skb)->csum_valid = 0;
4396 		}
4397 
4398 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4399 		break;
4400 	}
4401 	rcu_read_unlock();
4402 
4403 	if (&ptype->list == head)
4404 		goto normal;
4405 
4406 	same_flow = NAPI_GRO_CB(skb)->same_flow;
4407 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4408 
4409 	if (pp) {
4410 		struct sk_buff *nskb = *pp;
4411 
4412 		*pp = nskb->next;
4413 		nskb->next = NULL;
4414 		napi_gro_complete(nskb);
4415 		napi->gro_count--;
4416 	}
4417 
4418 	if (same_flow)
4419 		goto ok;
4420 
4421 	if (NAPI_GRO_CB(skb)->flush)
4422 		goto normal;
4423 
4424 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4425 		struct sk_buff *nskb = napi->gro_list;
4426 
4427 		/* locate the end of the list to select the 'oldest' flow */
4428 		while (nskb->next) {
4429 			pp = &nskb->next;
4430 			nskb = *pp;
4431 		}
4432 		*pp = NULL;
4433 		nskb->next = NULL;
4434 		napi_gro_complete(nskb);
4435 	} else {
4436 		napi->gro_count++;
4437 	}
4438 	NAPI_GRO_CB(skb)->count = 1;
4439 	NAPI_GRO_CB(skb)->age = jiffies;
4440 	NAPI_GRO_CB(skb)->last = skb;
4441 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4442 	skb->next = napi->gro_list;
4443 	napi->gro_list = skb;
4444 	ret = GRO_HELD;
4445 
4446 pull:
4447 	grow = skb_gro_offset(skb) - skb_headlen(skb);
4448 	if (grow > 0)
4449 		gro_pull_from_frag0(skb, grow);
4450 ok:
4451 	return ret;
4452 
4453 normal:
4454 	ret = GRO_NORMAL;
4455 	goto pull;
4456 }
4457 
4458 struct packet_offload *gro_find_receive_by_type(__be16 type)
4459 {
4460 	struct list_head *offload_head = &offload_base;
4461 	struct packet_offload *ptype;
4462 
4463 	list_for_each_entry_rcu(ptype, offload_head, list) {
4464 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4465 			continue;
4466 		return ptype;
4467 	}
4468 	return NULL;
4469 }
4470 EXPORT_SYMBOL(gro_find_receive_by_type);
4471 
4472 struct packet_offload *gro_find_complete_by_type(__be16 type)
4473 {
4474 	struct list_head *offload_head = &offload_base;
4475 	struct packet_offload *ptype;
4476 
4477 	list_for_each_entry_rcu(ptype, offload_head, list) {
4478 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4479 			continue;
4480 		return ptype;
4481 	}
4482 	return NULL;
4483 }
4484 EXPORT_SYMBOL(gro_find_complete_by_type);
4485 
4486 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4487 {
4488 	switch (ret) {
4489 	case GRO_NORMAL:
4490 		if (netif_receive_skb_internal(skb))
4491 			ret = GRO_DROP;
4492 		break;
4493 
4494 	case GRO_DROP:
4495 		kfree_skb(skb);
4496 		break;
4497 
4498 	case GRO_MERGED_FREE:
4499 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4500 			skb_dst_drop(skb);
4501 			kmem_cache_free(skbuff_head_cache, skb);
4502 		} else {
4503 			__kfree_skb(skb);
4504 		}
4505 		break;
4506 
4507 	case GRO_HELD:
4508 	case GRO_MERGED:
4509 		break;
4510 	}
4511 
4512 	return ret;
4513 }
4514 
4515 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4516 {
4517 	skb_mark_napi_id(skb, napi);
4518 	trace_napi_gro_receive_entry(skb);
4519 
4520 	skb_gro_reset_offset(skb);
4521 
4522 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4523 }
4524 EXPORT_SYMBOL(napi_gro_receive);
4525 
4526 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4527 {
4528 	if (unlikely(skb->pfmemalloc)) {
4529 		consume_skb(skb);
4530 		return;
4531 	}
4532 	__skb_pull(skb, skb_headlen(skb));
4533 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4534 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4535 	skb->vlan_tci = 0;
4536 	skb->dev = napi->dev;
4537 	skb->skb_iif = 0;
4538 	skb->encapsulation = 0;
4539 	skb_shinfo(skb)->gso_type = 0;
4540 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4541 
4542 	napi->skb = skb;
4543 }
4544 
4545 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4546 {
4547 	struct sk_buff *skb = napi->skb;
4548 
4549 	if (!skb) {
4550 		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4551 		if (skb) {
4552 			napi->skb = skb;
4553 			skb_mark_napi_id(skb, napi);
4554 		}
4555 	}
4556 	return skb;
4557 }
4558 EXPORT_SYMBOL(napi_get_frags);
4559 
4560 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4561 				      struct sk_buff *skb,
4562 				      gro_result_t ret)
4563 {
4564 	switch (ret) {
4565 	case GRO_NORMAL:
4566 	case GRO_HELD:
4567 		__skb_push(skb, ETH_HLEN);
4568 		skb->protocol = eth_type_trans(skb, skb->dev);
4569 		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4570 			ret = GRO_DROP;
4571 		break;
4572 
4573 	case GRO_DROP:
4574 	case GRO_MERGED_FREE:
4575 		napi_reuse_skb(napi, skb);
4576 		break;
4577 
4578 	case GRO_MERGED:
4579 		break;
4580 	}
4581 
4582 	return ret;
4583 }
4584 
4585 /* Upper GRO stack assumes network header starts at gro_offset=0
4586  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4587  * We copy ethernet header into skb->data to have a common layout.
4588  */
4589 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4590 {
4591 	struct sk_buff *skb = napi->skb;
4592 	const struct ethhdr *eth;
4593 	unsigned int hlen = sizeof(*eth);
4594 
4595 	napi->skb = NULL;
4596 
4597 	skb_reset_mac_header(skb);
4598 	skb_gro_reset_offset(skb);
4599 
4600 	eth = skb_gro_header_fast(skb, 0);
4601 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4602 		eth = skb_gro_header_slow(skb, hlen, 0);
4603 		if (unlikely(!eth)) {
4604 			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4605 					     __func__, napi->dev->name);
4606 			napi_reuse_skb(napi, skb);
4607 			return NULL;
4608 		}
4609 	} else {
4610 		gro_pull_from_frag0(skb, hlen);
4611 		NAPI_GRO_CB(skb)->frag0 += hlen;
4612 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4613 	}
4614 	__skb_pull(skb, hlen);
4615 
4616 	/*
4617 	 * This works because the only protocols we care about don't require
4618 	 * special handling.
4619 	 * We'll fix it up properly in napi_frags_finish()
4620 	 */
4621 	skb->protocol = eth->h_proto;
4622 
4623 	return skb;
4624 }
4625 
4626 gro_result_t napi_gro_frags(struct napi_struct *napi)
4627 {
4628 	struct sk_buff *skb = napi_frags_skb(napi);
4629 
4630 	if (!skb)
4631 		return GRO_DROP;
4632 
4633 	trace_napi_gro_frags_entry(skb);
4634 
4635 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4636 }
4637 EXPORT_SYMBOL(napi_gro_frags);
4638 
4639 /* Compute the checksum from gro_offset and return the folded value
4640  * after adding in any pseudo checksum.
4641  */
4642 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4643 {
4644 	__wsum wsum;
4645 	__sum16 sum;
4646 
4647 	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4648 
4649 	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4650 	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4651 	if (likely(!sum)) {
4652 		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4653 		    !skb->csum_complete_sw)
4654 			netdev_rx_csum_fault(skb->dev);
4655 	}
4656 
4657 	NAPI_GRO_CB(skb)->csum = wsum;
4658 	NAPI_GRO_CB(skb)->csum_valid = 1;
4659 
4660 	return sum;
4661 }
4662 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4663 
4664 /*
4665  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4666  * Note: called with local irq disabled, but exits with local irq enabled.
4667  */
4668 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4669 {
4670 #ifdef CONFIG_RPS
4671 	struct softnet_data *remsd = sd->rps_ipi_list;
4672 
4673 	if (remsd) {
4674 		sd->rps_ipi_list = NULL;
4675 
4676 		local_irq_enable();
4677 
4678 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4679 		while (remsd) {
4680 			struct softnet_data *next = remsd->rps_ipi_next;
4681 
4682 			if (cpu_online(remsd->cpu))
4683 				smp_call_function_single_async(remsd->cpu,
4684 							   &remsd->csd);
4685 			remsd = next;
4686 		}
4687 	} else
4688 #endif
4689 		local_irq_enable();
4690 }
4691 
4692 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4693 {
4694 #ifdef CONFIG_RPS
4695 	return sd->rps_ipi_list != NULL;
4696 #else
4697 	return false;
4698 #endif
4699 }
4700 
4701 static int process_backlog(struct napi_struct *napi, int quota)
4702 {
4703 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4704 	bool again = true;
4705 	int work = 0;
4706 
4707 	/* Check if we have pending ipi, its better to send them now,
4708 	 * not waiting net_rx_action() end.
4709 	 */
4710 	if (sd_has_rps_ipi_waiting(sd)) {
4711 		local_irq_disable();
4712 		net_rps_action_and_irq_enable(sd);
4713 	}
4714 
4715 	napi->weight = weight_p;
4716 	while (again) {
4717 		struct sk_buff *skb;
4718 
4719 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4720 			rcu_read_lock();
4721 			__netif_receive_skb(skb);
4722 			rcu_read_unlock();
4723 			input_queue_head_incr(sd);
4724 			if (++work >= quota)
4725 				return work;
4726 
4727 		}
4728 
4729 		local_irq_disable();
4730 		rps_lock(sd);
4731 		if (skb_queue_empty(&sd->input_pkt_queue)) {
4732 			/*
4733 			 * Inline a custom version of __napi_complete().
4734 			 * only current cpu owns and manipulates this napi,
4735 			 * and NAPI_STATE_SCHED is the only possible flag set
4736 			 * on backlog.
4737 			 * We can use a plain write instead of clear_bit(),
4738 			 * and we dont need an smp_mb() memory barrier.
4739 			 */
4740 			napi->state = 0;
4741 			again = false;
4742 		} else {
4743 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4744 						   &sd->process_queue);
4745 		}
4746 		rps_unlock(sd);
4747 		local_irq_enable();
4748 	}
4749 
4750 	return work;
4751 }
4752 
4753 /**
4754  * __napi_schedule - schedule for receive
4755  * @n: entry to schedule
4756  *
4757  * The entry's receive function will be scheduled to run.
4758  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4759  */
4760 void __napi_schedule(struct napi_struct *n)
4761 {
4762 	unsigned long flags;
4763 
4764 	local_irq_save(flags);
4765 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4766 	local_irq_restore(flags);
4767 }
4768 EXPORT_SYMBOL(__napi_schedule);
4769 
4770 /**
4771  * __napi_schedule_irqoff - schedule for receive
4772  * @n: entry to schedule
4773  *
4774  * Variant of __napi_schedule() assuming hard irqs are masked
4775  */
4776 void __napi_schedule_irqoff(struct napi_struct *n)
4777 {
4778 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4779 }
4780 EXPORT_SYMBOL(__napi_schedule_irqoff);
4781 
4782 void __napi_complete(struct napi_struct *n)
4783 {
4784 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4785 
4786 	list_del_init(&n->poll_list);
4787 	smp_mb__before_atomic();
4788 	clear_bit(NAPI_STATE_SCHED, &n->state);
4789 }
4790 EXPORT_SYMBOL(__napi_complete);
4791 
4792 void napi_complete_done(struct napi_struct *n, int work_done)
4793 {
4794 	unsigned long flags;
4795 
4796 	/*
4797 	 * don't let napi dequeue from the cpu poll list
4798 	 * just in case its running on a different cpu
4799 	 */
4800 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4801 		return;
4802 
4803 	if (n->gro_list) {
4804 		unsigned long timeout = 0;
4805 
4806 		if (work_done)
4807 			timeout = n->dev->gro_flush_timeout;
4808 
4809 		if (timeout)
4810 			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4811 				      HRTIMER_MODE_REL_PINNED);
4812 		else
4813 			napi_gro_flush(n, false);
4814 	}
4815 	if (likely(list_empty(&n->poll_list))) {
4816 		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4817 	} else {
4818 		/* If n->poll_list is not empty, we need to mask irqs */
4819 		local_irq_save(flags);
4820 		__napi_complete(n);
4821 		local_irq_restore(flags);
4822 	}
4823 }
4824 EXPORT_SYMBOL(napi_complete_done);
4825 
4826 /* must be called under rcu_read_lock(), as we dont take a reference */
4827 static struct napi_struct *napi_by_id(unsigned int napi_id)
4828 {
4829 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4830 	struct napi_struct *napi;
4831 
4832 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4833 		if (napi->napi_id == napi_id)
4834 			return napi;
4835 
4836 	return NULL;
4837 }
4838 
4839 #if defined(CONFIG_NET_RX_BUSY_POLL)
4840 #define BUSY_POLL_BUDGET 8
4841 bool sk_busy_loop(struct sock *sk, int nonblock)
4842 {
4843 	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4844 	int (*busy_poll)(struct napi_struct *dev);
4845 	struct napi_struct *napi;
4846 	int rc = false;
4847 
4848 	rcu_read_lock();
4849 
4850 	napi = napi_by_id(sk->sk_napi_id);
4851 	if (!napi)
4852 		goto out;
4853 
4854 	/* Note: ndo_busy_poll method is optional in linux-4.5 */
4855 	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4856 
4857 	do {
4858 		rc = 0;
4859 		local_bh_disable();
4860 		if (busy_poll) {
4861 			rc = busy_poll(napi);
4862 		} else if (napi_schedule_prep(napi)) {
4863 			void *have = netpoll_poll_lock(napi);
4864 
4865 			if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4866 				rc = napi->poll(napi, BUSY_POLL_BUDGET);
4867 				trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
4868 				if (rc == BUSY_POLL_BUDGET) {
4869 					napi_complete_done(napi, rc);
4870 					napi_schedule(napi);
4871 				}
4872 			}
4873 			netpoll_poll_unlock(have);
4874 		}
4875 		if (rc > 0)
4876 			__NET_ADD_STATS(sock_net(sk),
4877 					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
4878 		local_bh_enable();
4879 
4880 		if (rc == LL_FLUSH_FAILED)
4881 			break; /* permanent failure */
4882 
4883 		cpu_relax();
4884 	} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
4885 		 !need_resched() && !busy_loop_timeout(end_time));
4886 
4887 	rc = !skb_queue_empty(&sk->sk_receive_queue);
4888 out:
4889 	rcu_read_unlock();
4890 	return rc;
4891 }
4892 EXPORT_SYMBOL(sk_busy_loop);
4893 
4894 #endif /* CONFIG_NET_RX_BUSY_POLL */
4895 
4896 void napi_hash_add(struct napi_struct *napi)
4897 {
4898 	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
4899 	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
4900 		return;
4901 
4902 	spin_lock(&napi_hash_lock);
4903 
4904 	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
4905 	do {
4906 		if (unlikely(++napi_gen_id < NR_CPUS + 1))
4907 			napi_gen_id = NR_CPUS + 1;
4908 	} while (napi_by_id(napi_gen_id));
4909 	napi->napi_id = napi_gen_id;
4910 
4911 	hlist_add_head_rcu(&napi->napi_hash_node,
4912 			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4913 
4914 	spin_unlock(&napi_hash_lock);
4915 }
4916 EXPORT_SYMBOL_GPL(napi_hash_add);
4917 
4918 /* Warning : caller is responsible to make sure rcu grace period
4919  * is respected before freeing memory containing @napi
4920  */
4921 bool napi_hash_del(struct napi_struct *napi)
4922 {
4923 	bool rcu_sync_needed = false;
4924 
4925 	spin_lock(&napi_hash_lock);
4926 
4927 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
4928 		rcu_sync_needed = true;
4929 		hlist_del_rcu(&napi->napi_hash_node);
4930 	}
4931 	spin_unlock(&napi_hash_lock);
4932 	return rcu_sync_needed;
4933 }
4934 EXPORT_SYMBOL_GPL(napi_hash_del);
4935 
4936 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4937 {
4938 	struct napi_struct *napi;
4939 
4940 	napi = container_of(timer, struct napi_struct, timer);
4941 	if (napi->gro_list)
4942 		napi_schedule(napi);
4943 
4944 	return HRTIMER_NORESTART;
4945 }
4946 
4947 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4948 		    int (*poll)(struct napi_struct *, int), int weight)
4949 {
4950 	INIT_LIST_HEAD(&napi->poll_list);
4951 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4952 	napi->timer.function = napi_watchdog;
4953 	napi->gro_count = 0;
4954 	napi->gro_list = NULL;
4955 	napi->skb = NULL;
4956 	napi->poll = poll;
4957 	if (weight > NAPI_POLL_WEIGHT)
4958 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4959 			    weight, dev->name);
4960 	napi->weight = weight;
4961 	list_add(&napi->dev_list, &dev->napi_list);
4962 	napi->dev = dev;
4963 #ifdef CONFIG_NETPOLL
4964 	spin_lock_init(&napi->poll_lock);
4965 	napi->poll_owner = -1;
4966 #endif
4967 	set_bit(NAPI_STATE_SCHED, &napi->state);
4968 	napi_hash_add(napi);
4969 }
4970 EXPORT_SYMBOL(netif_napi_add);
4971 
4972 void napi_disable(struct napi_struct *n)
4973 {
4974 	might_sleep();
4975 	set_bit(NAPI_STATE_DISABLE, &n->state);
4976 
4977 	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4978 		msleep(1);
4979 	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
4980 		msleep(1);
4981 
4982 	hrtimer_cancel(&n->timer);
4983 
4984 	clear_bit(NAPI_STATE_DISABLE, &n->state);
4985 }
4986 EXPORT_SYMBOL(napi_disable);
4987 
4988 /* Must be called in process context */
4989 void netif_napi_del(struct napi_struct *napi)
4990 {
4991 	might_sleep();
4992 	if (napi_hash_del(napi))
4993 		synchronize_net();
4994 	list_del_init(&napi->dev_list);
4995 	napi_free_frags(napi);
4996 
4997 	kfree_skb_list(napi->gro_list);
4998 	napi->gro_list = NULL;
4999 	napi->gro_count = 0;
5000 }
5001 EXPORT_SYMBOL(netif_napi_del);
5002 
5003 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5004 {
5005 	void *have;
5006 	int work, weight;
5007 
5008 	list_del_init(&n->poll_list);
5009 
5010 	have = netpoll_poll_lock(n);
5011 
5012 	weight = n->weight;
5013 
5014 	/* This NAPI_STATE_SCHED test is for avoiding a race
5015 	 * with netpoll's poll_napi().  Only the entity which
5016 	 * obtains the lock and sees NAPI_STATE_SCHED set will
5017 	 * actually make the ->poll() call.  Therefore we avoid
5018 	 * accidentally calling ->poll() when NAPI is not scheduled.
5019 	 */
5020 	work = 0;
5021 	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5022 		work = n->poll(n, weight);
5023 		trace_napi_poll(n, work, weight);
5024 	}
5025 
5026 	WARN_ON_ONCE(work > weight);
5027 
5028 	if (likely(work < weight))
5029 		goto out_unlock;
5030 
5031 	/* Drivers must not modify the NAPI state if they
5032 	 * consume the entire weight.  In such cases this code
5033 	 * still "owns" the NAPI instance and therefore can
5034 	 * move the instance around on the list at-will.
5035 	 */
5036 	if (unlikely(napi_disable_pending(n))) {
5037 		napi_complete(n);
5038 		goto out_unlock;
5039 	}
5040 
5041 	if (n->gro_list) {
5042 		/* flush too old packets
5043 		 * If HZ < 1000, flush all packets.
5044 		 */
5045 		napi_gro_flush(n, HZ >= 1000);
5046 	}
5047 
5048 	/* Some drivers may have called napi_schedule
5049 	 * prior to exhausting their budget.
5050 	 */
5051 	if (unlikely(!list_empty(&n->poll_list))) {
5052 		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5053 			     n->dev ? n->dev->name : "backlog");
5054 		goto out_unlock;
5055 	}
5056 
5057 	list_add_tail(&n->poll_list, repoll);
5058 
5059 out_unlock:
5060 	netpoll_poll_unlock(have);
5061 
5062 	return work;
5063 }
5064 
5065 static void net_rx_action(struct softirq_action *h)
5066 {
5067 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5068 	unsigned long time_limit = jiffies + 2;
5069 	int budget = netdev_budget;
5070 	LIST_HEAD(list);
5071 	LIST_HEAD(repoll);
5072 
5073 	local_irq_disable();
5074 	list_splice_init(&sd->poll_list, &list);
5075 	local_irq_enable();
5076 
5077 	for (;;) {
5078 		struct napi_struct *n;
5079 
5080 		if (list_empty(&list)) {
5081 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5082 				return;
5083 			break;
5084 		}
5085 
5086 		n = list_first_entry(&list, struct napi_struct, poll_list);
5087 		budget -= napi_poll(n, &repoll);
5088 
5089 		/* If softirq window is exhausted then punt.
5090 		 * Allow this to run for 2 jiffies since which will allow
5091 		 * an average latency of 1.5/HZ.
5092 		 */
5093 		if (unlikely(budget <= 0 ||
5094 			     time_after_eq(jiffies, time_limit))) {
5095 			sd->time_squeeze++;
5096 			break;
5097 		}
5098 	}
5099 
5100 	__kfree_skb_flush();
5101 	local_irq_disable();
5102 
5103 	list_splice_tail_init(&sd->poll_list, &list);
5104 	list_splice_tail(&repoll, &list);
5105 	list_splice(&list, &sd->poll_list);
5106 	if (!list_empty(&sd->poll_list))
5107 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5108 
5109 	net_rps_action_and_irq_enable(sd);
5110 }
5111 
5112 struct netdev_adjacent {
5113 	struct net_device *dev;
5114 
5115 	/* upper master flag, there can only be one master device per list */
5116 	bool master;
5117 
5118 	/* counter for the number of times this device was added to us */
5119 	u16 ref_nr;
5120 
5121 	/* private field for the users */
5122 	void *private;
5123 
5124 	struct list_head list;
5125 	struct rcu_head rcu;
5126 };
5127 
5128 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5129 						 struct list_head *adj_list)
5130 {
5131 	struct netdev_adjacent *adj;
5132 
5133 	list_for_each_entry(adj, adj_list, list) {
5134 		if (adj->dev == adj_dev)
5135 			return adj;
5136 	}
5137 	return NULL;
5138 }
5139 
5140 /**
5141  * netdev_has_upper_dev - Check if device is linked to an upper device
5142  * @dev: device
5143  * @upper_dev: upper device to check
5144  *
5145  * Find out if a device is linked to specified upper device and return true
5146  * in case it is. Note that this checks only immediate upper device,
5147  * not through a complete stack of devices. The caller must hold the RTNL lock.
5148  */
5149 bool netdev_has_upper_dev(struct net_device *dev,
5150 			  struct net_device *upper_dev)
5151 {
5152 	ASSERT_RTNL();
5153 
5154 	return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5155 }
5156 EXPORT_SYMBOL(netdev_has_upper_dev);
5157 
5158 /**
5159  * netdev_has_any_upper_dev - Check if device is linked to some device
5160  * @dev: device
5161  *
5162  * Find out if a device is linked to an upper device and return true in case
5163  * it is. The caller must hold the RTNL lock.
5164  */
5165 static bool netdev_has_any_upper_dev(struct net_device *dev)
5166 {
5167 	ASSERT_RTNL();
5168 
5169 	return !list_empty(&dev->all_adj_list.upper);
5170 }
5171 
5172 /**
5173  * netdev_master_upper_dev_get - Get master upper device
5174  * @dev: device
5175  *
5176  * Find a master upper device and return pointer to it or NULL in case
5177  * it's not there. The caller must hold the RTNL lock.
5178  */
5179 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5180 {
5181 	struct netdev_adjacent *upper;
5182 
5183 	ASSERT_RTNL();
5184 
5185 	if (list_empty(&dev->adj_list.upper))
5186 		return NULL;
5187 
5188 	upper = list_first_entry(&dev->adj_list.upper,
5189 				 struct netdev_adjacent, list);
5190 	if (likely(upper->master))
5191 		return upper->dev;
5192 	return NULL;
5193 }
5194 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5195 
5196 void *netdev_adjacent_get_private(struct list_head *adj_list)
5197 {
5198 	struct netdev_adjacent *adj;
5199 
5200 	adj = list_entry(adj_list, struct netdev_adjacent, list);
5201 
5202 	return adj->private;
5203 }
5204 EXPORT_SYMBOL(netdev_adjacent_get_private);
5205 
5206 /**
5207  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5208  * @dev: device
5209  * @iter: list_head ** of the current position
5210  *
5211  * Gets the next device from the dev's upper list, starting from iter
5212  * position. The caller must hold RCU read lock.
5213  */
5214 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5215 						 struct list_head **iter)
5216 {
5217 	struct netdev_adjacent *upper;
5218 
5219 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5220 
5221 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5222 
5223 	if (&upper->list == &dev->adj_list.upper)
5224 		return NULL;
5225 
5226 	*iter = &upper->list;
5227 
5228 	return upper->dev;
5229 }
5230 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5231 
5232 /**
5233  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5234  * @dev: device
5235  * @iter: list_head ** of the current position
5236  *
5237  * Gets the next device from the dev's upper list, starting from iter
5238  * position. The caller must hold RCU read lock.
5239  */
5240 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5241 						     struct list_head **iter)
5242 {
5243 	struct netdev_adjacent *upper;
5244 
5245 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5246 
5247 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5248 
5249 	if (&upper->list == &dev->all_adj_list.upper)
5250 		return NULL;
5251 
5252 	*iter = &upper->list;
5253 
5254 	return upper->dev;
5255 }
5256 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5257 
5258 /**
5259  * netdev_lower_get_next_private - Get the next ->private from the
5260  *				   lower neighbour list
5261  * @dev: device
5262  * @iter: list_head ** of the current position
5263  *
5264  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5265  * list, starting from iter position. The caller must hold either hold the
5266  * RTNL lock or its own locking that guarantees that the neighbour lower
5267  * list will remain unchanged.
5268  */
5269 void *netdev_lower_get_next_private(struct net_device *dev,
5270 				    struct list_head **iter)
5271 {
5272 	struct netdev_adjacent *lower;
5273 
5274 	lower = list_entry(*iter, struct netdev_adjacent, list);
5275 
5276 	if (&lower->list == &dev->adj_list.lower)
5277 		return NULL;
5278 
5279 	*iter = lower->list.next;
5280 
5281 	return lower->private;
5282 }
5283 EXPORT_SYMBOL(netdev_lower_get_next_private);
5284 
5285 /**
5286  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5287  *				       lower neighbour list, RCU
5288  *				       variant
5289  * @dev: device
5290  * @iter: list_head ** of the current position
5291  *
5292  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5293  * list, starting from iter position. The caller must hold RCU read lock.
5294  */
5295 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5296 					struct list_head **iter)
5297 {
5298 	struct netdev_adjacent *lower;
5299 
5300 	WARN_ON_ONCE(!rcu_read_lock_held());
5301 
5302 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5303 
5304 	if (&lower->list == &dev->adj_list.lower)
5305 		return NULL;
5306 
5307 	*iter = &lower->list;
5308 
5309 	return lower->private;
5310 }
5311 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5312 
5313 /**
5314  * netdev_lower_get_next - Get the next device from the lower neighbour
5315  *                         list
5316  * @dev: device
5317  * @iter: list_head ** of the current position
5318  *
5319  * Gets the next netdev_adjacent from the dev's lower neighbour
5320  * list, starting from iter position. The caller must hold RTNL lock or
5321  * its own locking that guarantees that the neighbour lower
5322  * list will remain unchanged.
5323  */
5324 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5325 {
5326 	struct netdev_adjacent *lower;
5327 
5328 	lower = list_entry(*iter, struct netdev_adjacent, list);
5329 
5330 	if (&lower->list == &dev->adj_list.lower)
5331 		return NULL;
5332 
5333 	*iter = lower->list.next;
5334 
5335 	return lower->dev;
5336 }
5337 EXPORT_SYMBOL(netdev_lower_get_next);
5338 
5339 /**
5340  * netdev_all_lower_get_next - Get the next device from all lower neighbour list
5341  * @dev: device
5342  * @iter: list_head ** of the current position
5343  *
5344  * Gets the next netdev_adjacent from the dev's all lower neighbour
5345  * list, starting from iter position. The caller must hold RTNL lock or
5346  * its own locking that guarantees that the neighbour all lower
5347  * list will remain unchanged.
5348  */
5349 struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
5350 {
5351 	struct netdev_adjacent *lower;
5352 
5353 	lower = list_entry(*iter, struct netdev_adjacent, list);
5354 
5355 	if (&lower->list == &dev->all_adj_list.lower)
5356 		return NULL;
5357 
5358 	*iter = lower->list.next;
5359 
5360 	return lower->dev;
5361 }
5362 EXPORT_SYMBOL(netdev_all_lower_get_next);
5363 
5364 /**
5365  * netdev_all_lower_get_next_rcu - Get the next device from all
5366  *				   lower neighbour list, RCU variant
5367  * @dev: device
5368  * @iter: list_head ** of the current position
5369  *
5370  * Gets the next netdev_adjacent from the dev's all lower neighbour
5371  * list, starting from iter position. The caller must hold RCU read lock.
5372  */
5373 struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
5374 						 struct list_head **iter)
5375 {
5376 	struct netdev_adjacent *lower;
5377 
5378 	lower = list_first_or_null_rcu(&dev->all_adj_list.lower,
5379 				       struct netdev_adjacent, list);
5380 
5381 	return lower ? lower->dev : NULL;
5382 }
5383 EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
5384 
5385 /**
5386  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5387  *				       lower neighbour list, RCU
5388  *				       variant
5389  * @dev: device
5390  *
5391  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5392  * list. The caller must hold RCU read lock.
5393  */
5394 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5395 {
5396 	struct netdev_adjacent *lower;
5397 
5398 	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5399 			struct netdev_adjacent, list);
5400 	if (lower)
5401 		return lower->private;
5402 	return NULL;
5403 }
5404 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5405 
5406 /**
5407  * netdev_master_upper_dev_get_rcu - Get master upper device
5408  * @dev: device
5409  *
5410  * Find a master upper device and return pointer to it or NULL in case
5411  * it's not there. The caller must hold the RCU read lock.
5412  */
5413 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5414 {
5415 	struct netdev_adjacent *upper;
5416 
5417 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5418 				       struct netdev_adjacent, list);
5419 	if (upper && likely(upper->master))
5420 		return upper->dev;
5421 	return NULL;
5422 }
5423 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5424 
5425 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5426 			      struct net_device *adj_dev,
5427 			      struct list_head *dev_list)
5428 {
5429 	char linkname[IFNAMSIZ+7];
5430 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5431 		"upper_%s" : "lower_%s", adj_dev->name);
5432 	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5433 				 linkname);
5434 }
5435 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5436 			       char *name,
5437 			       struct list_head *dev_list)
5438 {
5439 	char linkname[IFNAMSIZ+7];
5440 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5441 		"upper_%s" : "lower_%s", name);
5442 	sysfs_remove_link(&(dev->dev.kobj), linkname);
5443 }
5444 
5445 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5446 						 struct net_device *adj_dev,
5447 						 struct list_head *dev_list)
5448 {
5449 	return (dev_list == &dev->adj_list.upper ||
5450 		dev_list == &dev->adj_list.lower) &&
5451 		net_eq(dev_net(dev), dev_net(adj_dev));
5452 }
5453 
5454 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5455 					struct net_device *adj_dev,
5456 					u16 ref_nr,
5457 					struct list_head *dev_list,
5458 					void *private, bool master)
5459 {
5460 	struct netdev_adjacent *adj;
5461 	int ret;
5462 
5463 	adj = __netdev_find_adj(adj_dev, dev_list);
5464 
5465 	if (adj) {
5466 		adj->ref_nr += ref_nr;
5467 		return 0;
5468 	}
5469 
5470 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5471 	if (!adj)
5472 		return -ENOMEM;
5473 
5474 	adj->dev = adj_dev;
5475 	adj->master = master;
5476 	adj->ref_nr = ref_nr;
5477 	adj->private = private;
5478 	dev_hold(adj_dev);
5479 
5480 	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5481 		 adj_dev->name, dev->name, adj_dev->name);
5482 
5483 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5484 		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5485 		if (ret)
5486 			goto free_adj;
5487 	}
5488 
5489 	/* Ensure that master link is always the first item in list. */
5490 	if (master) {
5491 		ret = sysfs_create_link(&(dev->dev.kobj),
5492 					&(adj_dev->dev.kobj), "master");
5493 		if (ret)
5494 			goto remove_symlinks;
5495 
5496 		list_add_rcu(&adj->list, dev_list);
5497 	} else {
5498 		list_add_tail_rcu(&adj->list, dev_list);
5499 	}
5500 
5501 	return 0;
5502 
5503 remove_symlinks:
5504 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5505 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5506 free_adj:
5507 	kfree(adj);
5508 	dev_put(adj_dev);
5509 
5510 	return ret;
5511 }
5512 
5513 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5514 					 struct net_device *adj_dev,
5515 					 u16 ref_nr,
5516 					 struct list_head *dev_list)
5517 {
5518 	struct netdev_adjacent *adj;
5519 
5520 	adj = __netdev_find_adj(adj_dev, dev_list);
5521 
5522 	if (!adj) {
5523 		pr_err("tried to remove device %s from %s\n",
5524 		       dev->name, adj_dev->name);
5525 		BUG();
5526 	}
5527 
5528 	if (adj->ref_nr > ref_nr) {
5529 		pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
5530 			 ref_nr, adj->ref_nr-ref_nr);
5531 		adj->ref_nr -= ref_nr;
5532 		return;
5533 	}
5534 
5535 	if (adj->master)
5536 		sysfs_remove_link(&(dev->dev.kobj), "master");
5537 
5538 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5539 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5540 
5541 	list_del_rcu(&adj->list);
5542 	pr_debug("dev_put for %s, because link removed from %s to %s\n",
5543 		 adj_dev->name, dev->name, adj_dev->name);
5544 	dev_put(adj_dev);
5545 	kfree_rcu(adj, rcu);
5546 }
5547 
5548 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5549 					    struct net_device *upper_dev,
5550 					    u16 ref_nr,
5551 					    struct list_head *up_list,
5552 					    struct list_head *down_list,
5553 					    void *private, bool master)
5554 {
5555 	int ret;
5556 
5557 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5558 					   private, master);
5559 	if (ret)
5560 		return ret;
5561 
5562 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5563 					   private, false);
5564 	if (ret) {
5565 		__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5566 		return ret;
5567 	}
5568 
5569 	return 0;
5570 }
5571 
5572 static int __netdev_adjacent_dev_link(struct net_device *dev,
5573 				      struct net_device *upper_dev,
5574 				      u16 ref_nr)
5575 {
5576 	return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5577 						&dev->all_adj_list.upper,
5578 						&upper_dev->all_adj_list.lower,
5579 						NULL, false);
5580 }
5581 
5582 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5583 					       struct net_device *upper_dev,
5584 					       u16 ref_nr,
5585 					       struct list_head *up_list,
5586 					       struct list_head *down_list)
5587 {
5588 	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5589 	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5590 }
5591 
5592 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5593 					 struct net_device *upper_dev,
5594 					 u16 ref_nr)
5595 {
5596 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5597 					   &dev->all_adj_list.upper,
5598 					   &upper_dev->all_adj_list.lower);
5599 }
5600 
5601 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5602 						struct net_device *upper_dev,
5603 						void *private, bool master)
5604 {
5605 	int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5606 
5607 	if (ret)
5608 		return ret;
5609 
5610 	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5611 					       &dev->adj_list.upper,
5612 					       &upper_dev->adj_list.lower,
5613 					       private, master);
5614 	if (ret) {
5615 		__netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5616 		return ret;
5617 	}
5618 
5619 	return 0;
5620 }
5621 
5622 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5623 						   struct net_device *upper_dev)
5624 {
5625 	__netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5626 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5627 					   &dev->adj_list.upper,
5628 					   &upper_dev->adj_list.lower);
5629 }
5630 
5631 static int __netdev_upper_dev_link(struct net_device *dev,
5632 				   struct net_device *upper_dev, bool master,
5633 				   void *upper_priv, void *upper_info)
5634 {
5635 	struct netdev_notifier_changeupper_info changeupper_info;
5636 	struct netdev_adjacent *i, *j, *to_i, *to_j;
5637 	int ret = 0;
5638 
5639 	ASSERT_RTNL();
5640 
5641 	if (dev == upper_dev)
5642 		return -EBUSY;
5643 
5644 	/* To prevent loops, check if dev is not upper device to upper_dev. */
5645 	if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5646 		return -EBUSY;
5647 
5648 	if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5649 		return -EEXIST;
5650 
5651 	if (master && netdev_master_upper_dev_get(dev))
5652 		return -EBUSY;
5653 
5654 	changeupper_info.upper_dev = upper_dev;
5655 	changeupper_info.master = master;
5656 	changeupper_info.linking = true;
5657 	changeupper_info.upper_info = upper_info;
5658 
5659 	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5660 					    &changeupper_info.info);
5661 	ret = notifier_to_errno(ret);
5662 	if (ret)
5663 		return ret;
5664 
5665 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5666 						   master);
5667 	if (ret)
5668 		return ret;
5669 
5670 	/* Now that we linked these devs, make all the upper_dev's
5671 	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5672 	 * versa, and don't forget the devices itself. All of these
5673 	 * links are non-neighbours.
5674 	 */
5675 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5676 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5677 			pr_debug("Interlinking %s with %s, non-neighbour\n",
5678 				 i->dev->name, j->dev->name);
5679 			ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5680 			if (ret)
5681 				goto rollback_mesh;
5682 		}
5683 	}
5684 
5685 	/* add dev to every upper_dev's upper device */
5686 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5687 		pr_debug("linking %s's upper device %s with %s\n",
5688 			 upper_dev->name, i->dev->name, dev->name);
5689 		ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5690 		if (ret)
5691 			goto rollback_upper_mesh;
5692 	}
5693 
5694 	/* add upper_dev to every dev's lower device */
5695 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5696 		pr_debug("linking %s's lower device %s with %s\n", dev->name,
5697 			 i->dev->name, upper_dev->name);
5698 		ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5699 		if (ret)
5700 			goto rollback_lower_mesh;
5701 	}
5702 
5703 	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5704 					    &changeupper_info.info);
5705 	ret = notifier_to_errno(ret);
5706 	if (ret)
5707 		goto rollback_lower_mesh;
5708 
5709 	return 0;
5710 
5711 rollback_lower_mesh:
5712 	to_i = i;
5713 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5714 		if (i == to_i)
5715 			break;
5716 		__netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5717 	}
5718 
5719 	i = NULL;
5720 
5721 rollback_upper_mesh:
5722 	to_i = i;
5723 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5724 		if (i == to_i)
5725 			break;
5726 		__netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5727 	}
5728 
5729 	i = j = NULL;
5730 
5731 rollback_mesh:
5732 	to_i = i;
5733 	to_j = j;
5734 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5735 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5736 			if (i == to_i && j == to_j)
5737 				break;
5738 			__netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5739 		}
5740 		if (i == to_i)
5741 			break;
5742 	}
5743 
5744 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5745 
5746 	return ret;
5747 }
5748 
5749 /**
5750  * netdev_upper_dev_link - Add a link to the upper device
5751  * @dev: device
5752  * @upper_dev: new upper device
5753  *
5754  * Adds a link to device which is upper to this one. The caller must hold
5755  * the RTNL lock. On a failure a negative errno code is returned.
5756  * On success the reference counts are adjusted and the function
5757  * returns zero.
5758  */
5759 int netdev_upper_dev_link(struct net_device *dev,
5760 			  struct net_device *upper_dev)
5761 {
5762 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5763 }
5764 EXPORT_SYMBOL(netdev_upper_dev_link);
5765 
5766 /**
5767  * netdev_master_upper_dev_link - Add a master link to the upper device
5768  * @dev: device
5769  * @upper_dev: new upper device
5770  * @upper_priv: upper device private
5771  * @upper_info: upper info to be passed down via notifier
5772  *
5773  * Adds a link to device which is upper to this one. In this case, only
5774  * one master upper device can be linked, although other non-master devices
5775  * might be linked as well. The caller must hold the RTNL lock.
5776  * On a failure a negative errno code is returned. On success the reference
5777  * counts are adjusted and the function returns zero.
5778  */
5779 int netdev_master_upper_dev_link(struct net_device *dev,
5780 				 struct net_device *upper_dev,
5781 				 void *upper_priv, void *upper_info)
5782 {
5783 	return __netdev_upper_dev_link(dev, upper_dev, true,
5784 				       upper_priv, upper_info);
5785 }
5786 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5787 
5788 /**
5789  * netdev_upper_dev_unlink - Removes a link to upper device
5790  * @dev: device
5791  * @upper_dev: new upper device
5792  *
5793  * Removes a link to device which is upper to this one. The caller must hold
5794  * the RTNL lock.
5795  */
5796 void netdev_upper_dev_unlink(struct net_device *dev,
5797 			     struct net_device *upper_dev)
5798 {
5799 	struct netdev_notifier_changeupper_info changeupper_info;
5800 	struct netdev_adjacent *i, *j;
5801 	ASSERT_RTNL();
5802 
5803 	changeupper_info.upper_dev = upper_dev;
5804 	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5805 	changeupper_info.linking = false;
5806 
5807 	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5808 				      &changeupper_info.info);
5809 
5810 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5811 
5812 	/* Here is the tricky part. We must remove all dev's lower
5813 	 * devices from all upper_dev's upper devices and vice
5814 	 * versa, to maintain the graph relationship.
5815 	 */
5816 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5817 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5818 			__netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5819 
5820 	/* remove also the devices itself from lower/upper device
5821 	 * list
5822 	 */
5823 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5824 		__netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5825 
5826 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5827 		__netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5828 
5829 	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5830 				      &changeupper_info.info);
5831 }
5832 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5833 
5834 /**
5835  * netdev_bonding_info_change - Dispatch event about slave change
5836  * @dev: device
5837  * @bonding_info: info to dispatch
5838  *
5839  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5840  * The caller must hold the RTNL lock.
5841  */
5842 void netdev_bonding_info_change(struct net_device *dev,
5843 				struct netdev_bonding_info *bonding_info)
5844 {
5845 	struct netdev_notifier_bonding_info	info;
5846 
5847 	memcpy(&info.bonding_info, bonding_info,
5848 	       sizeof(struct netdev_bonding_info));
5849 	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5850 				      &info.info);
5851 }
5852 EXPORT_SYMBOL(netdev_bonding_info_change);
5853 
5854 static void netdev_adjacent_add_links(struct net_device *dev)
5855 {
5856 	struct netdev_adjacent *iter;
5857 
5858 	struct net *net = dev_net(dev);
5859 
5860 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5861 		if (!net_eq(net, dev_net(iter->dev)))
5862 			continue;
5863 		netdev_adjacent_sysfs_add(iter->dev, dev,
5864 					  &iter->dev->adj_list.lower);
5865 		netdev_adjacent_sysfs_add(dev, iter->dev,
5866 					  &dev->adj_list.upper);
5867 	}
5868 
5869 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5870 		if (!net_eq(net, dev_net(iter->dev)))
5871 			continue;
5872 		netdev_adjacent_sysfs_add(iter->dev, dev,
5873 					  &iter->dev->adj_list.upper);
5874 		netdev_adjacent_sysfs_add(dev, iter->dev,
5875 					  &dev->adj_list.lower);
5876 	}
5877 }
5878 
5879 static void netdev_adjacent_del_links(struct net_device *dev)
5880 {
5881 	struct netdev_adjacent *iter;
5882 
5883 	struct net *net = dev_net(dev);
5884 
5885 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5886 		if (!net_eq(net, dev_net(iter->dev)))
5887 			continue;
5888 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5889 					  &iter->dev->adj_list.lower);
5890 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5891 					  &dev->adj_list.upper);
5892 	}
5893 
5894 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5895 		if (!net_eq(net, dev_net(iter->dev)))
5896 			continue;
5897 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5898 					  &iter->dev->adj_list.upper);
5899 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5900 					  &dev->adj_list.lower);
5901 	}
5902 }
5903 
5904 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5905 {
5906 	struct netdev_adjacent *iter;
5907 
5908 	struct net *net = dev_net(dev);
5909 
5910 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5911 		if (!net_eq(net, dev_net(iter->dev)))
5912 			continue;
5913 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5914 					  &iter->dev->adj_list.lower);
5915 		netdev_adjacent_sysfs_add(iter->dev, dev,
5916 					  &iter->dev->adj_list.lower);
5917 	}
5918 
5919 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5920 		if (!net_eq(net, dev_net(iter->dev)))
5921 			continue;
5922 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5923 					  &iter->dev->adj_list.upper);
5924 		netdev_adjacent_sysfs_add(iter->dev, dev,
5925 					  &iter->dev->adj_list.upper);
5926 	}
5927 }
5928 
5929 void *netdev_lower_dev_get_private(struct net_device *dev,
5930 				   struct net_device *lower_dev)
5931 {
5932 	struct netdev_adjacent *lower;
5933 
5934 	if (!lower_dev)
5935 		return NULL;
5936 	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5937 	if (!lower)
5938 		return NULL;
5939 
5940 	return lower->private;
5941 }
5942 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5943 
5944 
5945 int dev_get_nest_level(struct net_device *dev)
5946 {
5947 	struct net_device *lower = NULL;
5948 	struct list_head *iter;
5949 	int max_nest = -1;
5950 	int nest;
5951 
5952 	ASSERT_RTNL();
5953 
5954 	netdev_for_each_lower_dev(dev, lower, iter) {
5955 		nest = dev_get_nest_level(lower);
5956 		if (max_nest < nest)
5957 			max_nest = nest;
5958 	}
5959 
5960 	return max_nest + 1;
5961 }
5962 EXPORT_SYMBOL(dev_get_nest_level);
5963 
5964 /**
5965  * netdev_lower_change - Dispatch event about lower device state change
5966  * @lower_dev: device
5967  * @lower_state_info: state to dispatch
5968  *
5969  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
5970  * The caller must hold the RTNL lock.
5971  */
5972 void netdev_lower_state_changed(struct net_device *lower_dev,
5973 				void *lower_state_info)
5974 {
5975 	struct netdev_notifier_changelowerstate_info changelowerstate_info;
5976 
5977 	ASSERT_RTNL();
5978 	changelowerstate_info.lower_state_info = lower_state_info;
5979 	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
5980 				      &changelowerstate_info.info);
5981 }
5982 EXPORT_SYMBOL(netdev_lower_state_changed);
5983 
5984 int netdev_default_l2upper_neigh_construct(struct net_device *dev,
5985 					   struct neighbour *n)
5986 {
5987 	struct net_device *lower_dev, *stop_dev;
5988 	struct list_head *iter;
5989 	int err;
5990 
5991 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
5992 		if (!lower_dev->netdev_ops->ndo_neigh_construct)
5993 			continue;
5994 		err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
5995 		if (err) {
5996 			stop_dev = lower_dev;
5997 			goto rollback;
5998 		}
5999 	}
6000 	return 0;
6001 
6002 rollback:
6003 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6004 		if (lower_dev == stop_dev)
6005 			break;
6006 		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6007 			continue;
6008 		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6009 	}
6010 	return err;
6011 }
6012 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6013 
6014 void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6015 					  struct neighbour *n)
6016 {
6017 	struct net_device *lower_dev;
6018 	struct list_head *iter;
6019 
6020 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6021 		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6022 			continue;
6023 		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6024 	}
6025 }
6026 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6027 
6028 static void dev_change_rx_flags(struct net_device *dev, int flags)
6029 {
6030 	const struct net_device_ops *ops = dev->netdev_ops;
6031 
6032 	if (ops->ndo_change_rx_flags)
6033 		ops->ndo_change_rx_flags(dev, flags);
6034 }
6035 
6036 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6037 {
6038 	unsigned int old_flags = dev->flags;
6039 	kuid_t uid;
6040 	kgid_t gid;
6041 
6042 	ASSERT_RTNL();
6043 
6044 	dev->flags |= IFF_PROMISC;
6045 	dev->promiscuity += inc;
6046 	if (dev->promiscuity == 0) {
6047 		/*
6048 		 * Avoid overflow.
6049 		 * If inc causes overflow, untouch promisc and return error.
6050 		 */
6051 		if (inc < 0)
6052 			dev->flags &= ~IFF_PROMISC;
6053 		else {
6054 			dev->promiscuity -= inc;
6055 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6056 				dev->name);
6057 			return -EOVERFLOW;
6058 		}
6059 	}
6060 	if (dev->flags != old_flags) {
6061 		pr_info("device %s %s promiscuous mode\n",
6062 			dev->name,
6063 			dev->flags & IFF_PROMISC ? "entered" : "left");
6064 		if (audit_enabled) {
6065 			current_uid_gid(&uid, &gid);
6066 			audit_log(current->audit_context, GFP_ATOMIC,
6067 				AUDIT_ANOM_PROMISCUOUS,
6068 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6069 				dev->name, (dev->flags & IFF_PROMISC),
6070 				(old_flags & IFF_PROMISC),
6071 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6072 				from_kuid(&init_user_ns, uid),
6073 				from_kgid(&init_user_ns, gid),
6074 				audit_get_sessionid(current));
6075 		}
6076 
6077 		dev_change_rx_flags(dev, IFF_PROMISC);
6078 	}
6079 	if (notify)
6080 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6081 	return 0;
6082 }
6083 
6084 /**
6085  *	dev_set_promiscuity	- update promiscuity count on a device
6086  *	@dev: device
6087  *	@inc: modifier
6088  *
6089  *	Add or remove promiscuity from a device. While the count in the device
6090  *	remains above zero the interface remains promiscuous. Once it hits zero
6091  *	the device reverts back to normal filtering operation. A negative inc
6092  *	value is used to drop promiscuity on the device.
6093  *	Return 0 if successful or a negative errno code on error.
6094  */
6095 int dev_set_promiscuity(struct net_device *dev, int inc)
6096 {
6097 	unsigned int old_flags = dev->flags;
6098 	int err;
6099 
6100 	err = __dev_set_promiscuity(dev, inc, true);
6101 	if (err < 0)
6102 		return err;
6103 	if (dev->flags != old_flags)
6104 		dev_set_rx_mode(dev);
6105 	return err;
6106 }
6107 EXPORT_SYMBOL(dev_set_promiscuity);
6108 
6109 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6110 {
6111 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6112 
6113 	ASSERT_RTNL();
6114 
6115 	dev->flags |= IFF_ALLMULTI;
6116 	dev->allmulti += inc;
6117 	if (dev->allmulti == 0) {
6118 		/*
6119 		 * Avoid overflow.
6120 		 * If inc causes overflow, untouch allmulti and return error.
6121 		 */
6122 		if (inc < 0)
6123 			dev->flags &= ~IFF_ALLMULTI;
6124 		else {
6125 			dev->allmulti -= inc;
6126 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6127 				dev->name);
6128 			return -EOVERFLOW;
6129 		}
6130 	}
6131 	if (dev->flags ^ old_flags) {
6132 		dev_change_rx_flags(dev, IFF_ALLMULTI);
6133 		dev_set_rx_mode(dev);
6134 		if (notify)
6135 			__dev_notify_flags(dev, old_flags,
6136 					   dev->gflags ^ old_gflags);
6137 	}
6138 	return 0;
6139 }
6140 
6141 /**
6142  *	dev_set_allmulti	- update allmulti count on a device
6143  *	@dev: device
6144  *	@inc: modifier
6145  *
6146  *	Add or remove reception of all multicast frames to a device. While the
6147  *	count in the device remains above zero the interface remains listening
6148  *	to all interfaces. Once it hits zero the device reverts back to normal
6149  *	filtering operation. A negative @inc value is used to drop the counter
6150  *	when releasing a resource needing all multicasts.
6151  *	Return 0 if successful or a negative errno code on error.
6152  */
6153 
6154 int dev_set_allmulti(struct net_device *dev, int inc)
6155 {
6156 	return __dev_set_allmulti(dev, inc, true);
6157 }
6158 EXPORT_SYMBOL(dev_set_allmulti);
6159 
6160 /*
6161  *	Upload unicast and multicast address lists to device and
6162  *	configure RX filtering. When the device doesn't support unicast
6163  *	filtering it is put in promiscuous mode while unicast addresses
6164  *	are present.
6165  */
6166 void __dev_set_rx_mode(struct net_device *dev)
6167 {
6168 	const struct net_device_ops *ops = dev->netdev_ops;
6169 
6170 	/* dev_open will call this function so the list will stay sane. */
6171 	if (!(dev->flags&IFF_UP))
6172 		return;
6173 
6174 	if (!netif_device_present(dev))
6175 		return;
6176 
6177 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6178 		/* Unicast addresses changes may only happen under the rtnl,
6179 		 * therefore calling __dev_set_promiscuity here is safe.
6180 		 */
6181 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6182 			__dev_set_promiscuity(dev, 1, false);
6183 			dev->uc_promisc = true;
6184 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6185 			__dev_set_promiscuity(dev, -1, false);
6186 			dev->uc_promisc = false;
6187 		}
6188 	}
6189 
6190 	if (ops->ndo_set_rx_mode)
6191 		ops->ndo_set_rx_mode(dev);
6192 }
6193 
6194 void dev_set_rx_mode(struct net_device *dev)
6195 {
6196 	netif_addr_lock_bh(dev);
6197 	__dev_set_rx_mode(dev);
6198 	netif_addr_unlock_bh(dev);
6199 }
6200 
6201 /**
6202  *	dev_get_flags - get flags reported to userspace
6203  *	@dev: device
6204  *
6205  *	Get the combination of flag bits exported through APIs to userspace.
6206  */
6207 unsigned int dev_get_flags(const struct net_device *dev)
6208 {
6209 	unsigned int flags;
6210 
6211 	flags = (dev->flags & ~(IFF_PROMISC |
6212 				IFF_ALLMULTI |
6213 				IFF_RUNNING |
6214 				IFF_LOWER_UP |
6215 				IFF_DORMANT)) |
6216 		(dev->gflags & (IFF_PROMISC |
6217 				IFF_ALLMULTI));
6218 
6219 	if (netif_running(dev)) {
6220 		if (netif_oper_up(dev))
6221 			flags |= IFF_RUNNING;
6222 		if (netif_carrier_ok(dev))
6223 			flags |= IFF_LOWER_UP;
6224 		if (netif_dormant(dev))
6225 			flags |= IFF_DORMANT;
6226 	}
6227 
6228 	return flags;
6229 }
6230 EXPORT_SYMBOL(dev_get_flags);
6231 
6232 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6233 {
6234 	unsigned int old_flags = dev->flags;
6235 	int ret;
6236 
6237 	ASSERT_RTNL();
6238 
6239 	/*
6240 	 *	Set the flags on our device.
6241 	 */
6242 
6243 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6244 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6245 			       IFF_AUTOMEDIA)) |
6246 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6247 				    IFF_ALLMULTI));
6248 
6249 	/*
6250 	 *	Load in the correct multicast list now the flags have changed.
6251 	 */
6252 
6253 	if ((old_flags ^ flags) & IFF_MULTICAST)
6254 		dev_change_rx_flags(dev, IFF_MULTICAST);
6255 
6256 	dev_set_rx_mode(dev);
6257 
6258 	/*
6259 	 *	Have we downed the interface. We handle IFF_UP ourselves
6260 	 *	according to user attempts to set it, rather than blindly
6261 	 *	setting it.
6262 	 */
6263 
6264 	ret = 0;
6265 	if ((old_flags ^ flags) & IFF_UP)
6266 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6267 
6268 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6269 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6270 		unsigned int old_flags = dev->flags;
6271 
6272 		dev->gflags ^= IFF_PROMISC;
6273 
6274 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6275 			if (dev->flags != old_flags)
6276 				dev_set_rx_mode(dev);
6277 	}
6278 
6279 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6280 	   is important. Some (broken) drivers set IFF_PROMISC, when
6281 	   IFF_ALLMULTI is requested not asking us and not reporting.
6282 	 */
6283 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6284 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6285 
6286 		dev->gflags ^= IFF_ALLMULTI;
6287 		__dev_set_allmulti(dev, inc, false);
6288 	}
6289 
6290 	return ret;
6291 }
6292 
6293 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6294 			unsigned int gchanges)
6295 {
6296 	unsigned int changes = dev->flags ^ old_flags;
6297 
6298 	if (gchanges)
6299 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6300 
6301 	if (changes & IFF_UP) {
6302 		if (dev->flags & IFF_UP)
6303 			call_netdevice_notifiers(NETDEV_UP, dev);
6304 		else
6305 			call_netdevice_notifiers(NETDEV_DOWN, dev);
6306 	}
6307 
6308 	if (dev->flags & IFF_UP &&
6309 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6310 		struct netdev_notifier_change_info change_info;
6311 
6312 		change_info.flags_changed = changes;
6313 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6314 					      &change_info.info);
6315 	}
6316 }
6317 
6318 /**
6319  *	dev_change_flags - change device settings
6320  *	@dev: device
6321  *	@flags: device state flags
6322  *
6323  *	Change settings on device based state flags. The flags are
6324  *	in the userspace exported format.
6325  */
6326 int dev_change_flags(struct net_device *dev, unsigned int flags)
6327 {
6328 	int ret;
6329 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6330 
6331 	ret = __dev_change_flags(dev, flags);
6332 	if (ret < 0)
6333 		return ret;
6334 
6335 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6336 	__dev_notify_flags(dev, old_flags, changes);
6337 	return ret;
6338 }
6339 EXPORT_SYMBOL(dev_change_flags);
6340 
6341 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6342 {
6343 	const struct net_device_ops *ops = dev->netdev_ops;
6344 
6345 	if (ops->ndo_change_mtu)
6346 		return ops->ndo_change_mtu(dev, new_mtu);
6347 
6348 	dev->mtu = new_mtu;
6349 	return 0;
6350 }
6351 
6352 /**
6353  *	dev_set_mtu - Change maximum transfer unit
6354  *	@dev: device
6355  *	@new_mtu: new transfer unit
6356  *
6357  *	Change the maximum transfer size of the network device.
6358  */
6359 int dev_set_mtu(struct net_device *dev, int new_mtu)
6360 {
6361 	int err, orig_mtu;
6362 
6363 	if (new_mtu == dev->mtu)
6364 		return 0;
6365 
6366 	/* MTU must be positive, and in range */
6367 	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6368 		net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6369 				    dev->name, new_mtu, dev->min_mtu);
6370 		return -EINVAL;
6371 	}
6372 
6373 	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6374 		net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6375 				    dev->name, new_mtu, dev->min_mtu);
6376 		return -EINVAL;
6377 	}
6378 
6379 	if (!netif_device_present(dev))
6380 		return -ENODEV;
6381 
6382 	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6383 	err = notifier_to_errno(err);
6384 	if (err)
6385 		return err;
6386 
6387 	orig_mtu = dev->mtu;
6388 	err = __dev_set_mtu(dev, new_mtu);
6389 
6390 	if (!err) {
6391 		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6392 		err = notifier_to_errno(err);
6393 		if (err) {
6394 			/* setting mtu back and notifying everyone again,
6395 			 * so that they have a chance to revert changes.
6396 			 */
6397 			__dev_set_mtu(dev, orig_mtu);
6398 			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6399 		}
6400 	}
6401 	return err;
6402 }
6403 EXPORT_SYMBOL(dev_set_mtu);
6404 
6405 /**
6406  *	dev_set_group - Change group this device belongs to
6407  *	@dev: device
6408  *	@new_group: group this device should belong to
6409  */
6410 void dev_set_group(struct net_device *dev, int new_group)
6411 {
6412 	dev->group = new_group;
6413 }
6414 EXPORT_SYMBOL(dev_set_group);
6415 
6416 /**
6417  *	dev_set_mac_address - Change Media Access Control Address
6418  *	@dev: device
6419  *	@sa: new address
6420  *
6421  *	Change the hardware (MAC) address of the device
6422  */
6423 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6424 {
6425 	const struct net_device_ops *ops = dev->netdev_ops;
6426 	int err;
6427 
6428 	if (!ops->ndo_set_mac_address)
6429 		return -EOPNOTSUPP;
6430 	if (sa->sa_family != dev->type)
6431 		return -EINVAL;
6432 	if (!netif_device_present(dev))
6433 		return -ENODEV;
6434 	err = ops->ndo_set_mac_address(dev, sa);
6435 	if (err)
6436 		return err;
6437 	dev->addr_assign_type = NET_ADDR_SET;
6438 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6439 	add_device_randomness(dev->dev_addr, dev->addr_len);
6440 	return 0;
6441 }
6442 EXPORT_SYMBOL(dev_set_mac_address);
6443 
6444 /**
6445  *	dev_change_carrier - Change device carrier
6446  *	@dev: device
6447  *	@new_carrier: new value
6448  *
6449  *	Change device carrier
6450  */
6451 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6452 {
6453 	const struct net_device_ops *ops = dev->netdev_ops;
6454 
6455 	if (!ops->ndo_change_carrier)
6456 		return -EOPNOTSUPP;
6457 	if (!netif_device_present(dev))
6458 		return -ENODEV;
6459 	return ops->ndo_change_carrier(dev, new_carrier);
6460 }
6461 EXPORT_SYMBOL(dev_change_carrier);
6462 
6463 /**
6464  *	dev_get_phys_port_id - Get device physical port ID
6465  *	@dev: device
6466  *	@ppid: port ID
6467  *
6468  *	Get device physical port ID
6469  */
6470 int dev_get_phys_port_id(struct net_device *dev,
6471 			 struct netdev_phys_item_id *ppid)
6472 {
6473 	const struct net_device_ops *ops = dev->netdev_ops;
6474 
6475 	if (!ops->ndo_get_phys_port_id)
6476 		return -EOPNOTSUPP;
6477 	return ops->ndo_get_phys_port_id(dev, ppid);
6478 }
6479 EXPORT_SYMBOL(dev_get_phys_port_id);
6480 
6481 /**
6482  *	dev_get_phys_port_name - Get device physical port name
6483  *	@dev: device
6484  *	@name: port name
6485  *	@len: limit of bytes to copy to name
6486  *
6487  *	Get device physical port name
6488  */
6489 int dev_get_phys_port_name(struct net_device *dev,
6490 			   char *name, size_t len)
6491 {
6492 	const struct net_device_ops *ops = dev->netdev_ops;
6493 
6494 	if (!ops->ndo_get_phys_port_name)
6495 		return -EOPNOTSUPP;
6496 	return ops->ndo_get_phys_port_name(dev, name, len);
6497 }
6498 EXPORT_SYMBOL(dev_get_phys_port_name);
6499 
6500 /**
6501  *	dev_change_proto_down - update protocol port state information
6502  *	@dev: device
6503  *	@proto_down: new value
6504  *
6505  *	This info can be used by switch drivers to set the phys state of the
6506  *	port.
6507  */
6508 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6509 {
6510 	const struct net_device_ops *ops = dev->netdev_ops;
6511 
6512 	if (!ops->ndo_change_proto_down)
6513 		return -EOPNOTSUPP;
6514 	if (!netif_device_present(dev))
6515 		return -ENODEV;
6516 	return ops->ndo_change_proto_down(dev, proto_down);
6517 }
6518 EXPORT_SYMBOL(dev_change_proto_down);
6519 
6520 /**
6521  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
6522  *	@dev: device
6523  *	@fd: new program fd or negative value to clear
6524  *
6525  *	Set or clear a bpf program for a device
6526  */
6527 int dev_change_xdp_fd(struct net_device *dev, int fd)
6528 {
6529 	const struct net_device_ops *ops = dev->netdev_ops;
6530 	struct bpf_prog *prog = NULL;
6531 	struct netdev_xdp xdp = {};
6532 	int err;
6533 
6534 	if (!ops->ndo_xdp)
6535 		return -EOPNOTSUPP;
6536 	if (fd >= 0) {
6537 		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6538 		if (IS_ERR(prog))
6539 			return PTR_ERR(prog);
6540 	}
6541 
6542 	xdp.command = XDP_SETUP_PROG;
6543 	xdp.prog = prog;
6544 	err = ops->ndo_xdp(dev, &xdp);
6545 	if (err < 0 && prog)
6546 		bpf_prog_put(prog);
6547 
6548 	return err;
6549 }
6550 EXPORT_SYMBOL(dev_change_xdp_fd);
6551 
6552 /**
6553  *	dev_new_index	-	allocate an ifindex
6554  *	@net: the applicable net namespace
6555  *
6556  *	Returns a suitable unique value for a new device interface
6557  *	number.  The caller must hold the rtnl semaphore or the
6558  *	dev_base_lock to be sure it remains unique.
6559  */
6560 static int dev_new_index(struct net *net)
6561 {
6562 	int ifindex = net->ifindex;
6563 	for (;;) {
6564 		if (++ifindex <= 0)
6565 			ifindex = 1;
6566 		if (!__dev_get_by_index(net, ifindex))
6567 			return net->ifindex = ifindex;
6568 	}
6569 }
6570 
6571 /* Delayed registration/unregisteration */
6572 static LIST_HEAD(net_todo_list);
6573 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6574 
6575 static void net_set_todo(struct net_device *dev)
6576 {
6577 	list_add_tail(&dev->todo_list, &net_todo_list);
6578 	dev_net(dev)->dev_unreg_count++;
6579 }
6580 
6581 static void rollback_registered_many(struct list_head *head)
6582 {
6583 	struct net_device *dev, *tmp;
6584 	LIST_HEAD(close_head);
6585 
6586 	BUG_ON(dev_boot_phase);
6587 	ASSERT_RTNL();
6588 
6589 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6590 		/* Some devices call without registering
6591 		 * for initialization unwind. Remove those
6592 		 * devices and proceed with the remaining.
6593 		 */
6594 		if (dev->reg_state == NETREG_UNINITIALIZED) {
6595 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6596 				 dev->name, dev);
6597 
6598 			WARN_ON(1);
6599 			list_del(&dev->unreg_list);
6600 			continue;
6601 		}
6602 		dev->dismantle = true;
6603 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6604 	}
6605 
6606 	/* If device is running, close it first. */
6607 	list_for_each_entry(dev, head, unreg_list)
6608 		list_add_tail(&dev->close_list, &close_head);
6609 	dev_close_many(&close_head, true);
6610 
6611 	list_for_each_entry(dev, head, unreg_list) {
6612 		/* And unlink it from device chain. */
6613 		unlist_netdevice(dev);
6614 
6615 		dev->reg_state = NETREG_UNREGISTERING;
6616 	}
6617 	flush_all_backlogs();
6618 
6619 	synchronize_net();
6620 
6621 	list_for_each_entry(dev, head, unreg_list) {
6622 		struct sk_buff *skb = NULL;
6623 
6624 		/* Shutdown queueing discipline. */
6625 		dev_shutdown(dev);
6626 
6627 
6628 		/* Notify protocols, that we are about to destroy
6629 		   this device. They should clean all the things.
6630 		*/
6631 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6632 
6633 		if (!dev->rtnl_link_ops ||
6634 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6635 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6636 						     GFP_KERNEL);
6637 
6638 		/*
6639 		 *	Flush the unicast and multicast chains
6640 		 */
6641 		dev_uc_flush(dev);
6642 		dev_mc_flush(dev);
6643 
6644 		if (dev->netdev_ops->ndo_uninit)
6645 			dev->netdev_ops->ndo_uninit(dev);
6646 
6647 		if (skb)
6648 			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6649 
6650 		/* Notifier chain MUST detach us all upper devices. */
6651 		WARN_ON(netdev_has_any_upper_dev(dev));
6652 
6653 		/* Remove entries from kobject tree */
6654 		netdev_unregister_kobject(dev);
6655 #ifdef CONFIG_XPS
6656 		/* Remove XPS queueing entries */
6657 		netif_reset_xps_queues_gt(dev, 0);
6658 #endif
6659 	}
6660 
6661 	synchronize_net();
6662 
6663 	list_for_each_entry(dev, head, unreg_list)
6664 		dev_put(dev);
6665 }
6666 
6667 static void rollback_registered(struct net_device *dev)
6668 {
6669 	LIST_HEAD(single);
6670 
6671 	list_add(&dev->unreg_list, &single);
6672 	rollback_registered_many(&single);
6673 	list_del(&single);
6674 }
6675 
6676 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6677 	struct net_device *upper, netdev_features_t features)
6678 {
6679 	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6680 	netdev_features_t feature;
6681 	int feature_bit;
6682 
6683 	for_each_netdev_feature(&upper_disables, feature_bit) {
6684 		feature = __NETIF_F_BIT(feature_bit);
6685 		if (!(upper->wanted_features & feature)
6686 		    && (features & feature)) {
6687 			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6688 				   &feature, upper->name);
6689 			features &= ~feature;
6690 		}
6691 	}
6692 
6693 	return features;
6694 }
6695 
6696 static void netdev_sync_lower_features(struct net_device *upper,
6697 	struct net_device *lower, netdev_features_t features)
6698 {
6699 	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6700 	netdev_features_t feature;
6701 	int feature_bit;
6702 
6703 	for_each_netdev_feature(&upper_disables, feature_bit) {
6704 		feature = __NETIF_F_BIT(feature_bit);
6705 		if (!(features & feature) && (lower->features & feature)) {
6706 			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6707 				   &feature, lower->name);
6708 			lower->wanted_features &= ~feature;
6709 			netdev_update_features(lower);
6710 
6711 			if (unlikely(lower->features & feature))
6712 				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6713 					    &feature, lower->name);
6714 		}
6715 	}
6716 }
6717 
6718 static netdev_features_t netdev_fix_features(struct net_device *dev,
6719 	netdev_features_t features)
6720 {
6721 	/* Fix illegal checksum combinations */
6722 	if ((features & NETIF_F_HW_CSUM) &&
6723 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6724 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6725 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6726 	}
6727 
6728 	/* TSO requires that SG is present as well. */
6729 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6730 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6731 		features &= ~NETIF_F_ALL_TSO;
6732 	}
6733 
6734 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6735 					!(features & NETIF_F_IP_CSUM)) {
6736 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6737 		features &= ~NETIF_F_TSO;
6738 		features &= ~NETIF_F_TSO_ECN;
6739 	}
6740 
6741 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6742 					 !(features & NETIF_F_IPV6_CSUM)) {
6743 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6744 		features &= ~NETIF_F_TSO6;
6745 	}
6746 
6747 	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6748 	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6749 		features &= ~NETIF_F_TSO_MANGLEID;
6750 
6751 	/* TSO ECN requires that TSO is present as well. */
6752 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6753 		features &= ~NETIF_F_TSO_ECN;
6754 
6755 	/* Software GSO depends on SG. */
6756 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6757 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6758 		features &= ~NETIF_F_GSO;
6759 	}
6760 
6761 	/* UFO needs SG and checksumming */
6762 	if (features & NETIF_F_UFO) {
6763 		/* maybe split UFO into V4 and V6? */
6764 		if (!(features & NETIF_F_HW_CSUM) &&
6765 		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6766 		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6767 			netdev_dbg(dev,
6768 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6769 			features &= ~NETIF_F_UFO;
6770 		}
6771 
6772 		if (!(features & NETIF_F_SG)) {
6773 			netdev_dbg(dev,
6774 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6775 			features &= ~NETIF_F_UFO;
6776 		}
6777 	}
6778 
6779 	/* GSO partial features require GSO partial be set */
6780 	if ((features & dev->gso_partial_features) &&
6781 	    !(features & NETIF_F_GSO_PARTIAL)) {
6782 		netdev_dbg(dev,
6783 			   "Dropping partially supported GSO features since no GSO partial.\n");
6784 		features &= ~dev->gso_partial_features;
6785 	}
6786 
6787 #ifdef CONFIG_NET_RX_BUSY_POLL
6788 	if (dev->netdev_ops->ndo_busy_poll)
6789 		features |= NETIF_F_BUSY_POLL;
6790 	else
6791 #endif
6792 		features &= ~NETIF_F_BUSY_POLL;
6793 
6794 	return features;
6795 }
6796 
6797 int __netdev_update_features(struct net_device *dev)
6798 {
6799 	struct net_device *upper, *lower;
6800 	netdev_features_t features;
6801 	struct list_head *iter;
6802 	int err = -1;
6803 
6804 	ASSERT_RTNL();
6805 
6806 	features = netdev_get_wanted_features(dev);
6807 
6808 	if (dev->netdev_ops->ndo_fix_features)
6809 		features = dev->netdev_ops->ndo_fix_features(dev, features);
6810 
6811 	/* driver might be less strict about feature dependencies */
6812 	features = netdev_fix_features(dev, features);
6813 
6814 	/* some features can't be enabled if they're off an an upper device */
6815 	netdev_for_each_upper_dev_rcu(dev, upper, iter)
6816 		features = netdev_sync_upper_features(dev, upper, features);
6817 
6818 	if (dev->features == features)
6819 		goto sync_lower;
6820 
6821 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6822 		&dev->features, &features);
6823 
6824 	if (dev->netdev_ops->ndo_set_features)
6825 		err = dev->netdev_ops->ndo_set_features(dev, features);
6826 	else
6827 		err = 0;
6828 
6829 	if (unlikely(err < 0)) {
6830 		netdev_err(dev,
6831 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
6832 			err, &features, &dev->features);
6833 		/* return non-0 since some features might have changed and
6834 		 * it's better to fire a spurious notification than miss it
6835 		 */
6836 		return -1;
6837 	}
6838 
6839 sync_lower:
6840 	/* some features must be disabled on lower devices when disabled
6841 	 * on an upper device (think: bonding master or bridge)
6842 	 */
6843 	netdev_for_each_lower_dev(dev, lower, iter)
6844 		netdev_sync_lower_features(dev, lower, features);
6845 
6846 	if (!err)
6847 		dev->features = features;
6848 
6849 	return err < 0 ? 0 : 1;
6850 }
6851 
6852 /**
6853  *	netdev_update_features - recalculate device features
6854  *	@dev: the device to check
6855  *
6856  *	Recalculate dev->features set and send notifications if it
6857  *	has changed. Should be called after driver or hardware dependent
6858  *	conditions might have changed that influence the features.
6859  */
6860 void netdev_update_features(struct net_device *dev)
6861 {
6862 	if (__netdev_update_features(dev))
6863 		netdev_features_change(dev);
6864 }
6865 EXPORT_SYMBOL(netdev_update_features);
6866 
6867 /**
6868  *	netdev_change_features - recalculate device features
6869  *	@dev: the device to check
6870  *
6871  *	Recalculate dev->features set and send notifications even
6872  *	if they have not changed. Should be called instead of
6873  *	netdev_update_features() if also dev->vlan_features might
6874  *	have changed to allow the changes to be propagated to stacked
6875  *	VLAN devices.
6876  */
6877 void netdev_change_features(struct net_device *dev)
6878 {
6879 	__netdev_update_features(dev);
6880 	netdev_features_change(dev);
6881 }
6882 EXPORT_SYMBOL(netdev_change_features);
6883 
6884 /**
6885  *	netif_stacked_transfer_operstate -	transfer operstate
6886  *	@rootdev: the root or lower level device to transfer state from
6887  *	@dev: the device to transfer operstate to
6888  *
6889  *	Transfer operational state from root to device. This is normally
6890  *	called when a stacking relationship exists between the root
6891  *	device and the device(a leaf device).
6892  */
6893 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6894 					struct net_device *dev)
6895 {
6896 	if (rootdev->operstate == IF_OPER_DORMANT)
6897 		netif_dormant_on(dev);
6898 	else
6899 		netif_dormant_off(dev);
6900 
6901 	if (netif_carrier_ok(rootdev)) {
6902 		if (!netif_carrier_ok(dev))
6903 			netif_carrier_on(dev);
6904 	} else {
6905 		if (netif_carrier_ok(dev))
6906 			netif_carrier_off(dev);
6907 	}
6908 }
6909 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6910 
6911 #ifdef CONFIG_SYSFS
6912 static int netif_alloc_rx_queues(struct net_device *dev)
6913 {
6914 	unsigned int i, count = dev->num_rx_queues;
6915 	struct netdev_rx_queue *rx;
6916 	size_t sz = count * sizeof(*rx);
6917 
6918 	BUG_ON(count < 1);
6919 
6920 	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6921 	if (!rx) {
6922 		rx = vzalloc(sz);
6923 		if (!rx)
6924 			return -ENOMEM;
6925 	}
6926 	dev->_rx = rx;
6927 
6928 	for (i = 0; i < count; i++)
6929 		rx[i].dev = dev;
6930 	return 0;
6931 }
6932 #endif
6933 
6934 static void netdev_init_one_queue(struct net_device *dev,
6935 				  struct netdev_queue *queue, void *_unused)
6936 {
6937 	/* Initialize queue lock */
6938 	spin_lock_init(&queue->_xmit_lock);
6939 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6940 	queue->xmit_lock_owner = -1;
6941 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6942 	queue->dev = dev;
6943 #ifdef CONFIG_BQL
6944 	dql_init(&queue->dql, HZ);
6945 #endif
6946 }
6947 
6948 static void netif_free_tx_queues(struct net_device *dev)
6949 {
6950 	kvfree(dev->_tx);
6951 }
6952 
6953 static int netif_alloc_netdev_queues(struct net_device *dev)
6954 {
6955 	unsigned int count = dev->num_tx_queues;
6956 	struct netdev_queue *tx;
6957 	size_t sz = count * sizeof(*tx);
6958 
6959 	if (count < 1 || count > 0xffff)
6960 		return -EINVAL;
6961 
6962 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6963 	if (!tx) {
6964 		tx = vzalloc(sz);
6965 		if (!tx)
6966 			return -ENOMEM;
6967 	}
6968 	dev->_tx = tx;
6969 
6970 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6971 	spin_lock_init(&dev->tx_global_lock);
6972 
6973 	return 0;
6974 }
6975 
6976 void netif_tx_stop_all_queues(struct net_device *dev)
6977 {
6978 	unsigned int i;
6979 
6980 	for (i = 0; i < dev->num_tx_queues; i++) {
6981 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6982 		netif_tx_stop_queue(txq);
6983 	}
6984 }
6985 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6986 
6987 /**
6988  *	register_netdevice	- register a network device
6989  *	@dev: device to register
6990  *
6991  *	Take a completed network device structure and add it to the kernel
6992  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6993  *	chain. 0 is returned on success. A negative errno code is returned
6994  *	on a failure to set up the device, or if the name is a duplicate.
6995  *
6996  *	Callers must hold the rtnl semaphore. You may want
6997  *	register_netdev() instead of this.
6998  *
6999  *	BUGS:
7000  *	The locking appears insufficient to guarantee two parallel registers
7001  *	will not get the same name.
7002  */
7003 
7004 int register_netdevice(struct net_device *dev)
7005 {
7006 	int ret;
7007 	struct net *net = dev_net(dev);
7008 
7009 	BUG_ON(dev_boot_phase);
7010 	ASSERT_RTNL();
7011 
7012 	might_sleep();
7013 
7014 	/* When net_device's are persistent, this will be fatal. */
7015 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7016 	BUG_ON(!net);
7017 
7018 	spin_lock_init(&dev->addr_list_lock);
7019 	netdev_set_addr_lockdep_class(dev);
7020 
7021 	ret = dev_get_valid_name(net, dev, dev->name);
7022 	if (ret < 0)
7023 		goto out;
7024 
7025 	/* Init, if this function is available */
7026 	if (dev->netdev_ops->ndo_init) {
7027 		ret = dev->netdev_ops->ndo_init(dev);
7028 		if (ret) {
7029 			if (ret > 0)
7030 				ret = -EIO;
7031 			goto out;
7032 		}
7033 	}
7034 
7035 	if (((dev->hw_features | dev->features) &
7036 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
7037 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7038 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7039 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7040 		ret = -EINVAL;
7041 		goto err_uninit;
7042 	}
7043 
7044 	ret = -EBUSY;
7045 	if (!dev->ifindex)
7046 		dev->ifindex = dev_new_index(net);
7047 	else if (__dev_get_by_index(net, dev->ifindex))
7048 		goto err_uninit;
7049 
7050 	/* Transfer changeable features to wanted_features and enable
7051 	 * software offloads (GSO and GRO).
7052 	 */
7053 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
7054 	dev->features |= NETIF_F_SOFT_FEATURES;
7055 	dev->wanted_features = dev->features & dev->hw_features;
7056 
7057 	if (!(dev->flags & IFF_LOOPBACK))
7058 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
7059 
7060 	/* If IPv4 TCP segmentation offload is supported we should also
7061 	 * allow the device to enable segmenting the frame with the option
7062 	 * of ignoring a static IP ID value.  This doesn't enable the
7063 	 * feature itself but allows the user to enable it later.
7064 	 */
7065 	if (dev->hw_features & NETIF_F_TSO)
7066 		dev->hw_features |= NETIF_F_TSO_MANGLEID;
7067 	if (dev->vlan_features & NETIF_F_TSO)
7068 		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7069 	if (dev->mpls_features & NETIF_F_TSO)
7070 		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7071 	if (dev->hw_enc_features & NETIF_F_TSO)
7072 		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7073 
7074 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7075 	 */
7076 	dev->vlan_features |= NETIF_F_HIGHDMA;
7077 
7078 	/* Make NETIF_F_SG inheritable to tunnel devices.
7079 	 */
7080 	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7081 
7082 	/* Make NETIF_F_SG inheritable to MPLS.
7083 	 */
7084 	dev->mpls_features |= NETIF_F_SG;
7085 
7086 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7087 	ret = notifier_to_errno(ret);
7088 	if (ret)
7089 		goto err_uninit;
7090 
7091 	ret = netdev_register_kobject(dev);
7092 	if (ret)
7093 		goto err_uninit;
7094 	dev->reg_state = NETREG_REGISTERED;
7095 
7096 	__netdev_update_features(dev);
7097 
7098 	/*
7099 	 *	Default initial state at registry is that the
7100 	 *	device is present.
7101 	 */
7102 
7103 	set_bit(__LINK_STATE_PRESENT, &dev->state);
7104 
7105 	linkwatch_init_dev(dev);
7106 
7107 	dev_init_scheduler(dev);
7108 	dev_hold(dev);
7109 	list_netdevice(dev);
7110 	add_device_randomness(dev->dev_addr, dev->addr_len);
7111 
7112 	/* If the device has permanent device address, driver should
7113 	 * set dev_addr and also addr_assign_type should be set to
7114 	 * NET_ADDR_PERM (default value).
7115 	 */
7116 	if (dev->addr_assign_type == NET_ADDR_PERM)
7117 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7118 
7119 	/* Notify protocols, that a new device appeared. */
7120 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7121 	ret = notifier_to_errno(ret);
7122 	if (ret) {
7123 		rollback_registered(dev);
7124 		dev->reg_state = NETREG_UNREGISTERED;
7125 	}
7126 	/*
7127 	 *	Prevent userspace races by waiting until the network
7128 	 *	device is fully setup before sending notifications.
7129 	 */
7130 	if (!dev->rtnl_link_ops ||
7131 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7132 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7133 
7134 out:
7135 	return ret;
7136 
7137 err_uninit:
7138 	if (dev->netdev_ops->ndo_uninit)
7139 		dev->netdev_ops->ndo_uninit(dev);
7140 	goto out;
7141 }
7142 EXPORT_SYMBOL(register_netdevice);
7143 
7144 /**
7145  *	init_dummy_netdev	- init a dummy network device for NAPI
7146  *	@dev: device to init
7147  *
7148  *	This takes a network device structure and initialize the minimum
7149  *	amount of fields so it can be used to schedule NAPI polls without
7150  *	registering a full blown interface. This is to be used by drivers
7151  *	that need to tie several hardware interfaces to a single NAPI
7152  *	poll scheduler due to HW limitations.
7153  */
7154 int init_dummy_netdev(struct net_device *dev)
7155 {
7156 	/* Clear everything. Note we don't initialize spinlocks
7157 	 * are they aren't supposed to be taken by any of the
7158 	 * NAPI code and this dummy netdev is supposed to be
7159 	 * only ever used for NAPI polls
7160 	 */
7161 	memset(dev, 0, sizeof(struct net_device));
7162 
7163 	/* make sure we BUG if trying to hit standard
7164 	 * register/unregister code path
7165 	 */
7166 	dev->reg_state = NETREG_DUMMY;
7167 
7168 	/* NAPI wants this */
7169 	INIT_LIST_HEAD(&dev->napi_list);
7170 
7171 	/* a dummy interface is started by default */
7172 	set_bit(__LINK_STATE_PRESENT, &dev->state);
7173 	set_bit(__LINK_STATE_START, &dev->state);
7174 
7175 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
7176 	 * because users of this 'device' dont need to change
7177 	 * its refcount.
7178 	 */
7179 
7180 	return 0;
7181 }
7182 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7183 
7184 
7185 /**
7186  *	register_netdev	- register a network device
7187  *	@dev: device to register
7188  *
7189  *	Take a completed network device structure and add it to the kernel
7190  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7191  *	chain. 0 is returned on success. A negative errno code is returned
7192  *	on a failure to set up the device, or if the name is a duplicate.
7193  *
7194  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7195  *	and expands the device name if you passed a format string to
7196  *	alloc_netdev.
7197  */
7198 int register_netdev(struct net_device *dev)
7199 {
7200 	int err;
7201 
7202 	rtnl_lock();
7203 	err = register_netdevice(dev);
7204 	rtnl_unlock();
7205 	return err;
7206 }
7207 EXPORT_SYMBOL(register_netdev);
7208 
7209 int netdev_refcnt_read(const struct net_device *dev)
7210 {
7211 	int i, refcnt = 0;
7212 
7213 	for_each_possible_cpu(i)
7214 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7215 	return refcnt;
7216 }
7217 EXPORT_SYMBOL(netdev_refcnt_read);
7218 
7219 /**
7220  * netdev_wait_allrefs - wait until all references are gone.
7221  * @dev: target net_device
7222  *
7223  * This is called when unregistering network devices.
7224  *
7225  * Any protocol or device that holds a reference should register
7226  * for netdevice notification, and cleanup and put back the
7227  * reference if they receive an UNREGISTER event.
7228  * We can get stuck here if buggy protocols don't correctly
7229  * call dev_put.
7230  */
7231 static void netdev_wait_allrefs(struct net_device *dev)
7232 {
7233 	unsigned long rebroadcast_time, warning_time;
7234 	int refcnt;
7235 
7236 	linkwatch_forget_dev(dev);
7237 
7238 	rebroadcast_time = warning_time = jiffies;
7239 	refcnt = netdev_refcnt_read(dev);
7240 
7241 	while (refcnt != 0) {
7242 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7243 			rtnl_lock();
7244 
7245 			/* Rebroadcast unregister notification */
7246 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7247 
7248 			__rtnl_unlock();
7249 			rcu_barrier();
7250 			rtnl_lock();
7251 
7252 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7253 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7254 				     &dev->state)) {
7255 				/* We must not have linkwatch events
7256 				 * pending on unregister. If this
7257 				 * happens, we simply run the queue
7258 				 * unscheduled, resulting in a noop
7259 				 * for this device.
7260 				 */
7261 				linkwatch_run_queue();
7262 			}
7263 
7264 			__rtnl_unlock();
7265 
7266 			rebroadcast_time = jiffies;
7267 		}
7268 
7269 		msleep(250);
7270 
7271 		refcnt = netdev_refcnt_read(dev);
7272 
7273 		if (time_after(jiffies, warning_time + 10 * HZ)) {
7274 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7275 				 dev->name, refcnt);
7276 			warning_time = jiffies;
7277 		}
7278 	}
7279 }
7280 
7281 /* The sequence is:
7282  *
7283  *	rtnl_lock();
7284  *	...
7285  *	register_netdevice(x1);
7286  *	register_netdevice(x2);
7287  *	...
7288  *	unregister_netdevice(y1);
7289  *	unregister_netdevice(y2);
7290  *      ...
7291  *	rtnl_unlock();
7292  *	free_netdev(y1);
7293  *	free_netdev(y2);
7294  *
7295  * We are invoked by rtnl_unlock().
7296  * This allows us to deal with problems:
7297  * 1) We can delete sysfs objects which invoke hotplug
7298  *    without deadlocking with linkwatch via keventd.
7299  * 2) Since we run with the RTNL semaphore not held, we can sleep
7300  *    safely in order to wait for the netdev refcnt to drop to zero.
7301  *
7302  * We must not return until all unregister events added during
7303  * the interval the lock was held have been completed.
7304  */
7305 void netdev_run_todo(void)
7306 {
7307 	struct list_head list;
7308 
7309 	/* Snapshot list, allow later requests */
7310 	list_replace_init(&net_todo_list, &list);
7311 
7312 	__rtnl_unlock();
7313 
7314 
7315 	/* Wait for rcu callbacks to finish before next phase */
7316 	if (!list_empty(&list))
7317 		rcu_barrier();
7318 
7319 	while (!list_empty(&list)) {
7320 		struct net_device *dev
7321 			= list_first_entry(&list, struct net_device, todo_list);
7322 		list_del(&dev->todo_list);
7323 
7324 		rtnl_lock();
7325 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7326 		__rtnl_unlock();
7327 
7328 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7329 			pr_err("network todo '%s' but state %d\n",
7330 			       dev->name, dev->reg_state);
7331 			dump_stack();
7332 			continue;
7333 		}
7334 
7335 		dev->reg_state = NETREG_UNREGISTERED;
7336 
7337 		netdev_wait_allrefs(dev);
7338 
7339 		/* paranoia */
7340 		BUG_ON(netdev_refcnt_read(dev));
7341 		BUG_ON(!list_empty(&dev->ptype_all));
7342 		BUG_ON(!list_empty(&dev->ptype_specific));
7343 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7344 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7345 		WARN_ON(dev->dn_ptr);
7346 
7347 		if (dev->destructor)
7348 			dev->destructor(dev);
7349 
7350 		/* Report a network device has been unregistered */
7351 		rtnl_lock();
7352 		dev_net(dev)->dev_unreg_count--;
7353 		__rtnl_unlock();
7354 		wake_up(&netdev_unregistering_wq);
7355 
7356 		/* Free network device */
7357 		kobject_put(&dev->dev.kobj);
7358 	}
7359 }
7360 
7361 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7362  * all the same fields in the same order as net_device_stats, with only
7363  * the type differing, but rtnl_link_stats64 may have additional fields
7364  * at the end for newer counters.
7365  */
7366 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7367 			     const struct net_device_stats *netdev_stats)
7368 {
7369 #if BITS_PER_LONG == 64
7370 	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7371 	memcpy(stats64, netdev_stats, sizeof(*stats64));
7372 	/* zero out counters that only exist in rtnl_link_stats64 */
7373 	memset((char *)stats64 + sizeof(*netdev_stats), 0,
7374 	       sizeof(*stats64) - sizeof(*netdev_stats));
7375 #else
7376 	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7377 	const unsigned long *src = (const unsigned long *)netdev_stats;
7378 	u64 *dst = (u64 *)stats64;
7379 
7380 	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7381 	for (i = 0; i < n; i++)
7382 		dst[i] = src[i];
7383 	/* zero out counters that only exist in rtnl_link_stats64 */
7384 	memset((char *)stats64 + n * sizeof(u64), 0,
7385 	       sizeof(*stats64) - n * sizeof(u64));
7386 #endif
7387 }
7388 EXPORT_SYMBOL(netdev_stats_to_stats64);
7389 
7390 /**
7391  *	dev_get_stats	- get network device statistics
7392  *	@dev: device to get statistics from
7393  *	@storage: place to store stats
7394  *
7395  *	Get network statistics from device. Return @storage.
7396  *	The device driver may provide its own method by setting
7397  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7398  *	otherwise the internal statistics structure is used.
7399  */
7400 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7401 					struct rtnl_link_stats64 *storage)
7402 {
7403 	const struct net_device_ops *ops = dev->netdev_ops;
7404 
7405 	if (ops->ndo_get_stats64) {
7406 		memset(storage, 0, sizeof(*storage));
7407 		ops->ndo_get_stats64(dev, storage);
7408 	} else if (ops->ndo_get_stats) {
7409 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7410 	} else {
7411 		netdev_stats_to_stats64(storage, &dev->stats);
7412 	}
7413 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7414 	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7415 	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7416 	return storage;
7417 }
7418 EXPORT_SYMBOL(dev_get_stats);
7419 
7420 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7421 {
7422 	struct netdev_queue *queue = dev_ingress_queue(dev);
7423 
7424 #ifdef CONFIG_NET_CLS_ACT
7425 	if (queue)
7426 		return queue;
7427 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7428 	if (!queue)
7429 		return NULL;
7430 	netdev_init_one_queue(dev, queue, NULL);
7431 	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7432 	queue->qdisc_sleeping = &noop_qdisc;
7433 	rcu_assign_pointer(dev->ingress_queue, queue);
7434 #endif
7435 	return queue;
7436 }
7437 
7438 static const struct ethtool_ops default_ethtool_ops;
7439 
7440 void netdev_set_default_ethtool_ops(struct net_device *dev,
7441 				    const struct ethtool_ops *ops)
7442 {
7443 	if (dev->ethtool_ops == &default_ethtool_ops)
7444 		dev->ethtool_ops = ops;
7445 }
7446 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7447 
7448 void netdev_freemem(struct net_device *dev)
7449 {
7450 	char *addr = (char *)dev - dev->padded;
7451 
7452 	kvfree(addr);
7453 }
7454 
7455 /**
7456  *	alloc_netdev_mqs - allocate network device
7457  *	@sizeof_priv:		size of private data to allocate space for
7458  *	@name:			device name format string
7459  *	@name_assign_type: 	origin of device name
7460  *	@setup:			callback to initialize device
7461  *	@txqs:			the number of TX subqueues to allocate
7462  *	@rxqs:			the number of RX subqueues to allocate
7463  *
7464  *	Allocates a struct net_device with private data area for driver use
7465  *	and performs basic initialization.  Also allocates subqueue structs
7466  *	for each queue on the device.
7467  */
7468 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7469 		unsigned char name_assign_type,
7470 		void (*setup)(struct net_device *),
7471 		unsigned int txqs, unsigned int rxqs)
7472 {
7473 	struct net_device *dev;
7474 	size_t alloc_size;
7475 	struct net_device *p;
7476 
7477 	BUG_ON(strlen(name) >= sizeof(dev->name));
7478 
7479 	if (txqs < 1) {
7480 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7481 		return NULL;
7482 	}
7483 
7484 #ifdef CONFIG_SYSFS
7485 	if (rxqs < 1) {
7486 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7487 		return NULL;
7488 	}
7489 #endif
7490 
7491 	alloc_size = sizeof(struct net_device);
7492 	if (sizeof_priv) {
7493 		/* ensure 32-byte alignment of private area */
7494 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7495 		alloc_size += sizeof_priv;
7496 	}
7497 	/* ensure 32-byte alignment of whole construct */
7498 	alloc_size += NETDEV_ALIGN - 1;
7499 
7500 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7501 	if (!p)
7502 		p = vzalloc(alloc_size);
7503 	if (!p)
7504 		return NULL;
7505 
7506 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7507 	dev->padded = (char *)dev - (char *)p;
7508 
7509 	dev->pcpu_refcnt = alloc_percpu(int);
7510 	if (!dev->pcpu_refcnt)
7511 		goto free_dev;
7512 
7513 	if (dev_addr_init(dev))
7514 		goto free_pcpu;
7515 
7516 	dev_mc_init(dev);
7517 	dev_uc_init(dev);
7518 
7519 	dev_net_set(dev, &init_net);
7520 
7521 	dev->gso_max_size = GSO_MAX_SIZE;
7522 	dev->gso_max_segs = GSO_MAX_SEGS;
7523 
7524 	INIT_LIST_HEAD(&dev->napi_list);
7525 	INIT_LIST_HEAD(&dev->unreg_list);
7526 	INIT_LIST_HEAD(&dev->close_list);
7527 	INIT_LIST_HEAD(&dev->link_watch_list);
7528 	INIT_LIST_HEAD(&dev->adj_list.upper);
7529 	INIT_LIST_HEAD(&dev->adj_list.lower);
7530 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
7531 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
7532 	INIT_LIST_HEAD(&dev->ptype_all);
7533 	INIT_LIST_HEAD(&dev->ptype_specific);
7534 #ifdef CONFIG_NET_SCHED
7535 	hash_init(dev->qdisc_hash);
7536 #endif
7537 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7538 	setup(dev);
7539 
7540 	if (!dev->tx_queue_len) {
7541 		dev->priv_flags |= IFF_NO_QUEUE;
7542 		dev->tx_queue_len = 1;
7543 	}
7544 
7545 	dev->num_tx_queues = txqs;
7546 	dev->real_num_tx_queues = txqs;
7547 	if (netif_alloc_netdev_queues(dev))
7548 		goto free_all;
7549 
7550 #ifdef CONFIG_SYSFS
7551 	dev->num_rx_queues = rxqs;
7552 	dev->real_num_rx_queues = rxqs;
7553 	if (netif_alloc_rx_queues(dev))
7554 		goto free_all;
7555 #endif
7556 
7557 	strcpy(dev->name, name);
7558 	dev->name_assign_type = name_assign_type;
7559 	dev->group = INIT_NETDEV_GROUP;
7560 	if (!dev->ethtool_ops)
7561 		dev->ethtool_ops = &default_ethtool_ops;
7562 
7563 	nf_hook_ingress_init(dev);
7564 
7565 	return dev;
7566 
7567 free_all:
7568 	free_netdev(dev);
7569 	return NULL;
7570 
7571 free_pcpu:
7572 	free_percpu(dev->pcpu_refcnt);
7573 free_dev:
7574 	netdev_freemem(dev);
7575 	return NULL;
7576 }
7577 EXPORT_SYMBOL(alloc_netdev_mqs);
7578 
7579 /**
7580  *	free_netdev - free network device
7581  *	@dev: device
7582  *
7583  *	This function does the last stage of destroying an allocated device
7584  * 	interface. The reference to the device object is released.
7585  *	If this is the last reference then it will be freed.
7586  *	Must be called in process context.
7587  */
7588 void free_netdev(struct net_device *dev)
7589 {
7590 	struct napi_struct *p, *n;
7591 
7592 	might_sleep();
7593 	netif_free_tx_queues(dev);
7594 #ifdef CONFIG_SYSFS
7595 	kvfree(dev->_rx);
7596 #endif
7597 
7598 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7599 
7600 	/* Flush device addresses */
7601 	dev_addr_flush(dev);
7602 
7603 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7604 		netif_napi_del(p);
7605 
7606 	free_percpu(dev->pcpu_refcnt);
7607 	dev->pcpu_refcnt = NULL;
7608 
7609 	/*  Compatibility with error handling in drivers */
7610 	if (dev->reg_state == NETREG_UNINITIALIZED) {
7611 		netdev_freemem(dev);
7612 		return;
7613 	}
7614 
7615 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7616 	dev->reg_state = NETREG_RELEASED;
7617 
7618 	/* will free via device release */
7619 	put_device(&dev->dev);
7620 }
7621 EXPORT_SYMBOL(free_netdev);
7622 
7623 /**
7624  *	synchronize_net -  Synchronize with packet receive processing
7625  *
7626  *	Wait for packets currently being received to be done.
7627  *	Does not block later packets from starting.
7628  */
7629 void synchronize_net(void)
7630 {
7631 	might_sleep();
7632 	if (rtnl_is_locked())
7633 		synchronize_rcu_expedited();
7634 	else
7635 		synchronize_rcu();
7636 }
7637 EXPORT_SYMBOL(synchronize_net);
7638 
7639 /**
7640  *	unregister_netdevice_queue - remove device from the kernel
7641  *	@dev: device
7642  *	@head: list
7643  *
7644  *	This function shuts down a device interface and removes it
7645  *	from the kernel tables.
7646  *	If head not NULL, device is queued to be unregistered later.
7647  *
7648  *	Callers must hold the rtnl semaphore.  You may want
7649  *	unregister_netdev() instead of this.
7650  */
7651 
7652 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7653 {
7654 	ASSERT_RTNL();
7655 
7656 	if (head) {
7657 		list_move_tail(&dev->unreg_list, head);
7658 	} else {
7659 		rollback_registered(dev);
7660 		/* Finish processing unregister after unlock */
7661 		net_set_todo(dev);
7662 	}
7663 }
7664 EXPORT_SYMBOL(unregister_netdevice_queue);
7665 
7666 /**
7667  *	unregister_netdevice_many - unregister many devices
7668  *	@head: list of devices
7669  *
7670  *  Note: As most callers use a stack allocated list_head,
7671  *  we force a list_del() to make sure stack wont be corrupted later.
7672  */
7673 void unregister_netdevice_many(struct list_head *head)
7674 {
7675 	struct net_device *dev;
7676 
7677 	if (!list_empty(head)) {
7678 		rollback_registered_many(head);
7679 		list_for_each_entry(dev, head, unreg_list)
7680 			net_set_todo(dev);
7681 		list_del(head);
7682 	}
7683 }
7684 EXPORT_SYMBOL(unregister_netdevice_many);
7685 
7686 /**
7687  *	unregister_netdev - remove device from the kernel
7688  *	@dev: device
7689  *
7690  *	This function shuts down a device interface and removes it
7691  *	from the kernel tables.
7692  *
7693  *	This is just a wrapper for unregister_netdevice that takes
7694  *	the rtnl semaphore.  In general you want to use this and not
7695  *	unregister_netdevice.
7696  */
7697 void unregister_netdev(struct net_device *dev)
7698 {
7699 	rtnl_lock();
7700 	unregister_netdevice(dev);
7701 	rtnl_unlock();
7702 }
7703 EXPORT_SYMBOL(unregister_netdev);
7704 
7705 /**
7706  *	dev_change_net_namespace - move device to different nethost namespace
7707  *	@dev: device
7708  *	@net: network namespace
7709  *	@pat: If not NULL name pattern to try if the current device name
7710  *	      is already taken in the destination network namespace.
7711  *
7712  *	This function shuts down a device interface and moves it
7713  *	to a new network namespace. On success 0 is returned, on
7714  *	a failure a netagive errno code is returned.
7715  *
7716  *	Callers must hold the rtnl semaphore.
7717  */
7718 
7719 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7720 {
7721 	int err;
7722 
7723 	ASSERT_RTNL();
7724 
7725 	/* Don't allow namespace local devices to be moved. */
7726 	err = -EINVAL;
7727 	if (dev->features & NETIF_F_NETNS_LOCAL)
7728 		goto out;
7729 
7730 	/* Ensure the device has been registrered */
7731 	if (dev->reg_state != NETREG_REGISTERED)
7732 		goto out;
7733 
7734 	/* Get out if there is nothing todo */
7735 	err = 0;
7736 	if (net_eq(dev_net(dev), net))
7737 		goto out;
7738 
7739 	/* Pick the destination device name, and ensure
7740 	 * we can use it in the destination network namespace.
7741 	 */
7742 	err = -EEXIST;
7743 	if (__dev_get_by_name(net, dev->name)) {
7744 		/* We get here if we can't use the current device name */
7745 		if (!pat)
7746 			goto out;
7747 		if (dev_get_valid_name(net, dev, pat) < 0)
7748 			goto out;
7749 	}
7750 
7751 	/*
7752 	 * And now a mini version of register_netdevice unregister_netdevice.
7753 	 */
7754 
7755 	/* If device is running close it first. */
7756 	dev_close(dev);
7757 
7758 	/* And unlink it from device chain */
7759 	err = -ENODEV;
7760 	unlist_netdevice(dev);
7761 
7762 	synchronize_net();
7763 
7764 	/* Shutdown queueing discipline. */
7765 	dev_shutdown(dev);
7766 
7767 	/* Notify protocols, that we are about to destroy
7768 	   this device. They should clean all the things.
7769 
7770 	   Note that dev->reg_state stays at NETREG_REGISTERED.
7771 	   This is wanted because this way 8021q and macvlan know
7772 	   the device is just moving and can keep their slaves up.
7773 	*/
7774 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7775 	rcu_barrier();
7776 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7777 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7778 
7779 	/*
7780 	 *	Flush the unicast and multicast chains
7781 	 */
7782 	dev_uc_flush(dev);
7783 	dev_mc_flush(dev);
7784 
7785 	/* Send a netdev-removed uevent to the old namespace */
7786 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7787 	netdev_adjacent_del_links(dev);
7788 
7789 	/* Actually switch the network namespace */
7790 	dev_net_set(dev, net);
7791 
7792 	/* If there is an ifindex conflict assign a new one */
7793 	if (__dev_get_by_index(net, dev->ifindex))
7794 		dev->ifindex = dev_new_index(net);
7795 
7796 	/* Send a netdev-add uevent to the new namespace */
7797 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7798 	netdev_adjacent_add_links(dev);
7799 
7800 	/* Fixup kobjects */
7801 	err = device_rename(&dev->dev, dev->name);
7802 	WARN_ON(err);
7803 
7804 	/* Add the device back in the hashes */
7805 	list_netdevice(dev);
7806 
7807 	/* Notify protocols, that a new device appeared. */
7808 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7809 
7810 	/*
7811 	 *	Prevent userspace races by waiting until the network
7812 	 *	device is fully setup before sending notifications.
7813 	 */
7814 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7815 
7816 	synchronize_net();
7817 	err = 0;
7818 out:
7819 	return err;
7820 }
7821 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7822 
7823 static int dev_cpu_callback(struct notifier_block *nfb,
7824 			    unsigned long action,
7825 			    void *ocpu)
7826 {
7827 	struct sk_buff **list_skb;
7828 	struct sk_buff *skb;
7829 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
7830 	struct softnet_data *sd, *oldsd;
7831 
7832 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7833 		return NOTIFY_OK;
7834 
7835 	local_irq_disable();
7836 	cpu = smp_processor_id();
7837 	sd = &per_cpu(softnet_data, cpu);
7838 	oldsd = &per_cpu(softnet_data, oldcpu);
7839 
7840 	/* Find end of our completion_queue. */
7841 	list_skb = &sd->completion_queue;
7842 	while (*list_skb)
7843 		list_skb = &(*list_skb)->next;
7844 	/* Append completion queue from offline CPU. */
7845 	*list_skb = oldsd->completion_queue;
7846 	oldsd->completion_queue = NULL;
7847 
7848 	/* Append output queue from offline CPU. */
7849 	if (oldsd->output_queue) {
7850 		*sd->output_queue_tailp = oldsd->output_queue;
7851 		sd->output_queue_tailp = oldsd->output_queue_tailp;
7852 		oldsd->output_queue = NULL;
7853 		oldsd->output_queue_tailp = &oldsd->output_queue;
7854 	}
7855 	/* Append NAPI poll list from offline CPU, with one exception :
7856 	 * process_backlog() must be called by cpu owning percpu backlog.
7857 	 * We properly handle process_queue & input_pkt_queue later.
7858 	 */
7859 	while (!list_empty(&oldsd->poll_list)) {
7860 		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7861 							    struct napi_struct,
7862 							    poll_list);
7863 
7864 		list_del_init(&napi->poll_list);
7865 		if (napi->poll == process_backlog)
7866 			napi->state = 0;
7867 		else
7868 			____napi_schedule(sd, napi);
7869 	}
7870 
7871 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
7872 	local_irq_enable();
7873 
7874 	/* Process offline CPU's input_pkt_queue */
7875 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7876 		netif_rx_ni(skb);
7877 		input_queue_head_incr(oldsd);
7878 	}
7879 	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7880 		netif_rx_ni(skb);
7881 		input_queue_head_incr(oldsd);
7882 	}
7883 
7884 	return NOTIFY_OK;
7885 }
7886 
7887 
7888 /**
7889  *	netdev_increment_features - increment feature set by one
7890  *	@all: current feature set
7891  *	@one: new feature set
7892  *	@mask: mask feature set
7893  *
7894  *	Computes a new feature set after adding a device with feature set
7895  *	@one to the master device with current feature set @all.  Will not
7896  *	enable anything that is off in @mask. Returns the new feature set.
7897  */
7898 netdev_features_t netdev_increment_features(netdev_features_t all,
7899 	netdev_features_t one, netdev_features_t mask)
7900 {
7901 	if (mask & NETIF_F_HW_CSUM)
7902 		mask |= NETIF_F_CSUM_MASK;
7903 	mask |= NETIF_F_VLAN_CHALLENGED;
7904 
7905 	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
7906 	all &= one | ~NETIF_F_ALL_FOR_ALL;
7907 
7908 	/* If one device supports hw checksumming, set for all. */
7909 	if (all & NETIF_F_HW_CSUM)
7910 		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7911 
7912 	return all;
7913 }
7914 EXPORT_SYMBOL(netdev_increment_features);
7915 
7916 static struct hlist_head * __net_init netdev_create_hash(void)
7917 {
7918 	int i;
7919 	struct hlist_head *hash;
7920 
7921 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7922 	if (hash != NULL)
7923 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
7924 			INIT_HLIST_HEAD(&hash[i]);
7925 
7926 	return hash;
7927 }
7928 
7929 /* Initialize per network namespace state */
7930 static int __net_init netdev_init(struct net *net)
7931 {
7932 	if (net != &init_net)
7933 		INIT_LIST_HEAD(&net->dev_base_head);
7934 
7935 	net->dev_name_head = netdev_create_hash();
7936 	if (net->dev_name_head == NULL)
7937 		goto err_name;
7938 
7939 	net->dev_index_head = netdev_create_hash();
7940 	if (net->dev_index_head == NULL)
7941 		goto err_idx;
7942 
7943 	return 0;
7944 
7945 err_idx:
7946 	kfree(net->dev_name_head);
7947 err_name:
7948 	return -ENOMEM;
7949 }
7950 
7951 /**
7952  *	netdev_drivername - network driver for the device
7953  *	@dev: network device
7954  *
7955  *	Determine network driver for device.
7956  */
7957 const char *netdev_drivername(const struct net_device *dev)
7958 {
7959 	const struct device_driver *driver;
7960 	const struct device *parent;
7961 	const char *empty = "";
7962 
7963 	parent = dev->dev.parent;
7964 	if (!parent)
7965 		return empty;
7966 
7967 	driver = parent->driver;
7968 	if (driver && driver->name)
7969 		return driver->name;
7970 	return empty;
7971 }
7972 
7973 static void __netdev_printk(const char *level, const struct net_device *dev,
7974 			    struct va_format *vaf)
7975 {
7976 	if (dev && dev->dev.parent) {
7977 		dev_printk_emit(level[1] - '0',
7978 				dev->dev.parent,
7979 				"%s %s %s%s: %pV",
7980 				dev_driver_string(dev->dev.parent),
7981 				dev_name(dev->dev.parent),
7982 				netdev_name(dev), netdev_reg_state(dev),
7983 				vaf);
7984 	} else if (dev) {
7985 		printk("%s%s%s: %pV",
7986 		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
7987 	} else {
7988 		printk("%s(NULL net_device): %pV", level, vaf);
7989 	}
7990 }
7991 
7992 void netdev_printk(const char *level, const struct net_device *dev,
7993 		   const char *format, ...)
7994 {
7995 	struct va_format vaf;
7996 	va_list args;
7997 
7998 	va_start(args, format);
7999 
8000 	vaf.fmt = format;
8001 	vaf.va = &args;
8002 
8003 	__netdev_printk(level, dev, &vaf);
8004 
8005 	va_end(args);
8006 }
8007 EXPORT_SYMBOL(netdev_printk);
8008 
8009 #define define_netdev_printk_level(func, level)			\
8010 void func(const struct net_device *dev, const char *fmt, ...)	\
8011 {								\
8012 	struct va_format vaf;					\
8013 	va_list args;						\
8014 								\
8015 	va_start(args, fmt);					\
8016 								\
8017 	vaf.fmt = fmt;						\
8018 	vaf.va = &args;						\
8019 								\
8020 	__netdev_printk(level, dev, &vaf);			\
8021 								\
8022 	va_end(args);						\
8023 }								\
8024 EXPORT_SYMBOL(func);
8025 
8026 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8027 define_netdev_printk_level(netdev_alert, KERN_ALERT);
8028 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8029 define_netdev_printk_level(netdev_err, KERN_ERR);
8030 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8031 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8032 define_netdev_printk_level(netdev_info, KERN_INFO);
8033 
8034 static void __net_exit netdev_exit(struct net *net)
8035 {
8036 	kfree(net->dev_name_head);
8037 	kfree(net->dev_index_head);
8038 }
8039 
8040 static struct pernet_operations __net_initdata netdev_net_ops = {
8041 	.init = netdev_init,
8042 	.exit = netdev_exit,
8043 };
8044 
8045 static void __net_exit default_device_exit(struct net *net)
8046 {
8047 	struct net_device *dev, *aux;
8048 	/*
8049 	 * Push all migratable network devices back to the
8050 	 * initial network namespace
8051 	 */
8052 	rtnl_lock();
8053 	for_each_netdev_safe(net, dev, aux) {
8054 		int err;
8055 		char fb_name[IFNAMSIZ];
8056 
8057 		/* Ignore unmoveable devices (i.e. loopback) */
8058 		if (dev->features & NETIF_F_NETNS_LOCAL)
8059 			continue;
8060 
8061 		/* Leave virtual devices for the generic cleanup */
8062 		if (dev->rtnl_link_ops)
8063 			continue;
8064 
8065 		/* Push remaining network devices to init_net */
8066 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8067 		err = dev_change_net_namespace(dev, &init_net, fb_name);
8068 		if (err) {
8069 			pr_emerg("%s: failed to move %s to init_net: %d\n",
8070 				 __func__, dev->name, err);
8071 			BUG();
8072 		}
8073 	}
8074 	rtnl_unlock();
8075 }
8076 
8077 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8078 {
8079 	/* Return with the rtnl_lock held when there are no network
8080 	 * devices unregistering in any network namespace in net_list.
8081 	 */
8082 	struct net *net;
8083 	bool unregistering;
8084 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
8085 
8086 	add_wait_queue(&netdev_unregistering_wq, &wait);
8087 	for (;;) {
8088 		unregistering = false;
8089 		rtnl_lock();
8090 		list_for_each_entry(net, net_list, exit_list) {
8091 			if (net->dev_unreg_count > 0) {
8092 				unregistering = true;
8093 				break;
8094 			}
8095 		}
8096 		if (!unregistering)
8097 			break;
8098 		__rtnl_unlock();
8099 
8100 		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8101 	}
8102 	remove_wait_queue(&netdev_unregistering_wq, &wait);
8103 }
8104 
8105 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8106 {
8107 	/* At exit all network devices most be removed from a network
8108 	 * namespace.  Do this in the reverse order of registration.
8109 	 * Do this across as many network namespaces as possible to
8110 	 * improve batching efficiency.
8111 	 */
8112 	struct net_device *dev;
8113 	struct net *net;
8114 	LIST_HEAD(dev_kill_list);
8115 
8116 	/* To prevent network device cleanup code from dereferencing
8117 	 * loopback devices or network devices that have been freed
8118 	 * wait here for all pending unregistrations to complete,
8119 	 * before unregistring the loopback device and allowing the
8120 	 * network namespace be freed.
8121 	 *
8122 	 * The netdev todo list containing all network devices
8123 	 * unregistrations that happen in default_device_exit_batch
8124 	 * will run in the rtnl_unlock() at the end of
8125 	 * default_device_exit_batch.
8126 	 */
8127 	rtnl_lock_unregistering(net_list);
8128 	list_for_each_entry(net, net_list, exit_list) {
8129 		for_each_netdev_reverse(net, dev) {
8130 			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8131 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8132 			else
8133 				unregister_netdevice_queue(dev, &dev_kill_list);
8134 		}
8135 	}
8136 	unregister_netdevice_many(&dev_kill_list);
8137 	rtnl_unlock();
8138 }
8139 
8140 static struct pernet_operations __net_initdata default_device_ops = {
8141 	.exit = default_device_exit,
8142 	.exit_batch = default_device_exit_batch,
8143 };
8144 
8145 /*
8146  *	Initialize the DEV module. At boot time this walks the device list and
8147  *	unhooks any devices that fail to initialise (normally hardware not
8148  *	present) and leaves us with a valid list of present and active devices.
8149  *
8150  */
8151 
8152 /*
8153  *       This is called single threaded during boot, so no need
8154  *       to take the rtnl semaphore.
8155  */
8156 static int __init net_dev_init(void)
8157 {
8158 	int i, rc = -ENOMEM;
8159 
8160 	BUG_ON(!dev_boot_phase);
8161 
8162 	if (dev_proc_init())
8163 		goto out;
8164 
8165 	if (netdev_kobject_init())
8166 		goto out;
8167 
8168 	INIT_LIST_HEAD(&ptype_all);
8169 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
8170 		INIT_LIST_HEAD(&ptype_base[i]);
8171 
8172 	INIT_LIST_HEAD(&offload_base);
8173 
8174 	if (register_pernet_subsys(&netdev_net_ops))
8175 		goto out;
8176 
8177 	/*
8178 	 *	Initialise the packet receive queues.
8179 	 */
8180 
8181 	for_each_possible_cpu(i) {
8182 		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8183 		struct softnet_data *sd = &per_cpu(softnet_data, i);
8184 
8185 		INIT_WORK(flush, flush_backlog);
8186 
8187 		skb_queue_head_init(&sd->input_pkt_queue);
8188 		skb_queue_head_init(&sd->process_queue);
8189 		INIT_LIST_HEAD(&sd->poll_list);
8190 		sd->output_queue_tailp = &sd->output_queue;
8191 #ifdef CONFIG_RPS
8192 		sd->csd.func = rps_trigger_softirq;
8193 		sd->csd.info = sd;
8194 		sd->cpu = i;
8195 #endif
8196 
8197 		sd->backlog.poll = process_backlog;
8198 		sd->backlog.weight = weight_p;
8199 	}
8200 
8201 	dev_boot_phase = 0;
8202 
8203 	/* The loopback device is special if any other network devices
8204 	 * is present in a network namespace the loopback device must
8205 	 * be present. Since we now dynamically allocate and free the
8206 	 * loopback device ensure this invariant is maintained by
8207 	 * keeping the loopback device as the first device on the
8208 	 * list of network devices.  Ensuring the loopback devices
8209 	 * is the first device that appears and the last network device
8210 	 * that disappears.
8211 	 */
8212 	if (register_pernet_device(&loopback_net_ops))
8213 		goto out;
8214 
8215 	if (register_pernet_device(&default_device_ops))
8216 		goto out;
8217 
8218 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8219 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8220 
8221 	hotcpu_notifier(dev_cpu_callback, 0);
8222 	dst_subsys_init();
8223 	rc = 0;
8224 out:
8225 	return rc;
8226 }
8227 
8228 subsys_initcall(net_dev_init);
8229