xref: /linux/net/core/dev.c (revision 93d90ad708b8da6efc0e487b66111aa9db7f70c7)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <net/mpls.h>
122 #include <linux/ipv6.h>
123 #include <linux/in.h>
124 #include <linux/jhash.h>
125 #include <linux/random.h>
126 #include <trace/events/napi.h>
127 #include <trace/events/net.h>
128 #include <trace/events/skb.h>
129 #include <linux/pci.h>
130 #include <linux/inetdevice.h>
131 #include <linux/cpu_rmap.h>
132 #include <linux/static_key.h>
133 #include <linux/hashtable.h>
134 #include <linux/vmalloc.h>
135 #include <linux/if_macvlan.h>
136 #include <linux/errqueue.h>
137 #include <linux/hrtimer.h>
138 
139 #include "net-sysfs.h"
140 
141 /* Instead of increasing this, you should create a hash table. */
142 #define MAX_GRO_SKBS 8
143 
144 /* This should be increased if a protocol with a bigger head is added. */
145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146 
147 static DEFINE_SPINLOCK(ptype_lock);
148 static DEFINE_SPINLOCK(offload_lock);
149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
150 struct list_head ptype_all __read_mostly;	/* Taps */
151 static struct list_head offload_base __read_mostly;
152 
153 static int netif_rx_internal(struct sk_buff *skb);
154 static int call_netdevice_notifiers_info(unsigned long val,
155 					 struct net_device *dev,
156 					 struct netdev_notifier_info *info);
157 
158 /*
159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
160  * semaphore.
161  *
162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
163  *
164  * Writers must hold the rtnl semaphore while they loop through the
165  * dev_base_head list, and hold dev_base_lock for writing when they do the
166  * actual updates.  This allows pure readers to access the list even
167  * while a writer is preparing to update it.
168  *
169  * To put it another way, dev_base_lock is held for writing only to
170  * protect against pure readers; the rtnl semaphore provides the
171  * protection against other writers.
172  *
173  * See, for example usages, register_netdevice() and
174  * unregister_netdevice(), which must be called with the rtnl
175  * semaphore held.
176  */
177 DEFINE_RWLOCK(dev_base_lock);
178 EXPORT_SYMBOL(dev_base_lock);
179 
180 /* protects napi_hash addition/deletion and napi_gen_id */
181 static DEFINE_SPINLOCK(napi_hash_lock);
182 
183 static unsigned int napi_gen_id;
184 static DEFINE_HASHTABLE(napi_hash, 8);
185 
186 static seqcount_t devnet_rename_seq;
187 
188 static inline void dev_base_seq_inc(struct net *net)
189 {
190 	while (++net->dev_base_seq == 0);
191 }
192 
193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
194 {
195 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
196 
197 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
198 }
199 
200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
201 {
202 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
203 }
204 
205 static inline void rps_lock(struct softnet_data *sd)
206 {
207 #ifdef CONFIG_RPS
208 	spin_lock(&sd->input_pkt_queue.lock);
209 #endif
210 }
211 
212 static inline void rps_unlock(struct softnet_data *sd)
213 {
214 #ifdef CONFIG_RPS
215 	spin_unlock(&sd->input_pkt_queue.lock);
216 #endif
217 }
218 
219 /* Device list insertion */
220 static void list_netdevice(struct net_device *dev)
221 {
222 	struct net *net = dev_net(dev);
223 
224 	ASSERT_RTNL();
225 
226 	write_lock_bh(&dev_base_lock);
227 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
228 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
229 	hlist_add_head_rcu(&dev->index_hlist,
230 			   dev_index_hash(net, dev->ifindex));
231 	write_unlock_bh(&dev_base_lock);
232 
233 	dev_base_seq_inc(net);
234 }
235 
236 /* Device list removal
237  * caller must respect a RCU grace period before freeing/reusing dev
238  */
239 static void unlist_netdevice(struct net_device *dev)
240 {
241 	ASSERT_RTNL();
242 
243 	/* Unlink dev from the device chain */
244 	write_lock_bh(&dev_base_lock);
245 	list_del_rcu(&dev->dev_list);
246 	hlist_del_rcu(&dev->name_hlist);
247 	hlist_del_rcu(&dev->index_hlist);
248 	write_unlock_bh(&dev_base_lock);
249 
250 	dev_base_seq_inc(dev_net(dev));
251 }
252 
253 /*
254  *	Our notifier list
255  */
256 
257 static RAW_NOTIFIER_HEAD(netdev_chain);
258 
259 /*
260  *	Device drivers call our routines to queue packets here. We empty the
261  *	queue in the local softnet handler.
262  */
263 
264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
265 EXPORT_PER_CPU_SYMBOL(softnet_data);
266 
267 #ifdef CONFIG_LOCKDEP
268 /*
269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
270  * according to dev->type
271  */
272 static const unsigned short netdev_lock_type[] =
273 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
274 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
275 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
276 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
277 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
278 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
279 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
280 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
281 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
282 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
283 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
284 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
285 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
286 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
287 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
288 
289 static const char *const netdev_lock_name[] =
290 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
291 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
292 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
293 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
294 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
295 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
296 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
297 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
298 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
299 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
300 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
301 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
302 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
303 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
304 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
305 
306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
308 
309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
310 {
311 	int i;
312 
313 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
314 		if (netdev_lock_type[i] == dev_type)
315 			return i;
316 	/* the last key is used by default */
317 	return ARRAY_SIZE(netdev_lock_type) - 1;
318 }
319 
320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
321 						 unsigned short dev_type)
322 {
323 	int i;
324 
325 	i = netdev_lock_pos(dev_type);
326 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
327 				   netdev_lock_name[i]);
328 }
329 
330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
331 {
332 	int i;
333 
334 	i = netdev_lock_pos(dev->type);
335 	lockdep_set_class_and_name(&dev->addr_list_lock,
336 				   &netdev_addr_lock_key[i],
337 				   netdev_lock_name[i]);
338 }
339 #else
340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
341 						 unsigned short dev_type)
342 {
343 }
344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
345 {
346 }
347 #endif
348 
349 /*******************************************************************************
350 
351 		Protocol management and registration routines
352 
353 *******************************************************************************/
354 
355 /*
356  *	Add a protocol ID to the list. Now that the input handler is
357  *	smarter we can dispense with all the messy stuff that used to be
358  *	here.
359  *
360  *	BEWARE!!! Protocol handlers, mangling input packets,
361  *	MUST BE last in hash buckets and checking protocol handlers
362  *	MUST start from promiscuous ptype_all chain in net_bh.
363  *	It is true now, do not change it.
364  *	Explanation follows: if protocol handler, mangling packet, will
365  *	be the first on list, it is not able to sense, that packet
366  *	is cloned and should be copied-on-write, so that it will
367  *	change it and subsequent readers will get broken packet.
368  *							--ANK (980803)
369  */
370 
371 static inline struct list_head *ptype_head(const struct packet_type *pt)
372 {
373 	if (pt->type == htons(ETH_P_ALL))
374 		return &ptype_all;
375 	else
376 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
377 }
378 
379 /**
380  *	dev_add_pack - add packet handler
381  *	@pt: packet type declaration
382  *
383  *	Add a protocol handler to the networking stack. The passed &packet_type
384  *	is linked into kernel lists and may not be freed until it has been
385  *	removed from the kernel lists.
386  *
387  *	This call does not sleep therefore it can not
388  *	guarantee all CPU's that are in middle of receiving packets
389  *	will see the new packet type (until the next received packet).
390  */
391 
392 void dev_add_pack(struct packet_type *pt)
393 {
394 	struct list_head *head = ptype_head(pt);
395 
396 	spin_lock(&ptype_lock);
397 	list_add_rcu(&pt->list, head);
398 	spin_unlock(&ptype_lock);
399 }
400 EXPORT_SYMBOL(dev_add_pack);
401 
402 /**
403  *	__dev_remove_pack	 - remove packet handler
404  *	@pt: packet type declaration
405  *
406  *	Remove a protocol handler that was previously added to the kernel
407  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
408  *	from the kernel lists and can be freed or reused once this function
409  *	returns.
410  *
411  *      The packet type might still be in use by receivers
412  *	and must not be freed until after all the CPU's have gone
413  *	through a quiescent state.
414  */
415 void __dev_remove_pack(struct packet_type *pt)
416 {
417 	struct list_head *head = ptype_head(pt);
418 	struct packet_type *pt1;
419 
420 	spin_lock(&ptype_lock);
421 
422 	list_for_each_entry(pt1, head, list) {
423 		if (pt == pt1) {
424 			list_del_rcu(&pt->list);
425 			goto out;
426 		}
427 	}
428 
429 	pr_warn("dev_remove_pack: %p not found\n", pt);
430 out:
431 	spin_unlock(&ptype_lock);
432 }
433 EXPORT_SYMBOL(__dev_remove_pack);
434 
435 /**
436  *	dev_remove_pack	 - remove packet handler
437  *	@pt: packet type declaration
438  *
439  *	Remove a protocol handler that was previously added to the kernel
440  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
441  *	from the kernel lists and can be freed or reused once this function
442  *	returns.
443  *
444  *	This call sleeps to guarantee that no CPU is looking at the packet
445  *	type after return.
446  */
447 void dev_remove_pack(struct packet_type *pt)
448 {
449 	__dev_remove_pack(pt);
450 
451 	synchronize_net();
452 }
453 EXPORT_SYMBOL(dev_remove_pack);
454 
455 
456 /**
457  *	dev_add_offload - register offload handlers
458  *	@po: protocol offload declaration
459  *
460  *	Add protocol offload handlers to the networking stack. The passed
461  *	&proto_offload is linked into kernel lists and may not be freed until
462  *	it has been removed from the kernel lists.
463  *
464  *	This call does not sleep therefore it can not
465  *	guarantee all CPU's that are in middle of receiving packets
466  *	will see the new offload handlers (until the next received packet).
467  */
468 void dev_add_offload(struct packet_offload *po)
469 {
470 	struct list_head *head = &offload_base;
471 
472 	spin_lock(&offload_lock);
473 	list_add_rcu(&po->list, head);
474 	spin_unlock(&offload_lock);
475 }
476 EXPORT_SYMBOL(dev_add_offload);
477 
478 /**
479  *	__dev_remove_offload	 - remove offload handler
480  *	@po: packet offload declaration
481  *
482  *	Remove a protocol offload handler that was previously added to the
483  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
484  *	is removed from the kernel lists and can be freed or reused once this
485  *	function returns.
486  *
487  *      The packet type might still be in use by receivers
488  *	and must not be freed until after all the CPU's have gone
489  *	through a quiescent state.
490  */
491 static void __dev_remove_offload(struct packet_offload *po)
492 {
493 	struct list_head *head = &offload_base;
494 	struct packet_offload *po1;
495 
496 	spin_lock(&offload_lock);
497 
498 	list_for_each_entry(po1, head, list) {
499 		if (po == po1) {
500 			list_del_rcu(&po->list);
501 			goto out;
502 		}
503 	}
504 
505 	pr_warn("dev_remove_offload: %p not found\n", po);
506 out:
507 	spin_unlock(&offload_lock);
508 }
509 
510 /**
511  *	dev_remove_offload	 - remove packet offload handler
512  *	@po: packet offload declaration
513  *
514  *	Remove a packet offload handler that was previously added to the kernel
515  *	offload handlers by dev_add_offload(). The passed &offload_type is
516  *	removed from the kernel lists and can be freed or reused once this
517  *	function returns.
518  *
519  *	This call sleeps to guarantee that no CPU is looking at the packet
520  *	type after return.
521  */
522 void dev_remove_offload(struct packet_offload *po)
523 {
524 	__dev_remove_offload(po);
525 
526 	synchronize_net();
527 }
528 EXPORT_SYMBOL(dev_remove_offload);
529 
530 /******************************************************************************
531 
532 		      Device Boot-time Settings Routines
533 
534 *******************************************************************************/
535 
536 /* Boot time configuration table */
537 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
538 
539 /**
540  *	netdev_boot_setup_add	- add new setup entry
541  *	@name: name of the device
542  *	@map: configured settings for the device
543  *
544  *	Adds new setup entry to the dev_boot_setup list.  The function
545  *	returns 0 on error and 1 on success.  This is a generic routine to
546  *	all netdevices.
547  */
548 static int netdev_boot_setup_add(char *name, struct ifmap *map)
549 {
550 	struct netdev_boot_setup *s;
551 	int i;
552 
553 	s = dev_boot_setup;
554 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
555 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
556 			memset(s[i].name, 0, sizeof(s[i].name));
557 			strlcpy(s[i].name, name, IFNAMSIZ);
558 			memcpy(&s[i].map, map, sizeof(s[i].map));
559 			break;
560 		}
561 	}
562 
563 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
564 }
565 
566 /**
567  *	netdev_boot_setup_check	- check boot time settings
568  *	@dev: the netdevice
569  *
570  * 	Check boot time settings for the device.
571  *	The found settings are set for the device to be used
572  *	later in the device probing.
573  *	Returns 0 if no settings found, 1 if they are.
574  */
575 int netdev_boot_setup_check(struct net_device *dev)
576 {
577 	struct netdev_boot_setup *s = dev_boot_setup;
578 	int i;
579 
580 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
581 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
582 		    !strcmp(dev->name, s[i].name)) {
583 			dev->irq 	= s[i].map.irq;
584 			dev->base_addr 	= s[i].map.base_addr;
585 			dev->mem_start 	= s[i].map.mem_start;
586 			dev->mem_end 	= s[i].map.mem_end;
587 			return 1;
588 		}
589 	}
590 	return 0;
591 }
592 EXPORT_SYMBOL(netdev_boot_setup_check);
593 
594 
595 /**
596  *	netdev_boot_base	- get address from boot time settings
597  *	@prefix: prefix for network device
598  *	@unit: id for network device
599  *
600  * 	Check boot time settings for the base address of device.
601  *	The found settings are set for the device to be used
602  *	later in the device probing.
603  *	Returns 0 if no settings found.
604  */
605 unsigned long netdev_boot_base(const char *prefix, int unit)
606 {
607 	const struct netdev_boot_setup *s = dev_boot_setup;
608 	char name[IFNAMSIZ];
609 	int i;
610 
611 	sprintf(name, "%s%d", prefix, unit);
612 
613 	/*
614 	 * If device already registered then return base of 1
615 	 * to indicate not to probe for this interface
616 	 */
617 	if (__dev_get_by_name(&init_net, name))
618 		return 1;
619 
620 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
621 		if (!strcmp(name, s[i].name))
622 			return s[i].map.base_addr;
623 	return 0;
624 }
625 
626 /*
627  * Saves at boot time configured settings for any netdevice.
628  */
629 int __init netdev_boot_setup(char *str)
630 {
631 	int ints[5];
632 	struct ifmap map;
633 
634 	str = get_options(str, ARRAY_SIZE(ints), ints);
635 	if (!str || !*str)
636 		return 0;
637 
638 	/* Save settings */
639 	memset(&map, 0, sizeof(map));
640 	if (ints[0] > 0)
641 		map.irq = ints[1];
642 	if (ints[0] > 1)
643 		map.base_addr = ints[2];
644 	if (ints[0] > 2)
645 		map.mem_start = ints[3];
646 	if (ints[0] > 3)
647 		map.mem_end = ints[4];
648 
649 	/* Add new entry to the list */
650 	return netdev_boot_setup_add(str, &map);
651 }
652 
653 __setup("netdev=", netdev_boot_setup);
654 
655 /*******************************************************************************
656 
657 			    Device Interface Subroutines
658 
659 *******************************************************************************/
660 
661 /**
662  *	__dev_get_by_name	- find a device by its name
663  *	@net: the applicable net namespace
664  *	@name: name to find
665  *
666  *	Find an interface by name. Must be called under RTNL semaphore
667  *	or @dev_base_lock. If the name is found a pointer to the device
668  *	is returned. If the name is not found then %NULL is returned. The
669  *	reference counters are not incremented so the caller must be
670  *	careful with locks.
671  */
672 
673 struct net_device *__dev_get_by_name(struct net *net, const char *name)
674 {
675 	struct net_device *dev;
676 	struct hlist_head *head = dev_name_hash(net, name);
677 
678 	hlist_for_each_entry(dev, head, name_hlist)
679 		if (!strncmp(dev->name, name, IFNAMSIZ))
680 			return dev;
681 
682 	return NULL;
683 }
684 EXPORT_SYMBOL(__dev_get_by_name);
685 
686 /**
687  *	dev_get_by_name_rcu	- find a device by its name
688  *	@net: the applicable net namespace
689  *	@name: name to find
690  *
691  *	Find an interface by name.
692  *	If the name is found a pointer to the device is returned.
693  * 	If the name is not found then %NULL is returned.
694  *	The reference counters are not incremented so the caller must be
695  *	careful with locks. The caller must hold RCU lock.
696  */
697 
698 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
699 {
700 	struct net_device *dev;
701 	struct hlist_head *head = dev_name_hash(net, name);
702 
703 	hlist_for_each_entry_rcu(dev, head, name_hlist)
704 		if (!strncmp(dev->name, name, IFNAMSIZ))
705 			return dev;
706 
707 	return NULL;
708 }
709 EXPORT_SYMBOL(dev_get_by_name_rcu);
710 
711 /**
712  *	dev_get_by_name		- find a device by its name
713  *	@net: the applicable net namespace
714  *	@name: name to find
715  *
716  *	Find an interface by name. This can be called from any
717  *	context and does its own locking. The returned handle has
718  *	the usage count incremented and the caller must use dev_put() to
719  *	release it when it is no longer needed. %NULL is returned if no
720  *	matching device is found.
721  */
722 
723 struct net_device *dev_get_by_name(struct net *net, const char *name)
724 {
725 	struct net_device *dev;
726 
727 	rcu_read_lock();
728 	dev = dev_get_by_name_rcu(net, name);
729 	if (dev)
730 		dev_hold(dev);
731 	rcu_read_unlock();
732 	return dev;
733 }
734 EXPORT_SYMBOL(dev_get_by_name);
735 
736 /**
737  *	__dev_get_by_index - find a device by its ifindex
738  *	@net: the applicable net namespace
739  *	@ifindex: index of device
740  *
741  *	Search for an interface by index. Returns %NULL if the device
742  *	is not found or a pointer to the device. The device has not
743  *	had its reference counter increased so the caller must be careful
744  *	about locking. The caller must hold either the RTNL semaphore
745  *	or @dev_base_lock.
746  */
747 
748 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
749 {
750 	struct net_device *dev;
751 	struct hlist_head *head = dev_index_hash(net, ifindex);
752 
753 	hlist_for_each_entry(dev, head, index_hlist)
754 		if (dev->ifindex == ifindex)
755 			return dev;
756 
757 	return NULL;
758 }
759 EXPORT_SYMBOL(__dev_get_by_index);
760 
761 /**
762  *	dev_get_by_index_rcu - find a device by its ifindex
763  *	@net: the applicable net namespace
764  *	@ifindex: index of device
765  *
766  *	Search for an interface by index. Returns %NULL if the device
767  *	is not found or a pointer to the device. The device has not
768  *	had its reference counter increased so the caller must be careful
769  *	about locking. The caller must hold RCU lock.
770  */
771 
772 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
773 {
774 	struct net_device *dev;
775 	struct hlist_head *head = dev_index_hash(net, ifindex);
776 
777 	hlist_for_each_entry_rcu(dev, head, index_hlist)
778 		if (dev->ifindex == ifindex)
779 			return dev;
780 
781 	return NULL;
782 }
783 EXPORT_SYMBOL(dev_get_by_index_rcu);
784 
785 
786 /**
787  *	dev_get_by_index - find a device by its ifindex
788  *	@net: the applicable net namespace
789  *	@ifindex: index of device
790  *
791  *	Search for an interface by index. Returns NULL if the device
792  *	is not found or a pointer to the device. The device returned has
793  *	had a reference added and the pointer is safe until the user calls
794  *	dev_put to indicate they have finished with it.
795  */
796 
797 struct net_device *dev_get_by_index(struct net *net, int ifindex)
798 {
799 	struct net_device *dev;
800 
801 	rcu_read_lock();
802 	dev = dev_get_by_index_rcu(net, ifindex);
803 	if (dev)
804 		dev_hold(dev);
805 	rcu_read_unlock();
806 	return dev;
807 }
808 EXPORT_SYMBOL(dev_get_by_index);
809 
810 /**
811  *	netdev_get_name - get a netdevice name, knowing its ifindex.
812  *	@net: network namespace
813  *	@name: a pointer to the buffer where the name will be stored.
814  *	@ifindex: the ifindex of the interface to get the name from.
815  *
816  *	The use of raw_seqcount_begin() and cond_resched() before
817  *	retrying is required as we want to give the writers a chance
818  *	to complete when CONFIG_PREEMPT is not set.
819  */
820 int netdev_get_name(struct net *net, char *name, int ifindex)
821 {
822 	struct net_device *dev;
823 	unsigned int seq;
824 
825 retry:
826 	seq = raw_seqcount_begin(&devnet_rename_seq);
827 	rcu_read_lock();
828 	dev = dev_get_by_index_rcu(net, ifindex);
829 	if (!dev) {
830 		rcu_read_unlock();
831 		return -ENODEV;
832 	}
833 
834 	strcpy(name, dev->name);
835 	rcu_read_unlock();
836 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
837 		cond_resched();
838 		goto retry;
839 	}
840 
841 	return 0;
842 }
843 
844 /**
845  *	dev_getbyhwaddr_rcu - find a device by its hardware address
846  *	@net: the applicable net namespace
847  *	@type: media type of device
848  *	@ha: hardware address
849  *
850  *	Search for an interface by MAC address. Returns NULL if the device
851  *	is not found or a pointer to the device.
852  *	The caller must hold RCU or RTNL.
853  *	The returned device has not had its ref count increased
854  *	and the caller must therefore be careful about locking
855  *
856  */
857 
858 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
859 				       const char *ha)
860 {
861 	struct net_device *dev;
862 
863 	for_each_netdev_rcu(net, dev)
864 		if (dev->type == type &&
865 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
866 			return dev;
867 
868 	return NULL;
869 }
870 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
871 
872 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
873 {
874 	struct net_device *dev;
875 
876 	ASSERT_RTNL();
877 	for_each_netdev(net, dev)
878 		if (dev->type == type)
879 			return dev;
880 
881 	return NULL;
882 }
883 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
884 
885 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
886 {
887 	struct net_device *dev, *ret = NULL;
888 
889 	rcu_read_lock();
890 	for_each_netdev_rcu(net, dev)
891 		if (dev->type == type) {
892 			dev_hold(dev);
893 			ret = dev;
894 			break;
895 		}
896 	rcu_read_unlock();
897 	return ret;
898 }
899 EXPORT_SYMBOL(dev_getfirstbyhwtype);
900 
901 /**
902  *	__dev_get_by_flags - find any device with given flags
903  *	@net: the applicable net namespace
904  *	@if_flags: IFF_* values
905  *	@mask: bitmask of bits in if_flags to check
906  *
907  *	Search for any interface with the given flags. Returns NULL if a device
908  *	is not found or a pointer to the device. Must be called inside
909  *	rtnl_lock(), and result refcount is unchanged.
910  */
911 
912 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
913 				      unsigned short mask)
914 {
915 	struct net_device *dev, *ret;
916 
917 	ASSERT_RTNL();
918 
919 	ret = NULL;
920 	for_each_netdev(net, dev) {
921 		if (((dev->flags ^ if_flags) & mask) == 0) {
922 			ret = dev;
923 			break;
924 		}
925 	}
926 	return ret;
927 }
928 EXPORT_SYMBOL(__dev_get_by_flags);
929 
930 /**
931  *	dev_valid_name - check if name is okay for network device
932  *	@name: name string
933  *
934  *	Network device names need to be valid file names to
935  *	to allow sysfs to work.  We also disallow any kind of
936  *	whitespace.
937  */
938 bool dev_valid_name(const char *name)
939 {
940 	if (*name == '\0')
941 		return false;
942 	if (strlen(name) >= IFNAMSIZ)
943 		return false;
944 	if (!strcmp(name, ".") || !strcmp(name, ".."))
945 		return false;
946 
947 	while (*name) {
948 		if (*name == '/' || isspace(*name))
949 			return false;
950 		name++;
951 	}
952 	return true;
953 }
954 EXPORT_SYMBOL(dev_valid_name);
955 
956 /**
957  *	__dev_alloc_name - allocate a name for a device
958  *	@net: network namespace to allocate the device name in
959  *	@name: name format string
960  *	@buf:  scratch buffer and result name string
961  *
962  *	Passed a format string - eg "lt%d" it will try and find a suitable
963  *	id. It scans list of devices to build up a free map, then chooses
964  *	the first empty slot. The caller must hold the dev_base or rtnl lock
965  *	while allocating the name and adding the device in order to avoid
966  *	duplicates.
967  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
968  *	Returns the number of the unit assigned or a negative errno code.
969  */
970 
971 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
972 {
973 	int i = 0;
974 	const char *p;
975 	const int max_netdevices = 8*PAGE_SIZE;
976 	unsigned long *inuse;
977 	struct net_device *d;
978 
979 	p = strnchr(name, IFNAMSIZ-1, '%');
980 	if (p) {
981 		/*
982 		 * Verify the string as this thing may have come from
983 		 * the user.  There must be either one "%d" and no other "%"
984 		 * characters.
985 		 */
986 		if (p[1] != 'd' || strchr(p + 2, '%'))
987 			return -EINVAL;
988 
989 		/* Use one page as a bit array of possible slots */
990 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
991 		if (!inuse)
992 			return -ENOMEM;
993 
994 		for_each_netdev(net, d) {
995 			if (!sscanf(d->name, name, &i))
996 				continue;
997 			if (i < 0 || i >= max_netdevices)
998 				continue;
999 
1000 			/*  avoid cases where sscanf is not exact inverse of printf */
1001 			snprintf(buf, IFNAMSIZ, name, i);
1002 			if (!strncmp(buf, d->name, IFNAMSIZ))
1003 				set_bit(i, inuse);
1004 		}
1005 
1006 		i = find_first_zero_bit(inuse, max_netdevices);
1007 		free_page((unsigned long) inuse);
1008 	}
1009 
1010 	if (buf != name)
1011 		snprintf(buf, IFNAMSIZ, name, i);
1012 	if (!__dev_get_by_name(net, buf))
1013 		return i;
1014 
1015 	/* It is possible to run out of possible slots
1016 	 * when the name is long and there isn't enough space left
1017 	 * for the digits, or if all bits are used.
1018 	 */
1019 	return -ENFILE;
1020 }
1021 
1022 /**
1023  *	dev_alloc_name - allocate a name for a device
1024  *	@dev: device
1025  *	@name: name format string
1026  *
1027  *	Passed a format string - eg "lt%d" it will try and find a suitable
1028  *	id. It scans list of devices to build up a free map, then chooses
1029  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1030  *	while allocating the name and adding the device in order to avoid
1031  *	duplicates.
1032  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1033  *	Returns the number of the unit assigned or a negative errno code.
1034  */
1035 
1036 int dev_alloc_name(struct net_device *dev, const char *name)
1037 {
1038 	char buf[IFNAMSIZ];
1039 	struct net *net;
1040 	int ret;
1041 
1042 	BUG_ON(!dev_net(dev));
1043 	net = dev_net(dev);
1044 	ret = __dev_alloc_name(net, name, buf);
1045 	if (ret >= 0)
1046 		strlcpy(dev->name, buf, IFNAMSIZ);
1047 	return ret;
1048 }
1049 EXPORT_SYMBOL(dev_alloc_name);
1050 
1051 static int dev_alloc_name_ns(struct net *net,
1052 			     struct net_device *dev,
1053 			     const char *name)
1054 {
1055 	char buf[IFNAMSIZ];
1056 	int ret;
1057 
1058 	ret = __dev_alloc_name(net, name, buf);
1059 	if (ret >= 0)
1060 		strlcpy(dev->name, buf, IFNAMSIZ);
1061 	return ret;
1062 }
1063 
1064 static int dev_get_valid_name(struct net *net,
1065 			      struct net_device *dev,
1066 			      const char *name)
1067 {
1068 	BUG_ON(!net);
1069 
1070 	if (!dev_valid_name(name))
1071 		return -EINVAL;
1072 
1073 	if (strchr(name, '%'))
1074 		return dev_alloc_name_ns(net, dev, name);
1075 	else if (__dev_get_by_name(net, name))
1076 		return -EEXIST;
1077 	else if (dev->name != name)
1078 		strlcpy(dev->name, name, IFNAMSIZ);
1079 
1080 	return 0;
1081 }
1082 
1083 /**
1084  *	dev_change_name - change name of a device
1085  *	@dev: device
1086  *	@newname: name (or format string) must be at least IFNAMSIZ
1087  *
1088  *	Change name of a device, can pass format strings "eth%d".
1089  *	for wildcarding.
1090  */
1091 int dev_change_name(struct net_device *dev, const char *newname)
1092 {
1093 	unsigned char old_assign_type;
1094 	char oldname[IFNAMSIZ];
1095 	int err = 0;
1096 	int ret;
1097 	struct net *net;
1098 
1099 	ASSERT_RTNL();
1100 	BUG_ON(!dev_net(dev));
1101 
1102 	net = dev_net(dev);
1103 	if (dev->flags & IFF_UP)
1104 		return -EBUSY;
1105 
1106 	write_seqcount_begin(&devnet_rename_seq);
1107 
1108 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1109 		write_seqcount_end(&devnet_rename_seq);
1110 		return 0;
1111 	}
1112 
1113 	memcpy(oldname, dev->name, IFNAMSIZ);
1114 
1115 	err = dev_get_valid_name(net, dev, newname);
1116 	if (err < 0) {
1117 		write_seqcount_end(&devnet_rename_seq);
1118 		return err;
1119 	}
1120 
1121 	if (oldname[0] && !strchr(oldname, '%'))
1122 		netdev_info(dev, "renamed from %s\n", oldname);
1123 
1124 	old_assign_type = dev->name_assign_type;
1125 	dev->name_assign_type = NET_NAME_RENAMED;
1126 
1127 rollback:
1128 	ret = device_rename(&dev->dev, dev->name);
1129 	if (ret) {
1130 		memcpy(dev->name, oldname, IFNAMSIZ);
1131 		dev->name_assign_type = old_assign_type;
1132 		write_seqcount_end(&devnet_rename_seq);
1133 		return ret;
1134 	}
1135 
1136 	write_seqcount_end(&devnet_rename_seq);
1137 
1138 	netdev_adjacent_rename_links(dev, oldname);
1139 
1140 	write_lock_bh(&dev_base_lock);
1141 	hlist_del_rcu(&dev->name_hlist);
1142 	write_unlock_bh(&dev_base_lock);
1143 
1144 	synchronize_rcu();
1145 
1146 	write_lock_bh(&dev_base_lock);
1147 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1148 	write_unlock_bh(&dev_base_lock);
1149 
1150 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1151 	ret = notifier_to_errno(ret);
1152 
1153 	if (ret) {
1154 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1155 		if (err >= 0) {
1156 			err = ret;
1157 			write_seqcount_begin(&devnet_rename_seq);
1158 			memcpy(dev->name, oldname, IFNAMSIZ);
1159 			memcpy(oldname, newname, IFNAMSIZ);
1160 			dev->name_assign_type = old_assign_type;
1161 			old_assign_type = NET_NAME_RENAMED;
1162 			goto rollback;
1163 		} else {
1164 			pr_err("%s: name change rollback failed: %d\n",
1165 			       dev->name, ret);
1166 		}
1167 	}
1168 
1169 	return err;
1170 }
1171 
1172 /**
1173  *	dev_set_alias - change ifalias of a device
1174  *	@dev: device
1175  *	@alias: name up to IFALIASZ
1176  *	@len: limit of bytes to copy from info
1177  *
1178  *	Set ifalias for a device,
1179  */
1180 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1181 {
1182 	char *new_ifalias;
1183 
1184 	ASSERT_RTNL();
1185 
1186 	if (len >= IFALIASZ)
1187 		return -EINVAL;
1188 
1189 	if (!len) {
1190 		kfree(dev->ifalias);
1191 		dev->ifalias = NULL;
1192 		return 0;
1193 	}
1194 
1195 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1196 	if (!new_ifalias)
1197 		return -ENOMEM;
1198 	dev->ifalias = new_ifalias;
1199 
1200 	strlcpy(dev->ifalias, alias, len+1);
1201 	return len;
1202 }
1203 
1204 
1205 /**
1206  *	netdev_features_change - device changes features
1207  *	@dev: device to cause notification
1208  *
1209  *	Called to indicate a device has changed features.
1210  */
1211 void netdev_features_change(struct net_device *dev)
1212 {
1213 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1214 }
1215 EXPORT_SYMBOL(netdev_features_change);
1216 
1217 /**
1218  *	netdev_state_change - device changes state
1219  *	@dev: device to cause notification
1220  *
1221  *	Called to indicate a device has changed state. This function calls
1222  *	the notifier chains for netdev_chain and sends a NEWLINK message
1223  *	to the routing socket.
1224  */
1225 void netdev_state_change(struct net_device *dev)
1226 {
1227 	if (dev->flags & IFF_UP) {
1228 		struct netdev_notifier_change_info change_info;
1229 
1230 		change_info.flags_changed = 0;
1231 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1232 					      &change_info.info);
1233 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1234 	}
1235 }
1236 EXPORT_SYMBOL(netdev_state_change);
1237 
1238 /**
1239  * 	netdev_notify_peers - notify network peers about existence of @dev
1240  * 	@dev: network device
1241  *
1242  * Generate traffic such that interested network peers are aware of
1243  * @dev, such as by generating a gratuitous ARP. This may be used when
1244  * a device wants to inform the rest of the network about some sort of
1245  * reconfiguration such as a failover event or virtual machine
1246  * migration.
1247  */
1248 void netdev_notify_peers(struct net_device *dev)
1249 {
1250 	rtnl_lock();
1251 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1252 	rtnl_unlock();
1253 }
1254 EXPORT_SYMBOL(netdev_notify_peers);
1255 
1256 static int __dev_open(struct net_device *dev)
1257 {
1258 	const struct net_device_ops *ops = dev->netdev_ops;
1259 	int ret;
1260 
1261 	ASSERT_RTNL();
1262 
1263 	if (!netif_device_present(dev))
1264 		return -ENODEV;
1265 
1266 	/* Block netpoll from trying to do any rx path servicing.
1267 	 * If we don't do this there is a chance ndo_poll_controller
1268 	 * or ndo_poll may be running while we open the device
1269 	 */
1270 	netpoll_poll_disable(dev);
1271 
1272 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1273 	ret = notifier_to_errno(ret);
1274 	if (ret)
1275 		return ret;
1276 
1277 	set_bit(__LINK_STATE_START, &dev->state);
1278 
1279 	if (ops->ndo_validate_addr)
1280 		ret = ops->ndo_validate_addr(dev);
1281 
1282 	if (!ret && ops->ndo_open)
1283 		ret = ops->ndo_open(dev);
1284 
1285 	netpoll_poll_enable(dev);
1286 
1287 	if (ret)
1288 		clear_bit(__LINK_STATE_START, &dev->state);
1289 	else {
1290 		dev->flags |= IFF_UP;
1291 		dev_set_rx_mode(dev);
1292 		dev_activate(dev);
1293 		add_device_randomness(dev->dev_addr, dev->addr_len);
1294 	}
1295 
1296 	return ret;
1297 }
1298 
1299 /**
1300  *	dev_open	- prepare an interface for use.
1301  *	@dev:	device to open
1302  *
1303  *	Takes a device from down to up state. The device's private open
1304  *	function is invoked and then the multicast lists are loaded. Finally
1305  *	the device is moved into the up state and a %NETDEV_UP message is
1306  *	sent to the netdev notifier chain.
1307  *
1308  *	Calling this function on an active interface is a nop. On a failure
1309  *	a negative errno code is returned.
1310  */
1311 int dev_open(struct net_device *dev)
1312 {
1313 	int ret;
1314 
1315 	if (dev->flags & IFF_UP)
1316 		return 0;
1317 
1318 	ret = __dev_open(dev);
1319 	if (ret < 0)
1320 		return ret;
1321 
1322 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1323 	call_netdevice_notifiers(NETDEV_UP, dev);
1324 
1325 	return ret;
1326 }
1327 EXPORT_SYMBOL(dev_open);
1328 
1329 static int __dev_close_many(struct list_head *head)
1330 {
1331 	struct net_device *dev;
1332 
1333 	ASSERT_RTNL();
1334 	might_sleep();
1335 
1336 	list_for_each_entry(dev, head, close_list) {
1337 		/* Temporarily disable netpoll until the interface is down */
1338 		netpoll_poll_disable(dev);
1339 
1340 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1341 
1342 		clear_bit(__LINK_STATE_START, &dev->state);
1343 
1344 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1345 		 * can be even on different cpu. So just clear netif_running().
1346 		 *
1347 		 * dev->stop() will invoke napi_disable() on all of it's
1348 		 * napi_struct instances on this device.
1349 		 */
1350 		smp_mb__after_atomic(); /* Commit netif_running(). */
1351 	}
1352 
1353 	dev_deactivate_many(head);
1354 
1355 	list_for_each_entry(dev, head, close_list) {
1356 		const struct net_device_ops *ops = dev->netdev_ops;
1357 
1358 		/*
1359 		 *	Call the device specific close. This cannot fail.
1360 		 *	Only if device is UP
1361 		 *
1362 		 *	We allow it to be called even after a DETACH hot-plug
1363 		 *	event.
1364 		 */
1365 		if (ops->ndo_stop)
1366 			ops->ndo_stop(dev);
1367 
1368 		dev->flags &= ~IFF_UP;
1369 		netpoll_poll_enable(dev);
1370 	}
1371 
1372 	return 0;
1373 }
1374 
1375 static int __dev_close(struct net_device *dev)
1376 {
1377 	int retval;
1378 	LIST_HEAD(single);
1379 
1380 	list_add(&dev->close_list, &single);
1381 	retval = __dev_close_many(&single);
1382 	list_del(&single);
1383 
1384 	return retval;
1385 }
1386 
1387 static int dev_close_many(struct list_head *head)
1388 {
1389 	struct net_device *dev, *tmp;
1390 
1391 	/* Remove the devices that don't need to be closed */
1392 	list_for_each_entry_safe(dev, tmp, head, close_list)
1393 		if (!(dev->flags & IFF_UP))
1394 			list_del_init(&dev->close_list);
1395 
1396 	__dev_close_many(head);
1397 
1398 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1399 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1400 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1401 		list_del_init(&dev->close_list);
1402 	}
1403 
1404 	return 0;
1405 }
1406 
1407 /**
1408  *	dev_close - shutdown an interface.
1409  *	@dev: device to shutdown
1410  *
1411  *	This function moves an active device into down state. A
1412  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1413  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1414  *	chain.
1415  */
1416 int dev_close(struct net_device *dev)
1417 {
1418 	if (dev->flags & IFF_UP) {
1419 		LIST_HEAD(single);
1420 
1421 		list_add(&dev->close_list, &single);
1422 		dev_close_many(&single);
1423 		list_del(&single);
1424 	}
1425 	return 0;
1426 }
1427 EXPORT_SYMBOL(dev_close);
1428 
1429 
1430 /**
1431  *	dev_disable_lro - disable Large Receive Offload on a device
1432  *	@dev: device
1433  *
1434  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1435  *	called under RTNL.  This is needed if received packets may be
1436  *	forwarded to another interface.
1437  */
1438 void dev_disable_lro(struct net_device *dev)
1439 {
1440 	struct net_device *lower_dev;
1441 	struct list_head *iter;
1442 
1443 	dev->wanted_features &= ~NETIF_F_LRO;
1444 	netdev_update_features(dev);
1445 
1446 	if (unlikely(dev->features & NETIF_F_LRO))
1447 		netdev_WARN(dev, "failed to disable LRO!\n");
1448 
1449 	netdev_for_each_lower_dev(dev, lower_dev, iter)
1450 		dev_disable_lro(lower_dev);
1451 }
1452 EXPORT_SYMBOL(dev_disable_lro);
1453 
1454 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1455 				   struct net_device *dev)
1456 {
1457 	struct netdev_notifier_info info;
1458 
1459 	netdev_notifier_info_init(&info, dev);
1460 	return nb->notifier_call(nb, val, &info);
1461 }
1462 
1463 static int dev_boot_phase = 1;
1464 
1465 /**
1466  *	register_netdevice_notifier - register a network notifier block
1467  *	@nb: notifier
1468  *
1469  *	Register a notifier to be called when network device events occur.
1470  *	The notifier passed is linked into the kernel structures and must
1471  *	not be reused until it has been unregistered. A negative errno code
1472  *	is returned on a failure.
1473  *
1474  * 	When registered all registration and up events are replayed
1475  *	to the new notifier to allow device to have a race free
1476  *	view of the network device list.
1477  */
1478 
1479 int register_netdevice_notifier(struct notifier_block *nb)
1480 {
1481 	struct net_device *dev;
1482 	struct net_device *last;
1483 	struct net *net;
1484 	int err;
1485 
1486 	rtnl_lock();
1487 	err = raw_notifier_chain_register(&netdev_chain, nb);
1488 	if (err)
1489 		goto unlock;
1490 	if (dev_boot_phase)
1491 		goto unlock;
1492 	for_each_net(net) {
1493 		for_each_netdev(net, dev) {
1494 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1495 			err = notifier_to_errno(err);
1496 			if (err)
1497 				goto rollback;
1498 
1499 			if (!(dev->flags & IFF_UP))
1500 				continue;
1501 
1502 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1503 		}
1504 	}
1505 
1506 unlock:
1507 	rtnl_unlock();
1508 	return err;
1509 
1510 rollback:
1511 	last = dev;
1512 	for_each_net(net) {
1513 		for_each_netdev(net, dev) {
1514 			if (dev == last)
1515 				goto outroll;
1516 
1517 			if (dev->flags & IFF_UP) {
1518 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1519 							dev);
1520 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1521 			}
1522 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1523 		}
1524 	}
1525 
1526 outroll:
1527 	raw_notifier_chain_unregister(&netdev_chain, nb);
1528 	goto unlock;
1529 }
1530 EXPORT_SYMBOL(register_netdevice_notifier);
1531 
1532 /**
1533  *	unregister_netdevice_notifier - unregister a network notifier block
1534  *	@nb: notifier
1535  *
1536  *	Unregister a notifier previously registered by
1537  *	register_netdevice_notifier(). The notifier is unlinked into the
1538  *	kernel structures and may then be reused. A negative errno code
1539  *	is returned on a failure.
1540  *
1541  * 	After unregistering unregister and down device events are synthesized
1542  *	for all devices on the device list to the removed notifier to remove
1543  *	the need for special case cleanup code.
1544  */
1545 
1546 int unregister_netdevice_notifier(struct notifier_block *nb)
1547 {
1548 	struct net_device *dev;
1549 	struct net *net;
1550 	int err;
1551 
1552 	rtnl_lock();
1553 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1554 	if (err)
1555 		goto unlock;
1556 
1557 	for_each_net(net) {
1558 		for_each_netdev(net, dev) {
1559 			if (dev->flags & IFF_UP) {
1560 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1561 							dev);
1562 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1563 			}
1564 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1565 		}
1566 	}
1567 unlock:
1568 	rtnl_unlock();
1569 	return err;
1570 }
1571 EXPORT_SYMBOL(unregister_netdevice_notifier);
1572 
1573 /**
1574  *	call_netdevice_notifiers_info - call all network notifier blocks
1575  *	@val: value passed unmodified to notifier function
1576  *	@dev: net_device pointer passed unmodified to notifier function
1577  *	@info: notifier information data
1578  *
1579  *	Call all network notifier blocks.  Parameters and return value
1580  *	are as for raw_notifier_call_chain().
1581  */
1582 
1583 static int call_netdevice_notifiers_info(unsigned long val,
1584 					 struct net_device *dev,
1585 					 struct netdev_notifier_info *info)
1586 {
1587 	ASSERT_RTNL();
1588 	netdev_notifier_info_init(info, dev);
1589 	return raw_notifier_call_chain(&netdev_chain, val, info);
1590 }
1591 
1592 /**
1593  *	call_netdevice_notifiers - call all network notifier blocks
1594  *      @val: value passed unmodified to notifier function
1595  *      @dev: net_device pointer passed unmodified to notifier function
1596  *
1597  *	Call all network notifier blocks.  Parameters and return value
1598  *	are as for raw_notifier_call_chain().
1599  */
1600 
1601 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1602 {
1603 	struct netdev_notifier_info info;
1604 
1605 	return call_netdevice_notifiers_info(val, dev, &info);
1606 }
1607 EXPORT_SYMBOL(call_netdevice_notifiers);
1608 
1609 static struct static_key netstamp_needed __read_mostly;
1610 #ifdef HAVE_JUMP_LABEL
1611 /* We are not allowed to call static_key_slow_dec() from irq context
1612  * If net_disable_timestamp() is called from irq context, defer the
1613  * static_key_slow_dec() calls.
1614  */
1615 static atomic_t netstamp_needed_deferred;
1616 #endif
1617 
1618 void net_enable_timestamp(void)
1619 {
1620 #ifdef HAVE_JUMP_LABEL
1621 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1622 
1623 	if (deferred) {
1624 		while (--deferred)
1625 			static_key_slow_dec(&netstamp_needed);
1626 		return;
1627 	}
1628 #endif
1629 	static_key_slow_inc(&netstamp_needed);
1630 }
1631 EXPORT_SYMBOL(net_enable_timestamp);
1632 
1633 void net_disable_timestamp(void)
1634 {
1635 #ifdef HAVE_JUMP_LABEL
1636 	if (in_interrupt()) {
1637 		atomic_inc(&netstamp_needed_deferred);
1638 		return;
1639 	}
1640 #endif
1641 	static_key_slow_dec(&netstamp_needed);
1642 }
1643 EXPORT_SYMBOL(net_disable_timestamp);
1644 
1645 static inline void net_timestamp_set(struct sk_buff *skb)
1646 {
1647 	skb->tstamp.tv64 = 0;
1648 	if (static_key_false(&netstamp_needed))
1649 		__net_timestamp(skb);
1650 }
1651 
1652 #define net_timestamp_check(COND, SKB)			\
1653 	if (static_key_false(&netstamp_needed)) {		\
1654 		if ((COND) && !(SKB)->tstamp.tv64)	\
1655 			__net_timestamp(SKB);		\
1656 	}						\
1657 
1658 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1659 {
1660 	unsigned int len;
1661 
1662 	if (!(dev->flags & IFF_UP))
1663 		return false;
1664 
1665 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1666 	if (skb->len <= len)
1667 		return true;
1668 
1669 	/* if TSO is enabled, we don't care about the length as the packet
1670 	 * could be forwarded without being segmented before
1671 	 */
1672 	if (skb_is_gso(skb))
1673 		return true;
1674 
1675 	return false;
1676 }
1677 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1678 
1679 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1680 {
1681 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1682 		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1683 			atomic_long_inc(&dev->rx_dropped);
1684 			kfree_skb(skb);
1685 			return NET_RX_DROP;
1686 		}
1687 	}
1688 
1689 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1690 		atomic_long_inc(&dev->rx_dropped);
1691 		kfree_skb(skb);
1692 		return NET_RX_DROP;
1693 	}
1694 
1695 	skb_scrub_packet(skb, true);
1696 	skb->protocol = eth_type_trans(skb, dev);
1697 	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1698 
1699 	return 0;
1700 }
1701 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1702 
1703 /**
1704  * dev_forward_skb - loopback an skb to another netif
1705  *
1706  * @dev: destination network device
1707  * @skb: buffer to forward
1708  *
1709  * return values:
1710  *	NET_RX_SUCCESS	(no congestion)
1711  *	NET_RX_DROP     (packet was dropped, but freed)
1712  *
1713  * dev_forward_skb can be used for injecting an skb from the
1714  * start_xmit function of one device into the receive queue
1715  * of another device.
1716  *
1717  * The receiving device may be in another namespace, so
1718  * we have to clear all information in the skb that could
1719  * impact namespace isolation.
1720  */
1721 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1722 {
1723 	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1724 }
1725 EXPORT_SYMBOL_GPL(dev_forward_skb);
1726 
1727 static inline int deliver_skb(struct sk_buff *skb,
1728 			      struct packet_type *pt_prev,
1729 			      struct net_device *orig_dev)
1730 {
1731 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1732 		return -ENOMEM;
1733 	atomic_inc(&skb->users);
1734 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1735 }
1736 
1737 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1738 {
1739 	if (!ptype->af_packet_priv || !skb->sk)
1740 		return false;
1741 
1742 	if (ptype->id_match)
1743 		return ptype->id_match(ptype, skb->sk);
1744 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1745 		return true;
1746 
1747 	return false;
1748 }
1749 
1750 /*
1751  *	Support routine. Sends outgoing frames to any network
1752  *	taps currently in use.
1753  */
1754 
1755 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1756 {
1757 	struct packet_type *ptype;
1758 	struct sk_buff *skb2 = NULL;
1759 	struct packet_type *pt_prev = NULL;
1760 
1761 	rcu_read_lock();
1762 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1763 		/* Never send packets back to the socket
1764 		 * they originated from - MvS (miquels@drinkel.ow.org)
1765 		 */
1766 		if ((ptype->dev == dev || !ptype->dev) &&
1767 		    (!skb_loop_sk(ptype, skb))) {
1768 			if (pt_prev) {
1769 				deliver_skb(skb2, pt_prev, skb->dev);
1770 				pt_prev = ptype;
1771 				continue;
1772 			}
1773 
1774 			skb2 = skb_clone(skb, GFP_ATOMIC);
1775 			if (!skb2)
1776 				break;
1777 
1778 			net_timestamp_set(skb2);
1779 
1780 			/* skb->nh should be correctly
1781 			   set by sender, so that the second statement is
1782 			   just protection against buggy protocols.
1783 			 */
1784 			skb_reset_mac_header(skb2);
1785 
1786 			if (skb_network_header(skb2) < skb2->data ||
1787 			    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1788 				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1789 						     ntohs(skb2->protocol),
1790 						     dev->name);
1791 				skb_reset_network_header(skb2);
1792 			}
1793 
1794 			skb2->transport_header = skb2->network_header;
1795 			skb2->pkt_type = PACKET_OUTGOING;
1796 			pt_prev = ptype;
1797 		}
1798 	}
1799 	if (pt_prev)
1800 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1801 	rcu_read_unlock();
1802 }
1803 
1804 /**
1805  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1806  * @dev: Network device
1807  * @txq: number of queues available
1808  *
1809  * If real_num_tx_queues is changed the tc mappings may no longer be
1810  * valid. To resolve this verify the tc mapping remains valid and if
1811  * not NULL the mapping. With no priorities mapping to this
1812  * offset/count pair it will no longer be used. In the worst case TC0
1813  * is invalid nothing can be done so disable priority mappings. If is
1814  * expected that drivers will fix this mapping if they can before
1815  * calling netif_set_real_num_tx_queues.
1816  */
1817 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1818 {
1819 	int i;
1820 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1821 
1822 	/* If TC0 is invalidated disable TC mapping */
1823 	if (tc->offset + tc->count > txq) {
1824 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1825 		dev->num_tc = 0;
1826 		return;
1827 	}
1828 
1829 	/* Invalidated prio to tc mappings set to TC0 */
1830 	for (i = 1; i < TC_BITMASK + 1; i++) {
1831 		int q = netdev_get_prio_tc_map(dev, i);
1832 
1833 		tc = &dev->tc_to_txq[q];
1834 		if (tc->offset + tc->count > txq) {
1835 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1836 				i, q);
1837 			netdev_set_prio_tc_map(dev, i, 0);
1838 		}
1839 	}
1840 }
1841 
1842 #ifdef CONFIG_XPS
1843 static DEFINE_MUTEX(xps_map_mutex);
1844 #define xmap_dereference(P)		\
1845 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1846 
1847 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1848 					int cpu, u16 index)
1849 {
1850 	struct xps_map *map = NULL;
1851 	int pos;
1852 
1853 	if (dev_maps)
1854 		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1855 
1856 	for (pos = 0; map && pos < map->len; pos++) {
1857 		if (map->queues[pos] == index) {
1858 			if (map->len > 1) {
1859 				map->queues[pos] = map->queues[--map->len];
1860 			} else {
1861 				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1862 				kfree_rcu(map, rcu);
1863 				map = NULL;
1864 			}
1865 			break;
1866 		}
1867 	}
1868 
1869 	return map;
1870 }
1871 
1872 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1873 {
1874 	struct xps_dev_maps *dev_maps;
1875 	int cpu, i;
1876 	bool active = false;
1877 
1878 	mutex_lock(&xps_map_mutex);
1879 	dev_maps = xmap_dereference(dev->xps_maps);
1880 
1881 	if (!dev_maps)
1882 		goto out_no_maps;
1883 
1884 	for_each_possible_cpu(cpu) {
1885 		for (i = index; i < dev->num_tx_queues; i++) {
1886 			if (!remove_xps_queue(dev_maps, cpu, i))
1887 				break;
1888 		}
1889 		if (i == dev->num_tx_queues)
1890 			active = true;
1891 	}
1892 
1893 	if (!active) {
1894 		RCU_INIT_POINTER(dev->xps_maps, NULL);
1895 		kfree_rcu(dev_maps, rcu);
1896 	}
1897 
1898 	for (i = index; i < dev->num_tx_queues; i++)
1899 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1900 					     NUMA_NO_NODE);
1901 
1902 out_no_maps:
1903 	mutex_unlock(&xps_map_mutex);
1904 }
1905 
1906 static struct xps_map *expand_xps_map(struct xps_map *map,
1907 				      int cpu, u16 index)
1908 {
1909 	struct xps_map *new_map;
1910 	int alloc_len = XPS_MIN_MAP_ALLOC;
1911 	int i, pos;
1912 
1913 	for (pos = 0; map && pos < map->len; pos++) {
1914 		if (map->queues[pos] != index)
1915 			continue;
1916 		return map;
1917 	}
1918 
1919 	/* Need to add queue to this CPU's existing map */
1920 	if (map) {
1921 		if (pos < map->alloc_len)
1922 			return map;
1923 
1924 		alloc_len = map->alloc_len * 2;
1925 	}
1926 
1927 	/* Need to allocate new map to store queue on this CPU's map */
1928 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1929 			       cpu_to_node(cpu));
1930 	if (!new_map)
1931 		return NULL;
1932 
1933 	for (i = 0; i < pos; i++)
1934 		new_map->queues[i] = map->queues[i];
1935 	new_map->alloc_len = alloc_len;
1936 	new_map->len = pos;
1937 
1938 	return new_map;
1939 }
1940 
1941 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1942 			u16 index)
1943 {
1944 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1945 	struct xps_map *map, *new_map;
1946 	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1947 	int cpu, numa_node_id = -2;
1948 	bool active = false;
1949 
1950 	mutex_lock(&xps_map_mutex);
1951 
1952 	dev_maps = xmap_dereference(dev->xps_maps);
1953 
1954 	/* allocate memory for queue storage */
1955 	for_each_online_cpu(cpu) {
1956 		if (!cpumask_test_cpu(cpu, mask))
1957 			continue;
1958 
1959 		if (!new_dev_maps)
1960 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1961 		if (!new_dev_maps) {
1962 			mutex_unlock(&xps_map_mutex);
1963 			return -ENOMEM;
1964 		}
1965 
1966 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1967 				 NULL;
1968 
1969 		map = expand_xps_map(map, cpu, index);
1970 		if (!map)
1971 			goto error;
1972 
1973 		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1974 	}
1975 
1976 	if (!new_dev_maps)
1977 		goto out_no_new_maps;
1978 
1979 	for_each_possible_cpu(cpu) {
1980 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1981 			/* add queue to CPU maps */
1982 			int pos = 0;
1983 
1984 			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1985 			while ((pos < map->len) && (map->queues[pos] != index))
1986 				pos++;
1987 
1988 			if (pos == map->len)
1989 				map->queues[map->len++] = index;
1990 #ifdef CONFIG_NUMA
1991 			if (numa_node_id == -2)
1992 				numa_node_id = cpu_to_node(cpu);
1993 			else if (numa_node_id != cpu_to_node(cpu))
1994 				numa_node_id = -1;
1995 #endif
1996 		} else if (dev_maps) {
1997 			/* fill in the new device map from the old device map */
1998 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1999 			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2000 		}
2001 
2002 	}
2003 
2004 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2005 
2006 	/* Cleanup old maps */
2007 	if (dev_maps) {
2008 		for_each_possible_cpu(cpu) {
2009 			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2010 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2011 			if (map && map != new_map)
2012 				kfree_rcu(map, rcu);
2013 		}
2014 
2015 		kfree_rcu(dev_maps, rcu);
2016 	}
2017 
2018 	dev_maps = new_dev_maps;
2019 	active = true;
2020 
2021 out_no_new_maps:
2022 	/* update Tx queue numa node */
2023 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2024 				     (numa_node_id >= 0) ? numa_node_id :
2025 				     NUMA_NO_NODE);
2026 
2027 	if (!dev_maps)
2028 		goto out_no_maps;
2029 
2030 	/* removes queue from unused CPUs */
2031 	for_each_possible_cpu(cpu) {
2032 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2033 			continue;
2034 
2035 		if (remove_xps_queue(dev_maps, cpu, index))
2036 			active = true;
2037 	}
2038 
2039 	/* free map if not active */
2040 	if (!active) {
2041 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2042 		kfree_rcu(dev_maps, rcu);
2043 	}
2044 
2045 out_no_maps:
2046 	mutex_unlock(&xps_map_mutex);
2047 
2048 	return 0;
2049 error:
2050 	/* remove any maps that we added */
2051 	for_each_possible_cpu(cpu) {
2052 		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2053 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2054 				 NULL;
2055 		if (new_map && new_map != map)
2056 			kfree(new_map);
2057 	}
2058 
2059 	mutex_unlock(&xps_map_mutex);
2060 
2061 	kfree(new_dev_maps);
2062 	return -ENOMEM;
2063 }
2064 EXPORT_SYMBOL(netif_set_xps_queue);
2065 
2066 #endif
2067 /*
2068  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2069  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2070  */
2071 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2072 {
2073 	int rc;
2074 
2075 	if (txq < 1 || txq > dev->num_tx_queues)
2076 		return -EINVAL;
2077 
2078 	if (dev->reg_state == NETREG_REGISTERED ||
2079 	    dev->reg_state == NETREG_UNREGISTERING) {
2080 		ASSERT_RTNL();
2081 
2082 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2083 						  txq);
2084 		if (rc)
2085 			return rc;
2086 
2087 		if (dev->num_tc)
2088 			netif_setup_tc(dev, txq);
2089 
2090 		if (txq < dev->real_num_tx_queues) {
2091 			qdisc_reset_all_tx_gt(dev, txq);
2092 #ifdef CONFIG_XPS
2093 			netif_reset_xps_queues_gt(dev, txq);
2094 #endif
2095 		}
2096 	}
2097 
2098 	dev->real_num_tx_queues = txq;
2099 	return 0;
2100 }
2101 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2102 
2103 #ifdef CONFIG_SYSFS
2104 /**
2105  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2106  *	@dev: Network device
2107  *	@rxq: Actual number of RX queues
2108  *
2109  *	This must be called either with the rtnl_lock held or before
2110  *	registration of the net device.  Returns 0 on success, or a
2111  *	negative error code.  If called before registration, it always
2112  *	succeeds.
2113  */
2114 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2115 {
2116 	int rc;
2117 
2118 	if (rxq < 1 || rxq > dev->num_rx_queues)
2119 		return -EINVAL;
2120 
2121 	if (dev->reg_state == NETREG_REGISTERED) {
2122 		ASSERT_RTNL();
2123 
2124 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2125 						  rxq);
2126 		if (rc)
2127 			return rc;
2128 	}
2129 
2130 	dev->real_num_rx_queues = rxq;
2131 	return 0;
2132 }
2133 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2134 #endif
2135 
2136 /**
2137  * netif_get_num_default_rss_queues - default number of RSS queues
2138  *
2139  * This routine should set an upper limit on the number of RSS queues
2140  * used by default by multiqueue devices.
2141  */
2142 int netif_get_num_default_rss_queues(void)
2143 {
2144 	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2145 }
2146 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2147 
2148 static inline void __netif_reschedule(struct Qdisc *q)
2149 {
2150 	struct softnet_data *sd;
2151 	unsigned long flags;
2152 
2153 	local_irq_save(flags);
2154 	sd = this_cpu_ptr(&softnet_data);
2155 	q->next_sched = NULL;
2156 	*sd->output_queue_tailp = q;
2157 	sd->output_queue_tailp = &q->next_sched;
2158 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2159 	local_irq_restore(flags);
2160 }
2161 
2162 void __netif_schedule(struct Qdisc *q)
2163 {
2164 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2165 		__netif_reschedule(q);
2166 }
2167 EXPORT_SYMBOL(__netif_schedule);
2168 
2169 struct dev_kfree_skb_cb {
2170 	enum skb_free_reason reason;
2171 };
2172 
2173 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2174 {
2175 	return (struct dev_kfree_skb_cb *)skb->cb;
2176 }
2177 
2178 void netif_schedule_queue(struct netdev_queue *txq)
2179 {
2180 	rcu_read_lock();
2181 	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2182 		struct Qdisc *q = rcu_dereference(txq->qdisc);
2183 
2184 		__netif_schedule(q);
2185 	}
2186 	rcu_read_unlock();
2187 }
2188 EXPORT_SYMBOL(netif_schedule_queue);
2189 
2190 /**
2191  *	netif_wake_subqueue - allow sending packets on subqueue
2192  *	@dev: network device
2193  *	@queue_index: sub queue index
2194  *
2195  * Resume individual transmit queue of a device with multiple transmit queues.
2196  */
2197 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2198 {
2199 	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2200 
2201 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2202 		struct Qdisc *q;
2203 
2204 		rcu_read_lock();
2205 		q = rcu_dereference(txq->qdisc);
2206 		__netif_schedule(q);
2207 		rcu_read_unlock();
2208 	}
2209 }
2210 EXPORT_SYMBOL(netif_wake_subqueue);
2211 
2212 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2213 {
2214 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2215 		struct Qdisc *q;
2216 
2217 		rcu_read_lock();
2218 		q = rcu_dereference(dev_queue->qdisc);
2219 		__netif_schedule(q);
2220 		rcu_read_unlock();
2221 	}
2222 }
2223 EXPORT_SYMBOL(netif_tx_wake_queue);
2224 
2225 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2226 {
2227 	unsigned long flags;
2228 
2229 	if (likely(atomic_read(&skb->users) == 1)) {
2230 		smp_rmb();
2231 		atomic_set(&skb->users, 0);
2232 	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2233 		return;
2234 	}
2235 	get_kfree_skb_cb(skb)->reason = reason;
2236 	local_irq_save(flags);
2237 	skb->next = __this_cpu_read(softnet_data.completion_queue);
2238 	__this_cpu_write(softnet_data.completion_queue, skb);
2239 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2240 	local_irq_restore(flags);
2241 }
2242 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2243 
2244 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2245 {
2246 	if (in_irq() || irqs_disabled())
2247 		__dev_kfree_skb_irq(skb, reason);
2248 	else
2249 		dev_kfree_skb(skb);
2250 }
2251 EXPORT_SYMBOL(__dev_kfree_skb_any);
2252 
2253 
2254 /**
2255  * netif_device_detach - mark device as removed
2256  * @dev: network device
2257  *
2258  * Mark device as removed from system and therefore no longer available.
2259  */
2260 void netif_device_detach(struct net_device *dev)
2261 {
2262 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2263 	    netif_running(dev)) {
2264 		netif_tx_stop_all_queues(dev);
2265 	}
2266 }
2267 EXPORT_SYMBOL(netif_device_detach);
2268 
2269 /**
2270  * netif_device_attach - mark device as attached
2271  * @dev: network device
2272  *
2273  * Mark device as attached from system and restart if needed.
2274  */
2275 void netif_device_attach(struct net_device *dev)
2276 {
2277 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2278 	    netif_running(dev)) {
2279 		netif_tx_wake_all_queues(dev);
2280 		__netdev_watchdog_up(dev);
2281 	}
2282 }
2283 EXPORT_SYMBOL(netif_device_attach);
2284 
2285 static void skb_warn_bad_offload(const struct sk_buff *skb)
2286 {
2287 	static const netdev_features_t null_features = 0;
2288 	struct net_device *dev = skb->dev;
2289 	const char *driver = "";
2290 
2291 	if (!net_ratelimit())
2292 		return;
2293 
2294 	if (dev && dev->dev.parent)
2295 		driver = dev_driver_string(dev->dev.parent);
2296 
2297 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2298 	     "gso_type=%d ip_summed=%d\n",
2299 	     driver, dev ? &dev->features : &null_features,
2300 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2301 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2302 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2303 }
2304 
2305 /*
2306  * Invalidate hardware checksum when packet is to be mangled, and
2307  * complete checksum manually on outgoing path.
2308  */
2309 int skb_checksum_help(struct sk_buff *skb)
2310 {
2311 	__wsum csum;
2312 	int ret = 0, offset;
2313 
2314 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2315 		goto out_set_summed;
2316 
2317 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2318 		skb_warn_bad_offload(skb);
2319 		return -EINVAL;
2320 	}
2321 
2322 	/* Before computing a checksum, we should make sure no frag could
2323 	 * be modified by an external entity : checksum could be wrong.
2324 	 */
2325 	if (skb_has_shared_frag(skb)) {
2326 		ret = __skb_linearize(skb);
2327 		if (ret)
2328 			goto out;
2329 	}
2330 
2331 	offset = skb_checksum_start_offset(skb);
2332 	BUG_ON(offset >= skb_headlen(skb));
2333 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2334 
2335 	offset += skb->csum_offset;
2336 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2337 
2338 	if (skb_cloned(skb) &&
2339 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2340 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2341 		if (ret)
2342 			goto out;
2343 	}
2344 
2345 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2346 out_set_summed:
2347 	skb->ip_summed = CHECKSUM_NONE;
2348 out:
2349 	return ret;
2350 }
2351 EXPORT_SYMBOL(skb_checksum_help);
2352 
2353 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2354 {
2355 	unsigned int vlan_depth = skb->mac_len;
2356 	__be16 type = skb->protocol;
2357 
2358 	/* Tunnel gso handlers can set protocol to ethernet. */
2359 	if (type == htons(ETH_P_TEB)) {
2360 		struct ethhdr *eth;
2361 
2362 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2363 			return 0;
2364 
2365 		eth = (struct ethhdr *)skb_mac_header(skb);
2366 		type = eth->h_proto;
2367 	}
2368 
2369 	/* if skb->protocol is 802.1Q/AD then the header should already be
2370 	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2371 	 * ETH_HLEN otherwise
2372 	 */
2373 	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2374 		if (vlan_depth) {
2375 			if (WARN_ON(vlan_depth < VLAN_HLEN))
2376 				return 0;
2377 			vlan_depth -= VLAN_HLEN;
2378 		} else {
2379 			vlan_depth = ETH_HLEN;
2380 		}
2381 		do {
2382 			struct vlan_hdr *vh;
2383 
2384 			if (unlikely(!pskb_may_pull(skb,
2385 						    vlan_depth + VLAN_HLEN)))
2386 				return 0;
2387 
2388 			vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2389 			type = vh->h_vlan_encapsulated_proto;
2390 			vlan_depth += VLAN_HLEN;
2391 		} while (type == htons(ETH_P_8021Q) ||
2392 			 type == htons(ETH_P_8021AD));
2393 	}
2394 
2395 	*depth = vlan_depth;
2396 
2397 	return type;
2398 }
2399 
2400 /**
2401  *	skb_mac_gso_segment - mac layer segmentation handler.
2402  *	@skb: buffer to segment
2403  *	@features: features for the output path (see dev->features)
2404  */
2405 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2406 				    netdev_features_t features)
2407 {
2408 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2409 	struct packet_offload *ptype;
2410 	int vlan_depth = skb->mac_len;
2411 	__be16 type = skb_network_protocol(skb, &vlan_depth);
2412 
2413 	if (unlikely(!type))
2414 		return ERR_PTR(-EINVAL);
2415 
2416 	__skb_pull(skb, vlan_depth);
2417 
2418 	rcu_read_lock();
2419 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2420 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2421 			segs = ptype->callbacks.gso_segment(skb, features);
2422 			break;
2423 		}
2424 	}
2425 	rcu_read_unlock();
2426 
2427 	__skb_push(skb, skb->data - skb_mac_header(skb));
2428 
2429 	return segs;
2430 }
2431 EXPORT_SYMBOL(skb_mac_gso_segment);
2432 
2433 
2434 /* openvswitch calls this on rx path, so we need a different check.
2435  */
2436 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2437 {
2438 	if (tx_path)
2439 		return skb->ip_summed != CHECKSUM_PARTIAL;
2440 	else
2441 		return skb->ip_summed == CHECKSUM_NONE;
2442 }
2443 
2444 /**
2445  *	__skb_gso_segment - Perform segmentation on skb.
2446  *	@skb: buffer to segment
2447  *	@features: features for the output path (see dev->features)
2448  *	@tx_path: whether it is called in TX path
2449  *
2450  *	This function segments the given skb and returns a list of segments.
2451  *
2452  *	It may return NULL if the skb requires no segmentation.  This is
2453  *	only possible when GSO is used for verifying header integrity.
2454  */
2455 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2456 				  netdev_features_t features, bool tx_path)
2457 {
2458 	if (unlikely(skb_needs_check(skb, tx_path))) {
2459 		int err;
2460 
2461 		skb_warn_bad_offload(skb);
2462 
2463 		err = skb_cow_head(skb, 0);
2464 		if (err < 0)
2465 			return ERR_PTR(err);
2466 	}
2467 
2468 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2469 	SKB_GSO_CB(skb)->encap_level = 0;
2470 
2471 	skb_reset_mac_header(skb);
2472 	skb_reset_mac_len(skb);
2473 
2474 	return skb_mac_gso_segment(skb, features);
2475 }
2476 EXPORT_SYMBOL(__skb_gso_segment);
2477 
2478 /* Take action when hardware reception checksum errors are detected. */
2479 #ifdef CONFIG_BUG
2480 void netdev_rx_csum_fault(struct net_device *dev)
2481 {
2482 	if (net_ratelimit()) {
2483 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2484 		dump_stack();
2485 	}
2486 }
2487 EXPORT_SYMBOL(netdev_rx_csum_fault);
2488 #endif
2489 
2490 /* Actually, we should eliminate this check as soon as we know, that:
2491  * 1. IOMMU is present and allows to map all the memory.
2492  * 2. No high memory really exists on this machine.
2493  */
2494 
2495 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2496 {
2497 #ifdef CONFIG_HIGHMEM
2498 	int i;
2499 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2500 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2501 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2502 			if (PageHighMem(skb_frag_page(frag)))
2503 				return 1;
2504 		}
2505 	}
2506 
2507 	if (PCI_DMA_BUS_IS_PHYS) {
2508 		struct device *pdev = dev->dev.parent;
2509 
2510 		if (!pdev)
2511 			return 0;
2512 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2513 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2514 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2515 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2516 				return 1;
2517 		}
2518 	}
2519 #endif
2520 	return 0;
2521 }
2522 
2523 /* If MPLS offload request, verify we are testing hardware MPLS features
2524  * instead of standard features for the netdev.
2525  */
2526 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2527 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2528 					   netdev_features_t features,
2529 					   __be16 type)
2530 {
2531 	if (eth_p_mpls(type))
2532 		features &= skb->dev->mpls_features;
2533 
2534 	return features;
2535 }
2536 #else
2537 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2538 					   netdev_features_t features,
2539 					   __be16 type)
2540 {
2541 	return features;
2542 }
2543 #endif
2544 
2545 static netdev_features_t harmonize_features(struct sk_buff *skb,
2546 	netdev_features_t features)
2547 {
2548 	int tmp;
2549 	__be16 type;
2550 
2551 	type = skb_network_protocol(skb, &tmp);
2552 	features = net_mpls_features(skb, features, type);
2553 
2554 	if (skb->ip_summed != CHECKSUM_NONE &&
2555 	    !can_checksum_protocol(features, type)) {
2556 		features &= ~NETIF_F_ALL_CSUM;
2557 	} else if (illegal_highdma(skb->dev, skb)) {
2558 		features &= ~NETIF_F_SG;
2559 	}
2560 
2561 	return features;
2562 }
2563 
2564 netdev_features_t netif_skb_features(struct sk_buff *skb)
2565 {
2566 	struct net_device *dev = skb->dev;
2567 	netdev_features_t features = dev->features;
2568 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2569 	__be16 protocol = skb->protocol;
2570 
2571 	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2572 		features &= ~NETIF_F_GSO_MASK;
2573 
2574 	/* If encapsulation offload request, verify we are testing
2575 	 * hardware encapsulation features instead of standard
2576 	 * features for the netdev
2577 	 */
2578 	if (skb->encapsulation)
2579 		features &= dev->hw_enc_features;
2580 
2581 	if (!vlan_tx_tag_present(skb)) {
2582 		if (unlikely(protocol == htons(ETH_P_8021Q) ||
2583 			     protocol == htons(ETH_P_8021AD))) {
2584 			struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2585 			protocol = veh->h_vlan_encapsulated_proto;
2586 		} else {
2587 			goto finalize;
2588 		}
2589 	}
2590 
2591 	features = netdev_intersect_features(features,
2592 					     dev->vlan_features |
2593 					     NETIF_F_HW_VLAN_CTAG_TX |
2594 					     NETIF_F_HW_VLAN_STAG_TX);
2595 
2596 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2597 		features = netdev_intersect_features(features,
2598 						     NETIF_F_SG |
2599 						     NETIF_F_HIGHDMA |
2600 						     NETIF_F_FRAGLIST |
2601 						     NETIF_F_GEN_CSUM |
2602 						     NETIF_F_HW_VLAN_CTAG_TX |
2603 						     NETIF_F_HW_VLAN_STAG_TX);
2604 
2605 finalize:
2606 	if (dev->netdev_ops->ndo_features_check)
2607 		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2608 								features);
2609 
2610 	return harmonize_features(skb, features);
2611 }
2612 EXPORT_SYMBOL(netif_skb_features);
2613 
2614 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2615 		    struct netdev_queue *txq, bool more)
2616 {
2617 	unsigned int len;
2618 	int rc;
2619 
2620 	if (!list_empty(&ptype_all))
2621 		dev_queue_xmit_nit(skb, dev);
2622 
2623 	len = skb->len;
2624 	trace_net_dev_start_xmit(skb, dev);
2625 	rc = netdev_start_xmit(skb, dev, txq, more);
2626 	trace_net_dev_xmit(skb, rc, dev, len);
2627 
2628 	return rc;
2629 }
2630 
2631 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2632 				    struct netdev_queue *txq, int *ret)
2633 {
2634 	struct sk_buff *skb = first;
2635 	int rc = NETDEV_TX_OK;
2636 
2637 	while (skb) {
2638 		struct sk_buff *next = skb->next;
2639 
2640 		skb->next = NULL;
2641 		rc = xmit_one(skb, dev, txq, next != NULL);
2642 		if (unlikely(!dev_xmit_complete(rc))) {
2643 			skb->next = next;
2644 			goto out;
2645 		}
2646 
2647 		skb = next;
2648 		if (netif_xmit_stopped(txq) && skb) {
2649 			rc = NETDEV_TX_BUSY;
2650 			break;
2651 		}
2652 	}
2653 
2654 out:
2655 	*ret = rc;
2656 	return skb;
2657 }
2658 
2659 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2660 					  netdev_features_t features)
2661 {
2662 	if (vlan_tx_tag_present(skb) &&
2663 	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2664 		skb = __vlan_hwaccel_push_inside(skb);
2665 	return skb;
2666 }
2667 
2668 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2669 {
2670 	netdev_features_t features;
2671 
2672 	if (skb->next)
2673 		return skb;
2674 
2675 	features = netif_skb_features(skb);
2676 	skb = validate_xmit_vlan(skb, features);
2677 	if (unlikely(!skb))
2678 		goto out_null;
2679 
2680 	if (netif_needs_gso(dev, skb, features)) {
2681 		struct sk_buff *segs;
2682 
2683 		segs = skb_gso_segment(skb, features);
2684 		if (IS_ERR(segs)) {
2685 			goto out_kfree_skb;
2686 		} else if (segs) {
2687 			consume_skb(skb);
2688 			skb = segs;
2689 		}
2690 	} else {
2691 		if (skb_needs_linearize(skb, features) &&
2692 		    __skb_linearize(skb))
2693 			goto out_kfree_skb;
2694 
2695 		/* If packet is not checksummed and device does not
2696 		 * support checksumming for this protocol, complete
2697 		 * checksumming here.
2698 		 */
2699 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2700 			if (skb->encapsulation)
2701 				skb_set_inner_transport_header(skb,
2702 							       skb_checksum_start_offset(skb));
2703 			else
2704 				skb_set_transport_header(skb,
2705 							 skb_checksum_start_offset(skb));
2706 			if (!(features & NETIF_F_ALL_CSUM) &&
2707 			    skb_checksum_help(skb))
2708 				goto out_kfree_skb;
2709 		}
2710 	}
2711 
2712 	return skb;
2713 
2714 out_kfree_skb:
2715 	kfree_skb(skb);
2716 out_null:
2717 	return NULL;
2718 }
2719 
2720 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2721 {
2722 	struct sk_buff *next, *head = NULL, *tail;
2723 
2724 	for (; skb != NULL; skb = next) {
2725 		next = skb->next;
2726 		skb->next = NULL;
2727 
2728 		/* in case skb wont be segmented, point to itself */
2729 		skb->prev = skb;
2730 
2731 		skb = validate_xmit_skb(skb, dev);
2732 		if (!skb)
2733 			continue;
2734 
2735 		if (!head)
2736 			head = skb;
2737 		else
2738 			tail->next = skb;
2739 		/* If skb was segmented, skb->prev points to
2740 		 * the last segment. If not, it still contains skb.
2741 		 */
2742 		tail = skb->prev;
2743 	}
2744 	return head;
2745 }
2746 
2747 static void qdisc_pkt_len_init(struct sk_buff *skb)
2748 {
2749 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2750 
2751 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2752 
2753 	/* To get more precise estimation of bytes sent on wire,
2754 	 * we add to pkt_len the headers size of all segments
2755 	 */
2756 	if (shinfo->gso_size)  {
2757 		unsigned int hdr_len;
2758 		u16 gso_segs = shinfo->gso_segs;
2759 
2760 		/* mac layer + network layer */
2761 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2762 
2763 		/* + transport layer */
2764 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2765 			hdr_len += tcp_hdrlen(skb);
2766 		else
2767 			hdr_len += sizeof(struct udphdr);
2768 
2769 		if (shinfo->gso_type & SKB_GSO_DODGY)
2770 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2771 						shinfo->gso_size);
2772 
2773 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2774 	}
2775 }
2776 
2777 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2778 				 struct net_device *dev,
2779 				 struct netdev_queue *txq)
2780 {
2781 	spinlock_t *root_lock = qdisc_lock(q);
2782 	bool contended;
2783 	int rc;
2784 
2785 	qdisc_pkt_len_init(skb);
2786 	qdisc_calculate_pkt_len(skb, q);
2787 	/*
2788 	 * Heuristic to force contended enqueues to serialize on a
2789 	 * separate lock before trying to get qdisc main lock.
2790 	 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2791 	 * often and dequeue packets faster.
2792 	 */
2793 	contended = qdisc_is_running(q);
2794 	if (unlikely(contended))
2795 		spin_lock(&q->busylock);
2796 
2797 	spin_lock(root_lock);
2798 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2799 		kfree_skb(skb);
2800 		rc = NET_XMIT_DROP;
2801 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2802 		   qdisc_run_begin(q)) {
2803 		/*
2804 		 * This is a work-conserving queue; there are no old skbs
2805 		 * waiting to be sent out; and the qdisc is not running -
2806 		 * xmit the skb directly.
2807 		 */
2808 
2809 		qdisc_bstats_update(q, skb);
2810 
2811 		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2812 			if (unlikely(contended)) {
2813 				spin_unlock(&q->busylock);
2814 				contended = false;
2815 			}
2816 			__qdisc_run(q);
2817 		} else
2818 			qdisc_run_end(q);
2819 
2820 		rc = NET_XMIT_SUCCESS;
2821 	} else {
2822 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2823 		if (qdisc_run_begin(q)) {
2824 			if (unlikely(contended)) {
2825 				spin_unlock(&q->busylock);
2826 				contended = false;
2827 			}
2828 			__qdisc_run(q);
2829 		}
2830 	}
2831 	spin_unlock(root_lock);
2832 	if (unlikely(contended))
2833 		spin_unlock(&q->busylock);
2834 	return rc;
2835 }
2836 
2837 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2838 static void skb_update_prio(struct sk_buff *skb)
2839 {
2840 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2841 
2842 	if (!skb->priority && skb->sk && map) {
2843 		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2844 
2845 		if (prioidx < map->priomap_len)
2846 			skb->priority = map->priomap[prioidx];
2847 	}
2848 }
2849 #else
2850 #define skb_update_prio(skb)
2851 #endif
2852 
2853 static DEFINE_PER_CPU(int, xmit_recursion);
2854 #define RECURSION_LIMIT 10
2855 
2856 /**
2857  *	dev_loopback_xmit - loop back @skb
2858  *	@skb: buffer to transmit
2859  */
2860 int dev_loopback_xmit(struct sk_buff *skb)
2861 {
2862 	skb_reset_mac_header(skb);
2863 	__skb_pull(skb, skb_network_offset(skb));
2864 	skb->pkt_type = PACKET_LOOPBACK;
2865 	skb->ip_summed = CHECKSUM_UNNECESSARY;
2866 	WARN_ON(!skb_dst(skb));
2867 	skb_dst_force(skb);
2868 	netif_rx_ni(skb);
2869 	return 0;
2870 }
2871 EXPORT_SYMBOL(dev_loopback_xmit);
2872 
2873 /**
2874  *	__dev_queue_xmit - transmit a buffer
2875  *	@skb: buffer to transmit
2876  *	@accel_priv: private data used for L2 forwarding offload
2877  *
2878  *	Queue a buffer for transmission to a network device. The caller must
2879  *	have set the device and priority and built the buffer before calling
2880  *	this function. The function can be called from an interrupt.
2881  *
2882  *	A negative errno code is returned on a failure. A success does not
2883  *	guarantee the frame will be transmitted as it may be dropped due
2884  *	to congestion or traffic shaping.
2885  *
2886  * -----------------------------------------------------------------------------------
2887  *      I notice this method can also return errors from the queue disciplines,
2888  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2889  *      be positive.
2890  *
2891  *      Regardless of the return value, the skb is consumed, so it is currently
2892  *      difficult to retry a send to this method.  (You can bump the ref count
2893  *      before sending to hold a reference for retry if you are careful.)
2894  *
2895  *      When calling this method, interrupts MUST be enabled.  This is because
2896  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2897  *          --BLG
2898  */
2899 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2900 {
2901 	struct net_device *dev = skb->dev;
2902 	struct netdev_queue *txq;
2903 	struct Qdisc *q;
2904 	int rc = -ENOMEM;
2905 
2906 	skb_reset_mac_header(skb);
2907 
2908 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2909 		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2910 
2911 	/* Disable soft irqs for various locks below. Also
2912 	 * stops preemption for RCU.
2913 	 */
2914 	rcu_read_lock_bh();
2915 
2916 	skb_update_prio(skb);
2917 
2918 	/* If device/qdisc don't need skb->dst, release it right now while
2919 	 * its hot in this cpu cache.
2920 	 */
2921 	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2922 		skb_dst_drop(skb);
2923 	else
2924 		skb_dst_force(skb);
2925 
2926 	txq = netdev_pick_tx(dev, skb, accel_priv);
2927 	q = rcu_dereference_bh(txq->qdisc);
2928 
2929 #ifdef CONFIG_NET_CLS_ACT
2930 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2931 #endif
2932 	trace_net_dev_queue(skb);
2933 	if (q->enqueue) {
2934 		rc = __dev_xmit_skb(skb, q, dev, txq);
2935 		goto out;
2936 	}
2937 
2938 	/* The device has no queue. Common case for software devices:
2939 	   loopback, all the sorts of tunnels...
2940 
2941 	   Really, it is unlikely that netif_tx_lock protection is necessary
2942 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2943 	   counters.)
2944 	   However, it is possible, that they rely on protection
2945 	   made by us here.
2946 
2947 	   Check this and shot the lock. It is not prone from deadlocks.
2948 	   Either shot noqueue qdisc, it is even simpler 8)
2949 	 */
2950 	if (dev->flags & IFF_UP) {
2951 		int cpu = smp_processor_id(); /* ok because BHs are off */
2952 
2953 		if (txq->xmit_lock_owner != cpu) {
2954 
2955 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2956 				goto recursion_alert;
2957 
2958 			skb = validate_xmit_skb(skb, dev);
2959 			if (!skb)
2960 				goto drop;
2961 
2962 			HARD_TX_LOCK(dev, txq, cpu);
2963 
2964 			if (!netif_xmit_stopped(txq)) {
2965 				__this_cpu_inc(xmit_recursion);
2966 				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2967 				__this_cpu_dec(xmit_recursion);
2968 				if (dev_xmit_complete(rc)) {
2969 					HARD_TX_UNLOCK(dev, txq);
2970 					goto out;
2971 				}
2972 			}
2973 			HARD_TX_UNLOCK(dev, txq);
2974 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2975 					     dev->name);
2976 		} else {
2977 			/* Recursion is detected! It is possible,
2978 			 * unfortunately
2979 			 */
2980 recursion_alert:
2981 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2982 					     dev->name);
2983 		}
2984 	}
2985 
2986 	rc = -ENETDOWN;
2987 drop:
2988 	rcu_read_unlock_bh();
2989 
2990 	atomic_long_inc(&dev->tx_dropped);
2991 	kfree_skb_list(skb);
2992 	return rc;
2993 out:
2994 	rcu_read_unlock_bh();
2995 	return rc;
2996 }
2997 
2998 int dev_queue_xmit(struct sk_buff *skb)
2999 {
3000 	return __dev_queue_xmit(skb, NULL);
3001 }
3002 EXPORT_SYMBOL(dev_queue_xmit);
3003 
3004 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3005 {
3006 	return __dev_queue_xmit(skb, accel_priv);
3007 }
3008 EXPORT_SYMBOL(dev_queue_xmit_accel);
3009 
3010 
3011 /*=======================================================================
3012 			Receiver routines
3013   =======================================================================*/
3014 
3015 int netdev_max_backlog __read_mostly = 1000;
3016 EXPORT_SYMBOL(netdev_max_backlog);
3017 
3018 int netdev_tstamp_prequeue __read_mostly = 1;
3019 int netdev_budget __read_mostly = 300;
3020 int weight_p __read_mostly = 64;            /* old backlog weight */
3021 
3022 /* Called with irq disabled */
3023 static inline void ____napi_schedule(struct softnet_data *sd,
3024 				     struct napi_struct *napi)
3025 {
3026 	list_add_tail(&napi->poll_list, &sd->poll_list);
3027 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3028 }
3029 
3030 #ifdef CONFIG_RPS
3031 
3032 /* One global table that all flow-based protocols share. */
3033 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3034 EXPORT_SYMBOL(rps_sock_flow_table);
3035 
3036 struct static_key rps_needed __read_mostly;
3037 
3038 static struct rps_dev_flow *
3039 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3040 	    struct rps_dev_flow *rflow, u16 next_cpu)
3041 {
3042 	if (next_cpu != RPS_NO_CPU) {
3043 #ifdef CONFIG_RFS_ACCEL
3044 		struct netdev_rx_queue *rxqueue;
3045 		struct rps_dev_flow_table *flow_table;
3046 		struct rps_dev_flow *old_rflow;
3047 		u32 flow_id;
3048 		u16 rxq_index;
3049 		int rc;
3050 
3051 		/* Should we steer this flow to a different hardware queue? */
3052 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3053 		    !(dev->features & NETIF_F_NTUPLE))
3054 			goto out;
3055 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3056 		if (rxq_index == skb_get_rx_queue(skb))
3057 			goto out;
3058 
3059 		rxqueue = dev->_rx + rxq_index;
3060 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3061 		if (!flow_table)
3062 			goto out;
3063 		flow_id = skb_get_hash(skb) & flow_table->mask;
3064 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3065 							rxq_index, flow_id);
3066 		if (rc < 0)
3067 			goto out;
3068 		old_rflow = rflow;
3069 		rflow = &flow_table->flows[flow_id];
3070 		rflow->filter = rc;
3071 		if (old_rflow->filter == rflow->filter)
3072 			old_rflow->filter = RPS_NO_FILTER;
3073 	out:
3074 #endif
3075 		rflow->last_qtail =
3076 			per_cpu(softnet_data, next_cpu).input_queue_head;
3077 	}
3078 
3079 	rflow->cpu = next_cpu;
3080 	return rflow;
3081 }
3082 
3083 /*
3084  * get_rps_cpu is called from netif_receive_skb and returns the target
3085  * CPU from the RPS map of the receiving queue for a given skb.
3086  * rcu_read_lock must be held on entry.
3087  */
3088 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3089 		       struct rps_dev_flow **rflowp)
3090 {
3091 	struct netdev_rx_queue *rxqueue;
3092 	struct rps_map *map;
3093 	struct rps_dev_flow_table *flow_table;
3094 	struct rps_sock_flow_table *sock_flow_table;
3095 	int cpu = -1;
3096 	u16 tcpu;
3097 	u32 hash;
3098 
3099 	if (skb_rx_queue_recorded(skb)) {
3100 		u16 index = skb_get_rx_queue(skb);
3101 		if (unlikely(index >= dev->real_num_rx_queues)) {
3102 			WARN_ONCE(dev->real_num_rx_queues > 1,
3103 				  "%s received packet on queue %u, but number "
3104 				  "of RX queues is %u\n",
3105 				  dev->name, index, dev->real_num_rx_queues);
3106 			goto done;
3107 		}
3108 		rxqueue = dev->_rx + index;
3109 	} else
3110 		rxqueue = dev->_rx;
3111 
3112 	map = rcu_dereference(rxqueue->rps_map);
3113 	if (map) {
3114 		if (map->len == 1 &&
3115 		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
3116 			tcpu = map->cpus[0];
3117 			if (cpu_online(tcpu))
3118 				cpu = tcpu;
3119 			goto done;
3120 		}
3121 	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3122 		goto done;
3123 	}
3124 
3125 	skb_reset_network_header(skb);
3126 	hash = skb_get_hash(skb);
3127 	if (!hash)
3128 		goto done;
3129 
3130 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3131 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3132 	if (flow_table && sock_flow_table) {
3133 		u16 next_cpu;
3134 		struct rps_dev_flow *rflow;
3135 
3136 		rflow = &flow_table->flows[hash & flow_table->mask];
3137 		tcpu = rflow->cpu;
3138 
3139 		next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3140 
3141 		/*
3142 		 * If the desired CPU (where last recvmsg was done) is
3143 		 * different from current CPU (one in the rx-queue flow
3144 		 * table entry), switch if one of the following holds:
3145 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
3146 		 *   - Current CPU is offline.
3147 		 *   - The current CPU's queue tail has advanced beyond the
3148 		 *     last packet that was enqueued using this table entry.
3149 		 *     This guarantees that all previous packets for the flow
3150 		 *     have been dequeued, thus preserving in order delivery.
3151 		 */
3152 		if (unlikely(tcpu != next_cpu) &&
3153 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3154 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3155 		      rflow->last_qtail)) >= 0)) {
3156 			tcpu = next_cpu;
3157 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3158 		}
3159 
3160 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3161 			*rflowp = rflow;
3162 			cpu = tcpu;
3163 			goto done;
3164 		}
3165 	}
3166 
3167 	if (map) {
3168 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3169 		if (cpu_online(tcpu)) {
3170 			cpu = tcpu;
3171 			goto done;
3172 		}
3173 	}
3174 
3175 done:
3176 	return cpu;
3177 }
3178 
3179 #ifdef CONFIG_RFS_ACCEL
3180 
3181 /**
3182  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3183  * @dev: Device on which the filter was set
3184  * @rxq_index: RX queue index
3185  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3186  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3187  *
3188  * Drivers that implement ndo_rx_flow_steer() should periodically call
3189  * this function for each installed filter and remove the filters for
3190  * which it returns %true.
3191  */
3192 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3193 			 u32 flow_id, u16 filter_id)
3194 {
3195 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3196 	struct rps_dev_flow_table *flow_table;
3197 	struct rps_dev_flow *rflow;
3198 	bool expire = true;
3199 	int cpu;
3200 
3201 	rcu_read_lock();
3202 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3203 	if (flow_table && flow_id <= flow_table->mask) {
3204 		rflow = &flow_table->flows[flow_id];
3205 		cpu = ACCESS_ONCE(rflow->cpu);
3206 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3207 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3208 			   rflow->last_qtail) <
3209 		     (int)(10 * flow_table->mask)))
3210 			expire = false;
3211 	}
3212 	rcu_read_unlock();
3213 	return expire;
3214 }
3215 EXPORT_SYMBOL(rps_may_expire_flow);
3216 
3217 #endif /* CONFIG_RFS_ACCEL */
3218 
3219 /* Called from hardirq (IPI) context */
3220 static void rps_trigger_softirq(void *data)
3221 {
3222 	struct softnet_data *sd = data;
3223 
3224 	____napi_schedule(sd, &sd->backlog);
3225 	sd->received_rps++;
3226 }
3227 
3228 #endif /* CONFIG_RPS */
3229 
3230 /*
3231  * Check if this softnet_data structure is another cpu one
3232  * If yes, queue it to our IPI list and return 1
3233  * If no, return 0
3234  */
3235 static int rps_ipi_queued(struct softnet_data *sd)
3236 {
3237 #ifdef CONFIG_RPS
3238 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3239 
3240 	if (sd != mysd) {
3241 		sd->rps_ipi_next = mysd->rps_ipi_list;
3242 		mysd->rps_ipi_list = sd;
3243 
3244 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3245 		return 1;
3246 	}
3247 #endif /* CONFIG_RPS */
3248 	return 0;
3249 }
3250 
3251 #ifdef CONFIG_NET_FLOW_LIMIT
3252 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3253 #endif
3254 
3255 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3256 {
3257 #ifdef CONFIG_NET_FLOW_LIMIT
3258 	struct sd_flow_limit *fl;
3259 	struct softnet_data *sd;
3260 	unsigned int old_flow, new_flow;
3261 
3262 	if (qlen < (netdev_max_backlog >> 1))
3263 		return false;
3264 
3265 	sd = this_cpu_ptr(&softnet_data);
3266 
3267 	rcu_read_lock();
3268 	fl = rcu_dereference(sd->flow_limit);
3269 	if (fl) {
3270 		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3271 		old_flow = fl->history[fl->history_head];
3272 		fl->history[fl->history_head] = new_flow;
3273 
3274 		fl->history_head++;
3275 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3276 
3277 		if (likely(fl->buckets[old_flow]))
3278 			fl->buckets[old_flow]--;
3279 
3280 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3281 			fl->count++;
3282 			rcu_read_unlock();
3283 			return true;
3284 		}
3285 	}
3286 	rcu_read_unlock();
3287 #endif
3288 	return false;
3289 }
3290 
3291 /*
3292  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3293  * queue (may be a remote CPU queue).
3294  */
3295 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3296 			      unsigned int *qtail)
3297 {
3298 	struct softnet_data *sd;
3299 	unsigned long flags;
3300 	unsigned int qlen;
3301 
3302 	sd = &per_cpu(softnet_data, cpu);
3303 
3304 	local_irq_save(flags);
3305 
3306 	rps_lock(sd);
3307 	qlen = skb_queue_len(&sd->input_pkt_queue);
3308 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3309 		if (qlen) {
3310 enqueue:
3311 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3312 			input_queue_tail_incr_save(sd, qtail);
3313 			rps_unlock(sd);
3314 			local_irq_restore(flags);
3315 			return NET_RX_SUCCESS;
3316 		}
3317 
3318 		/* Schedule NAPI for backlog device
3319 		 * We can use non atomic operation since we own the queue lock
3320 		 */
3321 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3322 			if (!rps_ipi_queued(sd))
3323 				____napi_schedule(sd, &sd->backlog);
3324 		}
3325 		goto enqueue;
3326 	}
3327 
3328 	sd->dropped++;
3329 	rps_unlock(sd);
3330 
3331 	local_irq_restore(flags);
3332 
3333 	atomic_long_inc(&skb->dev->rx_dropped);
3334 	kfree_skb(skb);
3335 	return NET_RX_DROP;
3336 }
3337 
3338 static int netif_rx_internal(struct sk_buff *skb)
3339 {
3340 	int ret;
3341 
3342 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3343 
3344 	trace_netif_rx(skb);
3345 #ifdef CONFIG_RPS
3346 	if (static_key_false(&rps_needed)) {
3347 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3348 		int cpu;
3349 
3350 		preempt_disable();
3351 		rcu_read_lock();
3352 
3353 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3354 		if (cpu < 0)
3355 			cpu = smp_processor_id();
3356 
3357 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3358 
3359 		rcu_read_unlock();
3360 		preempt_enable();
3361 	} else
3362 #endif
3363 	{
3364 		unsigned int qtail;
3365 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3366 		put_cpu();
3367 	}
3368 	return ret;
3369 }
3370 
3371 /**
3372  *	netif_rx	-	post buffer to the network code
3373  *	@skb: buffer to post
3374  *
3375  *	This function receives a packet from a device driver and queues it for
3376  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3377  *	may be dropped during processing for congestion control or by the
3378  *	protocol layers.
3379  *
3380  *	return values:
3381  *	NET_RX_SUCCESS	(no congestion)
3382  *	NET_RX_DROP     (packet was dropped)
3383  *
3384  */
3385 
3386 int netif_rx(struct sk_buff *skb)
3387 {
3388 	trace_netif_rx_entry(skb);
3389 
3390 	return netif_rx_internal(skb);
3391 }
3392 EXPORT_SYMBOL(netif_rx);
3393 
3394 int netif_rx_ni(struct sk_buff *skb)
3395 {
3396 	int err;
3397 
3398 	trace_netif_rx_ni_entry(skb);
3399 
3400 	preempt_disable();
3401 	err = netif_rx_internal(skb);
3402 	if (local_softirq_pending())
3403 		do_softirq();
3404 	preempt_enable();
3405 
3406 	return err;
3407 }
3408 EXPORT_SYMBOL(netif_rx_ni);
3409 
3410 static void net_tx_action(struct softirq_action *h)
3411 {
3412 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3413 
3414 	if (sd->completion_queue) {
3415 		struct sk_buff *clist;
3416 
3417 		local_irq_disable();
3418 		clist = sd->completion_queue;
3419 		sd->completion_queue = NULL;
3420 		local_irq_enable();
3421 
3422 		while (clist) {
3423 			struct sk_buff *skb = clist;
3424 			clist = clist->next;
3425 
3426 			WARN_ON(atomic_read(&skb->users));
3427 			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3428 				trace_consume_skb(skb);
3429 			else
3430 				trace_kfree_skb(skb, net_tx_action);
3431 			__kfree_skb(skb);
3432 		}
3433 	}
3434 
3435 	if (sd->output_queue) {
3436 		struct Qdisc *head;
3437 
3438 		local_irq_disable();
3439 		head = sd->output_queue;
3440 		sd->output_queue = NULL;
3441 		sd->output_queue_tailp = &sd->output_queue;
3442 		local_irq_enable();
3443 
3444 		while (head) {
3445 			struct Qdisc *q = head;
3446 			spinlock_t *root_lock;
3447 
3448 			head = head->next_sched;
3449 
3450 			root_lock = qdisc_lock(q);
3451 			if (spin_trylock(root_lock)) {
3452 				smp_mb__before_atomic();
3453 				clear_bit(__QDISC_STATE_SCHED,
3454 					  &q->state);
3455 				qdisc_run(q);
3456 				spin_unlock(root_lock);
3457 			} else {
3458 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3459 					      &q->state)) {
3460 					__netif_reschedule(q);
3461 				} else {
3462 					smp_mb__before_atomic();
3463 					clear_bit(__QDISC_STATE_SCHED,
3464 						  &q->state);
3465 				}
3466 			}
3467 		}
3468 	}
3469 }
3470 
3471 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3472     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3473 /* This hook is defined here for ATM LANE */
3474 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3475 			     unsigned char *addr) __read_mostly;
3476 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3477 #endif
3478 
3479 #ifdef CONFIG_NET_CLS_ACT
3480 /* TODO: Maybe we should just force sch_ingress to be compiled in
3481  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3482  * a compare and 2 stores extra right now if we dont have it on
3483  * but have CONFIG_NET_CLS_ACT
3484  * NOTE: This doesn't stop any functionality; if you dont have
3485  * the ingress scheduler, you just can't add policies on ingress.
3486  *
3487  */
3488 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3489 {
3490 	struct net_device *dev = skb->dev;
3491 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3492 	int result = TC_ACT_OK;
3493 	struct Qdisc *q;
3494 
3495 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3496 		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3497 				     skb->skb_iif, dev->ifindex);
3498 		return TC_ACT_SHOT;
3499 	}
3500 
3501 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3502 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3503 
3504 	q = rcu_dereference(rxq->qdisc);
3505 	if (q != &noop_qdisc) {
3506 		spin_lock(qdisc_lock(q));
3507 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3508 			result = qdisc_enqueue_root(skb, q);
3509 		spin_unlock(qdisc_lock(q));
3510 	}
3511 
3512 	return result;
3513 }
3514 
3515 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3516 					 struct packet_type **pt_prev,
3517 					 int *ret, struct net_device *orig_dev)
3518 {
3519 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3520 
3521 	if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3522 		goto out;
3523 
3524 	if (*pt_prev) {
3525 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3526 		*pt_prev = NULL;
3527 	}
3528 
3529 	switch (ing_filter(skb, rxq)) {
3530 	case TC_ACT_SHOT:
3531 	case TC_ACT_STOLEN:
3532 		kfree_skb(skb);
3533 		return NULL;
3534 	}
3535 
3536 out:
3537 	skb->tc_verd = 0;
3538 	return skb;
3539 }
3540 #endif
3541 
3542 /**
3543  *	netdev_rx_handler_register - register receive handler
3544  *	@dev: device to register a handler for
3545  *	@rx_handler: receive handler to register
3546  *	@rx_handler_data: data pointer that is used by rx handler
3547  *
3548  *	Register a receive handler for a device. This handler will then be
3549  *	called from __netif_receive_skb. A negative errno code is returned
3550  *	on a failure.
3551  *
3552  *	The caller must hold the rtnl_mutex.
3553  *
3554  *	For a general description of rx_handler, see enum rx_handler_result.
3555  */
3556 int netdev_rx_handler_register(struct net_device *dev,
3557 			       rx_handler_func_t *rx_handler,
3558 			       void *rx_handler_data)
3559 {
3560 	ASSERT_RTNL();
3561 
3562 	if (dev->rx_handler)
3563 		return -EBUSY;
3564 
3565 	/* Note: rx_handler_data must be set before rx_handler */
3566 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3567 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3568 
3569 	return 0;
3570 }
3571 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3572 
3573 /**
3574  *	netdev_rx_handler_unregister - unregister receive handler
3575  *	@dev: device to unregister a handler from
3576  *
3577  *	Unregister a receive handler from a device.
3578  *
3579  *	The caller must hold the rtnl_mutex.
3580  */
3581 void netdev_rx_handler_unregister(struct net_device *dev)
3582 {
3583 
3584 	ASSERT_RTNL();
3585 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3586 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3587 	 * section has a guarantee to see a non NULL rx_handler_data
3588 	 * as well.
3589 	 */
3590 	synchronize_net();
3591 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3592 }
3593 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3594 
3595 /*
3596  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3597  * the special handling of PFMEMALLOC skbs.
3598  */
3599 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3600 {
3601 	switch (skb->protocol) {
3602 	case htons(ETH_P_ARP):
3603 	case htons(ETH_P_IP):
3604 	case htons(ETH_P_IPV6):
3605 	case htons(ETH_P_8021Q):
3606 	case htons(ETH_P_8021AD):
3607 		return true;
3608 	default:
3609 		return false;
3610 	}
3611 }
3612 
3613 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3614 {
3615 	struct packet_type *ptype, *pt_prev;
3616 	rx_handler_func_t *rx_handler;
3617 	struct net_device *orig_dev;
3618 	struct net_device *null_or_dev;
3619 	bool deliver_exact = false;
3620 	int ret = NET_RX_DROP;
3621 	__be16 type;
3622 
3623 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3624 
3625 	trace_netif_receive_skb(skb);
3626 
3627 	orig_dev = skb->dev;
3628 
3629 	skb_reset_network_header(skb);
3630 	if (!skb_transport_header_was_set(skb))
3631 		skb_reset_transport_header(skb);
3632 	skb_reset_mac_len(skb);
3633 
3634 	pt_prev = NULL;
3635 
3636 	rcu_read_lock();
3637 
3638 another_round:
3639 	skb->skb_iif = skb->dev->ifindex;
3640 
3641 	__this_cpu_inc(softnet_data.processed);
3642 
3643 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3644 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3645 		skb = skb_vlan_untag(skb);
3646 		if (unlikely(!skb))
3647 			goto unlock;
3648 	}
3649 
3650 #ifdef CONFIG_NET_CLS_ACT
3651 	if (skb->tc_verd & TC_NCLS) {
3652 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3653 		goto ncls;
3654 	}
3655 #endif
3656 
3657 	if (pfmemalloc)
3658 		goto skip_taps;
3659 
3660 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3661 		if (!ptype->dev || ptype->dev == skb->dev) {
3662 			if (pt_prev)
3663 				ret = deliver_skb(skb, pt_prev, orig_dev);
3664 			pt_prev = ptype;
3665 		}
3666 	}
3667 
3668 skip_taps:
3669 #ifdef CONFIG_NET_CLS_ACT
3670 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3671 	if (!skb)
3672 		goto unlock;
3673 ncls:
3674 #endif
3675 
3676 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3677 		goto drop;
3678 
3679 	if (vlan_tx_tag_present(skb)) {
3680 		if (pt_prev) {
3681 			ret = deliver_skb(skb, pt_prev, orig_dev);
3682 			pt_prev = NULL;
3683 		}
3684 		if (vlan_do_receive(&skb))
3685 			goto another_round;
3686 		else if (unlikely(!skb))
3687 			goto unlock;
3688 	}
3689 
3690 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3691 	if (rx_handler) {
3692 		if (pt_prev) {
3693 			ret = deliver_skb(skb, pt_prev, orig_dev);
3694 			pt_prev = NULL;
3695 		}
3696 		switch (rx_handler(&skb)) {
3697 		case RX_HANDLER_CONSUMED:
3698 			ret = NET_RX_SUCCESS;
3699 			goto unlock;
3700 		case RX_HANDLER_ANOTHER:
3701 			goto another_round;
3702 		case RX_HANDLER_EXACT:
3703 			deliver_exact = true;
3704 		case RX_HANDLER_PASS:
3705 			break;
3706 		default:
3707 			BUG();
3708 		}
3709 	}
3710 
3711 	if (unlikely(vlan_tx_tag_present(skb))) {
3712 		if (vlan_tx_tag_get_id(skb))
3713 			skb->pkt_type = PACKET_OTHERHOST;
3714 		/* Note: we might in the future use prio bits
3715 		 * and set skb->priority like in vlan_do_receive()
3716 		 * For the time being, just ignore Priority Code Point
3717 		 */
3718 		skb->vlan_tci = 0;
3719 	}
3720 
3721 	/* deliver only exact match when indicated */
3722 	null_or_dev = deliver_exact ? skb->dev : NULL;
3723 
3724 	type = skb->protocol;
3725 	list_for_each_entry_rcu(ptype,
3726 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3727 		if (ptype->type == type &&
3728 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3729 		     ptype->dev == orig_dev)) {
3730 			if (pt_prev)
3731 				ret = deliver_skb(skb, pt_prev, orig_dev);
3732 			pt_prev = ptype;
3733 		}
3734 	}
3735 
3736 	if (pt_prev) {
3737 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3738 			goto drop;
3739 		else
3740 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3741 	} else {
3742 drop:
3743 		atomic_long_inc(&skb->dev->rx_dropped);
3744 		kfree_skb(skb);
3745 		/* Jamal, now you will not able to escape explaining
3746 		 * me how you were going to use this. :-)
3747 		 */
3748 		ret = NET_RX_DROP;
3749 	}
3750 
3751 unlock:
3752 	rcu_read_unlock();
3753 	return ret;
3754 }
3755 
3756 static int __netif_receive_skb(struct sk_buff *skb)
3757 {
3758 	int ret;
3759 
3760 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3761 		unsigned long pflags = current->flags;
3762 
3763 		/*
3764 		 * PFMEMALLOC skbs are special, they should
3765 		 * - be delivered to SOCK_MEMALLOC sockets only
3766 		 * - stay away from userspace
3767 		 * - have bounded memory usage
3768 		 *
3769 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
3770 		 * context down to all allocation sites.
3771 		 */
3772 		current->flags |= PF_MEMALLOC;
3773 		ret = __netif_receive_skb_core(skb, true);
3774 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
3775 	} else
3776 		ret = __netif_receive_skb_core(skb, false);
3777 
3778 	return ret;
3779 }
3780 
3781 static int netif_receive_skb_internal(struct sk_buff *skb)
3782 {
3783 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3784 
3785 	if (skb_defer_rx_timestamp(skb))
3786 		return NET_RX_SUCCESS;
3787 
3788 #ifdef CONFIG_RPS
3789 	if (static_key_false(&rps_needed)) {
3790 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3791 		int cpu, ret;
3792 
3793 		rcu_read_lock();
3794 
3795 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3796 
3797 		if (cpu >= 0) {
3798 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3799 			rcu_read_unlock();
3800 			return ret;
3801 		}
3802 		rcu_read_unlock();
3803 	}
3804 #endif
3805 	return __netif_receive_skb(skb);
3806 }
3807 
3808 /**
3809  *	netif_receive_skb - process receive buffer from network
3810  *	@skb: buffer to process
3811  *
3812  *	netif_receive_skb() is the main receive data processing function.
3813  *	It always succeeds. The buffer may be dropped during processing
3814  *	for congestion control or by the protocol layers.
3815  *
3816  *	This function may only be called from softirq context and interrupts
3817  *	should be enabled.
3818  *
3819  *	Return values (usually ignored):
3820  *	NET_RX_SUCCESS: no congestion
3821  *	NET_RX_DROP: packet was dropped
3822  */
3823 int netif_receive_skb(struct sk_buff *skb)
3824 {
3825 	trace_netif_receive_skb_entry(skb);
3826 
3827 	return netif_receive_skb_internal(skb);
3828 }
3829 EXPORT_SYMBOL(netif_receive_skb);
3830 
3831 /* Network device is going away, flush any packets still pending
3832  * Called with irqs disabled.
3833  */
3834 static void flush_backlog(void *arg)
3835 {
3836 	struct net_device *dev = arg;
3837 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3838 	struct sk_buff *skb, *tmp;
3839 
3840 	rps_lock(sd);
3841 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3842 		if (skb->dev == dev) {
3843 			__skb_unlink(skb, &sd->input_pkt_queue);
3844 			kfree_skb(skb);
3845 			input_queue_head_incr(sd);
3846 		}
3847 	}
3848 	rps_unlock(sd);
3849 
3850 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3851 		if (skb->dev == dev) {
3852 			__skb_unlink(skb, &sd->process_queue);
3853 			kfree_skb(skb);
3854 			input_queue_head_incr(sd);
3855 		}
3856 	}
3857 }
3858 
3859 static int napi_gro_complete(struct sk_buff *skb)
3860 {
3861 	struct packet_offload *ptype;
3862 	__be16 type = skb->protocol;
3863 	struct list_head *head = &offload_base;
3864 	int err = -ENOENT;
3865 
3866 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3867 
3868 	if (NAPI_GRO_CB(skb)->count == 1) {
3869 		skb_shinfo(skb)->gso_size = 0;
3870 		goto out;
3871 	}
3872 
3873 	rcu_read_lock();
3874 	list_for_each_entry_rcu(ptype, head, list) {
3875 		if (ptype->type != type || !ptype->callbacks.gro_complete)
3876 			continue;
3877 
3878 		err = ptype->callbacks.gro_complete(skb, 0);
3879 		break;
3880 	}
3881 	rcu_read_unlock();
3882 
3883 	if (err) {
3884 		WARN_ON(&ptype->list == head);
3885 		kfree_skb(skb);
3886 		return NET_RX_SUCCESS;
3887 	}
3888 
3889 out:
3890 	return netif_receive_skb_internal(skb);
3891 }
3892 
3893 /* napi->gro_list contains packets ordered by age.
3894  * youngest packets at the head of it.
3895  * Complete skbs in reverse order to reduce latencies.
3896  */
3897 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3898 {
3899 	struct sk_buff *skb, *prev = NULL;
3900 
3901 	/* scan list and build reverse chain */
3902 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3903 		skb->prev = prev;
3904 		prev = skb;
3905 	}
3906 
3907 	for (skb = prev; skb; skb = prev) {
3908 		skb->next = NULL;
3909 
3910 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3911 			return;
3912 
3913 		prev = skb->prev;
3914 		napi_gro_complete(skb);
3915 		napi->gro_count--;
3916 	}
3917 
3918 	napi->gro_list = NULL;
3919 }
3920 EXPORT_SYMBOL(napi_gro_flush);
3921 
3922 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3923 {
3924 	struct sk_buff *p;
3925 	unsigned int maclen = skb->dev->hard_header_len;
3926 	u32 hash = skb_get_hash_raw(skb);
3927 
3928 	for (p = napi->gro_list; p; p = p->next) {
3929 		unsigned long diffs;
3930 
3931 		NAPI_GRO_CB(p)->flush = 0;
3932 
3933 		if (hash != skb_get_hash_raw(p)) {
3934 			NAPI_GRO_CB(p)->same_flow = 0;
3935 			continue;
3936 		}
3937 
3938 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3939 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3940 		if (maclen == ETH_HLEN)
3941 			diffs |= compare_ether_header(skb_mac_header(p),
3942 						      skb_mac_header(skb));
3943 		else if (!diffs)
3944 			diffs = memcmp(skb_mac_header(p),
3945 				       skb_mac_header(skb),
3946 				       maclen);
3947 		NAPI_GRO_CB(p)->same_flow = !diffs;
3948 	}
3949 }
3950 
3951 static void skb_gro_reset_offset(struct sk_buff *skb)
3952 {
3953 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
3954 	const skb_frag_t *frag0 = &pinfo->frags[0];
3955 
3956 	NAPI_GRO_CB(skb)->data_offset = 0;
3957 	NAPI_GRO_CB(skb)->frag0 = NULL;
3958 	NAPI_GRO_CB(skb)->frag0_len = 0;
3959 
3960 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3961 	    pinfo->nr_frags &&
3962 	    !PageHighMem(skb_frag_page(frag0))) {
3963 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3964 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3965 	}
3966 }
3967 
3968 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3969 {
3970 	struct skb_shared_info *pinfo = skb_shinfo(skb);
3971 
3972 	BUG_ON(skb->end - skb->tail < grow);
3973 
3974 	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3975 
3976 	skb->data_len -= grow;
3977 	skb->tail += grow;
3978 
3979 	pinfo->frags[0].page_offset += grow;
3980 	skb_frag_size_sub(&pinfo->frags[0], grow);
3981 
3982 	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3983 		skb_frag_unref(skb, 0);
3984 		memmove(pinfo->frags, pinfo->frags + 1,
3985 			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
3986 	}
3987 }
3988 
3989 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3990 {
3991 	struct sk_buff **pp = NULL;
3992 	struct packet_offload *ptype;
3993 	__be16 type = skb->protocol;
3994 	struct list_head *head = &offload_base;
3995 	int same_flow;
3996 	enum gro_result ret;
3997 	int grow;
3998 
3999 	if (!(skb->dev->features & NETIF_F_GRO))
4000 		goto normal;
4001 
4002 	if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4003 		goto normal;
4004 
4005 	gro_list_prepare(napi, skb);
4006 
4007 	rcu_read_lock();
4008 	list_for_each_entry_rcu(ptype, head, list) {
4009 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4010 			continue;
4011 
4012 		skb_set_network_header(skb, skb_gro_offset(skb));
4013 		skb_reset_mac_len(skb);
4014 		NAPI_GRO_CB(skb)->same_flow = 0;
4015 		NAPI_GRO_CB(skb)->flush = 0;
4016 		NAPI_GRO_CB(skb)->free = 0;
4017 		NAPI_GRO_CB(skb)->udp_mark = 0;
4018 
4019 		/* Setup for GRO checksum validation */
4020 		switch (skb->ip_summed) {
4021 		case CHECKSUM_COMPLETE:
4022 			NAPI_GRO_CB(skb)->csum = skb->csum;
4023 			NAPI_GRO_CB(skb)->csum_valid = 1;
4024 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4025 			break;
4026 		case CHECKSUM_UNNECESSARY:
4027 			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4028 			NAPI_GRO_CB(skb)->csum_valid = 0;
4029 			break;
4030 		default:
4031 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4032 			NAPI_GRO_CB(skb)->csum_valid = 0;
4033 		}
4034 
4035 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4036 		break;
4037 	}
4038 	rcu_read_unlock();
4039 
4040 	if (&ptype->list == head)
4041 		goto normal;
4042 
4043 	same_flow = NAPI_GRO_CB(skb)->same_flow;
4044 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4045 
4046 	if (pp) {
4047 		struct sk_buff *nskb = *pp;
4048 
4049 		*pp = nskb->next;
4050 		nskb->next = NULL;
4051 		napi_gro_complete(nskb);
4052 		napi->gro_count--;
4053 	}
4054 
4055 	if (same_flow)
4056 		goto ok;
4057 
4058 	if (NAPI_GRO_CB(skb)->flush)
4059 		goto normal;
4060 
4061 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4062 		struct sk_buff *nskb = napi->gro_list;
4063 
4064 		/* locate the end of the list to select the 'oldest' flow */
4065 		while (nskb->next) {
4066 			pp = &nskb->next;
4067 			nskb = *pp;
4068 		}
4069 		*pp = NULL;
4070 		nskb->next = NULL;
4071 		napi_gro_complete(nskb);
4072 	} else {
4073 		napi->gro_count++;
4074 	}
4075 	NAPI_GRO_CB(skb)->count = 1;
4076 	NAPI_GRO_CB(skb)->age = jiffies;
4077 	NAPI_GRO_CB(skb)->last = skb;
4078 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4079 	skb->next = napi->gro_list;
4080 	napi->gro_list = skb;
4081 	ret = GRO_HELD;
4082 
4083 pull:
4084 	grow = skb_gro_offset(skb) - skb_headlen(skb);
4085 	if (grow > 0)
4086 		gro_pull_from_frag0(skb, grow);
4087 ok:
4088 	return ret;
4089 
4090 normal:
4091 	ret = GRO_NORMAL;
4092 	goto pull;
4093 }
4094 
4095 struct packet_offload *gro_find_receive_by_type(__be16 type)
4096 {
4097 	struct list_head *offload_head = &offload_base;
4098 	struct packet_offload *ptype;
4099 
4100 	list_for_each_entry_rcu(ptype, offload_head, list) {
4101 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4102 			continue;
4103 		return ptype;
4104 	}
4105 	return NULL;
4106 }
4107 EXPORT_SYMBOL(gro_find_receive_by_type);
4108 
4109 struct packet_offload *gro_find_complete_by_type(__be16 type)
4110 {
4111 	struct list_head *offload_head = &offload_base;
4112 	struct packet_offload *ptype;
4113 
4114 	list_for_each_entry_rcu(ptype, offload_head, list) {
4115 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4116 			continue;
4117 		return ptype;
4118 	}
4119 	return NULL;
4120 }
4121 EXPORT_SYMBOL(gro_find_complete_by_type);
4122 
4123 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4124 {
4125 	switch (ret) {
4126 	case GRO_NORMAL:
4127 		if (netif_receive_skb_internal(skb))
4128 			ret = GRO_DROP;
4129 		break;
4130 
4131 	case GRO_DROP:
4132 		kfree_skb(skb);
4133 		break;
4134 
4135 	case GRO_MERGED_FREE:
4136 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4137 			kmem_cache_free(skbuff_head_cache, skb);
4138 		else
4139 			__kfree_skb(skb);
4140 		break;
4141 
4142 	case GRO_HELD:
4143 	case GRO_MERGED:
4144 		break;
4145 	}
4146 
4147 	return ret;
4148 }
4149 
4150 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4151 {
4152 	trace_napi_gro_receive_entry(skb);
4153 
4154 	skb_gro_reset_offset(skb);
4155 
4156 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4157 }
4158 EXPORT_SYMBOL(napi_gro_receive);
4159 
4160 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4161 {
4162 	if (unlikely(skb->pfmemalloc)) {
4163 		consume_skb(skb);
4164 		return;
4165 	}
4166 	__skb_pull(skb, skb_headlen(skb));
4167 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4168 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4169 	skb->vlan_tci = 0;
4170 	skb->dev = napi->dev;
4171 	skb->skb_iif = 0;
4172 	skb->encapsulation = 0;
4173 	skb_shinfo(skb)->gso_type = 0;
4174 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4175 
4176 	napi->skb = skb;
4177 }
4178 
4179 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4180 {
4181 	struct sk_buff *skb = napi->skb;
4182 
4183 	if (!skb) {
4184 		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4185 		napi->skb = skb;
4186 	}
4187 	return skb;
4188 }
4189 EXPORT_SYMBOL(napi_get_frags);
4190 
4191 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4192 				      struct sk_buff *skb,
4193 				      gro_result_t ret)
4194 {
4195 	switch (ret) {
4196 	case GRO_NORMAL:
4197 	case GRO_HELD:
4198 		__skb_push(skb, ETH_HLEN);
4199 		skb->protocol = eth_type_trans(skb, skb->dev);
4200 		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4201 			ret = GRO_DROP;
4202 		break;
4203 
4204 	case GRO_DROP:
4205 	case GRO_MERGED_FREE:
4206 		napi_reuse_skb(napi, skb);
4207 		break;
4208 
4209 	case GRO_MERGED:
4210 		break;
4211 	}
4212 
4213 	return ret;
4214 }
4215 
4216 /* Upper GRO stack assumes network header starts at gro_offset=0
4217  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4218  * We copy ethernet header into skb->data to have a common layout.
4219  */
4220 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4221 {
4222 	struct sk_buff *skb = napi->skb;
4223 	const struct ethhdr *eth;
4224 	unsigned int hlen = sizeof(*eth);
4225 
4226 	napi->skb = NULL;
4227 
4228 	skb_reset_mac_header(skb);
4229 	skb_gro_reset_offset(skb);
4230 
4231 	eth = skb_gro_header_fast(skb, 0);
4232 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4233 		eth = skb_gro_header_slow(skb, hlen, 0);
4234 		if (unlikely(!eth)) {
4235 			napi_reuse_skb(napi, skb);
4236 			return NULL;
4237 		}
4238 	} else {
4239 		gro_pull_from_frag0(skb, hlen);
4240 		NAPI_GRO_CB(skb)->frag0 += hlen;
4241 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4242 	}
4243 	__skb_pull(skb, hlen);
4244 
4245 	/*
4246 	 * This works because the only protocols we care about don't require
4247 	 * special handling.
4248 	 * We'll fix it up properly in napi_frags_finish()
4249 	 */
4250 	skb->protocol = eth->h_proto;
4251 
4252 	return skb;
4253 }
4254 
4255 gro_result_t napi_gro_frags(struct napi_struct *napi)
4256 {
4257 	struct sk_buff *skb = napi_frags_skb(napi);
4258 
4259 	if (!skb)
4260 		return GRO_DROP;
4261 
4262 	trace_napi_gro_frags_entry(skb);
4263 
4264 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4265 }
4266 EXPORT_SYMBOL(napi_gro_frags);
4267 
4268 /* Compute the checksum from gro_offset and return the folded value
4269  * after adding in any pseudo checksum.
4270  */
4271 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4272 {
4273 	__wsum wsum;
4274 	__sum16 sum;
4275 
4276 	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4277 
4278 	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4279 	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4280 	if (likely(!sum)) {
4281 		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4282 		    !skb->csum_complete_sw)
4283 			netdev_rx_csum_fault(skb->dev);
4284 	}
4285 
4286 	NAPI_GRO_CB(skb)->csum = wsum;
4287 	NAPI_GRO_CB(skb)->csum_valid = 1;
4288 
4289 	return sum;
4290 }
4291 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4292 
4293 /*
4294  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4295  * Note: called with local irq disabled, but exits with local irq enabled.
4296  */
4297 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4298 {
4299 #ifdef CONFIG_RPS
4300 	struct softnet_data *remsd = sd->rps_ipi_list;
4301 
4302 	if (remsd) {
4303 		sd->rps_ipi_list = NULL;
4304 
4305 		local_irq_enable();
4306 
4307 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4308 		while (remsd) {
4309 			struct softnet_data *next = remsd->rps_ipi_next;
4310 
4311 			if (cpu_online(remsd->cpu))
4312 				smp_call_function_single_async(remsd->cpu,
4313 							   &remsd->csd);
4314 			remsd = next;
4315 		}
4316 	} else
4317 #endif
4318 		local_irq_enable();
4319 }
4320 
4321 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4322 {
4323 #ifdef CONFIG_RPS
4324 	return sd->rps_ipi_list != NULL;
4325 #else
4326 	return false;
4327 #endif
4328 }
4329 
4330 static int process_backlog(struct napi_struct *napi, int quota)
4331 {
4332 	int work = 0;
4333 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4334 
4335 	/* Check if we have pending ipi, its better to send them now,
4336 	 * not waiting net_rx_action() end.
4337 	 */
4338 	if (sd_has_rps_ipi_waiting(sd)) {
4339 		local_irq_disable();
4340 		net_rps_action_and_irq_enable(sd);
4341 	}
4342 
4343 	napi->weight = weight_p;
4344 	local_irq_disable();
4345 	while (1) {
4346 		struct sk_buff *skb;
4347 
4348 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4349 			local_irq_enable();
4350 			__netif_receive_skb(skb);
4351 			local_irq_disable();
4352 			input_queue_head_incr(sd);
4353 			if (++work >= quota) {
4354 				local_irq_enable();
4355 				return work;
4356 			}
4357 		}
4358 
4359 		rps_lock(sd);
4360 		if (skb_queue_empty(&sd->input_pkt_queue)) {
4361 			/*
4362 			 * Inline a custom version of __napi_complete().
4363 			 * only current cpu owns and manipulates this napi,
4364 			 * and NAPI_STATE_SCHED is the only possible flag set
4365 			 * on backlog.
4366 			 * We can use a plain write instead of clear_bit(),
4367 			 * and we dont need an smp_mb() memory barrier.
4368 			 */
4369 			napi->state = 0;
4370 			rps_unlock(sd);
4371 
4372 			break;
4373 		}
4374 
4375 		skb_queue_splice_tail_init(&sd->input_pkt_queue,
4376 					   &sd->process_queue);
4377 		rps_unlock(sd);
4378 	}
4379 	local_irq_enable();
4380 
4381 	return work;
4382 }
4383 
4384 /**
4385  * __napi_schedule - schedule for receive
4386  * @n: entry to schedule
4387  *
4388  * The entry's receive function will be scheduled to run.
4389  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4390  */
4391 void __napi_schedule(struct napi_struct *n)
4392 {
4393 	unsigned long flags;
4394 
4395 	local_irq_save(flags);
4396 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4397 	local_irq_restore(flags);
4398 }
4399 EXPORT_SYMBOL(__napi_schedule);
4400 
4401 /**
4402  * __napi_schedule_irqoff - schedule for receive
4403  * @n: entry to schedule
4404  *
4405  * Variant of __napi_schedule() assuming hard irqs are masked
4406  */
4407 void __napi_schedule_irqoff(struct napi_struct *n)
4408 {
4409 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4410 }
4411 EXPORT_SYMBOL(__napi_schedule_irqoff);
4412 
4413 void __napi_complete(struct napi_struct *n)
4414 {
4415 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4416 
4417 	list_del_init(&n->poll_list);
4418 	smp_mb__before_atomic();
4419 	clear_bit(NAPI_STATE_SCHED, &n->state);
4420 }
4421 EXPORT_SYMBOL(__napi_complete);
4422 
4423 void napi_complete_done(struct napi_struct *n, int work_done)
4424 {
4425 	unsigned long flags;
4426 
4427 	/*
4428 	 * don't let napi dequeue from the cpu poll list
4429 	 * just in case its running on a different cpu
4430 	 */
4431 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4432 		return;
4433 
4434 	if (n->gro_list) {
4435 		unsigned long timeout = 0;
4436 
4437 		if (work_done)
4438 			timeout = n->dev->gro_flush_timeout;
4439 
4440 		if (timeout)
4441 			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4442 				      HRTIMER_MODE_REL_PINNED);
4443 		else
4444 			napi_gro_flush(n, false);
4445 	}
4446 	if (likely(list_empty(&n->poll_list))) {
4447 		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4448 	} else {
4449 		/* If n->poll_list is not empty, we need to mask irqs */
4450 		local_irq_save(flags);
4451 		__napi_complete(n);
4452 		local_irq_restore(flags);
4453 	}
4454 }
4455 EXPORT_SYMBOL(napi_complete_done);
4456 
4457 /* must be called under rcu_read_lock(), as we dont take a reference */
4458 struct napi_struct *napi_by_id(unsigned int napi_id)
4459 {
4460 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4461 	struct napi_struct *napi;
4462 
4463 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4464 		if (napi->napi_id == napi_id)
4465 			return napi;
4466 
4467 	return NULL;
4468 }
4469 EXPORT_SYMBOL_GPL(napi_by_id);
4470 
4471 void napi_hash_add(struct napi_struct *napi)
4472 {
4473 	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4474 
4475 		spin_lock(&napi_hash_lock);
4476 
4477 		/* 0 is not a valid id, we also skip an id that is taken
4478 		 * we expect both events to be extremely rare
4479 		 */
4480 		napi->napi_id = 0;
4481 		while (!napi->napi_id) {
4482 			napi->napi_id = ++napi_gen_id;
4483 			if (napi_by_id(napi->napi_id))
4484 				napi->napi_id = 0;
4485 		}
4486 
4487 		hlist_add_head_rcu(&napi->napi_hash_node,
4488 			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4489 
4490 		spin_unlock(&napi_hash_lock);
4491 	}
4492 }
4493 EXPORT_SYMBOL_GPL(napi_hash_add);
4494 
4495 /* Warning : caller is responsible to make sure rcu grace period
4496  * is respected before freeing memory containing @napi
4497  */
4498 void napi_hash_del(struct napi_struct *napi)
4499 {
4500 	spin_lock(&napi_hash_lock);
4501 
4502 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4503 		hlist_del_rcu(&napi->napi_hash_node);
4504 
4505 	spin_unlock(&napi_hash_lock);
4506 }
4507 EXPORT_SYMBOL_GPL(napi_hash_del);
4508 
4509 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4510 {
4511 	struct napi_struct *napi;
4512 
4513 	napi = container_of(timer, struct napi_struct, timer);
4514 	if (napi->gro_list)
4515 		napi_schedule(napi);
4516 
4517 	return HRTIMER_NORESTART;
4518 }
4519 
4520 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4521 		    int (*poll)(struct napi_struct *, int), int weight)
4522 {
4523 	INIT_LIST_HEAD(&napi->poll_list);
4524 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4525 	napi->timer.function = napi_watchdog;
4526 	napi->gro_count = 0;
4527 	napi->gro_list = NULL;
4528 	napi->skb = NULL;
4529 	napi->poll = poll;
4530 	if (weight > NAPI_POLL_WEIGHT)
4531 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4532 			    weight, dev->name);
4533 	napi->weight = weight;
4534 	list_add(&napi->dev_list, &dev->napi_list);
4535 	napi->dev = dev;
4536 #ifdef CONFIG_NETPOLL
4537 	spin_lock_init(&napi->poll_lock);
4538 	napi->poll_owner = -1;
4539 #endif
4540 	set_bit(NAPI_STATE_SCHED, &napi->state);
4541 }
4542 EXPORT_SYMBOL(netif_napi_add);
4543 
4544 void napi_disable(struct napi_struct *n)
4545 {
4546 	might_sleep();
4547 	set_bit(NAPI_STATE_DISABLE, &n->state);
4548 
4549 	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4550 		msleep(1);
4551 
4552 	hrtimer_cancel(&n->timer);
4553 
4554 	clear_bit(NAPI_STATE_DISABLE, &n->state);
4555 }
4556 EXPORT_SYMBOL(napi_disable);
4557 
4558 void netif_napi_del(struct napi_struct *napi)
4559 {
4560 	list_del_init(&napi->dev_list);
4561 	napi_free_frags(napi);
4562 
4563 	kfree_skb_list(napi->gro_list);
4564 	napi->gro_list = NULL;
4565 	napi->gro_count = 0;
4566 }
4567 EXPORT_SYMBOL(netif_napi_del);
4568 
4569 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4570 {
4571 	void *have;
4572 	int work, weight;
4573 
4574 	list_del_init(&n->poll_list);
4575 
4576 	have = netpoll_poll_lock(n);
4577 
4578 	weight = n->weight;
4579 
4580 	/* This NAPI_STATE_SCHED test is for avoiding a race
4581 	 * with netpoll's poll_napi().  Only the entity which
4582 	 * obtains the lock and sees NAPI_STATE_SCHED set will
4583 	 * actually make the ->poll() call.  Therefore we avoid
4584 	 * accidentally calling ->poll() when NAPI is not scheduled.
4585 	 */
4586 	work = 0;
4587 	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4588 		work = n->poll(n, weight);
4589 		trace_napi_poll(n);
4590 	}
4591 
4592 	WARN_ON_ONCE(work > weight);
4593 
4594 	if (likely(work < weight))
4595 		goto out_unlock;
4596 
4597 	/* Drivers must not modify the NAPI state if they
4598 	 * consume the entire weight.  In such cases this code
4599 	 * still "owns" the NAPI instance and therefore can
4600 	 * move the instance around on the list at-will.
4601 	 */
4602 	if (unlikely(napi_disable_pending(n))) {
4603 		napi_complete(n);
4604 		goto out_unlock;
4605 	}
4606 
4607 	if (n->gro_list) {
4608 		/* flush too old packets
4609 		 * If HZ < 1000, flush all packets.
4610 		 */
4611 		napi_gro_flush(n, HZ >= 1000);
4612 	}
4613 
4614 	/* Some drivers may have called napi_schedule
4615 	 * prior to exhausting their budget.
4616 	 */
4617 	if (unlikely(!list_empty(&n->poll_list))) {
4618 		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4619 			     n->dev ? n->dev->name : "backlog");
4620 		goto out_unlock;
4621 	}
4622 
4623 	list_add_tail(&n->poll_list, repoll);
4624 
4625 out_unlock:
4626 	netpoll_poll_unlock(have);
4627 
4628 	return work;
4629 }
4630 
4631 static void net_rx_action(struct softirq_action *h)
4632 {
4633 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4634 	unsigned long time_limit = jiffies + 2;
4635 	int budget = netdev_budget;
4636 	LIST_HEAD(list);
4637 	LIST_HEAD(repoll);
4638 
4639 	local_irq_disable();
4640 	list_splice_init(&sd->poll_list, &list);
4641 	local_irq_enable();
4642 
4643 	for (;;) {
4644 		struct napi_struct *n;
4645 
4646 		if (list_empty(&list)) {
4647 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4648 				return;
4649 			break;
4650 		}
4651 
4652 		n = list_first_entry(&list, struct napi_struct, poll_list);
4653 		budget -= napi_poll(n, &repoll);
4654 
4655 		/* If softirq window is exhausted then punt.
4656 		 * Allow this to run for 2 jiffies since which will allow
4657 		 * an average latency of 1.5/HZ.
4658 		 */
4659 		if (unlikely(budget <= 0 ||
4660 			     time_after_eq(jiffies, time_limit))) {
4661 			sd->time_squeeze++;
4662 			break;
4663 		}
4664 	}
4665 
4666 	local_irq_disable();
4667 
4668 	list_splice_tail_init(&sd->poll_list, &list);
4669 	list_splice_tail(&repoll, &list);
4670 	list_splice(&list, &sd->poll_list);
4671 	if (!list_empty(&sd->poll_list))
4672 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4673 
4674 	net_rps_action_and_irq_enable(sd);
4675 }
4676 
4677 struct netdev_adjacent {
4678 	struct net_device *dev;
4679 
4680 	/* upper master flag, there can only be one master device per list */
4681 	bool master;
4682 
4683 	/* counter for the number of times this device was added to us */
4684 	u16 ref_nr;
4685 
4686 	/* private field for the users */
4687 	void *private;
4688 
4689 	struct list_head list;
4690 	struct rcu_head rcu;
4691 };
4692 
4693 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4694 						 struct net_device *adj_dev,
4695 						 struct list_head *adj_list)
4696 {
4697 	struct netdev_adjacent *adj;
4698 
4699 	list_for_each_entry(adj, adj_list, list) {
4700 		if (adj->dev == adj_dev)
4701 			return adj;
4702 	}
4703 	return NULL;
4704 }
4705 
4706 /**
4707  * netdev_has_upper_dev - Check if device is linked to an upper device
4708  * @dev: device
4709  * @upper_dev: upper device to check
4710  *
4711  * Find out if a device is linked to specified upper device and return true
4712  * in case it is. Note that this checks only immediate upper device,
4713  * not through a complete stack of devices. The caller must hold the RTNL lock.
4714  */
4715 bool netdev_has_upper_dev(struct net_device *dev,
4716 			  struct net_device *upper_dev)
4717 {
4718 	ASSERT_RTNL();
4719 
4720 	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4721 }
4722 EXPORT_SYMBOL(netdev_has_upper_dev);
4723 
4724 /**
4725  * netdev_has_any_upper_dev - Check if device is linked to some device
4726  * @dev: device
4727  *
4728  * Find out if a device is linked to an upper device and return true in case
4729  * it is. The caller must hold the RTNL lock.
4730  */
4731 static bool netdev_has_any_upper_dev(struct net_device *dev)
4732 {
4733 	ASSERT_RTNL();
4734 
4735 	return !list_empty(&dev->all_adj_list.upper);
4736 }
4737 
4738 /**
4739  * netdev_master_upper_dev_get - Get master upper device
4740  * @dev: device
4741  *
4742  * Find a master upper device and return pointer to it or NULL in case
4743  * it's not there. The caller must hold the RTNL lock.
4744  */
4745 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4746 {
4747 	struct netdev_adjacent *upper;
4748 
4749 	ASSERT_RTNL();
4750 
4751 	if (list_empty(&dev->adj_list.upper))
4752 		return NULL;
4753 
4754 	upper = list_first_entry(&dev->adj_list.upper,
4755 				 struct netdev_adjacent, list);
4756 	if (likely(upper->master))
4757 		return upper->dev;
4758 	return NULL;
4759 }
4760 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4761 
4762 void *netdev_adjacent_get_private(struct list_head *adj_list)
4763 {
4764 	struct netdev_adjacent *adj;
4765 
4766 	adj = list_entry(adj_list, struct netdev_adjacent, list);
4767 
4768 	return adj->private;
4769 }
4770 EXPORT_SYMBOL(netdev_adjacent_get_private);
4771 
4772 /**
4773  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4774  * @dev: device
4775  * @iter: list_head ** of the current position
4776  *
4777  * Gets the next device from the dev's upper list, starting from iter
4778  * position. The caller must hold RCU read lock.
4779  */
4780 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4781 						 struct list_head **iter)
4782 {
4783 	struct netdev_adjacent *upper;
4784 
4785 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4786 
4787 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4788 
4789 	if (&upper->list == &dev->adj_list.upper)
4790 		return NULL;
4791 
4792 	*iter = &upper->list;
4793 
4794 	return upper->dev;
4795 }
4796 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4797 
4798 /**
4799  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4800  * @dev: device
4801  * @iter: list_head ** of the current position
4802  *
4803  * Gets the next device from the dev's upper list, starting from iter
4804  * position. The caller must hold RCU read lock.
4805  */
4806 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4807 						     struct list_head **iter)
4808 {
4809 	struct netdev_adjacent *upper;
4810 
4811 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4812 
4813 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4814 
4815 	if (&upper->list == &dev->all_adj_list.upper)
4816 		return NULL;
4817 
4818 	*iter = &upper->list;
4819 
4820 	return upper->dev;
4821 }
4822 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4823 
4824 /**
4825  * netdev_lower_get_next_private - Get the next ->private from the
4826  *				   lower neighbour list
4827  * @dev: device
4828  * @iter: list_head ** of the current position
4829  *
4830  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4831  * list, starting from iter position. The caller must hold either hold the
4832  * RTNL lock or its own locking that guarantees that the neighbour lower
4833  * list will remain unchainged.
4834  */
4835 void *netdev_lower_get_next_private(struct net_device *dev,
4836 				    struct list_head **iter)
4837 {
4838 	struct netdev_adjacent *lower;
4839 
4840 	lower = list_entry(*iter, struct netdev_adjacent, list);
4841 
4842 	if (&lower->list == &dev->adj_list.lower)
4843 		return NULL;
4844 
4845 	*iter = lower->list.next;
4846 
4847 	return lower->private;
4848 }
4849 EXPORT_SYMBOL(netdev_lower_get_next_private);
4850 
4851 /**
4852  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4853  *				       lower neighbour list, RCU
4854  *				       variant
4855  * @dev: device
4856  * @iter: list_head ** of the current position
4857  *
4858  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4859  * list, starting from iter position. The caller must hold RCU read lock.
4860  */
4861 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4862 					struct list_head **iter)
4863 {
4864 	struct netdev_adjacent *lower;
4865 
4866 	WARN_ON_ONCE(!rcu_read_lock_held());
4867 
4868 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4869 
4870 	if (&lower->list == &dev->adj_list.lower)
4871 		return NULL;
4872 
4873 	*iter = &lower->list;
4874 
4875 	return lower->private;
4876 }
4877 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4878 
4879 /**
4880  * netdev_lower_get_next - Get the next device from the lower neighbour
4881  *                         list
4882  * @dev: device
4883  * @iter: list_head ** of the current position
4884  *
4885  * Gets the next netdev_adjacent from the dev's lower neighbour
4886  * list, starting from iter position. The caller must hold RTNL lock or
4887  * its own locking that guarantees that the neighbour lower
4888  * list will remain unchainged.
4889  */
4890 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4891 {
4892 	struct netdev_adjacent *lower;
4893 
4894 	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4895 
4896 	if (&lower->list == &dev->adj_list.lower)
4897 		return NULL;
4898 
4899 	*iter = &lower->list;
4900 
4901 	return lower->dev;
4902 }
4903 EXPORT_SYMBOL(netdev_lower_get_next);
4904 
4905 /**
4906  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4907  *				       lower neighbour list, RCU
4908  *				       variant
4909  * @dev: device
4910  *
4911  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4912  * list. The caller must hold RCU read lock.
4913  */
4914 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4915 {
4916 	struct netdev_adjacent *lower;
4917 
4918 	lower = list_first_or_null_rcu(&dev->adj_list.lower,
4919 			struct netdev_adjacent, list);
4920 	if (lower)
4921 		return lower->private;
4922 	return NULL;
4923 }
4924 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4925 
4926 /**
4927  * netdev_master_upper_dev_get_rcu - Get master upper device
4928  * @dev: device
4929  *
4930  * Find a master upper device and return pointer to it or NULL in case
4931  * it's not there. The caller must hold the RCU read lock.
4932  */
4933 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4934 {
4935 	struct netdev_adjacent *upper;
4936 
4937 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
4938 				       struct netdev_adjacent, list);
4939 	if (upper && likely(upper->master))
4940 		return upper->dev;
4941 	return NULL;
4942 }
4943 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4944 
4945 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4946 			      struct net_device *adj_dev,
4947 			      struct list_head *dev_list)
4948 {
4949 	char linkname[IFNAMSIZ+7];
4950 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4951 		"upper_%s" : "lower_%s", adj_dev->name);
4952 	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4953 				 linkname);
4954 }
4955 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4956 			       char *name,
4957 			       struct list_head *dev_list)
4958 {
4959 	char linkname[IFNAMSIZ+7];
4960 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4961 		"upper_%s" : "lower_%s", name);
4962 	sysfs_remove_link(&(dev->dev.kobj), linkname);
4963 }
4964 
4965 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4966 						 struct net_device *adj_dev,
4967 						 struct list_head *dev_list)
4968 {
4969 	return (dev_list == &dev->adj_list.upper ||
4970 		dev_list == &dev->adj_list.lower) &&
4971 		net_eq(dev_net(dev), dev_net(adj_dev));
4972 }
4973 
4974 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4975 					struct net_device *adj_dev,
4976 					struct list_head *dev_list,
4977 					void *private, bool master)
4978 {
4979 	struct netdev_adjacent *adj;
4980 	int ret;
4981 
4982 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4983 
4984 	if (adj) {
4985 		adj->ref_nr++;
4986 		return 0;
4987 	}
4988 
4989 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4990 	if (!adj)
4991 		return -ENOMEM;
4992 
4993 	adj->dev = adj_dev;
4994 	adj->master = master;
4995 	adj->ref_nr = 1;
4996 	adj->private = private;
4997 	dev_hold(adj_dev);
4998 
4999 	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5000 		 adj_dev->name, dev->name, adj_dev->name);
5001 
5002 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5003 		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5004 		if (ret)
5005 			goto free_adj;
5006 	}
5007 
5008 	/* Ensure that master link is always the first item in list. */
5009 	if (master) {
5010 		ret = sysfs_create_link(&(dev->dev.kobj),
5011 					&(adj_dev->dev.kobj), "master");
5012 		if (ret)
5013 			goto remove_symlinks;
5014 
5015 		list_add_rcu(&adj->list, dev_list);
5016 	} else {
5017 		list_add_tail_rcu(&adj->list, dev_list);
5018 	}
5019 
5020 	return 0;
5021 
5022 remove_symlinks:
5023 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5024 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5025 free_adj:
5026 	kfree(adj);
5027 	dev_put(adj_dev);
5028 
5029 	return ret;
5030 }
5031 
5032 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5033 					 struct net_device *adj_dev,
5034 					 struct list_head *dev_list)
5035 {
5036 	struct netdev_adjacent *adj;
5037 
5038 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
5039 
5040 	if (!adj) {
5041 		pr_err("tried to remove device %s from %s\n",
5042 		       dev->name, adj_dev->name);
5043 		BUG();
5044 	}
5045 
5046 	if (adj->ref_nr > 1) {
5047 		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5048 			 adj->ref_nr-1);
5049 		adj->ref_nr--;
5050 		return;
5051 	}
5052 
5053 	if (adj->master)
5054 		sysfs_remove_link(&(dev->dev.kobj), "master");
5055 
5056 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5057 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5058 
5059 	list_del_rcu(&adj->list);
5060 	pr_debug("dev_put for %s, because link removed from %s to %s\n",
5061 		 adj_dev->name, dev->name, adj_dev->name);
5062 	dev_put(adj_dev);
5063 	kfree_rcu(adj, rcu);
5064 }
5065 
5066 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5067 					    struct net_device *upper_dev,
5068 					    struct list_head *up_list,
5069 					    struct list_head *down_list,
5070 					    void *private, bool master)
5071 {
5072 	int ret;
5073 
5074 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5075 					   master);
5076 	if (ret)
5077 		return ret;
5078 
5079 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5080 					   false);
5081 	if (ret) {
5082 		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5083 		return ret;
5084 	}
5085 
5086 	return 0;
5087 }
5088 
5089 static int __netdev_adjacent_dev_link(struct net_device *dev,
5090 				      struct net_device *upper_dev)
5091 {
5092 	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5093 						&dev->all_adj_list.upper,
5094 						&upper_dev->all_adj_list.lower,
5095 						NULL, false);
5096 }
5097 
5098 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5099 					       struct net_device *upper_dev,
5100 					       struct list_head *up_list,
5101 					       struct list_head *down_list)
5102 {
5103 	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5104 	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5105 }
5106 
5107 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5108 					 struct net_device *upper_dev)
5109 {
5110 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5111 					   &dev->all_adj_list.upper,
5112 					   &upper_dev->all_adj_list.lower);
5113 }
5114 
5115 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5116 						struct net_device *upper_dev,
5117 						void *private, bool master)
5118 {
5119 	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5120 
5121 	if (ret)
5122 		return ret;
5123 
5124 	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5125 					       &dev->adj_list.upper,
5126 					       &upper_dev->adj_list.lower,
5127 					       private, master);
5128 	if (ret) {
5129 		__netdev_adjacent_dev_unlink(dev, upper_dev);
5130 		return ret;
5131 	}
5132 
5133 	return 0;
5134 }
5135 
5136 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5137 						   struct net_device *upper_dev)
5138 {
5139 	__netdev_adjacent_dev_unlink(dev, upper_dev);
5140 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5141 					   &dev->adj_list.upper,
5142 					   &upper_dev->adj_list.lower);
5143 }
5144 
5145 static int __netdev_upper_dev_link(struct net_device *dev,
5146 				   struct net_device *upper_dev, bool master,
5147 				   void *private)
5148 {
5149 	struct netdev_adjacent *i, *j, *to_i, *to_j;
5150 	int ret = 0;
5151 
5152 	ASSERT_RTNL();
5153 
5154 	if (dev == upper_dev)
5155 		return -EBUSY;
5156 
5157 	/* To prevent loops, check if dev is not upper device to upper_dev. */
5158 	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5159 		return -EBUSY;
5160 
5161 	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5162 		return -EEXIST;
5163 
5164 	if (master && netdev_master_upper_dev_get(dev))
5165 		return -EBUSY;
5166 
5167 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5168 						   master);
5169 	if (ret)
5170 		return ret;
5171 
5172 	/* Now that we linked these devs, make all the upper_dev's
5173 	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5174 	 * versa, and don't forget the devices itself. All of these
5175 	 * links are non-neighbours.
5176 	 */
5177 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5178 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5179 			pr_debug("Interlinking %s with %s, non-neighbour\n",
5180 				 i->dev->name, j->dev->name);
5181 			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5182 			if (ret)
5183 				goto rollback_mesh;
5184 		}
5185 	}
5186 
5187 	/* add dev to every upper_dev's upper device */
5188 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5189 		pr_debug("linking %s's upper device %s with %s\n",
5190 			 upper_dev->name, i->dev->name, dev->name);
5191 		ret = __netdev_adjacent_dev_link(dev, i->dev);
5192 		if (ret)
5193 			goto rollback_upper_mesh;
5194 	}
5195 
5196 	/* add upper_dev to every dev's lower device */
5197 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5198 		pr_debug("linking %s's lower device %s with %s\n", dev->name,
5199 			 i->dev->name, upper_dev->name);
5200 		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5201 		if (ret)
5202 			goto rollback_lower_mesh;
5203 	}
5204 
5205 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5206 	return 0;
5207 
5208 rollback_lower_mesh:
5209 	to_i = i;
5210 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5211 		if (i == to_i)
5212 			break;
5213 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5214 	}
5215 
5216 	i = NULL;
5217 
5218 rollback_upper_mesh:
5219 	to_i = i;
5220 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5221 		if (i == to_i)
5222 			break;
5223 		__netdev_adjacent_dev_unlink(dev, i->dev);
5224 	}
5225 
5226 	i = j = NULL;
5227 
5228 rollback_mesh:
5229 	to_i = i;
5230 	to_j = j;
5231 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5232 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5233 			if (i == to_i && j == to_j)
5234 				break;
5235 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5236 		}
5237 		if (i == to_i)
5238 			break;
5239 	}
5240 
5241 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5242 
5243 	return ret;
5244 }
5245 
5246 /**
5247  * netdev_upper_dev_link - Add a link to the upper device
5248  * @dev: device
5249  * @upper_dev: new upper device
5250  *
5251  * Adds a link to device which is upper to this one. The caller must hold
5252  * the RTNL lock. On a failure a negative errno code is returned.
5253  * On success the reference counts are adjusted and the function
5254  * returns zero.
5255  */
5256 int netdev_upper_dev_link(struct net_device *dev,
5257 			  struct net_device *upper_dev)
5258 {
5259 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5260 }
5261 EXPORT_SYMBOL(netdev_upper_dev_link);
5262 
5263 /**
5264  * netdev_master_upper_dev_link - Add a master link to the upper device
5265  * @dev: device
5266  * @upper_dev: new upper device
5267  *
5268  * Adds a link to device which is upper to this one. In this case, only
5269  * one master upper device can be linked, although other non-master devices
5270  * might be linked as well. The caller must hold the RTNL lock.
5271  * On a failure a negative errno code is returned. On success the reference
5272  * counts are adjusted and the function returns zero.
5273  */
5274 int netdev_master_upper_dev_link(struct net_device *dev,
5275 				 struct net_device *upper_dev)
5276 {
5277 	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5278 }
5279 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5280 
5281 int netdev_master_upper_dev_link_private(struct net_device *dev,
5282 					 struct net_device *upper_dev,
5283 					 void *private)
5284 {
5285 	return __netdev_upper_dev_link(dev, upper_dev, true, private);
5286 }
5287 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5288 
5289 /**
5290  * netdev_upper_dev_unlink - Removes a link to upper device
5291  * @dev: device
5292  * @upper_dev: new upper device
5293  *
5294  * Removes a link to device which is upper to this one. The caller must hold
5295  * the RTNL lock.
5296  */
5297 void netdev_upper_dev_unlink(struct net_device *dev,
5298 			     struct net_device *upper_dev)
5299 {
5300 	struct netdev_adjacent *i, *j;
5301 	ASSERT_RTNL();
5302 
5303 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5304 
5305 	/* Here is the tricky part. We must remove all dev's lower
5306 	 * devices from all upper_dev's upper devices and vice
5307 	 * versa, to maintain the graph relationship.
5308 	 */
5309 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5310 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5311 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5312 
5313 	/* remove also the devices itself from lower/upper device
5314 	 * list
5315 	 */
5316 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5317 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5318 
5319 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5320 		__netdev_adjacent_dev_unlink(dev, i->dev);
5321 
5322 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5323 }
5324 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5325 
5326 void netdev_adjacent_add_links(struct net_device *dev)
5327 {
5328 	struct netdev_adjacent *iter;
5329 
5330 	struct net *net = dev_net(dev);
5331 
5332 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5333 		if (!net_eq(net,dev_net(iter->dev)))
5334 			continue;
5335 		netdev_adjacent_sysfs_add(iter->dev, dev,
5336 					  &iter->dev->adj_list.lower);
5337 		netdev_adjacent_sysfs_add(dev, iter->dev,
5338 					  &dev->adj_list.upper);
5339 	}
5340 
5341 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5342 		if (!net_eq(net,dev_net(iter->dev)))
5343 			continue;
5344 		netdev_adjacent_sysfs_add(iter->dev, dev,
5345 					  &iter->dev->adj_list.upper);
5346 		netdev_adjacent_sysfs_add(dev, iter->dev,
5347 					  &dev->adj_list.lower);
5348 	}
5349 }
5350 
5351 void netdev_adjacent_del_links(struct net_device *dev)
5352 {
5353 	struct netdev_adjacent *iter;
5354 
5355 	struct net *net = dev_net(dev);
5356 
5357 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5358 		if (!net_eq(net,dev_net(iter->dev)))
5359 			continue;
5360 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5361 					  &iter->dev->adj_list.lower);
5362 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5363 					  &dev->adj_list.upper);
5364 	}
5365 
5366 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5367 		if (!net_eq(net,dev_net(iter->dev)))
5368 			continue;
5369 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5370 					  &iter->dev->adj_list.upper);
5371 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5372 					  &dev->adj_list.lower);
5373 	}
5374 }
5375 
5376 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5377 {
5378 	struct netdev_adjacent *iter;
5379 
5380 	struct net *net = dev_net(dev);
5381 
5382 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5383 		if (!net_eq(net,dev_net(iter->dev)))
5384 			continue;
5385 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5386 					  &iter->dev->adj_list.lower);
5387 		netdev_adjacent_sysfs_add(iter->dev, dev,
5388 					  &iter->dev->adj_list.lower);
5389 	}
5390 
5391 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5392 		if (!net_eq(net,dev_net(iter->dev)))
5393 			continue;
5394 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5395 					  &iter->dev->adj_list.upper);
5396 		netdev_adjacent_sysfs_add(iter->dev, dev,
5397 					  &iter->dev->adj_list.upper);
5398 	}
5399 }
5400 
5401 void *netdev_lower_dev_get_private(struct net_device *dev,
5402 				   struct net_device *lower_dev)
5403 {
5404 	struct netdev_adjacent *lower;
5405 
5406 	if (!lower_dev)
5407 		return NULL;
5408 	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5409 	if (!lower)
5410 		return NULL;
5411 
5412 	return lower->private;
5413 }
5414 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5415 
5416 
5417 int dev_get_nest_level(struct net_device *dev,
5418 		       bool (*type_check)(struct net_device *dev))
5419 {
5420 	struct net_device *lower = NULL;
5421 	struct list_head *iter;
5422 	int max_nest = -1;
5423 	int nest;
5424 
5425 	ASSERT_RTNL();
5426 
5427 	netdev_for_each_lower_dev(dev, lower, iter) {
5428 		nest = dev_get_nest_level(lower, type_check);
5429 		if (max_nest < nest)
5430 			max_nest = nest;
5431 	}
5432 
5433 	if (type_check(dev))
5434 		max_nest++;
5435 
5436 	return max_nest;
5437 }
5438 EXPORT_SYMBOL(dev_get_nest_level);
5439 
5440 static void dev_change_rx_flags(struct net_device *dev, int flags)
5441 {
5442 	const struct net_device_ops *ops = dev->netdev_ops;
5443 
5444 	if (ops->ndo_change_rx_flags)
5445 		ops->ndo_change_rx_flags(dev, flags);
5446 }
5447 
5448 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5449 {
5450 	unsigned int old_flags = dev->flags;
5451 	kuid_t uid;
5452 	kgid_t gid;
5453 
5454 	ASSERT_RTNL();
5455 
5456 	dev->flags |= IFF_PROMISC;
5457 	dev->promiscuity += inc;
5458 	if (dev->promiscuity == 0) {
5459 		/*
5460 		 * Avoid overflow.
5461 		 * If inc causes overflow, untouch promisc and return error.
5462 		 */
5463 		if (inc < 0)
5464 			dev->flags &= ~IFF_PROMISC;
5465 		else {
5466 			dev->promiscuity -= inc;
5467 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5468 				dev->name);
5469 			return -EOVERFLOW;
5470 		}
5471 	}
5472 	if (dev->flags != old_flags) {
5473 		pr_info("device %s %s promiscuous mode\n",
5474 			dev->name,
5475 			dev->flags & IFF_PROMISC ? "entered" : "left");
5476 		if (audit_enabled) {
5477 			current_uid_gid(&uid, &gid);
5478 			audit_log(current->audit_context, GFP_ATOMIC,
5479 				AUDIT_ANOM_PROMISCUOUS,
5480 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5481 				dev->name, (dev->flags & IFF_PROMISC),
5482 				(old_flags & IFF_PROMISC),
5483 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
5484 				from_kuid(&init_user_ns, uid),
5485 				from_kgid(&init_user_ns, gid),
5486 				audit_get_sessionid(current));
5487 		}
5488 
5489 		dev_change_rx_flags(dev, IFF_PROMISC);
5490 	}
5491 	if (notify)
5492 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
5493 	return 0;
5494 }
5495 
5496 /**
5497  *	dev_set_promiscuity	- update promiscuity count on a device
5498  *	@dev: device
5499  *	@inc: modifier
5500  *
5501  *	Add or remove promiscuity from a device. While the count in the device
5502  *	remains above zero the interface remains promiscuous. Once it hits zero
5503  *	the device reverts back to normal filtering operation. A negative inc
5504  *	value is used to drop promiscuity on the device.
5505  *	Return 0 if successful or a negative errno code on error.
5506  */
5507 int dev_set_promiscuity(struct net_device *dev, int inc)
5508 {
5509 	unsigned int old_flags = dev->flags;
5510 	int err;
5511 
5512 	err = __dev_set_promiscuity(dev, inc, true);
5513 	if (err < 0)
5514 		return err;
5515 	if (dev->flags != old_flags)
5516 		dev_set_rx_mode(dev);
5517 	return err;
5518 }
5519 EXPORT_SYMBOL(dev_set_promiscuity);
5520 
5521 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5522 {
5523 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5524 
5525 	ASSERT_RTNL();
5526 
5527 	dev->flags |= IFF_ALLMULTI;
5528 	dev->allmulti += inc;
5529 	if (dev->allmulti == 0) {
5530 		/*
5531 		 * Avoid overflow.
5532 		 * If inc causes overflow, untouch allmulti and return error.
5533 		 */
5534 		if (inc < 0)
5535 			dev->flags &= ~IFF_ALLMULTI;
5536 		else {
5537 			dev->allmulti -= inc;
5538 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5539 				dev->name);
5540 			return -EOVERFLOW;
5541 		}
5542 	}
5543 	if (dev->flags ^ old_flags) {
5544 		dev_change_rx_flags(dev, IFF_ALLMULTI);
5545 		dev_set_rx_mode(dev);
5546 		if (notify)
5547 			__dev_notify_flags(dev, old_flags,
5548 					   dev->gflags ^ old_gflags);
5549 	}
5550 	return 0;
5551 }
5552 
5553 /**
5554  *	dev_set_allmulti	- update allmulti count on a device
5555  *	@dev: device
5556  *	@inc: modifier
5557  *
5558  *	Add or remove reception of all multicast frames to a device. While the
5559  *	count in the device remains above zero the interface remains listening
5560  *	to all interfaces. Once it hits zero the device reverts back to normal
5561  *	filtering operation. A negative @inc value is used to drop the counter
5562  *	when releasing a resource needing all multicasts.
5563  *	Return 0 if successful or a negative errno code on error.
5564  */
5565 
5566 int dev_set_allmulti(struct net_device *dev, int inc)
5567 {
5568 	return __dev_set_allmulti(dev, inc, true);
5569 }
5570 EXPORT_SYMBOL(dev_set_allmulti);
5571 
5572 /*
5573  *	Upload unicast and multicast address lists to device and
5574  *	configure RX filtering. When the device doesn't support unicast
5575  *	filtering it is put in promiscuous mode while unicast addresses
5576  *	are present.
5577  */
5578 void __dev_set_rx_mode(struct net_device *dev)
5579 {
5580 	const struct net_device_ops *ops = dev->netdev_ops;
5581 
5582 	/* dev_open will call this function so the list will stay sane. */
5583 	if (!(dev->flags&IFF_UP))
5584 		return;
5585 
5586 	if (!netif_device_present(dev))
5587 		return;
5588 
5589 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5590 		/* Unicast addresses changes may only happen under the rtnl,
5591 		 * therefore calling __dev_set_promiscuity here is safe.
5592 		 */
5593 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5594 			__dev_set_promiscuity(dev, 1, false);
5595 			dev->uc_promisc = true;
5596 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5597 			__dev_set_promiscuity(dev, -1, false);
5598 			dev->uc_promisc = false;
5599 		}
5600 	}
5601 
5602 	if (ops->ndo_set_rx_mode)
5603 		ops->ndo_set_rx_mode(dev);
5604 }
5605 
5606 void dev_set_rx_mode(struct net_device *dev)
5607 {
5608 	netif_addr_lock_bh(dev);
5609 	__dev_set_rx_mode(dev);
5610 	netif_addr_unlock_bh(dev);
5611 }
5612 
5613 /**
5614  *	dev_get_flags - get flags reported to userspace
5615  *	@dev: device
5616  *
5617  *	Get the combination of flag bits exported through APIs to userspace.
5618  */
5619 unsigned int dev_get_flags(const struct net_device *dev)
5620 {
5621 	unsigned int flags;
5622 
5623 	flags = (dev->flags & ~(IFF_PROMISC |
5624 				IFF_ALLMULTI |
5625 				IFF_RUNNING |
5626 				IFF_LOWER_UP |
5627 				IFF_DORMANT)) |
5628 		(dev->gflags & (IFF_PROMISC |
5629 				IFF_ALLMULTI));
5630 
5631 	if (netif_running(dev)) {
5632 		if (netif_oper_up(dev))
5633 			flags |= IFF_RUNNING;
5634 		if (netif_carrier_ok(dev))
5635 			flags |= IFF_LOWER_UP;
5636 		if (netif_dormant(dev))
5637 			flags |= IFF_DORMANT;
5638 	}
5639 
5640 	return flags;
5641 }
5642 EXPORT_SYMBOL(dev_get_flags);
5643 
5644 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5645 {
5646 	unsigned int old_flags = dev->flags;
5647 	int ret;
5648 
5649 	ASSERT_RTNL();
5650 
5651 	/*
5652 	 *	Set the flags on our device.
5653 	 */
5654 
5655 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5656 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5657 			       IFF_AUTOMEDIA)) |
5658 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5659 				    IFF_ALLMULTI));
5660 
5661 	/*
5662 	 *	Load in the correct multicast list now the flags have changed.
5663 	 */
5664 
5665 	if ((old_flags ^ flags) & IFF_MULTICAST)
5666 		dev_change_rx_flags(dev, IFF_MULTICAST);
5667 
5668 	dev_set_rx_mode(dev);
5669 
5670 	/*
5671 	 *	Have we downed the interface. We handle IFF_UP ourselves
5672 	 *	according to user attempts to set it, rather than blindly
5673 	 *	setting it.
5674 	 */
5675 
5676 	ret = 0;
5677 	if ((old_flags ^ flags) & IFF_UP)
5678 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5679 
5680 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
5681 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
5682 		unsigned int old_flags = dev->flags;
5683 
5684 		dev->gflags ^= IFF_PROMISC;
5685 
5686 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
5687 			if (dev->flags != old_flags)
5688 				dev_set_rx_mode(dev);
5689 	}
5690 
5691 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5692 	   is important. Some (broken) drivers set IFF_PROMISC, when
5693 	   IFF_ALLMULTI is requested not asking us and not reporting.
5694 	 */
5695 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5696 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5697 
5698 		dev->gflags ^= IFF_ALLMULTI;
5699 		__dev_set_allmulti(dev, inc, false);
5700 	}
5701 
5702 	return ret;
5703 }
5704 
5705 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5706 			unsigned int gchanges)
5707 {
5708 	unsigned int changes = dev->flags ^ old_flags;
5709 
5710 	if (gchanges)
5711 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5712 
5713 	if (changes & IFF_UP) {
5714 		if (dev->flags & IFF_UP)
5715 			call_netdevice_notifiers(NETDEV_UP, dev);
5716 		else
5717 			call_netdevice_notifiers(NETDEV_DOWN, dev);
5718 	}
5719 
5720 	if (dev->flags & IFF_UP &&
5721 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5722 		struct netdev_notifier_change_info change_info;
5723 
5724 		change_info.flags_changed = changes;
5725 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5726 					      &change_info.info);
5727 	}
5728 }
5729 
5730 /**
5731  *	dev_change_flags - change device settings
5732  *	@dev: device
5733  *	@flags: device state flags
5734  *
5735  *	Change settings on device based state flags. The flags are
5736  *	in the userspace exported format.
5737  */
5738 int dev_change_flags(struct net_device *dev, unsigned int flags)
5739 {
5740 	int ret;
5741 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5742 
5743 	ret = __dev_change_flags(dev, flags);
5744 	if (ret < 0)
5745 		return ret;
5746 
5747 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5748 	__dev_notify_flags(dev, old_flags, changes);
5749 	return ret;
5750 }
5751 EXPORT_SYMBOL(dev_change_flags);
5752 
5753 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5754 {
5755 	const struct net_device_ops *ops = dev->netdev_ops;
5756 
5757 	if (ops->ndo_change_mtu)
5758 		return ops->ndo_change_mtu(dev, new_mtu);
5759 
5760 	dev->mtu = new_mtu;
5761 	return 0;
5762 }
5763 
5764 /**
5765  *	dev_set_mtu - Change maximum transfer unit
5766  *	@dev: device
5767  *	@new_mtu: new transfer unit
5768  *
5769  *	Change the maximum transfer size of the network device.
5770  */
5771 int dev_set_mtu(struct net_device *dev, int new_mtu)
5772 {
5773 	int err, orig_mtu;
5774 
5775 	if (new_mtu == dev->mtu)
5776 		return 0;
5777 
5778 	/*	MTU must be positive.	 */
5779 	if (new_mtu < 0)
5780 		return -EINVAL;
5781 
5782 	if (!netif_device_present(dev))
5783 		return -ENODEV;
5784 
5785 	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5786 	err = notifier_to_errno(err);
5787 	if (err)
5788 		return err;
5789 
5790 	orig_mtu = dev->mtu;
5791 	err = __dev_set_mtu(dev, new_mtu);
5792 
5793 	if (!err) {
5794 		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5795 		err = notifier_to_errno(err);
5796 		if (err) {
5797 			/* setting mtu back and notifying everyone again,
5798 			 * so that they have a chance to revert changes.
5799 			 */
5800 			__dev_set_mtu(dev, orig_mtu);
5801 			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5802 		}
5803 	}
5804 	return err;
5805 }
5806 EXPORT_SYMBOL(dev_set_mtu);
5807 
5808 /**
5809  *	dev_set_group - Change group this device belongs to
5810  *	@dev: device
5811  *	@new_group: group this device should belong to
5812  */
5813 void dev_set_group(struct net_device *dev, int new_group)
5814 {
5815 	dev->group = new_group;
5816 }
5817 EXPORT_SYMBOL(dev_set_group);
5818 
5819 /**
5820  *	dev_set_mac_address - Change Media Access Control Address
5821  *	@dev: device
5822  *	@sa: new address
5823  *
5824  *	Change the hardware (MAC) address of the device
5825  */
5826 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5827 {
5828 	const struct net_device_ops *ops = dev->netdev_ops;
5829 	int err;
5830 
5831 	if (!ops->ndo_set_mac_address)
5832 		return -EOPNOTSUPP;
5833 	if (sa->sa_family != dev->type)
5834 		return -EINVAL;
5835 	if (!netif_device_present(dev))
5836 		return -ENODEV;
5837 	err = ops->ndo_set_mac_address(dev, sa);
5838 	if (err)
5839 		return err;
5840 	dev->addr_assign_type = NET_ADDR_SET;
5841 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5842 	add_device_randomness(dev->dev_addr, dev->addr_len);
5843 	return 0;
5844 }
5845 EXPORT_SYMBOL(dev_set_mac_address);
5846 
5847 /**
5848  *	dev_change_carrier - Change device carrier
5849  *	@dev: device
5850  *	@new_carrier: new value
5851  *
5852  *	Change device carrier
5853  */
5854 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5855 {
5856 	const struct net_device_ops *ops = dev->netdev_ops;
5857 
5858 	if (!ops->ndo_change_carrier)
5859 		return -EOPNOTSUPP;
5860 	if (!netif_device_present(dev))
5861 		return -ENODEV;
5862 	return ops->ndo_change_carrier(dev, new_carrier);
5863 }
5864 EXPORT_SYMBOL(dev_change_carrier);
5865 
5866 /**
5867  *	dev_get_phys_port_id - Get device physical port ID
5868  *	@dev: device
5869  *	@ppid: port ID
5870  *
5871  *	Get device physical port ID
5872  */
5873 int dev_get_phys_port_id(struct net_device *dev,
5874 			 struct netdev_phys_item_id *ppid)
5875 {
5876 	const struct net_device_ops *ops = dev->netdev_ops;
5877 
5878 	if (!ops->ndo_get_phys_port_id)
5879 		return -EOPNOTSUPP;
5880 	return ops->ndo_get_phys_port_id(dev, ppid);
5881 }
5882 EXPORT_SYMBOL(dev_get_phys_port_id);
5883 
5884 /**
5885  *	dev_new_index	-	allocate an ifindex
5886  *	@net: the applicable net namespace
5887  *
5888  *	Returns a suitable unique value for a new device interface
5889  *	number.  The caller must hold the rtnl semaphore or the
5890  *	dev_base_lock to be sure it remains unique.
5891  */
5892 static int dev_new_index(struct net *net)
5893 {
5894 	int ifindex = net->ifindex;
5895 	for (;;) {
5896 		if (++ifindex <= 0)
5897 			ifindex = 1;
5898 		if (!__dev_get_by_index(net, ifindex))
5899 			return net->ifindex = ifindex;
5900 	}
5901 }
5902 
5903 /* Delayed registration/unregisteration */
5904 static LIST_HEAD(net_todo_list);
5905 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5906 
5907 static void net_set_todo(struct net_device *dev)
5908 {
5909 	list_add_tail(&dev->todo_list, &net_todo_list);
5910 	dev_net(dev)->dev_unreg_count++;
5911 }
5912 
5913 static void rollback_registered_many(struct list_head *head)
5914 {
5915 	struct net_device *dev, *tmp;
5916 	LIST_HEAD(close_head);
5917 
5918 	BUG_ON(dev_boot_phase);
5919 	ASSERT_RTNL();
5920 
5921 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5922 		/* Some devices call without registering
5923 		 * for initialization unwind. Remove those
5924 		 * devices and proceed with the remaining.
5925 		 */
5926 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5927 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5928 				 dev->name, dev);
5929 
5930 			WARN_ON(1);
5931 			list_del(&dev->unreg_list);
5932 			continue;
5933 		}
5934 		dev->dismantle = true;
5935 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5936 	}
5937 
5938 	/* If device is running, close it first. */
5939 	list_for_each_entry(dev, head, unreg_list)
5940 		list_add_tail(&dev->close_list, &close_head);
5941 	dev_close_many(&close_head);
5942 
5943 	list_for_each_entry(dev, head, unreg_list) {
5944 		/* And unlink it from device chain. */
5945 		unlist_netdevice(dev);
5946 
5947 		dev->reg_state = NETREG_UNREGISTERING;
5948 	}
5949 
5950 	synchronize_net();
5951 
5952 	list_for_each_entry(dev, head, unreg_list) {
5953 		struct sk_buff *skb = NULL;
5954 
5955 		/* Shutdown queueing discipline. */
5956 		dev_shutdown(dev);
5957 
5958 
5959 		/* Notify protocols, that we are about to destroy
5960 		   this device. They should clean all the things.
5961 		*/
5962 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5963 
5964 		if (!dev->rtnl_link_ops ||
5965 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5966 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
5967 						     GFP_KERNEL);
5968 
5969 		/*
5970 		 *	Flush the unicast and multicast chains
5971 		 */
5972 		dev_uc_flush(dev);
5973 		dev_mc_flush(dev);
5974 
5975 		if (dev->netdev_ops->ndo_uninit)
5976 			dev->netdev_ops->ndo_uninit(dev);
5977 
5978 		if (skb)
5979 			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
5980 
5981 		/* Notifier chain MUST detach us all upper devices. */
5982 		WARN_ON(netdev_has_any_upper_dev(dev));
5983 
5984 		/* Remove entries from kobject tree */
5985 		netdev_unregister_kobject(dev);
5986 #ifdef CONFIG_XPS
5987 		/* Remove XPS queueing entries */
5988 		netif_reset_xps_queues_gt(dev, 0);
5989 #endif
5990 	}
5991 
5992 	synchronize_net();
5993 
5994 	list_for_each_entry(dev, head, unreg_list)
5995 		dev_put(dev);
5996 }
5997 
5998 static void rollback_registered(struct net_device *dev)
5999 {
6000 	LIST_HEAD(single);
6001 
6002 	list_add(&dev->unreg_list, &single);
6003 	rollback_registered_many(&single);
6004 	list_del(&single);
6005 }
6006 
6007 static netdev_features_t netdev_fix_features(struct net_device *dev,
6008 	netdev_features_t features)
6009 {
6010 	/* Fix illegal checksum combinations */
6011 	if ((features & NETIF_F_HW_CSUM) &&
6012 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6013 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6014 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6015 	}
6016 
6017 	/* TSO requires that SG is present as well. */
6018 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6019 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6020 		features &= ~NETIF_F_ALL_TSO;
6021 	}
6022 
6023 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6024 					!(features & NETIF_F_IP_CSUM)) {
6025 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6026 		features &= ~NETIF_F_TSO;
6027 		features &= ~NETIF_F_TSO_ECN;
6028 	}
6029 
6030 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6031 					 !(features & NETIF_F_IPV6_CSUM)) {
6032 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6033 		features &= ~NETIF_F_TSO6;
6034 	}
6035 
6036 	/* TSO ECN requires that TSO is present as well. */
6037 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6038 		features &= ~NETIF_F_TSO_ECN;
6039 
6040 	/* Software GSO depends on SG. */
6041 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6042 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6043 		features &= ~NETIF_F_GSO;
6044 	}
6045 
6046 	/* UFO needs SG and checksumming */
6047 	if (features & NETIF_F_UFO) {
6048 		/* maybe split UFO into V4 and V6? */
6049 		if (!((features & NETIF_F_GEN_CSUM) ||
6050 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6051 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6052 			netdev_dbg(dev,
6053 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6054 			features &= ~NETIF_F_UFO;
6055 		}
6056 
6057 		if (!(features & NETIF_F_SG)) {
6058 			netdev_dbg(dev,
6059 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6060 			features &= ~NETIF_F_UFO;
6061 		}
6062 	}
6063 
6064 #ifdef CONFIG_NET_RX_BUSY_POLL
6065 	if (dev->netdev_ops->ndo_busy_poll)
6066 		features |= NETIF_F_BUSY_POLL;
6067 	else
6068 #endif
6069 		features &= ~NETIF_F_BUSY_POLL;
6070 
6071 	return features;
6072 }
6073 
6074 int __netdev_update_features(struct net_device *dev)
6075 {
6076 	netdev_features_t features;
6077 	int err = 0;
6078 
6079 	ASSERT_RTNL();
6080 
6081 	features = netdev_get_wanted_features(dev);
6082 
6083 	if (dev->netdev_ops->ndo_fix_features)
6084 		features = dev->netdev_ops->ndo_fix_features(dev, features);
6085 
6086 	/* driver might be less strict about feature dependencies */
6087 	features = netdev_fix_features(dev, features);
6088 
6089 	if (dev->features == features)
6090 		return 0;
6091 
6092 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6093 		&dev->features, &features);
6094 
6095 	if (dev->netdev_ops->ndo_set_features)
6096 		err = dev->netdev_ops->ndo_set_features(dev, features);
6097 
6098 	if (unlikely(err < 0)) {
6099 		netdev_err(dev,
6100 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
6101 			err, &features, &dev->features);
6102 		return -1;
6103 	}
6104 
6105 	if (!err)
6106 		dev->features = features;
6107 
6108 	return 1;
6109 }
6110 
6111 /**
6112  *	netdev_update_features - recalculate device features
6113  *	@dev: the device to check
6114  *
6115  *	Recalculate dev->features set and send notifications if it
6116  *	has changed. Should be called after driver or hardware dependent
6117  *	conditions might have changed that influence the features.
6118  */
6119 void netdev_update_features(struct net_device *dev)
6120 {
6121 	if (__netdev_update_features(dev))
6122 		netdev_features_change(dev);
6123 }
6124 EXPORT_SYMBOL(netdev_update_features);
6125 
6126 /**
6127  *	netdev_change_features - recalculate device features
6128  *	@dev: the device to check
6129  *
6130  *	Recalculate dev->features set and send notifications even
6131  *	if they have not changed. Should be called instead of
6132  *	netdev_update_features() if also dev->vlan_features might
6133  *	have changed to allow the changes to be propagated to stacked
6134  *	VLAN devices.
6135  */
6136 void netdev_change_features(struct net_device *dev)
6137 {
6138 	__netdev_update_features(dev);
6139 	netdev_features_change(dev);
6140 }
6141 EXPORT_SYMBOL(netdev_change_features);
6142 
6143 /**
6144  *	netif_stacked_transfer_operstate -	transfer operstate
6145  *	@rootdev: the root or lower level device to transfer state from
6146  *	@dev: the device to transfer operstate to
6147  *
6148  *	Transfer operational state from root to device. This is normally
6149  *	called when a stacking relationship exists between the root
6150  *	device and the device(a leaf device).
6151  */
6152 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6153 					struct net_device *dev)
6154 {
6155 	if (rootdev->operstate == IF_OPER_DORMANT)
6156 		netif_dormant_on(dev);
6157 	else
6158 		netif_dormant_off(dev);
6159 
6160 	if (netif_carrier_ok(rootdev)) {
6161 		if (!netif_carrier_ok(dev))
6162 			netif_carrier_on(dev);
6163 	} else {
6164 		if (netif_carrier_ok(dev))
6165 			netif_carrier_off(dev);
6166 	}
6167 }
6168 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6169 
6170 #ifdef CONFIG_SYSFS
6171 static int netif_alloc_rx_queues(struct net_device *dev)
6172 {
6173 	unsigned int i, count = dev->num_rx_queues;
6174 	struct netdev_rx_queue *rx;
6175 
6176 	BUG_ON(count < 1);
6177 
6178 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6179 	if (!rx)
6180 		return -ENOMEM;
6181 
6182 	dev->_rx = rx;
6183 
6184 	for (i = 0; i < count; i++)
6185 		rx[i].dev = dev;
6186 	return 0;
6187 }
6188 #endif
6189 
6190 static void netdev_init_one_queue(struct net_device *dev,
6191 				  struct netdev_queue *queue, void *_unused)
6192 {
6193 	/* Initialize queue lock */
6194 	spin_lock_init(&queue->_xmit_lock);
6195 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6196 	queue->xmit_lock_owner = -1;
6197 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6198 	queue->dev = dev;
6199 #ifdef CONFIG_BQL
6200 	dql_init(&queue->dql, HZ);
6201 #endif
6202 }
6203 
6204 static void netif_free_tx_queues(struct net_device *dev)
6205 {
6206 	kvfree(dev->_tx);
6207 }
6208 
6209 static int netif_alloc_netdev_queues(struct net_device *dev)
6210 {
6211 	unsigned int count = dev->num_tx_queues;
6212 	struct netdev_queue *tx;
6213 	size_t sz = count * sizeof(*tx);
6214 
6215 	BUG_ON(count < 1 || count > 0xffff);
6216 
6217 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6218 	if (!tx) {
6219 		tx = vzalloc(sz);
6220 		if (!tx)
6221 			return -ENOMEM;
6222 	}
6223 	dev->_tx = tx;
6224 
6225 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6226 	spin_lock_init(&dev->tx_global_lock);
6227 
6228 	return 0;
6229 }
6230 
6231 /**
6232  *	register_netdevice	- register a network device
6233  *	@dev: device to register
6234  *
6235  *	Take a completed network device structure and add it to the kernel
6236  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6237  *	chain. 0 is returned on success. A negative errno code is returned
6238  *	on a failure to set up the device, or if the name is a duplicate.
6239  *
6240  *	Callers must hold the rtnl semaphore. You may want
6241  *	register_netdev() instead of this.
6242  *
6243  *	BUGS:
6244  *	The locking appears insufficient to guarantee two parallel registers
6245  *	will not get the same name.
6246  */
6247 
6248 int register_netdevice(struct net_device *dev)
6249 {
6250 	int ret;
6251 	struct net *net = dev_net(dev);
6252 
6253 	BUG_ON(dev_boot_phase);
6254 	ASSERT_RTNL();
6255 
6256 	might_sleep();
6257 
6258 	/* When net_device's are persistent, this will be fatal. */
6259 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6260 	BUG_ON(!net);
6261 
6262 	spin_lock_init(&dev->addr_list_lock);
6263 	netdev_set_addr_lockdep_class(dev);
6264 
6265 	dev->iflink = -1;
6266 
6267 	ret = dev_get_valid_name(net, dev, dev->name);
6268 	if (ret < 0)
6269 		goto out;
6270 
6271 	/* Init, if this function is available */
6272 	if (dev->netdev_ops->ndo_init) {
6273 		ret = dev->netdev_ops->ndo_init(dev);
6274 		if (ret) {
6275 			if (ret > 0)
6276 				ret = -EIO;
6277 			goto out;
6278 		}
6279 	}
6280 
6281 	if (((dev->hw_features | dev->features) &
6282 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
6283 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6284 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6285 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6286 		ret = -EINVAL;
6287 		goto err_uninit;
6288 	}
6289 
6290 	ret = -EBUSY;
6291 	if (!dev->ifindex)
6292 		dev->ifindex = dev_new_index(net);
6293 	else if (__dev_get_by_index(net, dev->ifindex))
6294 		goto err_uninit;
6295 
6296 	if (dev->iflink == -1)
6297 		dev->iflink = dev->ifindex;
6298 
6299 	/* Transfer changeable features to wanted_features and enable
6300 	 * software offloads (GSO and GRO).
6301 	 */
6302 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
6303 	dev->features |= NETIF_F_SOFT_FEATURES;
6304 	dev->wanted_features = dev->features & dev->hw_features;
6305 
6306 	if (!(dev->flags & IFF_LOOPBACK)) {
6307 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
6308 	}
6309 
6310 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6311 	 */
6312 	dev->vlan_features |= NETIF_F_HIGHDMA;
6313 
6314 	/* Make NETIF_F_SG inheritable to tunnel devices.
6315 	 */
6316 	dev->hw_enc_features |= NETIF_F_SG;
6317 
6318 	/* Make NETIF_F_SG inheritable to MPLS.
6319 	 */
6320 	dev->mpls_features |= NETIF_F_SG;
6321 
6322 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6323 	ret = notifier_to_errno(ret);
6324 	if (ret)
6325 		goto err_uninit;
6326 
6327 	ret = netdev_register_kobject(dev);
6328 	if (ret)
6329 		goto err_uninit;
6330 	dev->reg_state = NETREG_REGISTERED;
6331 
6332 	__netdev_update_features(dev);
6333 
6334 	/*
6335 	 *	Default initial state at registry is that the
6336 	 *	device is present.
6337 	 */
6338 
6339 	set_bit(__LINK_STATE_PRESENT, &dev->state);
6340 
6341 	linkwatch_init_dev(dev);
6342 
6343 	dev_init_scheduler(dev);
6344 	dev_hold(dev);
6345 	list_netdevice(dev);
6346 	add_device_randomness(dev->dev_addr, dev->addr_len);
6347 
6348 	/* If the device has permanent device address, driver should
6349 	 * set dev_addr and also addr_assign_type should be set to
6350 	 * NET_ADDR_PERM (default value).
6351 	 */
6352 	if (dev->addr_assign_type == NET_ADDR_PERM)
6353 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6354 
6355 	/* Notify protocols, that a new device appeared. */
6356 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6357 	ret = notifier_to_errno(ret);
6358 	if (ret) {
6359 		rollback_registered(dev);
6360 		dev->reg_state = NETREG_UNREGISTERED;
6361 	}
6362 	/*
6363 	 *	Prevent userspace races by waiting until the network
6364 	 *	device is fully setup before sending notifications.
6365 	 */
6366 	if (!dev->rtnl_link_ops ||
6367 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6368 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6369 
6370 out:
6371 	return ret;
6372 
6373 err_uninit:
6374 	if (dev->netdev_ops->ndo_uninit)
6375 		dev->netdev_ops->ndo_uninit(dev);
6376 	goto out;
6377 }
6378 EXPORT_SYMBOL(register_netdevice);
6379 
6380 /**
6381  *	init_dummy_netdev	- init a dummy network device for NAPI
6382  *	@dev: device to init
6383  *
6384  *	This takes a network device structure and initialize the minimum
6385  *	amount of fields so it can be used to schedule NAPI polls without
6386  *	registering a full blown interface. This is to be used by drivers
6387  *	that need to tie several hardware interfaces to a single NAPI
6388  *	poll scheduler due to HW limitations.
6389  */
6390 int init_dummy_netdev(struct net_device *dev)
6391 {
6392 	/* Clear everything. Note we don't initialize spinlocks
6393 	 * are they aren't supposed to be taken by any of the
6394 	 * NAPI code and this dummy netdev is supposed to be
6395 	 * only ever used for NAPI polls
6396 	 */
6397 	memset(dev, 0, sizeof(struct net_device));
6398 
6399 	/* make sure we BUG if trying to hit standard
6400 	 * register/unregister code path
6401 	 */
6402 	dev->reg_state = NETREG_DUMMY;
6403 
6404 	/* NAPI wants this */
6405 	INIT_LIST_HEAD(&dev->napi_list);
6406 
6407 	/* a dummy interface is started by default */
6408 	set_bit(__LINK_STATE_PRESENT, &dev->state);
6409 	set_bit(__LINK_STATE_START, &dev->state);
6410 
6411 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
6412 	 * because users of this 'device' dont need to change
6413 	 * its refcount.
6414 	 */
6415 
6416 	return 0;
6417 }
6418 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6419 
6420 
6421 /**
6422  *	register_netdev	- register a network device
6423  *	@dev: device to register
6424  *
6425  *	Take a completed network device structure and add it to the kernel
6426  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6427  *	chain. 0 is returned on success. A negative errno code is returned
6428  *	on a failure to set up the device, or if the name is a duplicate.
6429  *
6430  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
6431  *	and expands the device name if you passed a format string to
6432  *	alloc_netdev.
6433  */
6434 int register_netdev(struct net_device *dev)
6435 {
6436 	int err;
6437 
6438 	rtnl_lock();
6439 	err = register_netdevice(dev);
6440 	rtnl_unlock();
6441 	return err;
6442 }
6443 EXPORT_SYMBOL(register_netdev);
6444 
6445 int netdev_refcnt_read(const struct net_device *dev)
6446 {
6447 	int i, refcnt = 0;
6448 
6449 	for_each_possible_cpu(i)
6450 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6451 	return refcnt;
6452 }
6453 EXPORT_SYMBOL(netdev_refcnt_read);
6454 
6455 /**
6456  * netdev_wait_allrefs - wait until all references are gone.
6457  * @dev: target net_device
6458  *
6459  * This is called when unregistering network devices.
6460  *
6461  * Any protocol or device that holds a reference should register
6462  * for netdevice notification, and cleanup and put back the
6463  * reference if they receive an UNREGISTER event.
6464  * We can get stuck here if buggy protocols don't correctly
6465  * call dev_put.
6466  */
6467 static void netdev_wait_allrefs(struct net_device *dev)
6468 {
6469 	unsigned long rebroadcast_time, warning_time;
6470 	int refcnt;
6471 
6472 	linkwatch_forget_dev(dev);
6473 
6474 	rebroadcast_time = warning_time = jiffies;
6475 	refcnt = netdev_refcnt_read(dev);
6476 
6477 	while (refcnt != 0) {
6478 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6479 			rtnl_lock();
6480 
6481 			/* Rebroadcast unregister notification */
6482 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6483 
6484 			__rtnl_unlock();
6485 			rcu_barrier();
6486 			rtnl_lock();
6487 
6488 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6489 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6490 				     &dev->state)) {
6491 				/* We must not have linkwatch events
6492 				 * pending on unregister. If this
6493 				 * happens, we simply run the queue
6494 				 * unscheduled, resulting in a noop
6495 				 * for this device.
6496 				 */
6497 				linkwatch_run_queue();
6498 			}
6499 
6500 			__rtnl_unlock();
6501 
6502 			rebroadcast_time = jiffies;
6503 		}
6504 
6505 		msleep(250);
6506 
6507 		refcnt = netdev_refcnt_read(dev);
6508 
6509 		if (time_after(jiffies, warning_time + 10 * HZ)) {
6510 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6511 				 dev->name, refcnt);
6512 			warning_time = jiffies;
6513 		}
6514 	}
6515 }
6516 
6517 /* The sequence is:
6518  *
6519  *	rtnl_lock();
6520  *	...
6521  *	register_netdevice(x1);
6522  *	register_netdevice(x2);
6523  *	...
6524  *	unregister_netdevice(y1);
6525  *	unregister_netdevice(y2);
6526  *      ...
6527  *	rtnl_unlock();
6528  *	free_netdev(y1);
6529  *	free_netdev(y2);
6530  *
6531  * We are invoked by rtnl_unlock().
6532  * This allows us to deal with problems:
6533  * 1) We can delete sysfs objects which invoke hotplug
6534  *    without deadlocking with linkwatch via keventd.
6535  * 2) Since we run with the RTNL semaphore not held, we can sleep
6536  *    safely in order to wait for the netdev refcnt to drop to zero.
6537  *
6538  * We must not return until all unregister events added during
6539  * the interval the lock was held have been completed.
6540  */
6541 void netdev_run_todo(void)
6542 {
6543 	struct list_head list;
6544 
6545 	/* Snapshot list, allow later requests */
6546 	list_replace_init(&net_todo_list, &list);
6547 
6548 	__rtnl_unlock();
6549 
6550 
6551 	/* Wait for rcu callbacks to finish before next phase */
6552 	if (!list_empty(&list))
6553 		rcu_barrier();
6554 
6555 	while (!list_empty(&list)) {
6556 		struct net_device *dev
6557 			= list_first_entry(&list, struct net_device, todo_list);
6558 		list_del(&dev->todo_list);
6559 
6560 		rtnl_lock();
6561 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6562 		__rtnl_unlock();
6563 
6564 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6565 			pr_err("network todo '%s' but state %d\n",
6566 			       dev->name, dev->reg_state);
6567 			dump_stack();
6568 			continue;
6569 		}
6570 
6571 		dev->reg_state = NETREG_UNREGISTERED;
6572 
6573 		on_each_cpu(flush_backlog, dev, 1);
6574 
6575 		netdev_wait_allrefs(dev);
6576 
6577 		/* paranoia */
6578 		BUG_ON(netdev_refcnt_read(dev));
6579 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
6580 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6581 		WARN_ON(dev->dn_ptr);
6582 
6583 		if (dev->destructor)
6584 			dev->destructor(dev);
6585 
6586 		/* Report a network device has been unregistered */
6587 		rtnl_lock();
6588 		dev_net(dev)->dev_unreg_count--;
6589 		__rtnl_unlock();
6590 		wake_up(&netdev_unregistering_wq);
6591 
6592 		/* Free network device */
6593 		kobject_put(&dev->dev.kobj);
6594 	}
6595 }
6596 
6597 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6598  * fields in the same order, with only the type differing.
6599  */
6600 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6601 			     const struct net_device_stats *netdev_stats)
6602 {
6603 #if BITS_PER_LONG == 64
6604 	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6605 	memcpy(stats64, netdev_stats, sizeof(*stats64));
6606 #else
6607 	size_t i, n = sizeof(*stats64) / sizeof(u64);
6608 	const unsigned long *src = (const unsigned long *)netdev_stats;
6609 	u64 *dst = (u64 *)stats64;
6610 
6611 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6612 		     sizeof(*stats64) / sizeof(u64));
6613 	for (i = 0; i < n; i++)
6614 		dst[i] = src[i];
6615 #endif
6616 }
6617 EXPORT_SYMBOL(netdev_stats_to_stats64);
6618 
6619 /**
6620  *	dev_get_stats	- get network device statistics
6621  *	@dev: device to get statistics from
6622  *	@storage: place to store stats
6623  *
6624  *	Get network statistics from device. Return @storage.
6625  *	The device driver may provide its own method by setting
6626  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6627  *	otherwise the internal statistics structure is used.
6628  */
6629 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6630 					struct rtnl_link_stats64 *storage)
6631 {
6632 	const struct net_device_ops *ops = dev->netdev_ops;
6633 
6634 	if (ops->ndo_get_stats64) {
6635 		memset(storage, 0, sizeof(*storage));
6636 		ops->ndo_get_stats64(dev, storage);
6637 	} else if (ops->ndo_get_stats) {
6638 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6639 	} else {
6640 		netdev_stats_to_stats64(storage, &dev->stats);
6641 	}
6642 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6643 	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6644 	return storage;
6645 }
6646 EXPORT_SYMBOL(dev_get_stats);
6647 
6648 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6649 {
6650 	struct netdev_queue *queue = dev_ingress_queue(dev);
6651 
6652 #ifdef CONFIG_NET_CLS_ACT
6653 	if (queue)
6654 		return queue;
6655 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6656 	if (!queue)
6657 		return NULL;
6658 	netdev_init_one_queue(dev, queue, NULL);
6659 	queue->qdisc = &noop_qdisc;
6660 	queue->qdisc_sleeping = &noop_qdisc;
6661 	rcu_assign_pointer(dev->ingress_queue, queue);
6662 #endif
6663 	return queue;
6664 }
6665 
6666 static const struct ethtool_ops default_ethtool_ops;
6667 
6668 void netdev_set_default_ethtool_ops(struct net_device *dev,
6669 				    const struct ethtool_ops *ops)
6670 {
6671 	if (dev->ethtool_ops == &default_ethtool_ops)
6672 		dev->ethtool_ops = ops;
6673 }
6674 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6675 
6676 void netdev_freemem(struct net_device *dev)
6677 {
6678 	char *addr = (char *)dev - dev->padded;
6679 
6680 	kvfree(addr);
6681 }
6682 
6683 /**
6684  *	alloc_netdev_mqs - allocate network device
6685  *	@sizeof_priv:		size of private data to allocate space for
6686  *	@name:			device name format string
6687  *	@name_assign_type: 	origin of device name
6688  *	@setup:			callback to initialize device
6689  *	@txqs:			the number of TX subqueues to allocate
6690  *	@rxqs:			the number of RX subqueues to allocate
6691  *
6692  *	Allocates a struct net_device with private data area for driver use
6693  *	and performs basic initialization.  Also allocates subqueue structs
6694  *	for each queue on the device.
6695  */
6696 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6697 		unsigned char name_assign_type,
6698 		void (*setup)(struct net_device *),
6699 		unsigned int txqs, unsigned int rxqs)
6700 {
6701 	struct net_device *dev;
6702 	size_t alloc_size;
6703 	struct net_device *p;
6704 
6705 	BUG_ON(strlen(name) >= sizeof(dev->name));
6706 
6707 	if (txqs < 1) {
6708 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6709 		return NULL;
6710 	}
6711 
6712 #ifdef CONFIG_SYSFS
6713 	if (rxqs < 1) {
6714 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6715 		return NULL;
6716 	}
6717 #endif
6718 
6719 	alloc_size = sizeof(struct net_device);
6720 	if (sizeof_priv) {
6721 		/* ensure 32-byte alignment of private area */
6722 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6723 		alloc_size += sizeof_priv;
6724 	}
6725 	/* ensure 32-byte alignment of whole construct */
6726 	alloc_size += NETDEV_ALIGN - 1;
6727 
6728 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6729 	if (!p)
6730 		p = vzalloc(alloc_size);
6731 	if (!p)
6732 		return NULL;
6733 
6734 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6735 	dev->padded = (char *)dev - (char *)p;
6736 
6737 	dev->pcpu_refcnt = alloc_percpu(int);
6738 	if (!dev->pcpu_refcnt)
6739 		goto free_dev;
6740 
6741 	if (dev_addr_init(dev))
6742 		goto free_pcpu;
6743 
6744 	dev_mc_init(dev);
6745 	dev_uc_init(dev);
6746 
6747 	dev_net_set(dev, &init_net);
6748 
6749 	dev->gso_max_size = GSO_MAX_SIZE;
6750 	dev->gso_max_segs = GSO_MAX_SEGS;
6751 	dev->gso_min_segs = 0;
6752 
6753 	INIT_LIST_HEAD(&dev->napi_list);
6754 	INIT_LIST_HEAD(&dev->unreg_list);
6755 	INIT_LIST_HEAD(&dev->close_list);
6756 	INIT_LIST_HEAD(&dev->link_watch_list);
6757 	INIT_LIST_HEAD(&dev->adj_list.upper);
6758 	INIT_LIST_HEAD(&dev->adj_list.lower);
6759 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
6760 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
6761 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6762 	setup(dev);
6763 
6764 	dev->num_tx_queues = txqs;
6765 	dev->real_num_tx_queues = txqs;
6766 	if (netif_alloc_netdev_queues(dev))
6767 		goto free_all;
6768 
6769 #ifdef CONFIG_SYSFS
6770 	dev->num_rx_queues = rxqs;
6771 	dev->real_num_rx_queues = rxqs;
6772 	if (netif_alloc_rx_queues(dev))
6773 		goto free_all;
6774 #endif
6775 
6776 	strcpy(dev->name, name);
6777 	dev->name_assign_type = name_assign_type;
6778 	dev->group = INIT_NETDEV_GROUP;
6779 	if (!dev->ethtool_ops)
6780 		dev->ethtool_ops = &default_ethtool_ops;
6781 	return dev;
6782 
6783 free_all:
6784 	free_netdev(dev);
6785 	return NULL;
6786 
6787 free_pcpu:
6788 	free_percpu(dev->pcpu_refcnt);
6789 free_dev:
6790 	netdev_freemem(dev);
6791 	return NULL;
6792 }
6793 EXPORT_SYMBOL(alloc_netdev_mqs);
6794 
6795 /**
6796  *	free_netdev - free network device
6797  *	@dev: device
6798  *
6799  *	This function does the last stage of destroying an allocated device
6800  * 	interface. The reference to the device object is released.
6801  *	If this is the last reference then it will be freed.
6802  */
6803 void free_netdev(struct net_device *dev)
6804 {
6805 	struct napi_struct *p, *n;
6806 
6807 	release_net(dev_net(dev));
6808 
6809 	netif_free_tx_queues(dev);
6810 #ifdef CONFIG_SYSFS
6811 	kfree(dev->_rx);
6812 #endif
6813 
6814 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6815 
6816 	/* Flush device addresses */
6817 	dev_addr_flush(dev);
6818 
6819 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6820 		netif_napi_del(p);
6821 
6822 	free_percpu(dev->pcpu_refcnt);
6823 	dev->pcpu_refcnt = NULL;
6824 
6825 	/*  Compatibility with error handling in drivers */
6826 	if (dev->reg_state == NETREG_UNINITIALIZED) {
6827 		netdev_freemem(dev);
6828 		return;
6829 	}
6830 
6831 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6832 	dev->reg_state = NETREG_RELEASED;
6833 
6834 	/* will free via device release */
6835 	put_device(&dev->dev);
6836 }
6837 EXPORT_SYMBOL(free_netdev);
6838 
6839 /**
6840  *	synchronize_net -  Synchronize with packet receive processing
6841  *
6842  *	Wait for packets currently being received to be done.
6843  *	Does not block later packets from starting.
6844  */
6845 void synchronize_net(void)
6846 {
6847 	might_sleep();
6848 	if (rtnl_is_locked())
6849 		synchronize_rcu_expedited();
6850 	else
6851 		synchronize_rcu();
6852 }
6853 EXPORT_SYMBOL(synchronize_net);
6854 
6855 /**
6856  *	unregister_netdevice_queue - remove device from the kernel
6857  *	@dev: device
6858  *	@head: list
6859  *
6860  *	This function shuts down a device interface and removes it
6861  *	from the kernel tables.
6862  *	If head not NULL, device is queued to be unregistered later.
6863  *
6864  *	Callers must hold the rtnl semaphore.  You may want
6865  *	unregister_netdev() instead of this.
6866  */
6867 
6868 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6869 {
6870 	ASSERT_RTNL();
6871 
6872 	if (head) {
6873 		list_move_tail(&dev->unreg_list, head);
6874 	} else {
6875 		rollback_registered(dev);
6876 		/* Finish processing unregister after unlock */
6877 		net_set_todo(dev);
6878 	}
6879 }
6880 EXPORT_SYMBOL(unregister_netdevice_queue);
6881 
6882 /**
6883  *	unregister_netdevice_many - unregister many devices
6884  *	@head: list of devices
6885  *
6886  *  Note: As most callers use a stack allocated list_head,
6887  *  we force a list_del() to make sure stack wont be corrupted later.
6888  */
6889 void unregister_netdevice_many(struct list_head *head)
6890 {
6891 	struct net_device *dev;
6892 
6893 	if (!list_empty(head)) {
6894 		rollback_registered_many(head);
6895 		list_for_each_entry(dev, head, unreg_list)
6896 			net_set_todo(dev);
6897 		list_del(head);
6898 	}
6899 }
6900 EXPORT_SYMBOL(unregister_netdevice_many);
6901 
6902 /**
6903  *	unregister_netdev - remove device from the kernel
6904  *	@dev: device
6905  *
6906  *	This function shuts down a device interface and removes it
6907  *	from the kernel tables.
6908  *
6909  *	This is just a wrapper for unregister_netdevice that takes
6910  *	the rtnl semaphore.  In general you want to use this and not
6911  *	unregister_netdevice.
6912  */
6913 void unregister_netdev(struct net_device *dev)
6914 {
6915 	rtnl_lock();
6916 	unregister_netdevice(dev);
6917 	rtnl_unlock();
6918 }
6919 EXPORT_SYMBOL(unregister_netdev);
6920 
6921 /**
6922  *	dev_change_net_namespace - move device to different nethost namespace
6923  *	@dev: device
6924  *	@net: network namespace
6925  *	@pat: If not NULL name pattern to try if the current device name
6926  *	      is already taken in the destination network namespace.
6927  *
6928  *	This function shuts down a device interface and moves it
6929  *	to a new network namespace. On success 0 is returned, on
6930  *	a failure a netagive errno code is returned.
6931  *
6932  *	Callers must hold the rtnl semaphore.
6933  */
6934 
6935 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6936 {
6937 	int err;
6938 
6939 	ASSERT_RTNL();
6940 
6941 	/* Don't allow namespace local devices to be moved. */
6942 	err = -EINVAL;
6943 	if (dev->features & NETIF_F_NETNS_LOCAL)
6944 		goto out;
6945 
6946 	/* Ensure the device has been registrered */
6947 	if (dev->reg_state != NETREG_REGISTERED)
6948 		goto out;
6949 
6950 	/* Get out if there is nothing todo */
6951 	err = 0;
6952 	if (net_eq(dev_net(dev), net))
6953 		goto out;
6954 
6955 	/* Pick the destination device name, and ensure
6956 	 * we can use it in the destination network namespace.
6957 	 */
6958 	err = -EEXIST;
6959 	if (__dev_get_by_name(net, dev->name)) {
6960 		/* We get here if we can't use the current device name */
6961 		if (!pat)
6962 			goto out;
6963 		if (dev_get_valid_name(net, dev, pat) < 0)
6964 			goto out;
6965 	}
6966 
6967 	/*
6968 	 * And now a mini version of register_netdevice unregister_netdevice.
6969 	 */
6970 
6971 	/* If device is running close it first. */
6972 	dev_close(dev);
6973 
6974 	/* And unlink it from device chain */
6975 	err = -ENODEV;
6976 	unlist_netdevice(dev);
6977 
6978 	synchronize_net();
6979 
6980 	/* Shutdown queueing discipline. */
6981 	dev_shutdown(dev);
6982 
6983 	/* Notify protocols, that we are about to destroy
6984 	   this device. They should clean all the things.
6985 
6986 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6987 	   This is wanted because this way 8021q and macvlan know
6988 	   the device is just moving and can keep their slaves up.
6989 	*/
6990 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6991 	rcu_barrier();
6992 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6993 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6994 
6995 	/*
6996 	 *	Flush the unicast and multicast chains
6997 	 */
6998 	dev_uc_flush(dev);
6999 	dev_mc_flush(dev);
7000 
7001 	/* Send a netdev-removed uevent to the old namespace */
7002 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7003 	netdev_adjacent_del_links(dev);
7004 
7005 	/* Actually switch the network namespace */
7006 	dev_net_set(dev, net);
7007 
7008 	/* If there is an ifindex conflict assign a new one */
7009 	if (__dev_get_by_index(net, dev->ifindex)) {
7010 		int iflink = (dev->iflink == dev->ifindex);
7011 		dev->ifindex = dev_new_index(net);
7012 		if (iflink)
7013 			dev->iflink = dev->ifindex;
7014 	}
7015 
7016 	/* Send a netdev-add uevent to the new namespace */
7017 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7018 	netdev_adjacent_add_links(dev);
7019 
7020 	/* Fixup kobjects */
7021 	err = device_rename(&dev->dev, dev->name);
7022 	WARN_ON(err);
7023 
7024 	/* Add the device back in the hashes */
7025 	list_netdevice(dev);
7026 
7027 	/* Notify protocols, that a new device appeared. */
7028 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7029 
7030 	/*
7031 	 *	Prevent userspace races by waiting until the network
7032 	 *	device is fully setup before sending notifications.
7033 	 */
7034 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7035 
7036 	synchronize_net();
7037 	err = 0;
7038 out:
7039 	return err;
7040 }
7041 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7042 
7043 static int dev_cpu_callback(struct notifier_block *nfb,
7044 			    unsigned long action,
7045 			    void *ocpu)
7046 {
7047 	struct sk_buff **list_skb;
7048 	struct sk_buff *skb;
7049 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
7050 	struct softnet_data *sd, *oldsd;
7051 
7052 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7053 		return NOTIFY_OK;
7054 
7055 	local_irq_disable();
7056 	cpu = smp_processor_id();
7057 	sd = &per_cpu(softnet_data, cpu);
7058 	oldsd = &per_cpu(softnet_data, oldcpu);
7059 
7060 	/* Find end of our completion_queue. */
7061 	list_skb = &sd->completion_queue;
7062 	while (*list_skb)
7063 		list_skb = &(*list_skb)->next;
7064 	/* Append completion queue from offline CPU. */
7065 	*list_skb = oldsd->completion_queue;
7066 	oldsd->completion_queue = NULL;
7067 
7068 	/* Append output queue from offline CPU. */
7069 	if (oldsd->output_queue) {
7070 		*sd->output_queue_tailp = oldsd->output_queue;
7071 		sd->output_queue_tailp = oldsd->output_queue_tailp;
7072 		oldsd->output_queue = NULL;
7073 		oldsd->output_queue_tailp = &oldsd->output_queue;
7074 	}
7075 	/* Append NAPI poll list from offline CPU, with one exception :
7076 	 * process_backlog() must be called by cpu owning percpu backlog.
7077 	 * We properly handle process_queue & input_pkt_queue later.
7078 	 */
7079 	while (!list_empty(&oldsd->poll_list)) {
7080 		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7081 							    struct napi_struct,
7082 							    poll_list);
7083 
7084 		list_del_init(&napi->poll_list);
7085 		if (napi->poll == process_backlog)
7086 			napi->state = 0;
7087 		else
7088 			____napi_schedule(sd, napi);
7089 	}
7090 
7091 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
7092 	local_irq_enable();
7093 
7094 	/* Process offline CPU's input_pkt_queue */
7095 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7096 		netif_rx_internal(skb);
7097 		input_queue_head_incr(oldsd);
7098 	}
7099 	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7100 		netif_rx_internal(skb);
7101 		input_queue_head_incr(oldsd);
7102 	}
7103 
7104 	return NOTIFY_OK;
7105 }
7106 
7107 
7108 /**
7109  *	netdev_increment_features - increment feature set by one
7110  *	@all: current feature set
7111  *	@one: new feature set
7112  *	@mask: mask feature set
7113  *
7114  *	Computes a new feature set after adding a device with feature set
7115  *	@one to the master device with current feature set @all.  Will not
7116  *	enable anything that is off in @mask. Returns the new feature set.
7117  */
7118 netdev_features_t netdev_increment_features(netdev_features_t all,
7119 	netdev_features_t one, netdev_features_t mask)
7120 {
7121 	if (mask & NETIF_F_GEN_CSUM)
7122 		mask |= NETIF_F_ALL_CSUM;
7123 	mask |= NETIF_F_VLAN_CHALLENGED;
7124 
7125 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7126 	all &= one | ~NETIF_F_ALL_FOR_ALL;
7127 
7128 	/* If one device supports hw checksumming, set for all. */
7129 	if (all & NETIF_F_GEN_CSUM)
7130 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7131 
7132 	return all;
7133 }
7134 EXPORT_SYMBOL(netdev_increment_features);
7135 
7136 static struct hlist_head * __net_init netdev_create_hash(void)
7137 {
7138 	int i;
7139 	struct hlist_head *hash;
7140 
7141 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7142 	if (hash != NULL)
7143 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
7144 			INIT_HLIST_HEAD(&hash[i]);
7145 
7146 	return hash;
7147 }
7148 
7149 /* Initialize per network namespace state */
7150 static int __net_init netdev_init(struct net *net)
7151 {
7152 	if (net != &init_net)
7153 		INIT_LIST_HEAD(&net->dev_base_head);
7154 
7155 	net->dev_name_head = netdev_create_hash();
7156 	if (net->dev_name_head == NULL)
7157 		goto err_name;
7158 
7159 	net->dev_index_head = netdev_create_hash();
7160 	if (net->dev_index_head == NULL)
7161 		goto err_idx;
7162 
7163 	return 0;
7164 
7165 err_idx:
7166 	kfree(net->dev_name_head);
7167 err_name:
7168 	return -ENOMEM;
7169 }
7170 
7171 /**
7172  *	netdev_drivername - network driver for the device
7173  *	@dev: network device
7174  *
7175  *	Determine network driver for device.
7176  */
7177 const char *netdev_drivername(const struct net_device *dev)
7178 {
7179 	const struct device_driver *driver;
7180 	const struct device *parent;
7181 	const char *empty = "";
7182 
7183 	parent = dev->dev.parent;
7184 	if (!parent)
7185 		return empty;
7186 
7187 	driver = parent->driver;
7188 	if (driver && driver->name)
7189 		return driver->name;
7190 	return empty;
7191 }
7192 
7193 static void __netdev_printk(const char *level, const struct net_device *dev,
7194 			    struct va_format *vaf)
7195 {
7196 	if (dev && dev->dev.parent) {
7197 		dev_printk_emit(level[1] - '0',
7198 				dev->dev.parent,
7199 				"%s %s %s%s: %pV",
7200 				dev_driver_string(dev->dev.parent),
7201 				dev_name(dev->dev.parent),
7202 				netdev_name(dev), netdev_reg_state(dev),
7203 				vaf);
7204 	} else if (dev) {
7205 		printk("%s%s%s: %pV",
7206 		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
7207 	} else {
7208 		printk("%s(NULL net_device): %pV", level, vaf);
7209 	}
7210 }
7211 
7212 void netdev_printk(const char *level, const struct net_device *dev,
7213 		   const char *format, ...)
7214 {
7215 	struct va_format vaf;
7216 	va_list args;
7217 
7218 	va_start(args, format);
7219 
7220 	vaf.fmt = format;
7221 	vaf.va = &args;
7222 
7223 	__netdev_printk(level, dev, &vaf);
7224 
7225 	va_end(args);
7226 }
7227 EXPORT_SYMBOL(netdev_printk);
7228 
7229 #define define_netdev_printk_level(func, level)			\
7230 void func(const struct net_device *dev, const char *fmt, ...)	\
7231 {								\
7232 	struct va_format vaf;					\
7233 	va_list args;						\
7234 								\
7235 	va_start(args, fmt);					\
7236 								\
7237 	vaf.fmt = fmt;						\
7238 	vaf.va = &args;						\
7239 								\
7240 	__netdev_printk(level, dev, &vaf);			\
7241 								\
7242 	va_end(args);						\
7243 }								\
7244 EXPORT_SYMBOL(func);
7245 
7246 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7247 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7248 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7249 define_netdev_printk_level(netdev_err, KERN_ERR);
7250 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7251 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7252 define_netdev_printk_level(netdev_info, KERN_INFO);
7253 
7254 static void __net_exit netdev_exit(struct net *net)
7255 {
7256 	kfree(net->dev_name_head);
7257 	kfree(net->dev_index_head);
7258 }
7259 
7260 static struct pernet_operations __net_initdata netdev_net_ops = {
7261 	.init = netdev_init,
7262 	.exit = netdev_exit,
7263 };
7264 
7265 static void __net_exit default_device_exit(struct net *net)
7266 {
7267 	struct net_device *dev, *aux;
7268 	/*
7269 	 * Push all migratable network devices back to the
7270 	 * initial network namespace
7271 	 */
7272 	rtnl_lock();
7273 	for_each_netdev_safe(net, dev, aux) {
7274 		int err;
7275 		char fb_name[IFNAMSIZ];
7276 
7277 		/* Ignore unmoveable devices (i.e. loopback) */
7278 		if (dev->features & NETIF_F_NETNS_LOCAL)
7279 			continue;
7280 
7281 		/* Leave virtual devices for the generic cleanup */
7282 		if (dev->rtnl_link_ops)
7283 			continue;
7284 
7285 		/* Push remaining network devices to init_net */
7286 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7287 		err = dev_change_net_namespace(dev, &init_net, fb_name);
7288 		if (err) {
7289 			pr_emerg("%s: failed to move %s to init_net: %d\n",
7290 				 __func__, dev->name, err);
7291 			BUG();
7292 		}
7293 	}
7294 	rtnl_unlock();
7295 }
7296 
7297 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7298 {
7299 	/* Return with the rtnl_lock held when there are no network
7300 	 * devices unregistering in any network namespace in net_list.
7301 	 */
7302 	struct net *net;
7303 	bool unregistering;
7304 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
7305 
7306 	add_wait_queue(&netdev_unregistering_wq, &wait);
7307 	for (;;) {
7308 		unregistering = false;
7309 		rtnl_lock();
7310 		list_for_each_entry(net, net_list, exit_list) {
7311 			if (net->dev_unreg_count > 0) {
7312 				unregistering = true;
7313 				break;
7314 			}
7315 		}
7316 		if (!unregistering)
7317 			break;
7318 		__rtnl_unlock();
7319 
7320 		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7321 	}
7322 	remove_wait_queue(&netdev_unregistering_wq, &wait);
7323 }
7324 
7325 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7326 {
7327 	/* At exit all network devices most be removed from a network
7328 	 * namespace.  Do this in the reverse order of registration.
7329 	 * Do this across as many network namespaces as possible to
7330 	 * improve batching efficiency.
7331 	 */
7332 	struct net_device *dev;
7333 	struct net *net;
7334 	LIST_HEAD(dev_kill_list);
7335 
7336 	/* To prevent network device cleanup code from dereferencing
7337 	 * loopback devices or network devices that have been freed
7338 	 * wait here for all pending unregistrations to complete,
7339 	 * before unregistring the loopback device and allowing the
7340 	 * network namespace be freed.
7341 	 *
7342 	 * The netdev todo list containing all network devices
7343 	 * unregistrations that happen in default_device_exit_batch
7344 	 * will run in the rtnl_unlock() at the end of
7345 	 * default_device_exit_batch.
7346 	 */
7347 	rtnl_lock_unregistering(net_list);
7348 	list_for_each_entry(net, net_list, exit_list) {
7349 		for_each_netdev_reverse(net, dev) {
7350 			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7351 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7352 			else
7353 				unregister_netdevice_queue(dev, &dev_kill_list);
7354 		}
7355 	}
7356 	unregister_netdevice_many(&dev_kill_list);
7357 	rtnl_unlock();
7358 }
7359 
7360 static struct pernet_operations __net_initdata default_device_ops = {
7361 	.exit = default_device_exit,
7362 	.exit_batch = default_device_exit_batch,
7363 };
7364 
7365 /*
7366  *	Initialize the DEV module. At boot time this walks the device list and
7367  *	unhooks any devices that fail to initialise (normally hardware not
7368  *	present) and leaves us with a valid list of present and active devices.
7369  *
7370  */
7371 
7372 /*
7373  *       This is called single threaded during boot, so no need
7374  *       to take the rtnl semaphore.
7375  */
7376 static int __init net_dev_init(void)
7377 {
7378 	int i, rc = -ENOMEM;
7379 
7380 	BUG_ON(!dev_boot_phase);
7381 
7382 	if (dev_proc_init())
7383 		goto out;
7384 
7385 	if (netdev_kobject_init())
7386 		goto out;
7387 
7388 	INIT_LIST_HEAD(&ptype_all);
7389 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
7390 		INIT_LIST_HEAD(&ptype_base[i]);
7391 
7392 	INIT_LIST_HEAD(&offload_base);
7393 
7394 	if (register_pernet_subsys(&netdev_net_ops))
7395 		goto out;
7396 
7397 	/*
7398 	 *	Initialise the packet receive queues.
7399 	 */
7400 
7401 	for_each_possible_cpu(i) {
7402 		struct softnet_data *sd = &per_cpu(softnet_data, i);
7403 
7404 		skb_queue_head_init(&sd->input_pkt_queue);
7405 		skb_queue_head_init(&sd->process_queue);
7406 		INIT_LIST_HEAD(&sd->poll_list);
7407 		sd->output_queue_tailp = &sd->output_queue;
7408 #ifdef CONFIG_RPS
7409 		sd->csd.func = rps_trigger_softirq;
7410 		sd->csd.info = sd;
7411 		sd->cpu = i;
7412 #endif
7413 
7414 		sd->backlog.poll = process_backlog;
7415 		sd->backlog.weight = weight_p;
7416 	}
7417 
7418 	dev_boot_phase = 0;
7419 
7420 	/* The loopback device is special if any other network devices
7421 	 * is present in a network namespace the loopback device must
7422 	 * be present. Since we now dynamically allocate and free the
7423 	 * loopback device ensure this invariant is maintained by
7424 	 * keeping the loopback device as the first device on the
7425 	 * list of network devices.  Ensuring the loopback devices
7426 	 * is the first device that appears and the last network device
7427 	 * that disappears.
7428 	 */
7429 	if (register_pernet_device(&loopback_net_ops))
7430 		goto out;
7431 
7432 	if (register_pernet_device(&default_device_ops))
7433 		goto out;
7434 
7435 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7436 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7437 
7438 	hotcpu_notifier(dev_cpu_callback, 0);
7439 	dst_init();
7440 	rc = 0;
7441 out:
7442 	return rc;
7443 }
7444 
7445 subsys_initcall(net_dev_init);
7446