xref: /linux/net/core/dev.c (revision 2d87650a3bf1b80f7d0d150ee1af3f8a89e5b7aa)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <linux/ipv6.h>
122 #include <linux/in.h>
123 #include <linux/jhash.h>
124 #include <linux/random.h>
125 #include <trace/events/napi.h>
126 #include <trace/events/net.h>
127 #include <trace/events/skb.h>
128 #include <linux/pci.h>
129 #include <linux/inetdevice.h>
130 #include <linux/cpu_rmap.h>
131 #include <linux/static_key.h>
132 #include <linux/hashtable.h>
133 #include <linux/vmalloc.h>
134 #include <linux/if_macvlan.h>
135 
136 #include "net-sysfs.h"
137 
138 /* Instead of increasing this, you should create a hash table. */
139 #define MAX_GRO_SKBS 8
140 
141 /* This should be increased if a protocol with a bigger head is added. */
142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
143 
144 static DEFINE_SPINLOCK(ptype_lock);
145 static DEFINE_SPINLOCK(offload_lock);
146 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
147 struct list_head ptype_all __read_mostly;	/* Taps */
148 static struct list_head offload_base __read_mostly;
149 
150 /*
151  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
152  * semaphore.
153  *
154  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
155  *
156  * Writers must hold the rtnl semaphore while they loop through the
157  * dev_base_head list, and hold dev_base_lock for writing when they do the
158  * actual updates.  This allows pure readers to access the list even
159  * while a writer is preparing to update it.
160  *
161  * To put it another way, dev_base_lock is held for writing only to
162  * protect against pure readers; the rtnl semaphore provides the
163  * protection against other writers.
164  *
165  * See, for example usages, register_netdevice() and
166  * unregister_netdevice(), which must be called with the rtnl
167  * semaphore held.
168  */
169 DEFINE_RWLOCK(dev_base_lock);
170 EXPORT_SYMBOL(dev_base_lock);
171 
172 /* protects napi_hash addition/deletion and napi_gen_id */
173 static DEFINE_SPINLOCK(napi_hash_lock);
174 
175 static unsigned int napi_gen_id;
176 static DEFINE_HASHTABLE(napi_hash, 8);
177 
178 static seqcount_t devnet_rename_seq;
179 
180 static inline void dev_base_seq_inc(struct net *net)
181 {
182 	while (++net->dev_base_seq == 0);
183 }
184 
185 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
186 {
187 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
188 
189 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
190 }
191 
192 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
193 {
194 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
195 }
196 
197 static inline void rps_lock(struct softnet_data *sd)
198 {
199 #ifdef CONFIG_RPS
200 	spin_lock(&sd->input_pkt_queue.lock);
201 #endif
202 }
203 
204 static inline void rps_unlock(struct softnet_data *sd)
205 {
206 #ifdef CONFIG_RPS
207 	spin_unlock(&sd->input_pkt_queue.lock);
208 #endif
209 }
210 
211 /* Device list insertion */
212 static void list_netdevice(struct net_device *dev)
213 {
214 	struct net *net = dev_net(dev);
215 
216 	ASSERT_RTNL();
217 
218 	write_lock_bh(&dev_base_lock);
219 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
220 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
221 	hlist_add_head_rcu(&dev->index_hlist,
222 			   dev_index_hash(net, dev->ifindex));
223 	write_unlock_bh(&dev_base_lock);
224 
225 	dev_base_seq_inc(net);
226 }
227 
228 /* Device list removal
229  * caller must respect a RCU grace period before freeing/reusing dev
230  */
231 static void unlist_netdevice(struct net_device *dev)
232 {
233 	ASSERT_RTNL();
234 
235 	/* Unlink dev from the device chain */
236 	write_lock_bh(&dev_base_lock);
237 	list_del_rcu(&dev->dev_list);
238 	hlist_del_rcu(&dev->name_hlist);
239 	hlist_del_rcu(&dev->index_hlist);
240 	write_unlock_bh(&dev_base_lock);
241 
242 	dev_base_seq_inc(dev_net(dev));
243 }
244 
245 /*
246  *	Our notifier list
247  */
248 
249 static RAW_NOTIFIER_HEAD(netdev_chain);
250 
251 /*
252  *	Device drivers call our routines to queue packets here. We empty the
253  *	queue in the local softnet handler.
254  */
255 
256 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
257 EXPORT_PER_CPU_SYMBOL(softnet_data);
258 
259 #ifdef CONFIG_LOCKDEP
260 /*
261  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
262  * according to dev->type
263  */
264 static const unsigned short netdev_lock_type[] =
265 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
266 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
267 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
268 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
269 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
270 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
271 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
272 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
273 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
274 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
275 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
276 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
277 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
278 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
279 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
280 
281 static const char *const netdev_lock_name[] =
282 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
283 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
284 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
285 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
286 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
287 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
288 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
289 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
290 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
291 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
292 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
293 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
294 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
295 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
296 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
297 
298 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
299 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
300 
301 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
302 {
303 	int i;
304 
305 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
306 		if (netdev_lock_type[i] == dev_type)
307 			return i;
308 	/* the last key is used by default */
309 	return ARRAY_SIZE(netdev_lock_type) - 1;
310 }
311 
312 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
313 						 unsigned short dev_type)
314 {
315 	int i;
316 
317 	i = netdev_lock_pos(dev_type);
318 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
319 				   netdev_lock_name[i]);
320 }
321 
322 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
323 {
324 	int i;
325 
326 	i = netdev_lock_pos(dev->type);
327 	lockdep_set_class_and_name(&dev->addr_list_lock,
328 				   &netdev_addr_lock_key[i],
329 				   netdev_lock_name[i]);
330 }
331 #else
332 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
333 						 unsigned short dev_type)
334 {
335 }
336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337 {
338 }
339 #endif
340 
341 /*******************************************************************************
342 
343 		Protocol management and registration routines
344 
345 *******************************************************************************/
346 
347 /*
348  *	Add a protocol ID to the list. Now that the input handler is
349  *	smarter we can dispense with all the messy stuff that used to be
350  *	here.
351  *
352  *	BEWARE!!! Protocol handlers, mangling input packets,
353  *	MUST BE last in hash buckets and checking protocol handlers
354  *	MUST start from promiscuous ptype_all chain in net_bh.
355  *	It is true now, do not change it.
356  *	Explanation follows: if protocol handler, mangling packet, will
357  *	be the first on list, it is not able to sense, that packet
358  *	is cloned and should be copied-on-write, so that it will
359  *	change it and subsequent readers will get broken packet.
360  *							--ANK (980803)
361  */
362 
363 static inline struct list_head *ptype_head(const struct packet_type *pt)
364 {
365 	if (pt->type == htons(ETH_P_ALL))
366 		return &ptype_all;
367 	else
368 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
369 }
370 
371 /**
372  *	dev_add_pack - add packet handler
373  *	@pt: packet type declaration
374  *
375  *	Add a protocol handler to the networking stack. The passed &packet_type
376  *	is linked into kernel lists and may not be freed until it has been
377  *	removed from the kernel lists.
378  *
379  *	This call does not sleep therefore it can not
380  *	guarantee all CPU's that are in middle of receiving packets
381  *	will see the new packet type (until the next received packet).
382  */
383 
384 void dev_add_pack(struct packet_type *pt)
385 {
386 	struct list_head *head = ptype_head(pt);
387 
388 	spin_lock(&ptype_lock);
389 	list_add_rcu(&pt->list, head);
390 	spin_unlock(&ptype_lock);
391 }
392 EXPORT_SYMBOL(dev_add_pack);
393 
394 /**
395  *	__dev_remove_pack	 - remove packet handler
396  *	@pt: packet type declaration
397  *
398  *	Remove a protocol handler that was previously added to the kernel
399  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
400  *	from the kernel lists and can be freed or reused once this function
401  *	returns.
402  *
403  *      The packet type might still be in use by receivers
404  *	and must not be freed until after all the CPU's have gone
405  *	through a quiescent state.
406  */
407 void __dev_remove_pack(struct packet_type *pt)
408 {
409 	struct list_head *head = ptype_head(pt);
410 	struct packet_type *pt1;
411 
412 	spin_lock(&ptype_lock);
413 
414 	list_for_each_entry(pt1, head, list) {
415 		if (pt == pt1) {
416 			list_del_rcu(&pt->list);
417 			goto out;
418 		}
419 	}
420 
421 	pr_warn("dev_remove_pack: %p not found\n", pt);
422 out:
423 	spin_unlock(&ptype_lock);
424 }
425 EXPORT_SYMBOL(__dev_remove_pack);
426 
427 /**
428  *	dev_remove_pack	 - remove packet handler
429  *	@pt: packet type declaration
430  *
431  *	Remove a protocol handler that was previously added to the kernel
432  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
433  *	from the kernel lists and can be freed or reused once this function
434  *	returns.
435  *
436  *	This call sleeps to guarantee that no CPU is looking at the packet
437  *	type after return.
438  */
439 void dev_remove_pack(struct packet_type *pt)
440 {
441 	__dev_remove_pack(pt);
442 
443 	synchronize_net();
444 }
445 EXPORT_SYMBOL(dev_remove_pack);
446 
447 
448 /**
449  *	dev_add_offload - register offload handlers
450  *	@po: protocol offload declaration
451  *
452  *	Add protocol offload handlers to the networking stack. The passed
453  *	&proto_offload is linked into kernel lists and may not be freed until
454  *	it has been removed from the kernel lists.
455  *
456  *	This call does not sleep therefore it can not
457  *	guarantee all CPU's that are in middle of receiving packets
458  *	will see the new offload handlers (until the next received packet).
459  */
460 void dev_add_offload(struct packet_offload *po)
461 {
462 	struct list_head *head = &offload_base;
463 
464 	spin_lock(&offload_lock);
465 	list_add_rcu(&po->list, head);
466 	spin_unlock(&offload_lock);
467 }
468 EXPORT_SYMBOL(dev_add_offload);
469 
470 /**
471  *	__dev_remove_offload	 - remove offload handler
472  *	@po: packet offload declaration
473  *
474  *	Remove a protocol offload handler that was previously added to the
475  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
476  *	is removed from the kernel lists and can be freed or reused once this
477  *	function returns.
478  *
479  *      The packet type might still be in use by receivers
480  *	and must not be freed until after all the CPU's have gone
481  *	through a quiescent state.
482  */
483 void __dev_remove_offload(struct packet_offload *po)
484 {
485 	struct list_head *head = &offload_base;
486 	struct packet_offload *po1;
487 
488 	spin_lock(&offload_lock);
489 
490 	list_for_each_entry(po1, head, list) {
491 		if (po == po1) {
492 			list_del_rcu(&po->list);
493 			goto out;
494 		}
495 	}
496 
497 	pr_warn("dev_remove_offload: %p not found\n", po);
498 out:
499 	spin_unlock(&offload_lock);
500 }
501 EXPORT_SYMBOL(__dev_remove_offload);
502 
503 /**
504  *	dev_remove_offload	 - remove packet offload handler
505  *	@po: packet offload declaration
506  *
507  *	Remove a packet offload handler that was previously added to the kernel
508  *	offload handlers by dev_add_offload(). The passed &offload_type is
509  *	removed from the kernel lists and can be freed or reused once this
510  *	function returns.
511  *
512  *	This call sleeps to guarantee that no CPU is looking at the packet
513  *	type after return.
514  */
515 void dev_remove_offload(struct packet_offload *po)
516 {
517 	__dev_remove_offload(po);
518 
519 	synchronize_net();
520 }
521 EXPORT_SYMBOL(dev_remove_offload);
522 
523 /******************************************************************************
524 
525 		      Device Boot-time Settings Routines
526 
527 *******************************************************************************/
528 
529 /* Boot time configuration table */
530 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
531 
532 /**
533  *	netdev_boot_setup_add	- add new setup entry
534  *	@name: name of the device
535  *	@map: configured settings for the device
536  *
537  *	Adds new setup entry to the dev_boot_setup list.  The function
538  *	returns 0 on error and 1 on success.  This is a generic routine to
539  *	all netdevices.
540  */
541 static int netdev_boot_setup_add(char *name, struct ifmap *map)
542 {
543 	struct netdev_boot_setup *s;
544 	int i;
545 
546 	s = dev_boot_setup;
547 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
548 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
549 			memset(s[i].name, 0, sizeof(s[i].name));
550 			strlcpy(s[i].name, name, IFNAMSIZ);
551 			memcpy(&s[i].map, map, sizeof(s[i].map));
552 			break;
553 		}
554 	}
555 
556 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
557 }
558 
559 /**
560  *	netdev_boot_setup_check	- check boot time settings
561  *	@dev: the netdevice
562  *
563  * 	Check boot time settings for the device.
564  *	The found settings are set for the device to be used
565  *	later in the device probing.
566  *	Returns 0 if no settings found, 1 if they are.
567  */
568 int netdev_boot_setup_check(struct net_device *dev)
569 {
570 	struct netdev_boot_setup *s = dev_boot_setup;
571 	int i;
572 
573 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
574 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
575 		    !strcmp(dev->name, s[i].name)) {
576 			dev->irq 	= s[i].map.irq;
577 			dev->base_addr 	= s[i].map.base_addr;
578 			dev->mem_start 	= s[i].map.mem_start;
579 			dev->mem_end 	= s[i].map.mem_end;
580 			return 1;
581 		}
582 	}
583 	return 0;
584 }
585 EXPORT_SYMBOL(netdev_boot_setup_check);
586 
587 
588 /**
589  *	netdev_boot_base	- get address from boot time settings
590  *	@prefix: prefix for network device
591  *	@unit: id for network device
592  *
593  * 	Check boot time settings for the base address of device.
594  *	The found settings are set for the device to be used
595  *	later in the device probing.
596  *	Returns 0 if no settings found.
597  */
598 unsigned long netdev_boot_base(const char *prefix, int unit)
599 {
600 	const struct netdev_boot_setup *s = dev_boot_setup;
601 	char name[IFNAMSIZ];
602 	int i;
603 
604 	sprintf(name, "%s%d", prefix, unit);
605 
606 	/*
607 	 * If device already registered then return base of 1
608 	 * to indicate not to probe for this interface
609 	 */
610 	if (__dev_get_by_name(&init_net, name))
611 		return 1;
612 
613 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
614 		if (!strcmp(name, s[i].name))
615 			return s[i].map.base_addr;
616 	return 0;
617 }
618 
619 /*
620  * Saves at boot time configured settings for any netdevice.
621  */
622 int __init netdev_boot_setup(char *str)
623 {
624 	int ints[5];
625 	struct ifmap map;
626 
627 	str = get_options(str, ARRAY_SIZE(ints), ints);
628 	if (!str || !*str)
629 		return 0;
630 
631 	/* Save settings */
632 	memset(&map, 0, sizeof(map));
633 	if (ints[0] > 0)
634 		map.irq = ints[1];
635 	if (ints[0] > 1)
636 		map.base_addr = ints[2];
637 	if (ints[0] > 2)
638 		map.mem_start = ints[3];
639 	if (ints[0] > 3)
640 		map.mem_end = ints[4];
641 
642 	/* Add new entry to the list */
643 	return netdev_boot_setup_add(str, &map);
644 }
645 
646 __setup("netdev=", netdev_boot_setup);
647 
648 /*******************************************************************************
649 
650 			    Device Interface Subroutines
651 
652 *******************************************************************************/
653 
654 /**
655  *	__dev_get_by_name	- find a device by its name
656  *	@net: the applicable net namespace
657  *	@name: name to find
658  *
659  *	Find an interface by name. Must be called under RTNL semaphore
660  *	or @dev_base_lock. If the name is found a pointer to the device
661  *	is returned. If the name is not found then %NULL is returned. The
662  *	reference counters are not incremented so the caller must be
663  *	careful with locks.
664  */
665 
666 struct net_device *__dev_get_by_name(struct net *net, const char *name)
667 {
668 	struct net_device *dev;
669 	struct hlist_head *head = dev_name_hash(net, name);
670 
671 	hlist_for_each_entry(dev, head, name_hlist)
672 		if (!strncmp(dev->name, name, IFNAMSIZ))
673 			return dev;
674 
675 	return NULL;
676 }
677 EXPORT_SYMBOL(__dev_get_by_name);
678 
679 /**
680  *	dev_get_by_name_rcu	- find a device by its name
681  *	@net: the applicable net namespace
682  *	@name: name to find
683  *
684  *	Find an interface by name.
685  *	If the name is found a pointer to the device is returned.
686  * 	If the name is not found then %NULL is returned.
687  *	The reference counters are not incremented so the caller must be
688  *	careful with locks. The caller must hold RCU lock.
689  */
690 
691 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
692 {
693 	struct net_device *dev;
694 	struct hlist_head *head = dev_name_hash(net, name);
695 
696 	hlist_for_each_entry_rcu(dev, head, name_hlist)
697 		if (!strncmp(dev->name, name, IFNAMSIZ))
698 			return dev;
699 
700 	return NULL;
701 }
702 EXPORT_SYMBOL(dev_get_by_name_rcu);
703 
704 /**
705  *	dev_get_by_name		- find a device by its name
706  *	@net: the applicable net namespace
707  *	@name: name to find
708  *
709  *	Find an interface by name. This can be called from any
710  *	context and does its own locking. The returned handle has
711  *	the usage count incremented and the caller must use dev_put() to
712  *	release it when it is no longer needed. %NULL is returned if no
713  *	matching device is found.
714  */
715 
716 struct net_device *dev_get_by_name(struct net *net, const char *name)
717 {
718 	struct net_device *dev;
719 
720 	rcu_read_lock();
721 	dev = dev_get_by_name_rcu(net, name);
722 	if (dev)
723 		dev_hold(dev);
724 	rcu_read_unlock();
725 	return dev;
726 }
727 EXPORT_SYMBOL(dev_get_by_name);
728 
729 /**
730  *	__dev_get_by_index - find a device by its ifindex
731  *	@net: the applicable net namespace
732  *	@ifindex: index of device
733  *
734  *	Search for an interface by index. Returns %NULL if the device
735  *	is not found or a pointer to the device. The device has not
736  *	had its reference counter increased so the caller must be careful
737  *	about locking. The caller must hold either the RTNL semaphore
738  *	or @dev_base_lock.
739  */
740 
741 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
742 {
743 	struct net_device *dev;
744 	struct hlist_head *head = dev_index_hash(net, ifindex);
745 
746 	hlist_for_each_entry(dev, head, index_hlist)
747 		if (dev->ifindex == ifindex)
748 			return dev;
749 
750 	return NULL;
751 }
752 EXPORT_SYMBOL(__dev_get_by_index);
753 
754 /**
755  *	dev_get_by_index_rcu - find a device by its ifindex
756  *	@net: the applicable net namespace
757  *	@ifindex: index of device
758  *
759  *	Search for an interface by index. Returns %NULL if the device
760  *	is not found or a pointer to the device. The device has not
761  *	had its reference counter increased so the caller must be careful
762  *	about locking. The caller must hold RCU lock.
763  */
764 
765 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
766 {
767 	struct net_device *dev;
768 	struct hlist_head *head = dev_index_hash(net, ifindex);
769 
770 	hlist_for_each_entry_rcu(dev, head, index_hlist)
771 		if (dev->ifindex == ifindex)
772 			return dev;
773 
774 	return NULL;
775 }
776 EXPORT_SYMBOL(dev_get_by_index_rcu);
777 
778 
779 /**
780  *	dev_get_by_index - find a device by its ifindex
781  *	@net: the applicable net namespace
782  *	@ifindex: index of device
783  *
784  *	Search for an interface by index. Returns NULL if the device
785  *	is not found or a pointer to the device. The device returned has
786  *	had a reference added and the pointer is safe until the user calls
787  *	dev_put to indicate they have finished with it.
788  */
789 
790 struct net_device *dev_get_by_index(struct net *net, int ifindex)
791 {
792 	struct net_device *dev;
793 
794 	rcu_read_lock();
795 	dev = dev_get_by_index_rcu(net, ifindex);
796 	if (dev)
797 		dev_hold(dev);
798 	rcu_read_unlock();
799 	return dev;
800 }
801 EXPORT_SYMBOL(dev_get_by_index);
802 
803 /**
804  *	netdev_get_name - get a netdevice name, knowing its ifindex.
805  *	@net: network namespace
806  *	@name: a pointer to the buffer where the name will be stored.
807  *	@ifindex: the ifindex of the interface to get the name from.
808  *
809  *	The use of raw_seqcount_begin() and cond_resched() before
810  *	retrying is required as we want to give the writers a chance
811  *	to complete when CONFIG_PREEMPT is not set.
812  */
813 int netdev_get_name(struct net *net, char *name, int ifindex)
814 {
815 	struct net_device *dev;
816 	unsigned int seq;
817 
818 retry:
819 	seq = raw_seqcount_begin(&devnet_rename_seq);
820 	rcu_read_lock();
821 	dev = dev_get_by_index_rcu(net, ifindex);
822 	if (!dev) {
823 		rcu_read_unlock();
824 		return -ENODEV;
825 	}
826 
827 	strcpy(name, dev->name);
828 	rcu_read_unlock();
829 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
830 		cond_resched();
831 		goto retry;
832 	}
833 
834 	return 0;
835 }
836 
837 /**
838  *	dev_getbyhwaddr_rcu - find a device by its hardware address
839  *	@net: the applicable net namespace
840  *	@type: media type of device
841  *	@ha: hardware address
842  *
843  *	Search for an interface by MAC address. Returns NULL if the device
844  *	is not found or a pointer to the device.
845  *	The caller must hold RCU or RTNL.
846  *	The returned device has not had its ref count increased
847  *	and the caller must therefore be careful about locking
848  *
849  */
850 
851 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
852 				       const char *ha)
853 {
854 	struct net_device *dev;
855 
856 	for_each_netdev_rcu(net, dev)
857 		if (dev->type == type &&
858 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
859 			return dev;
860 
861 	return NULL;
862 }
863 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
864 
865 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
866 {
867 	struct net_device *dev;
868 
869 	ASSERT_RTNL();
870 	for_each_netdev(net, dev)
871 		if (dev->type == type)
872 			return dev;
873 
874 	return NULL;
875 }
876 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
877 
878 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
879 {
880 	struct net_device *dev, *ret = NULL;
881 
882 	rcu_read_lock();
883 	for_each_netdev_rcu(net, dev)
884 		if (dev->type == type) {
885 			dev_hold(dev);
886 			ret = dev;
887 			break;
888 		}
889 	rcu_read_unlock();
890 	return ret;
891 }
892 EXPORT_SYMBOL(dev_getfirstbyhwtype);
893 
894 /**
895  *	dev_get_by_flags_rcu - find any device with given flags
896  *	@net: the applicable net namespace
897  *	@if_flags: IFF_* values
898  *	@mask: bitmask of bits in if_flags to check
899  *
900  *	Search for any interface with the given flags. Returns NULL if a device
901  *	is not found or a pointer to the device. Must be called inside
902  *	rcu_read_lock(), and result refcount is unchanged.
903  */
904 
905 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
906 				    unsigned short mask)
907 {
908 	struct net_device *dev, *ret;
909 
910 	ret = NULL;
911 	for_each_netdev_rcu(net, dev) {
912 		if (((dev->flags ^ if_flags) & mask) == 0) {
913 			ret = dev;
914 			break;
915 		}
916 	}
917 	return ret;
918 }
919 EXPORT_SYMBOL(dev_get_by_flags_rcu);
920 
921 /**
922  *	dev_valid_name - check if name is okay for network device
923  *	@name: name string
924  *
925  *	Network device names need to be valid file names to
926  *	to allow sysfs to work.  We also disallow any kind of
927  *	whitespace.
928  */
929 bool dev_valid_name(const char *name)
930 {
931 	if (*name == '\0')
932 		return false;
933 	if (strlen(name) >= IFNAMSIZ)
934 		return false;
935 	if (!strcmp(name, ".") || !strcmp(name, ".."))
936 		return false;
937 
938 	while (*name) {
939 		if (*name == '/' || isspace(*name))
940 			return false;
941 		name++;
942 	}
943 	return true;
944 }
945 EXPORT_SYMBOL(dev_valid_name);
946 
947 /**
948  *	__dev_alloc_name - allocate a name for a device
949  *	@net: network namespace to allocate the device name in
950  *	@name: name format string
951  *	@buf:  scratch buffer and result name string
952  *
953  *	Passed a format string - eg "lt%d" it will try and find a suitable
954  *	id. It scans list of devices to build up a free map, then chooses
955  *	the first empty slot. The caller must hold the dev_base or rtnl lock
956  *	while allocating the name and adding the device in order to avoid
957  *	duplicates.
958  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
959  *	Returns the number of the unit assigned or a negative errno code.
960  */
961 
962 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
963 {
964 	int i = 0;
965 	const char *p;
966 	const int max_netdevices = 8*PAGE_SIZE;
967 	unsigned long *inuse;
968 	struct net_device *d;
969 
970 	p = strnchr(name, IFNAMSIZ-1, '%');
971 	if (p) {
972 		/*
973 		 * Verify the string as this thing may have come from
974 		 * the user.  There must be either one "%d" and no other "%"
975 		 * characters.
976 		 */
977 		if (p[1] != 'd' || strchr(p + 2, '%'))
978 			return -EINVAL;
979 
980 		/* Use one page as a bit array of possible slots */
981 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
982 		if (!inuse)
983 			return -ENOMEM;
984 
985 		for_each_netdev(net, d) {
986 			if (!sscanf(d->name, name, &i))
987 				continue;
988 			if (i < 0 || i >= max_netdevices)
989 				continue;
990 
991 			/*  avoid cases where sscanf is not exact inverse of printf */
992 			snprintf(buf, IFNAMSIZ, name, i);
993 			if (!strncmp(buf, d->name, IFNAMSIZ))
994 				set_bit(i, inuse);
995 		}
996 
997 		i = find_first_zero_bit(inuse, max_netdevices);
998 		free_page((unsigned long) inuse);
999 	}
1000 
1001 	if (buf != name)
1002 		snprintf(buf, IFNAMSIZ, name, i);
1003 	if (!__dev_get_by_name(net, buf))
1004 		return i;
1005 
1006 	/* It is possible to run out of possible slots
1007 	 * when the name is long and there isn't enough space left
1008 	 * for the digits, or if all bits are used.
1009 	 */
1010 	return -ENFILE;
1011 }
1012 
1013 /**
1014  *	dev_alloc_name - allocate a name for a device
1015  *	@dev: device
1016  *	@name: name format string
1017  *
1018  *	Passed a format string - eg "lt%d" it will try and find a suitable
1019  *	id. It scans list of devices to build up a free map, then chooses
1020  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1021  *	while allocating the name and adding the device in order to avoid
1022  *	duplicates.
1023  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1024  *	Returns the number of the unit assigned or a negative errno code.
1025  */
1026 
1027 int dev_alloc_name(struct net_device *dev, const char *name)
1028 {
1029 	char buf[IFNAMSIZ];
1030 	struct net *net;
1031 	int ret;
1032 
1033 	BUG_ON(!dev_net(dev));
1034 	net = dev_net(dev);
1035 	ret = __dev_alloc_name(net, name, buf);
1036 	if (ret >= 0)
1037 		strlcpy(dev->name, buf, IFNAMSIZ);
1038 	return ret;
1039 }
1040 EXPORT_SYMBOL(dev_alloc_name);
1041 
1042 static int dev_alloc_name_ns(struct net *net,
1043 			     struct net_device *dev,
1044 			     const char *name)
1045 {
1046 	char buf[IFNAMSIZ];
1047 	int ret;
1048 
1049 	ret = __dev_alloc_name(net, name, buf);
1050 	if (ret >= 0)
1051 		strlcpy(dev->name, buf, IFNAMSIZ);
1052 	return ret;
1053 }
1054 
1055 static int dev_get_valid_name(struct net *net,
1056 			      struct net_device *dev,
1057 			      const char *name)
1058 {
1059 	BUG_ON(!net);
1060 
1061 	if (!dev_valid_name(name))
1062 		return -EINVAL;
1063 
1064 	if (strchr(name, '%'))
1065 		return dev_alloc_name_ns(net, dev, name);
1066 	else if (__dev_get_by_name(net, name))
1067 		return -EEXIST;
1068 	else if (dev->name != name)
1069 		strlcpy(dev->name, name, IFNAMSIZ);
1070 
1071 	return 0;
1072 }
1073 
1074 /**
1075  *	dev_change_name - change name of a device
1076  *	@dev: device
1077  *	@newname: name (or format string) must be at least IFNAMSIZ
1078  *
1079  *	Change name of a device, can pass format strings "eth%d".
1080  *	for wildcarding.
1081  */
1082 int dev_change_name(struct net_device *dev, const char *newname)
1083 {
1084 	char oldname[IFNAMSIZ];
1085 	int err = 0;
1086 	int ret;
1087 	struct net *net;
1088 
1089 	ASSERT_RTNL();
1090 	BUG_ON(!dev_net(dev));
1091 
1092 	net = dev_net(dev);
1093 	if (dev->flags & IFF_UP)
1094 		return -EBUSY;
1095 
1096 	write_seqcount_begin(&devnet_rename_seq);
1097 
1098 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1099 		write_seqcount_end(&devnet_rename_seq);
1100 		return 0;
1101 	}
1102 
1103 	memcpy(oldname, dev->name, IFNAMSIZ);
1104 
1105 	err = dev_get_valid_name(net, dev, newname);
1106 	if (err < 0) {
1107 		write_seqcount_end(&devnet_rename_seq);
1108 		return err;
1109 	}
1110 
1111 rollback:
1112 	ret = device_rename(&dev->dev, dev->name);
1113 	if (ret) {
1114 		memcpy(dev->name, oldname, IFNAMSIZ);
1115 		write_seqcount_end(&devnet_rename_seq);
1116 		return ret;
1117 	}
1118 
1119 	write_seqcount_end(&devnet_rename_seq);
1120 
1121 	write_lock_bh(&dev_base_lock);
1122 	hlist_del_rcu(&dev->name_hlist);
1123 	write_unlock_bh(&dev_base_lock);
1124 
1125 	synchronize_rcu();
1126 
1127 	write_lock_bh(&dev_base_lock);
1128 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1129 	write_unlock_bh(&dev_base_lock);
1130 
1131 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1132 	ret = notifier_to_errno(ret);
1133 
1134 	if (ret) {
1135 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1136 		if (err >= 0) {
1137 			err = ret;
1138 			write_seqcount_begin(&devnet_rename_seq);
1139 			memcpy(dev->name, oldname, IFNAMSIZ);
1140 			goto rollback;
1141 		} else {
1142 			pr_err("%s: name change rollback failed: %d\n",
1143 			       dev->name, ret);
1144 		}
1145 	}
1146 
1147 	return err;
1148 }
1149 
1150 /**
1151  *	dev_set_alias - change ifalias of a device
1152  *	@dev: device
1153  *	@alias: name up to IFALIASZ
1154  *	@len: limit of bytes to copy from info
1155  *
1156  *	Set ifalias for a device,
1157  */
1158 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1159 {
1160 	char *new_ifalias;
1161 
1162 	ASSERT_RTNL();
1163 
1164 	if (len >= IFALIASZ)
1165 		return -EINVAL;
1166 
1167 	if (!len) {
1168 		kfree(dev->ifalias);
1169 		dev->ifalias = NULL;
1170 		return 0;
1171 	}
1172 
1173 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1174 	if (!new_ifalias)
1175 		return -ENOMEM;
1176 	dev->ifalias = new_ifalias;
1177 
1178 	strlcpy(dev->ifalias, alias, len+1);
1179 	return len;
1180 }
1181 
1182 
1183 /**
1184  *	netdev_features_change - device changes features
1185  *	@dev: device to cause notification
1186  *
1187  *	Called to indicate a device has changed features.
1188  */
1189 void netdev_features_change(struct net_device *dev)
1190 {
1191 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1192 }
1193 EXPORT_SYMBOL(netdev_features_change);
1194 
1195 /**
1196  *	netdev_state_change - device changes state
1197  *	@dev: device to cause notification
1198  *
1199  *	Called to indicate a device has changed state. This function calls
1200  *	the notifier chains for netdev_chain and sends a NEWLINK message
1201  *	to the routing socket.
1202  */
1203 void netdev_state_change(struct net_device *dev)
1204 {
1205 	if (dev->flags & IFF_UP) {
1206 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1207 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1208 	}
1209 }
1210 EXPORT_SYMBOL(netdev_state_change);
1211 
1212 /**
1213  * 	netdev_notify_peers - notify network peers about existence of @dev
1214  * 	@dev: network device
1215  *
1216  * Generate traffic such that interested network peers are aware of
1217  * @dev, such as by generating a gratuitous ARP. This may be used when
1218  * a device wants to inform the rest of the network about some sort of
1219  * reconfiguration such as a failover event or virtual machine
1220  * migration.
1221  */
1222 void netdev_notify_peers(struct net_device *dev)
1223 {
1224 	rtnl_lock();
1225 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1226 	rtnl_unlock();
1227 }
1228 EXPORT_SYMBOL(netdev_notify_peers);
1229 
1230 static int __dev_open(struct net_device *dev)
1231 {
1232 	const struct net_device_ops *ops = dev->netdev_ops;
1233 	int ret;
1234 
1235 	ASSERT_RTNL();
1236 
1237 	if (!netif_device_present(dev))
1238 		return -ENODEV;
1239 
1240 	/* Block netpoll from trying to do any rx path servicing.
1241 	 * If we don't do this there is a chance ndo_poll_controller
1242 	 * or ndo_poll may be running while we open the device
1243 	 */
1244 	netpoll_rx_disable(dev);
1245 
1246 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1247 	ret = notifier_to_errno(ret);
1248 	if (ret)
1249 		return ret;
1250 
1251 	set_bit(__LINK_STATE_START, &dev->state);
1252 
1253 	if (ops->ndo_validate_addr)
1254 		ret = ops->ndo_validate_addr(dev);
1255 
1256 	if (!ret && ops->ndo_open)
1257 		ret = ops->ndo_open(dev);
1258 
1259 	netpoll_rx_enable(dev);
1260 
1261 	if (ret)
1262 		clear_bit(__LINK_STATE_START, &dev->state);
1263 	else {
1264 		dev->flags |= IFF_UP;
1265 		net_dmaengine_get();
1266 		dev_set_rx_mode(dev);
1267 		dev_activate(dev);
1268 		add_device_randomness(dev->dev_addr, dev->addr_len);
1269 	}
1270 
1271 	return ret;
1272 }
1273 
1274 /**
1275  *	dev_open	- prepare an interface for use.
1276  *	@dev:	device to open
1277  *
1278  *	Takes a device from down to up state. The device's private open
1279  *	function is invoked and then the multicast lists are loaded. Finally
1280  *	the device is moved into the up state and a %NETDEV_UP message is
1281  *	sent to the netdev notifier chain.
1282  *
1283  *	Calling this function on an active interface is a nop. On a failure
1284  *	a negative errno code is returned.
1285  */
1286 int dev_open(struct net_device *dev)
1287 {
1288 	int ret;
1289 
1290 	if (dev->flags & IFF_UP)
1291 		return 0;
1292 
1293 	ret = __dev_open(dev);
1294 	if (ret < 0)
1295 		return ret;
1296 
1297 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1298 	call_netdevice_notifiers(NETDEV_UP, dev);
1299 
1300 	return ret;
1301 }
1302 EXPORT_SYMBOL(dev_open);
1303 
1304 static int __dev_close_many(struct list_head *head)
1305 {
1306 	struct net_device *dev;
1307 
1308 	ASSERT_RTNL();
1309 	might_sleep();
1310 
1311 	list_for_each_entry(dev, head, close_list) {
1312 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1313 
1314 		clear_bit(__LINK_STATE_START, &dev->state);
1315 
1316 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1317 		 * can be even on different cpu. So just clear netif_running().
1318 		 *
1319 		 * dev->stop() will invoke napi_disable() on all of it's
1320 		 * napi_struct instances on this device.
1321 		 */
1322 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1323 	}
1324 
1325 	dev_deactivate_many(head);
1326 
1327 	list_for_each_entry(dev, head, close_list) {
1328 		const struct net_device_ops *ops = dev->netdev_ops;
1329 
1330 		/*
1331 		 *	Call the device specific close. This cannot fail.
1332 		 *	Only if device is UP
1333 		 *
1334 		 *	We allow it to be called even after a DETACH hot-plug
1335 		 *	event.
1336 		 */
1337 		if (ops->ndo_stop)
1338 			ops->ndo_stop(dev);
1339 
1340 		dev->flags &= ~IFF_UP;
1341 		net_dmaengine_put();
1342 	}
1343 
1344 	return 0;
1345 }
1346 
1347 static int __dev_close(struct net_device *dev)
1348 {
1349 	int retval;
1350 	LIST_HEAD(single);
1351 
1352 	/* Temporarily disable netpoll until the interface is down */
1353 	netpoll_rx_disable(dev);
1354 
1355 	list_add(&dev->close_list, &single);
1356 	retval = __dev_close_many(&single);
1357 	list_del(&single);
1358 
1359 	netpoll_rx_enable(dev);
1360 	return retval;
1361 }
1362 
1363 static int dev_close_many(struct list_head *head)
1364 {
1365 	struct net_device *dev, *tmp;
1366 
1367 	/* Remove the devices that don't need to be closed */
1368 	list_for_each_entry_safe(dev, tmp, head, close_list)
1369 		if (!(dev->flags & IFF_UP))
1370 			list_del_init(&dev->close_list);
1371 
1372 	__dev_close_many(head);
1373 
1374 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1375 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1377 		list_del_init(&dev->close_list);
1378 	}
1379 
1380 	return 0;
1381 }
1382 
1383 /**
1384  *	dev_close - shutdown an interface.
1385  *	@dev: device to shutdown
1386  *
1387  *	This function moves an active device into down state. A
1388  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1389  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1390  *	chain.
1391  */
1392 int dev_close(struct net_device *dev)
1393 {
1394 	if (dev->flags & IFF_UP) {
1395 		LIST_HEAD(single);
1396 
1397 		/* Block netpoll rx while the interface is going down */
1398 		netpoll_rx_disable(dev);
1399 
1400 		list_add(&dev->close_list, &single);
1401 		dev_close_many(&single);
1402 		list_del(&single);
1403 
1404 		netpoll_rx_enable(dev);
1405 	}
1406 	return 0;
1407 }
1408 EXPORT_SYMBOL(dev_close);
1409 
1410 
1411 /**
1412  *	dev_disable_lro - disable Large Receive Offload on a device
1413  *	@dev: device
1414  *
1415  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1416  *	called under RTNL.  This is needed if received packets may be
1417  *	forwarded to another interface.
1418  */
1419 void dev_disable_lro(struct net_device *dev)
1420 {
1421 	/*
1422 	 * If we're trying to disable lro on a vlan device
1423 	 * use the underlying physical device instead
1424 	 */
1425 	if (is_vlan_dev(dev))
1426 		dev = vlan_dev_real_dev(dev);
1427 
1428 	/* the same for macvlan devices */
1429 	if (netif_is_macvlan(dev))
1430 		dev = macvlan_dev_real_dev(dev);
1431 
1432 	dev->wanted_features &= ~NETIF_F_LRO;
1433 	netdev_update_features(dev);
1434 
1435 	if (unlikely(dev->features & NETIF_F_LRO))
1436 		netdev_WARN(dev, "failed to disable LRO!\n");
1437 }
1438 EXPORT_SYMBOL(dev_disable_lro);
1439 
1440 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1441 				   struct net_device *dev)
1442 {
1443 	struct netdev_notifier_info info;
1444 
1445 	netdev_notifier_info_init(&info, dev);
1446 	return nb->notifier_call(nb, val, &info);
1447 }
1448 
1449 static int dev_boot_phase = 1;
1450 
1451 /**
1452  *	register_netdevice_notifier - register a network notifier block
1453  *	@nb: notifier
1454  *
1455  *	Register a notifier to be called when network device events occur.
1456  *	The notifier passed is linked into the kernel structures and must
1457  *	not be reused until it has been unregistered. A negative errno code
1458  *	is returned on a failure.
1459  *
1460  * 	When registered all registration and up events are replayed
1461  *	to the new notifier to allow device to have a race free
1462  *	view of the network device list.
1463  */
1464 
1465 int register_netdevice_notifier(struct notifier_block *nb)
1466 {
1467 	struct net_device *dev;
1468 	struct net_device *last;
1469 	struct net *net;
1470 	int err;
1471 
1472 	rtnl_lock();
1473 	err = raw_notifier_chain_register(&netdev_chain, nb);
1474 	if (err)
1475 		goto unlock;
1476 	if (dev_boot_phase)
1477 		goto unlock;
1478 	for_each_net(net) {
1479 		for_each_netdev(net, dev) {
1480 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1481 			err = notifier_to_errno(err);
1482 			if (err)
1483 				goto rollback;
1484 
1485 			if (!(dev->flags & IFF_UP))
1486 				continue;
1487 
1488 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1489 		}
1490 	}
1491 
1492 unlock:
1493 	rtnl_unlock();
1494 	return err;
1495 
1496 rollback:
1497 	last = dev;
1498 	for_each_net(net) {
1499 		for_each_netdev(net, dev) {
1500 			if (dev == last)
1501 				goto outroll;
1502 
1503 			if (dev->flags & IFF_UP) {
1504 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1505 							dev);
1506 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1507 			}
1508 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1509 		}
1510 	}
1511 
1512 outroll:
1513 	raw_notifier_chain_unregister(&netdev_chain, nb);
1514 	goto unlock;
1515 }
1516 EXPORT_SYMBOL(register_netdevice_notifier);
1517 
1518 /**
1519  *	unregister_netdevice_notifier - unregister a network notifier block
1520  *	@nb: notifier
1521  *
1522  *	Unregister a notifier previously registered by
1523  *	register_netdevice_notifier(). The notifier is unlinked into the
1524  *	kernel structures and may then be reused. A negative errno code
1525  *	is returned on a failure.
1526  *
1527  * 	After unregistering unregister and down device events are synthesized
1528  *	for all devices on the device list to the removed notifier to remove
1529  *	the need for special case cleanup code.
1530  */
1531 
1532 int unregister_netdevice_notifier(struct notifier_block *nb)
1533 {
1534 	struct net_device *dev;
1535 	struct net *net;
1536 	int err;
1537 
1538 	rtnl_lock();
1539 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1540 	if (err)
1541 		goto unlock;
1542 
1543 	for_each_net(net) {
1544 		for_each_netdev(net, dev) {
1545 			if (dev->flags & IFF_UP) {
1546 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1547 							dev);
1548 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1549 			}
1550 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1551 		}
1552 	}
1553 unlock:
1554 	rtnl_unlock();
1555 	return err;
1556 }
1557 EXPORT_SYMBOL(unregister_netdevice_notifier);
1558 
1559 /**
1560  *	call_netdevice_notifiers_info - call all network notifier blocks
1561  *	@val: value passed unmodified to notifier function
1562  *	@dev: net_device pointer passed unmodified to notifier function
1563  *	@info: notifier information data
1564  *
1565  *	Call all network notifier blocks.  Parameters and return value
1566  *	are as for raw_notifier_call_chain().
1567  */
1568 
1569 int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1570 				  struct netdev_notifier_info *info)
1571 {
1572 	ASSERT_RTNL();
1573 	netdev_notifier_info_init(info, dev);
1574 	return raw_notifier_call_chain(&netdev_chain, val, info);
1575 }
1576 EXPORT_SYMBOL(call_netdevice_notifiers_info);
1577 
1578 /**
1579  *	call_netdevice_notifiers - call all network notifier blocks
1580  *      @val: value passed unmodified to notifier function
1581  *      @dev: net_device pointer passed unmodified to notifier function
1582  *
1583  *	Call all network notifier blocks.  Parameters and return value
1584  *	are as for raw_notifier_call_chain().
1585  */
1586 
1587 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1588 {
1589 	struct netdev_notifier_info info;
1590 
1591 	return call_netdevice_notifiers_info(val, dev, &info);
1592 }
1593 EXPORT_SYMBOL(call_netdevice_notifiers);
1594 
1595 static struct static_key netstamp_needed __read_mostly;
1596 #ifdef HAVE_JUMP_LABEL
1597 /* We are not allowed to call static_key_slow_dec() from irq context
1598  * If net_disable_timestamp() is called from irq context, defer the
1599  * static_key_slow_dec() calls.
1600  */
1601 static atomic_t netstamp_needed_deferred;
1602 #endif
1603 
1604 void net_enable_timestamp(void)
1605 {
1606 #ifdef HAVE_JUMP_LABEL
1607 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1608 
1609 	if (deferred) {
1610 		while (--deferred)
1611 			static_key_slow_dec(&netstamp_needed);
1612 		return;
1613 	}
1614 #endif
1615 	static_key_slow_inc(&netstamp_needed);
1616 }
1617 EXPORT_SYMBOL(net_enable_timestamp);
1618 
1619 void net_disable_timestamp(void)
1620 {
1621 #ifdef HAVE_JUMP_LABEL
1622 	if (in_interrupt()) {
1623 		atomic_inc(&netstamp_needed_deferred);
1624 		return;
1625 	}
1626 #endif
1627 	static_key_slow_dec(&netstamp_needed);
1628 }
1629 EXPORT_SYMBOL(net_disable_timestamp);
1630 
1631 static inline void net_timestamp_set(struct sk_buff *skb)
1632 {
1633 	skb->tstamp.tv64 = 0;
1634 	if (static_key_false(&netstamp_needed))
1635 		__net_timestamp(skb);
1636 }
1637 
1638 #define net_timestamp_check(COND, SKB)			\
1639 	if (static_key_false(&netstamp_needed)) {		\
1640 		if ((COND) && !(SKB)->tstamp.tv64)	\
1641 			__net_timestamp(SKB);		\
1642 	}						\
1643 
1644 static inline bool is_skb_forwardable(struct net_device *dev,
1645 				      struct sk_buff *skb)
1646 {
1647 	unsigned int len;
1648 
1649 	if (!(dev->flags & IFF_UP))
1650 		return false;
1651 
1652 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1653 	if (skb->len <= len)
1654 		return true;
1655 
1656 	/* if TSO is enabled, we don't care about the length as the packet
1657 	 * could be forwarded without being segmented before
1658 	 */
1659 	if (skb_is_gso(skb))
1660 		return true;
1661 
1662 	return false;
1663 }
1664 
1665 /**
1666  * dev_forward_skb - loopback an skb to another netif
1667  *
1668  * @dev: destination network device
1669  * @skb: buffer to forward
1670  *
1671  * return values:
1672  *	NET_RX_SUCCESS	(no congestion)
1673  *	NET_RX_DROP     (packet was dropped, but freed)
1674  *
1675  * dev_forward_skb can be used for injecting an skb from the
1676  * start_xmit function of one device into the receive queue
1677  * of another device.
1678  *
1679  * The receiving device may be in another namespace, so
1680  * we have to clear all information in the skb that could
1681  * impact namespace isolation.
1682  */
1683 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1684 {
1685 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1686 		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1687 			atomic_long_inc(&dev->rx_dropped);
1688 			kfree_skb(skb);
1689 			return NET_RX_DROP;
1690 		}
1691 	}
1692 
1693 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1694 		atomic_long_inc(&dev->rx_dropped);
1695 		kfree_skb(skb);
1696 		return NET_RX_DROP;
1697 	}
1698 
1699 	skb_scrub_packet(skb, true);
1700 	skb->protocol = eth_type_trans(skb, dev);
1701 
1702 	return netif_rx(skb);
1703 }
1704 EXPORT_SYMBOL_GPL(dev_forward_skb);
1705 
1706 static inline int deliver_skb(struct sk_buff *skb,
1707 			      struct packet_type *pt_prev,
1708 			      struct net_device *orig_dev)
1709 {
1710 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1711 		return -ENOMEM;
1712 	atomic_inc(&skb->users);
1713 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1714 }
1715 
1716 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1717 {
1718 	if (!ptype->af_packet_priv || !skb->sk)
1719 		return false;
1720 
1721 	if (ptype->id_match)
1722 		return ptype->id_match(ptype, skb->sk);
1723 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1724 		return true;
1725 
1726 	return false;
1727 }
1728 
1729 /*
1730  *	Support routine. Sends outgoing frames to any network
1731  *	taps currently in use.
1732  */
1733 
1734 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1735 {
1736 	struct packet_type *ptype;
1737 	struct sk_buff *skb2 = NULL;
1738 	struct packet_type *pt_prev = NULL;
1739 
1740 	rcu_read_lock();
1741 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1742 		/* Never send packets back to the socket
1743 		 * they originated from - MvS (miquels@drinkel.ow.org)
1744 		 */
1745 		if ((ptype->dev == dev || !ptype->dev) &&
1746 		    (!skb_loop_sk(ptype, skb))) {
1747 			if (pt_prev) {
1748 				deliver_skb(skb2, pt_prev, skb->dev);
1749 				pt_prev = ptype;
1750 				continue;
1751 			}
1752 
1753 			skb2 = skb_clone(skb, GFP_ATOMIC);
1754 			if (!skb2)
1755 				break;
1756 
1757 			net_timestamp_set(skb2);
1758 
1759 			/* skb->nh should be correctly
1760 			   set by sender, so that the second statement is
1761 			   just protection against buggy protocols.
1762 			 */
1763 			skb_reset_mac_header(skb2);
1764 
1765 			if (skb_network_header(skb2) < skb2->data ||
1766 			    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1767 				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1768 						     ntohs(skb2->protocol),
1769 						     dev->name);
1770 				skb_reset_network_header(skb2);
1771 			}
1772 
1773 			skb2->transport_header = skb2->network_header;
1774 			skb2->pkt_type = PACKET_OUTGOING;
1775 			pt_prev = ptype;
1776 		}
1777 	}
1778 	if (pt_prev)
1779 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1780 	rcu_read_unlock();
1781 }
1782 
1783 /**
1784  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1785  * @dev: Network device
1786  * @txq: number of queues available
1787  *
1788  * If real_num_tx_queues is changed the tc mappings may no longer be
1789  * valid. To resolve this verify the tc mapping remains valid and if
1790  * not NULL the mapping. With no priorities mapping to this
1791  * offset/count pair it will no longer be used. In the worst case TC0
1792  * is invalid nothing can be done so disable priority mappings. If is
1793  * expected that drivers will fix this mapping if they can before
1794  * calling netif_set_real_num_tx_queues.
1795  */
1796 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1797 {
1798 	int i;
1799 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1800 
1801 	/* If TC0 is invalidated disable TC mapping */
1802 	if (tc->offset + tc->count > txq) {
1803 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1804 		dev->num_tc = 0;
1805 		return;
1806 	}
1807 
1808 	/* Invalidated prio to tc mappings set to TC0 */
1809 	for (i = 1; i < TC_BITMASK + 1; i++) {
1810 		int q = netdev_get_prio_tc_map(dev, i);
1811 
1812 		tc = &dev->tc_to_txq[q];
1813 		if (tc->offset + tc->count > txq) {
1814 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1815 				i, q);
1816 			netdev_set_prio_tc_map(dev, i, 0);
1817 		}
1818 	}
1819 }
1820 
1821 #ifdef CONFIG_XPS
1822 static DEFINE_MUTEX(xps_map_mutex);
1823 #define xmap_dereference(P)		\
1824 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1825 
1826 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1827 					int cpu, u16 index)
1828 {
1829 	struct xps_map *map = NULL;
1830 	int pos;
1831 
1832 	if (dev_maps)
1833 		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1834 
1835 	for (pos = 0; map && pos < map->len; pos++) {
1836 		if (map->queues[pos] == index) {
1837 			if (map->len > 1) {
1838 				map->queues[pos] = map->queues[--map->len];
1839 			} else {
1840 				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1841 				kfree_rcu(map, rcu);
1842 				map = NULL;
1843 			}
1844 			break;
1845 		}
1846 	}
1847 
1848 	return map;
1849 }
1850 
1851 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1852 {
1853 	struct xps_dev_maps *dev_maps;
1854 	int cpu, i;
1855 	bool active = false;
1856 
1857 	mutex_lock(&xps_map_mutex);
1858 	dev_maps = xmap_dereference(dev->xps_maps);
1859 
1860 	if (!dev_maps)
1861 		goto out_no_maps;
1862 
1863 	for_each_possible_cpu(cpu) {
1864 		for (i = index; i < dev->num_tx_queues; i++) {
1865 			if (!remove_xps_queue(dev_maps, cpu, i))
1866 				break;
1867 		}
1868 		if (i == dev->num_tx_queues)
1869 			active = true;
1870 	}
1871 
1872 	if (!active) {
1873 		RCU_INIT_POINTER(dev->xps_maps, NULL);
1874 		kfree_rcu(dev_maps, rcu);
1875 	}
1876 
1877 	for (i = index; i < dev->num_tx_queues; i++)
1878 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1879 					     NUMA_NO_NODE);
1880 
1881 out_no_maps:
1882 	mutex_unlock(&xps_map_mutex);
1883 }
1884 
1885 static struct xps_map *expand_xps_map(struct xps_map *map,
1886 				      int cpu, u16 index)
1887 {
1888 	struct xps_map *new_map;
1889 	int alloc_len = XPS_MIN_MAP_ALLOC;
1890 	int i, pos;
1891 
1892 	for (pos = 0; map && pos < map->len; pos++) {
1893 		if (map->queues[pos] != index)
1894 			continue;
1895 		return map;
1896 	}
1897 
1898 	/* Need to add queue to this CPU's existing map */
1899 	if (map) {
1900 		if (pos < map->alloc_len)
1901 			return map;
1902 
1903 		alloc_len = map->alloc_len * 2;
1904 	}
1905 
1906 	/* Need to allocate new map to store queue on this CPU's map */
1907 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1908 			       cpu_to_node(cpu));
1909 	if (!new_map)
1910 		return NULL;
1911 
1912 	for (i = 0; i < pos; i++)
1913 		new_map->queues[i] = map->queues[i];
1914 	new_map->alloc_len = alloc_len;
1915 	new_map->len = pos;
1916 
1917 	return new_map;
1918 }
1919 
1920 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1921 			u16 index)
1922 {
1923 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1924 	struct xps_map *map, *new_map;
1925 	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1926 	int cpu, numa_node_id = -2;
1927 	bool active = false;
1928 
1929 	mutex_lock(&xps_map_mutex);
1930 
1931 	dev_maps = xmap_dereference(dev->xps_maps);
1932 
1933 	/* allocate memory for queue storage */
1934 	for_each_online_cpu(cpu) {
1935 		if (!cpumask_test_cpu(cpu, mask))
1936 			continue;
1937 
1938 		if (!new_dev_maps)
1939 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1940 		if (!new_dev_maps) {
1941 			mutex_unlock(&xps_map_mutex);
1942 			return -ENOMEM;
1943 		}
1944 
1945 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1946 				 NULL;
1947 
1948 		map = expand_xps_map(map, cpu, index);
1949 		if (!map)
1950 			goto error;
1951 
1952 		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1953 	}
1954 
1955 	if (!new_dev_maps)
1956 		goto out_no_new_maps;
1957 
1958 	for_each_possible_cpu(cpu) {
1959 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1960 			/* add queue to CPU maps */
1961 			int pos = 0;
1962 
1963 			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1964 			while ((pos < map->len) && (map->queues[pos] != index))
1965 				pos++;
1966 
1967 			if (pos == map->len)
1968 				map->queues[map->len++] = index;
1969 #ifdef CONFIG_NUMA
1970 			if (numa_node_id == -2)
1971 				numa_node_id = cpu_to_node(cpu);
1972 			else if (numa_node_id != cpu_to_node(cpu))
1973 				numa_node_id = -1;
1974 #endif
1975 		} else if (dev_maps) {
1976 			/* fill in the new device map from the old device map */
1977 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1978 			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1979 		}
1980 
1981 	}
1982 
1983 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1984 
1985 	/* Cleanup old maps */
1986 	if (dev_maps) {
1987 		for_each_possible_cpu(cpu) {
1988 			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1989 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1990 			if (map && map != new_map)
1991 				kfree_rcu(map, rcu);
1992 		}
1993 
1994 		kfree_rcu(dev_maps, rcu);
1995 	}
1996 
1997 	dev_maps = new_dev_maps;
1998 	active = true;
1999 
2000 out_no_new_maps:
2001 	/* update Tx queue numa node */
2002 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2003 				     (numa_node_id >= 0) ? numa_node_id :
2004 				     NUMA_NO_NODE);
2005 
2006 	if (!dev_maps)
2007 		goto out_no_maps;
2008 
2009 	/* removes queue from unused CPUs */
2010 	for_each_possible_cpu(cpu) {
2011 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2012 			continue;
2013 
2014 		if (remove_xps_queue(dev_maps, cpu, index))
2015 			active = true;
2016 	}
2017 
2018 	/* free map if not active */
2019 	if (!active) {
2020 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2021 		kfree_rcu(dev_maps, rcu);
2022 	}
2023 
2024 out_no_maps:
2025 	mutex_unlock(&xps_map_mutex);
2026 
2027 	return 0;
2028 error:
2029 	/* remove any maps that we added */
2030 	for_each_possible_cpu(cpu) {
2031 		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2032 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2033 				 NULL;
2034 		if (new_map && new_map != map)
2035 			kfree(new_map);
2036 	}
2037 
2038 	mutex_unlock(&xps_map_mutex);
2039 
2040 	kfree(new_dev_maps);
2041 	return -ENOMEM;
2042 }
2043 EXPORT_SYMBOL(netif_set_xps_queue);
2044 
2045 #endif
2046 /*
2047  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2048  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2049  */
2050 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2051 {
2052 	int rc;
2053 
2054 	if (txq < 1 || txq > dev->num_tx_queues)
2055 		return -EINVAL;
2056 
2057 	if (dev->reg_state == NETREG_REGISTERED ||
2058 	    dev->reg_state == NETREG_UNREGISTERING) {
2059 		ASSERT_RTNL();
2060 
2061 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2062 						  txq);
2063 		if (rc)
2064 			return rc;
2065 
2066 		if (dev->num_tc)
2067 			netif_setup_tc(dev, txq);
2068 
2069 		if (txq < dev->real_num_tx_queues) {
2070 			qdisc_reset_all_tx_gt(dev, txq);
2071 #ifdef CONFIG_XPS
2072 			netif_reset_xps_queues_gt(dev, txq);
2073 #endif
2074 		}
2075 	}
2076 
2077 	dev->real_num_tx_queues = txq;
2078 	return 0;
2079 }
2080 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2081 
2082 #ifdef CONFIG_RPS
2083 /**
2084  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2085  *	@dev: Network device
2086  *	@rxq: Actual number of RX queues
2087  *
2088  *	This must be called either with the rtnl_lock held or before
2089  *	registration of the net device.  Returns 0 on success, or a
2090  *	negative error code.  If called before registration, it always
2091  *	succeeds.
2092  */
2093 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2094 {
2095 	int rc;
2096 
2097 	if (rxq < 1 || rxq > dev->num_rx_queues)
2098 		return -EINVAL;
2099 
2100 	if (dev->reg_state == NETREG_REGISTERED) {
2101 		ASSERT_RTNL();
2102 
2103 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2104 						  rxq);
2105 		if (rc)
2106 			return rc;
2107 	}
2108 
2109 	dev->real_num_rx_queues = rxq;
2110 	return 0;
2111 }
2112 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2113 #endif
2114 
2115 /**
2116  * netif_get_num_default_rss_queues - default number of RSS queues
2117  *
2118  * This routine should set an upper limit on the number of RSS queues
2119  * used by default by multiqueue devices.
2120  */
2121 int netif_get_num_default_rss_queues(void)
2122 {
2123 	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2124 }
2125 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2126 
2127 static inline void __netif_reschedule(struct Qdisc *q)
2128 {
2129 	struct softnet_data *sd;
2130 	unsigned long flags;
2131 
2132 	local_irq_save(flags);
2133 	sd = &__get_cpu_var(softnet_data);
2134 	q->next_sched = NULL;
2135 	*sd->output_queue_tailp = q;
2136 	sd->output_queue_tailp = &q->next_sched;
2137 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2138 	local_irq_restore(flags);
2139 }
2140 
2141 void __netif_schedule(struct Qdisc *q)
2142 {
2143 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2144 		__netif_reschedule(q);
2145 }
2146 EXPORT_SYMBOL(__netif_schedule);
2147 
2148 struct dev_kfree_skb_cb {
2149 	enum skb_free_reason reason;
2150 };
2151 
2152 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2153 {
2154 	return (struct dev_kfree_skb_cb *)skb->cb;
2155 }
2156 
2157 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2158 {
2159 	unsigned long flags;
2160 
2161 	if (likely(atomic_read(&skb->users) == 1)) {
2162 		smp_rmb();
2163 		atomic_set(&skb->users, 0);
2164 	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2165 		return;
2166 	}
2167 	get_kfree_skb_cb(skb)->reason = reason;
2168 	local_irq_save(flags);
2169 	skb->next = __this_cpu_read(softnet_data.completion_queue);
2170 	__this_cpu_write(softnet_data.completion_queue, skb);
2171 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2172 	local_irq_restore(flags);
2173 }
2174 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2175 
2176 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2177 {
2178 	if (in_irq() || irqs_disabled())
2179 		__dev_kfree_skb_irq(skb, reason);
2180 	else
2181 		dev_kfree_skb(skb);
2182 }
2183 EXPORT_SYMBOL(__dev_kfree_skb_any);
2184 
2185 
2186 /**
2187  * netif_device_detach - mark device as removed
2188  * @dev: network device
2189  *
2190  * Mark device as removed from system and therefore no longer available.
2191  */
2192 void netif_device_detach(struct net_device *dev)
2193 {
2194 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2195 	    netif_running(dev)) {
2196 		netif_tx_stop_all_queues(dev);
2197 	}
2198 }
2199 EXPORT_SYMBOL(netif_device_detach);
2200 
2201 /**
2202  * netif_device_attach - mark device as attached
2203  * @dev: network device
2204  *
2205  * Mark device as attached from system and restart if needed.
2206  */
2207 void netif_device_attach(struct net_device *dev)
2208 {
2209 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2210 	    netif_running(dev)) {
2211 		netif_tx_wake_all_queues(dev);
2212 		__netdev_watchdog_up(dev);
2213 	}
2214 }
2215 EXPORT_SYMBOL(netif_device_attach);
2216 
2217 static void skb_warn_bad_offload(const struct sk_buff *skb)
2218 {
2219 	static const netdev_features_t null_features = 0;
2220 	struct net_device *dev = skb->dev;
2221 	const char *driver = "";
2222 
2223 	if (!net_ratelimit())
2224 		return;
2225 
2226 	if (dev && dev->dev.parent)
2227 		driver = dev_driver_string(dev->dev.parent);
2228 
2229 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2230 	     "gso_type=%d ip_summed=%d\n",
2231 	     driver, dev ? &dev->features : &null_features,
2232 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2233 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2234 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2235 }
2236 
2237 /*
2238  * Invalidate hardware checksum when packet is to be mangled, and
2239  * complete checksum manually on outgoing path.
2240  */
2241 int skb_checksum_help(struct sk_buff *skb)
2242 {
2243 	__wsum csum;
2244 	int ret = 0, offset;
2245 
2246 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2247 		goto out_set_summed;
2248 
2249 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2250 		skb_warn_bad_offload(skb);
2251 		return -EINVAL;
2252 	}
2253 
2254 	/* Before computing a checksum, we should make sure no frag could
2255 	 * be modified by an external entity : checksum could be wrong.
2256 	 */
2257 	if (skb_has_shared_frag(skb)) {
2258 		ret = __skb_linearize(skb);
2259 		if (ret)
2260 			goto out;
2261 	}
2262 
2263 	offset = skb_checksum_start_offset(skb);
2264 	BUG_ON(offset >= skb_headlen(skb));
2265 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2266 
2267 	offset += skb->csum_offset;
2268 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2269 
2270 	if (skb_cloned(skb) &&
2271 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2272 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2273 		if (ret)
2274 			goto out;
2275 	}
2276 
2277 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2278 out_set_summed:
2279 	skb->ip_summed = CHECKSUM_NONE;
2280 out:
2281 	return ret;
2282 }
2283 EXPORT_SYMBOL(skb_checksum_help);
2284 
2285 __be16 skb_network_protocol(struct sk_buff *skb)
2286 {
2287 	__be16 type = skb->protocol;
2288 	int vlan_depth = ETH_HLEN;
2289 
2290 	/* Tunnel gso handlers can set protocol to ethernet. */
2291 	if (type == htons(ETH_P_TEB)) {
2292 		struct ethhdr *eth;
2293 
2294 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2295 			return 0;
2296 
2297 		eth = (struct ethhdr *)skb_mac_header(skb);
2298 		type = eth->h_proto;
2299 	}
2300 
2301 	while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2302 		struct vlan_hdr *vh;
2303 
2304 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2305 			return 0;
2306 
2307 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2308 		type = vh->h_vlan_encapsulated_proto;
2309 		vlan_depth += VLAN_HLEN;
2310 	}
2311 
2312 	return type;
2313 }
2314 
2315 /**
2316  *	skb_mac_gso_segment - mac layer segmentation handler.
2317  *	@skb: buffer to segment
2318  *	@features: features for the output path (see dev->features)
2319  */
2320 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2321 				    netdev_features_t features)
2322 {
2323 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2324 	struct packet_offload *ptype;
2325 	__be16 type = skb_network_protocol(skb);
2326 
2327 	if (unlikely(!type))
2328 		return ERR_PTR(-EINVAL);
2329 
2330 	__skb_pull(skb, skb->mac_len);
2331 
2332 	rcu_read_lock();
2333 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2334 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2335 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2336 				int err;
2337 
2338 				err = ptype->callbacks.gso_send_check(skb);
2339 				segs = ERR_PTR(err);
2340 				if (err || skb_gso_ok(skb, features))
2341 					break;
2342 				__skb_push(skb, (skb->data -
2343 						 skb_network_header(skb)));
2344 			}
2345 			segs = ptype->callbacks.gso_segment(skb, features);
2346 			break;
2347 		}
2348 	}
2349 	rcu_read_unlock();
2350 
2351 	__skb_push(skb, skb->data - skb_mac_header(skb));
2352 
2353 	return segs;
2354 }
2355 EXPORT_SYMBOL(skb_mac_gso_segment);
2356 
2357 
2358 /* openvswitch calls this on rx path, so we need a different check.
2359  */
2360 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2361 {
2362 	if (tx_path)
2363 		return skb->ip_summed != CHECKSUM_PARTIAL;
2364 	else
2365 		return skb->ip_summed == CHECKSUM_NONE;
2366 }
2367 
2368 /**
2369  *	__skb_gso_segment - Perform segmentation on skb.
2370  *	@skb: buffer to segment
2371  *	@features: features for the output path (see dev->features)
2372  *	@tx_path: whether it is called in TX path
2373  *
2374  *	This function segments the given skb and returns a list of segments.
2375  *
2376  *	It may return NULL if the skb requires no segmentation.  This is
2377  *	only possible when GSO is used for verifying header integrity.
2378  */
2379 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2380 				  netdev_features_t features, bool tx_path)
2381 {
2382 	if (unlikely(skb_needs_check(skb, tx_path))) {
2383 		int err;
2384 
2385 		skb_warn_bad_offload(skb);
2386 
2387 		if (skb_header_cloned(skb) &&
2388 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2389 			return ERR_PTR(err);
2390 	}
2391 
2392 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2393 	SKB_GSO_CB(skb)->encap_level = 0;
2394 
2395 	skb_reset_mac_header(skb);
2396 	skb_reset_mac_len(skb);
2397 
2398 	return skb_mac_gso_segment(skb, features);
2399 }
2400 EXPORT_SYMBOL(__skb_gso_segment);
2401 
2402 /* Take action when hardware reception checksum errors are detected. */
2403 #ifdef CONFIG_BUG
2404 void netdev_rx_csum_fault(struct net_device *dev)
2405 {
2406 	if (net_ratelimit()) {
2407 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2408 		dump_stack();
2409 	}
2410 }
2411 EXPORT_SYMBOL(netdev_rx_csum_fault);
2412 #endif
2413 
2414 /* Actually, we should eliminate this check as soon as we know, that:
2415  * 1. IOMMU is present and allows to map all the memory.
2416  * 2. No high memory really exists on this machine.
2417  */
2418 
2419 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2420 {
2421 #ifdef CONFIG_HIGHMEM
2422 	int i;
2423 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2424 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2425 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2426 			if (PageHighMem(skb_frag_page(frag)))
2427 				return 1;
2428 		}
2429 	}
2430 
2431 	if (PCI_DMA_BUS_IS_PHYS) {
2432 		struct device *pdev = dev->dev.parent;
2433 
2434 		if (!pdev)
2435 			return 0;
2436 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2437 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2438 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2439 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2440 				return 1;
2441 		}
2442 	}
2443 #endif
2444 	return 0;
2445 }
2446 
2447 struct dev_gso_cb {
2448 	void (*destructor)(struct sk_buff *skb);
2449 };
2450 
2451 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2452 
2453 static void dev_gso_skb_destructor(struct sk_buff *skb)
2454 {
2455 	struct dev_gso_cb *cb;
2456 
2457 	kfree_skb_list(skb->next);
2458 	skb->next = NULL;
2459 
2460 	cb = DEV_GSO_CB(skb);
2461 	if (cb->destructor)
2462 		cb->destructor(skb);
2463 }
2464 
2465 /**
2466  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2467  *	@skb: buffer to segment
2468  *	@features: device features as applicable to this skb
2469  *
2470  *	This function segments the given skb and stores the list of segments
2471  *	in skb->next.
2472  */
2473 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2474 {
2475 	struct sk_buff *segs;
2476 
2477 	segs = skb_gso_segment(skb, features);
2478 
2479 	/* Verifying header integrity only. */
2480 	if (!segs)
2481 		return 0;
2482 
2483 	if (IS_ERR(segs))
2484 		return PTR_ERR(segs);
2485 
2486 	skb->next = segs;
2487 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2488 	skb->destructor = dev_gso_skb_destructor;
2489 
2490 	return 0;
2491 }
2492 
2493 static netdev_features_t harmonize_features(struct sk_buff *skb,
2494 	netdev_features_t features)
2495 {
2496 	if (skb->ip_summed != CHECKSUM_NONE &&
2497 	    !can_checksum_protocol(features, skb_network_protocol(skb))) {
2498 		features &= ~NETIF_F_ALL_CSUM;
2499 	} else if (illegal_highdma(skb->dev, skb)) {
2500 		features &= ~NETIF_F_SG;
2501 	}
2502 
2503 	return features;
2504 }
2505 
2506 netdev_features_t netif_skb_features(struct sk_buff *skb)
2507 {
2508 	__be16 protocol = skb->protocol;
2509 	netdev_features_t features = skb->dev->features;
2510 
2511 	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2512 		features &= ~NETIF_F_GSO_MASK;
2513 
2514 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2515 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2516 		protocol = veh->h_vlan_encapsulated_proto;
2517 	} else if (!vlan_tx_tag_present(skb)) {
2518 		return harmonize_features(skb, features);
2519 	}
2520 
2521 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2522 					       NETIF_F_HW_VLAN_STAG_TX);
2523 
2524 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2525 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2526 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2527 				NETIF_F_HW_VLAN_STAG_TX;
2528 
2529 	return harmonize_features(skb, features);
2530 }
2531 EXPORT_SYMBOL(netif_skb_features);
2532 
2533 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2534 			struct netdev_queue *txq, void *accel_priv)
2535 {
2536 	const struct net_device_ops *ops = dev->netdev_ops;
2537 	int rc = NETDEV_TX_OK;
2538 	unsigned int skb_len;
2539 
2540 	if (likely(!skb->next)) {
2541 		netdev_features_t features;
2542 
2543 		/*
2544 		 * If device doesn't need skb->dst, release it right now while
2545 		 * its hot in this cpu cache
2546 		 */
2547 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2548 			skb_dst_drop(skb);
2549 
2550 		features = netif_skb_features(skb);
2551 
2552 		if (vlan_tx_tag_present(skb) &&
2553 		    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2554 			skb = __vlan_put_tag(skb, skb->vlan_proto,
2555 					     vlan_tx_tag_get(skb));
2556 			if (unlikely(!skb))
2557 				goto out;
2558 
2559 			skb->vlan_tci = 0;
2560 		}
2561 
2562 		/* If encapsulation offload request, verify we are testing
2563 		 * hardware encapsulation features instead of standard
2564 		 * features for the netdev
2565 		 */
2566 		if (skb->encapsulation)
2567 			features &= dev->hw_enc_features;
2568 
2569 		if (netif_needs_gso(skb, features)) {
2570 			if (unlikely(dev_gso_segment(skb, features)))
2571 				goto out_kfree_skb;
2572 			if (skb->next)
2573 				goto gso;
2574 		} else {
2575 			if (skb_needs_linearize(skb, features) &&
2576 			    __skb_linearize(skb))
2577 				goto out_kfree_skb;
2578 
2579 			/* If packet is not checksummed and device does not
2580 			 * support checksumming for this protocol, complete
2581 			 * checksumming here.
2582 			 */
2583 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2584 				if (skb->encapsulation)
2585 					skb_set_inner_transport_header(skb,
2586 						skb_checksum_start_offset(skb));
2587 				else
2588 					skb_set_transport_header(skb,
2589 						skb_checksum_start_offset(skb));
2590 				if (!(features & NETIF_F_ALL_CSUM) &&
2591 				     skb_checksum_help(skb))
2592 					goto out_kfree_skb;
2593 			}
2594 		}
2595 
2596 		if (!list_empty(&ptype_all))
2597 			dev_queue_xmit_nit(skb, dev);
2598 
2599 		skb_len = skb->len;
2600 		if (accel_priv)
2601 			rc = ops->ndo_dfwd_start_xmit(skb, dev, accel_priv);
2602 		else
2603 			rc = ops->ndo_start_xmit(skb, dev);
2604 
2605 		trace_net_dev_xmit(skb, rc, dev, skb_len);
2606 		if (rc == NETDEV_TX_OK && txq)
2607 			txq_trans_update(txq);
2608 		return rc;
2609 	}
2610 
2611 gso:
2612 	do {
2613 		struct sk_buff *nskb = skb->next;
2614 
2615 		skb->next = nskb->next;
2616 		nskb->next = NULL;
2617 
2618 		if (!list_empty(&ptype_all))
2619 			dev_queue_xmit_nit(nskb, dev);
2620 
2621 		skb_len = nskb->len;
2622 		if (accel_priv)
2623 			rc = ops->ndo_dfwd_start_xmit(nskb, dev, accel_priv);
2624 		else
2625 			rc = ops->ndo_start_xmit(nskb, dev);
2626 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2627 		if (unlikely(rc != NETDEV_TX_OK)) {
2628 			if (rc & ~NETDEV_TX_MASK)
2629 				goto out_kfree_gso_skb;
2630 			nskb->next = skb->next;
2631 			skb->next = nskb;
2632 			return rc;
2633 		}
2634 		txq_trans_update(txq);
2635 		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2636 			return NETDEV_TX_BUSY;
2637 	} while (skb->next);
2638 
2639 out_kfree_gso_skb:
2640 	if (likely(skb->next == NULL)) {
2641 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2642 		consume_skb(skb);
2643 		return rc;
2644 	}
2645 out_kfree_skb:
2646 	kfree_skb(skb);
2647 out:
2648 	return rc;
2649 }
2650 EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2651 
2652 static void qdisc_pkt_len_init(struct sk_buff *skb)
2653 {
2654 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2655 
2656 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2657 
2658 	/* To get more precise estimation of bytes sent on wire,
2659 	 * we add to pkt_len the headers size of all segments
2660 	 */
2661 	if (shinfo->gso_size)  {
2662 		unsigned int hdr_len;
2663 		u16 gso_segs = shinfo->gso_segs;
2664 
2665 		/* mac layer + network layer */
2666 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2667 
2668 		/* + transport layer */
2669 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2670 			hdr_len += tcp_hdrlen(skb);
2671 		else
2672 			hdr_len += sizeof(struct udphdr);
2673 
2674 		if (shinfo->gso_type & SKB_GSO_DODGY)
2675 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2676 						shinfo->gso_size);
2677 
2678 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2679 	}
2680 }
2681 
2682 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2683 				 struct net_device *dev,
2684 				 struct netdev_queue *txq)
2685 {
2686 	spinlock_t *root_lock = qdisc_lock(q);
2687 	bool contended;
2688 	int rc;
2689 
2690 	qdisc_pkt_len_init(skb);
2691 	qdisc_calculate_pkt_len(skb, q);
2692 	/*
2693 	 * Heuristic to force contended enqueues to serialize on a
2694 	 * separate lock before trying to get qdisc main lock.
2695 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2696 	 * and dequeue packets faster.
2697 	 */
2698 	contended = qdisc_is_running(q);
2699 	if (unlikely(contended))
2700 		spin_lock(&q->busylock);
2701 
2702 	spin_lock(root_lock);
2703 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2704 		kfree_skb(skb);
2705 		rc = NET_XMIT_DROP;
2706 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2707 		   qdisc_run_begin(q)) {
2708 		/*
2709 		 * This is a work-conserving queue; there are no old skbs
2710 		 * waiting to be sent out; and the qdisc is not running -
2711 		 * xmit the skb directly.
2712 		 */
2713 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2714 			skb_dst_force(skb);
2715 
2716 		qdisc_bstats_update(q, skb);
2717 
2718 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2719 			if (unlikely(contended)) {
2720 				spin_unlock(&q->busylock);
2721 				contended = false;
2722 			}
2723 			__qdisc_run(q);
2724 		} else
2725 			qdisc_run_end(q);
2726 
2727 		rc = NET_XMIT_SUCCESS;
2728 	} else {
2729 		skb_dst_force(skb);
2730 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2731 		if (qdisc_run_begin(q)) {
2732 			if (unlikely(contended)) {
2733 				spin_unlock(&q->busylock);
2734 				contended = false;
2735 			}
2736 			__qdisc_run(q);
2737 		}
2738 	}
2739 	spin_unlock(root_lock);
2740 	if (unlikely(contended))
2741 		spin_unlock(&q->busylock);
2742 	return rc;
2743 }
2744 
2745 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2746 static void skb_update_prio(struct sk_buff *skb)
2747 {
2748 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2749 
2750 	if (!skb->priority && skb->sk && map) {
2751 		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2752 
2753 		if (prioidx < map->priomap_len)
2754 			skb->priority = map->priomap[prioidx];
2755 	}
2756 }
2757 #else
2758 #define skb_update_prio(skb)
2759 #endif
2760 
2761 static DEFINE_PER_CPU(int, xmit_recursion);
2762 #define RECURSION_LIMIT 10
2763 
2764 /**
2765  *	dev_loopback_xmit - loop back @skb
2766  *	@skb: buffer to transmit
2767  */
2768 int dev_loopback_xmit(struct sk_buff *skb)
2769 {
2770 	skb_reset_mac_header(skb);
2771 	__skb_pull(skb, skb_network_offset(skb));
2772 	skb->pkt_type = PACKET_LOOPBACK;
2773 	skb->ip_summed = CHECKSUM_UNNECESSARY;
2774 	WARN_ON(!skb_dst(skb));
2775 	skb_dst_force(skb);
2776 	netif_rx_ni(skb);
2777 	return 0;
2778 }
2779 EXPORT_SYMBOL(dev_loopback_xmit);
2780 
2781 /**
2782  *	dev_queue_xmit - transmit a buffer
2783  *	@skb: buffer to transmit
2784  *
2785  *	Queue a buffer for transmission to a network device. The caller must
2786  *	have set the device and priority and built the buffer before calling
2787  *	this function. The function can be called from an interrupt.
2788  *
2789  *	A negative errno code is returned on a failure. A success does not
2790  *	guarantee the frame will be transmitted as it may be dropped due
2791  *	to congestion or traffic shaping.
2792  *
2793  * -----------------------------------------------------------------------------------
2794  *      I notice this method can also return errors from the queue disciplines,
2795  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2796  *      be positive.
2797  *
2798  *      Regardless of the return value, the skb is consumed, so it is currently
2799  *      difficult to retry a send to this method.  (You can bump the ref count
2800  *      before sending to hold a reference for retry if you are careful.)
2801  *
2802  *      When calling this method, interrupts MUST be enabled.  This is because
2803  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2804  *          --BLG
2805  */
2806 int dev_queue_xmit(struct sk_buff *skb)
2807 {
2808 	struct net_device *dev = skb->dev;
2809 	struct netdev_queue *txq;
2810 	struct Qdisc *q;
2811 	int rc = -ENOMEM;
2812 
2813 	skb_reset_mac_header(skb);
2814 
2815 	/* Disable soft irqs for various locks below. Also
2816 	 * stops preemption for RCU.
2817 	 */
2818 	rcu_read_lock_bh();
2819 
2820 	skb_update_prio(skb);
2821 
2822 	txq = netdev_pick_tx(dev, skb);
2823 	q = rcu_dereference_bh(txq->qdisc);
2824 
2825 #ifdef CONFIG_NET_CLS_ACT
2826 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2827 #endif
2828 	trace_net_dev_queue(skb);
2829 	if (q->enqueue) {
2830 		rc = __dev_xmit_skb(skb, q, dev, txq);
2831 		goto out;
2832 	}
2833 
2834 	/* The device has no queue. Common case for software devices:
2835 	   loopback, all the sorts of tunnels...
2836 
2837 	   Really, it is unlikely that netif_tx_lock protection is necessary
2838 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2839 	   counters.)
2840 	   However, it is possible, that they rely on protection
2841 	   made by us here.
2842 
2843 	   Check this and shot the lock. It is not prone from deadlocks.
2844 	   Either shot noqueue qdisc, it is even simpler 8)
2845 	 */
2846 	if (dev->flags & IFF_UP) {
2847 		int cpu = smp_processor_id(); /* ok because BHs are off */
2848 
2849 		if (txq->xmit_lock_owner != cpu) {
2850 
2851 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2852 				goto recursion_alert;
2853 
2854 			HARD_TX_LOCK(dev, txq, cpu);
2855 
2856 			if (!netif_xmit_stopped(txq)) {
2857 				__this_cpu_inc(xmit_recursion);
2858 				rc = dev_hard_start_xmit(skb, dev, txq, NULL);
2859 				__this_cpu_dec(xmit_recursion);
2860 				if (dev_xmit_complete(rc)) {
2861 					HARD_TX_UNLOCK(dev, txq);
2862 					goto out;
2863 				}
2864 			}
2865 			HARD_TX_UNLOCK(dev, txq);
2866 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2867 					     dev->name);
2868 		} else {
2869 			/* Recursion is detected! It is possible,
2870 			 * unfortunately
2871 			 */
2872 recursion_alert:
2873 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2874 					     dev->name);
2875 		}
2876 	}
2877 
2878 	rc = -ENETDOWN;
2879 	rcu_read_unlock_bh();
2880 
2881 	kfree_skb(skb);
2882 	return rc;
2883 out:
2884 	rcu_read_unlock_bh();
2885 	return rc;
2886 }
2887 EXPORT_SYMBOL(dev_queue_xmit);
2888 
2889 
2890 /*=======================================================================
2891 			Receiver routines
2892   =======================================================================*/
2893 
2894 int netdev_max_backlog __read_mostly = 1000;
2895 EXPORT_SYMBOL(netdev_max_backlog);
2896 
2897 int netdev_tstamp_prequeue __read_mostly = 1;
2898 int netdev_budget __read_mostly = 300;
2899 int weight_p __read_mostly = 64;            /* old backlog weight */
2900 
2901 /* Called with irq disabled */
2902 static inline void ____napi_schedule(struct softnet_data *sd,
2903 				     struct napi_struct *napi)
2904 {
2905 	list_add_tail(&napi->poll_list, &sd->poll_list);
2906 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2907 }
2908 
2909 #ifdef CONFIG_RPS
2910 
2911 /* One global table that all flow-based protocols share. */
2912 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2913 EXPORT_SYMBOL(rps_sock_flow_table);
2914 
2915 struct static_key rps_needed __read_mostly;
2916 
2917 static struct rps_dev_flow *
2918 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2919 	    struct rps_dev_flow *rflow, u16 next_cpu)
2920 {
2921 	if (next_cpu != RPS_NO_CPU) {
2922 #ifdef CONFIG_RFS_ACCEL
2923 		struct netdev_rx_queue *rxqueue;
2924 		struct rps_dev_flow_table *flow_table;
2925 		struct rps_dev_flow *old_rflow;
2926 		u32 flow_id;
2927 		u16 rxq_index;
2928 		int rc;
2929 
2930 		/* Should we steer this flow to a different hardware queue? */
2931 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2932 		    !(dev->features & NETIF_F_NTUPLE))
2933 			goto out;
2934 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2935 		if (rxq_index == skb_get_rx_queue(skb))
2936 			goto out;
2937 
2938 		rxqueue = dev->_rx + rxq_index;
2939 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2940 		if (!flow_table)
2941 			goto out;
2942 		flow_id = skb->rxhash & flow_table->mask;
2943 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2944 							rxq_index, flow_id);
2945 		if (rc < 0)
2946 			goto out;
2947 		old_rflow = rflow;
2948 		rflow = &flow_table->flows[flow_id];
2949 		rflow->filter = rc;
2950 		if (old_rflow->filter == rflow->filter)
2951 			old_rflow->filter = RPS_NO_FILTER;
2952 	out:
2953 #endif
2954 		rflow->last_qtail =
2955 			per_cpu(softnet_data, next_cpu).input_queue_head;
2956 	}
2957 
2958 	rflow->cpu = next_cpu;
2959 	return rflow;
2960 }
2961 
2962 /*
2963  * get_rps_cpu is called from netif_receive_skb and returns the target
2964  * CPU from the RPS map of the receiving queue for a given skb.
2965  * rcu_read_lock must be held on entry.
2966  */
2967 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2968 		       struct rps_dev_flow **rflowp)
2969 {
2970 	struct netdev_rx_queue *rxqueue;
2971 	struct rps_map *map;
2972 	struct rps_dev_flow_table *flow_table;
2973 	struct rps_sock_flow_table *sock_flow_table;
2974 	int cpu = -1;
2975 	u16 tcpu;
2976 
2977 	if (skb_rx_queue_recorded(skb)) {
2978 		u16 index = skb_get_rx_queue(skb);
2979 		if (unlikely(index >= dev->real_num_rx_queues)) {
2980 			WARN_ONCE(dev->real_num_rx_queues > 1,
2981 				  "%s received packet on queue %u, but number "
2982 				  "of RX queues is %u\n",
2983 				  dev->name, index, dev->real_num_rx_queues);
2984 			goto done;
2985 		}
2986 		rxqueue = dev->_rx + index;
2987 	} else
2988 		rxqueue = dev->_rx;
2989 
2990 	map = rcu_dereference(rxqueue->rps_map);
2991 	if (map) {
2992 		if (map->len == 1 &&
2993 		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2994 			tcpu = map->cpus[0];
2995 			if (cpu_online(tcpu))
2996 				cpu = tcpu;
2997 			goto done;
2998 		}
2999 	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3000 		goto done;
3001 	}
3002 
3003 	skb_reset_network_header(skb);
3004 	if (!skb_get_hash(skb))
3005 		goto done;
3006 
3007 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3008 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3009 	if (flow_table && sock_flow_table) {
3010 		u16 next_cpu;
3011 		struct rps_dev_flow *rflow;
3012 
3013 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3014 		tcpu = rflow->cpu;
3015 
3016 		next_cpu = sock_flow_table->ents[skb->rxhash &
3017 		    sock_flow_table->mask];
3018 
3019 		/*
3020 		 * If the desired CPU (where last recvmsg was done) is
3021 		 * different from current CPU (one in the rx-queue flow
3022 		 * table entry), switch if one of the following holds:
3023 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
3024 		 *   - Current CPU is offline.
3025 		 *   - The current CPU's queue tail has advanced beyond the
3026 		 *     last packet that was enqueued using this table entry.
3027 		 *     This guarantees that all previous packets for the flow
3028 		 *     have been dequeued, thus preserving in order delivery.
3029 		 */
3030 		if (unlikely(tcpu != next_cpu) &&
3031 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3032 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3033 		      rflow->last_qtail)) >= 0)) {
3034 			tcpu = next_cpu;
3035 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3036 		}
3037 
3038 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3039 			*rflowp = rflow;
3040 			cpu = tcpu;
3041 			goto done;
3042 		}
3043 	}
3044 
3045 	if (map) {
3046 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3047 
3048 		if (cpu_online(tcpu)) {
3049 			cpu = tcpu;
3050 			goto done;
3051 		}
3052 	}
3053 
3054 done:
3055 	return cpu;
3056 }
3057 
3058 #ifdef CONFIG_RFS_ACCEL
3059 
3060 /**
3061  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3062  * @dev: Device on which the filter was set
3063  * @rxq_index: RX queue index
3064  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3065  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3066  *
3067  * Drivers that implement ndo_rx_flow_steer() should periodically call
3068  * this function for each installed filter and remove the filters for
3069  * which it returns %true.
3070  */
3071 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3072 			 u32 flow_id, u16 filter_id)
3073 {
3074 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3075 	struct rps_dev_flow_table *flow_table;
3076 	struct rps_dev_flow *rflow;
3077 	bool expire = true;
3078 	int cpu;
3079 
3080 	rcu_read_lock();
3081 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3082 	if (flow_table && flow_id <= flow_table->mask) {
3083 		rflow = &flow_table->flows[flow_id];
3084 		cpu = ACCESS_ONCE(rflow->cpu);
3085 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3086 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3087 			   rflow->last_qtail) <
3088 		     (int)(10 * flow_table->mask)))
3089 			expire = false;
3090 	}
3091 	rcu_read_unlock();
3092 	return expire;
3093 }
3094 EXPORT_SYMBOL(rps_may_expire_flow);
3095 
3096 #endif /* CONFIG_RFS_ACCEL */
3097 
3098 /* Called from hardirq (IPI) context */
3099 static void rps_trigger_softirq(void *data)
3100 {
3101 	struct softnet_data *sd = data;
3102 
3103 	____napi_schedule(sd, &sd->backlog);
3104 	sd->received_rps++;
3105 }
3106 
3107 #endif /* CONFIG_RPS */
3108 
3109 /*
3110  * Check if this softnet_data structure is another cpu one
3111  * If yes, queue it to our IPI list and return 1
3112  * If no, return 0
3113  */
3114 static int rps_ipi_queued(struct softnet_data *sd)
3115 {
3116 #ifdef CONFIG_RPS
3117 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3118 
3119 	if (sd != mysd) {
3120 		sd->rps_ipi_next = mysd->rps_ipi_list;
3121 		mysd->rps_ipi_list = sd;
3122 
3123 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3124 		return 1;
3125 	}
3126 #endif /* CONFIG_RPS */
3127 	return 0;
3128 }
3129 
3130 #ifdef CONFIG_NET_FLOW_LIMIT
3131 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3132 #endif
3133 
3134 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3135 {
3136 #ifdef CONFIG_NET_FLOW_LIMIT
3137 	struct sd_flow_limit *fl;
3138 	struct softnet_data *sd;
3139 	unsigned int old_flow, new_flow;
3140 
3141 	if (qlen < (netdev_max_backlog >> 1))
3142 		return false;
3143 
3144 	sd = &__get_cpu_var(softnet_data);
3145 
3146 	rcu_read_lock();
3147 	fl = rcu_dereference(sd->flow_limit);
3148 	if (fl) {
3149 		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3150 		old_flow = fl->history[fl->history_head];
3151 		fl->history[fl->history_head] = new_flow;
3152 
3153 		fl->history_head++;
3154 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3155 
3156 		if (likely(fl->buckets[old_flow]))
3157 			fl->buckets[old_flow]--;
3158 
3159 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3160 			fl->count++;
3161 			rcu_read_unlock();
3162 			return true;
3163 		}
3164 	}
3165 	rcu_read_unlock();
3166 #endif
3167 	return false;
3168 }
3169 
3170 /*
3171  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3172  * queue (may be a remote CPU queue).
3173  */
3174 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3175 			      unsigned int *qtail)
3176 {
3177 	struct softnet_data *sd;
3178 	unsigned long flags;
3179 	unsigned int qlen;
3180 
3181 	sd = &per_cpu(softnet_data, cpu);
3182 
3183 	local_irq_save(flags);
3184 
3185 	rps_lock(sd);
3186 	qlen = skb_queue_len(&sd->input_pkt_queue);
3187 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3188 		if (skb_queue_len(&sd->input_pkt_queue)) {
3189 enqueue:
3190 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3191 			input_queue_tail_incr_save(sd, qtail);
3192 			rps_unlock(sd);
3193 			local_irq_restore(flags);
3194 			return NET_RX_SUCCESS;
3195 		}
3196 
3197 		/* Schedule NAPI for backlog device
3198 		 * We can use non atomic operation since we own the queue lock
3199 		 */
3200 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3201 			if (!rps_ipi_queued(sd))
3202 				____napi_schedule(sd, &sd->backlog);
3203 		}
3204 		goto enqueue;
3205 	}
3206 
3207 	sd->dropped++;
3208 	rps_unlock(sd);
3209 
3210 	local_irq_restore(flags);
3211 
3212 	atomic_long_inc(&skb->dev->rx_dropped);
3213 	kfree_skb(skb);
3214 	return NET_RX_DROP;
3215 }
3216 
3217 /**
3218  *	netif_rx	-	post buffer to the network code
3219  *	@skb: buffer to post
3220  *
3221  *	This function receives a packet from a device driver and queues it for
3222  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3223  *	may be dropped during processing for congestion control or by the
3224  *	protocol layers.
3225  *
3226  *	return values:
3227  *	NET_RX_SUCCESS	(no congestion)
3228  *	NET_RX_DROP     (packet was dropped)
3229  *
3230  */
3231 
3232 int netif_rx(struct sk_buff *skb)
3233 {
3234 	int ret;
3235 
3236 	/* if netpoll wants it, pretend we never saw it */
3237 	if (netpoll_rx(skb))
3238 		return NET_RX_DROP;
3239 
3240 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3241 
3242 	trace_netif_rx(skb);
3243 #ifdef CONFIG_RPS
3244 	if (static_key_false(&rps_needed)) {
3245 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3246 		int cpu;
3247 
3248 		preempt_disable();
3249 		rcu_read_lock();
3250 
3251 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3252 		if (cpu < 0)
3253 			cpu = smp_processor_id();
3254 
3255 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3256 
3257 		rcu_read_unlock();
3258 		preempt_enable();
3259 	} else
3260 #endif
3261 	{
3262 		unsigned int qtail;
3263 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3264 		put_cpu();
3265 	}
3266 	return ret;
3267 }
3268 EXPORT_SYMBOL(netif_rx);
3269 
3270 int netif_rx_ni(struct sk_buff *skb)
3271 {
3272 	int err;
3273 
3274 	preempt_disable();
3275 	err = netif_rx(skb);
3276 	if (local_softirq_pending())
3277 		do_softirq();
3278 	preempt_enable();
3279 
3280 	return err;
3281 }
3282 EXPORT_SYMBOL(netif_rx_ni);
3283 
3284 static void net_tx_action(struct softirq_action *h)
3285 {
3286 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3287 
3288 	if (sd->completion_queue) {
3289 		struct sk_buff *clist;
3290 
3291 		local_irq_disable();
3292 		clist = sd->completion_queue;
3293 		sd->completion_queue = NULL;
3294 		local_irq_enable();
3295 
3296 		while (clist) {
3297 			struct sk_buff *skb = clist;
3298 			clist = clist->next;
3299 
3300 			WARN_ON(atomic_read(&skb->users));
3301 			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3302 				trace_consume_skb(skb);
3303 			else
3304 				trace_kfree_skb(skb, net_tx_action);
3305 			__kfree_skb(skb);
3306 		}
3307 	}
3308 
3309 	if (sd->output_queue) {
3310 		struct Qdisc *head;
3311 
3312 		local_irq_disable();
3313 		head = sd->output_queue;
3314 		sd->output_queue = NULL;
3315 		sd->output_queue_tailp = &sd->output_queue;
3316 		local_irq_enable();
3317 
3318 		while (head) {
3319 			struct Qdisc *q = head;
3320 			spinlock_t *root_lock;
3321 
3322 			head = head->next_sched;
3323 
3324 			root_lock = qdisc_lock(q);
3325 			if (spin_trylock(root_lock)) {
3326 				smp_mb__before_clear_bit();
3327 				clear_bit(__QDISC_STATE_SCHED,
3328 					  &q->state);
3329 				qdisc_run(q);
3330 				spin_unlock(root_lock);
3331 			} else {
3332 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3333 					      &q->state)) {
3334 					__netif_reschedule(q);
3335 				} else {
3336 					smp_mb__before_clear_bit();
3337 					clear_bit(__QDISC_STATE_SCHED,
3338 						  &q->state);
3339 				}
3340 			}
3341 		}
3342 	}
3343 }
3344 
3345 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3346     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3347 /* This hook is defined here for ATM LANE */
3348 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3349 			     unsigned char *addr) __read_mostly;
3350 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3351 #endif
3352 
3353 #ifdef CONFIG_NET_CLS_ACT
3354 /* TODO: Maybe we should just force sch_ingress to be compiled in
3355  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3356  * a compare and 2 stores extra right now if we dont have it on
3357  * but have CONFIG_NET_CLS_ACT
3358  * NOTE: This doesn't stop any functionality; if you dont have
3359  * the ingress scheduler, you just can't add policies on ingress.
3360  *
3361  */
3362 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3363 {
3364 	struct net_device *dev = skb->dev;
3365 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3366 	int result = TC_ACT_OK;
3367 	struct Qdisc *q;
3368 
3369 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3370 		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3371 				     skb->skb_iif, dev->ifindex);
3372 		return TC_ACT_SHOT;
3373 	}
3374 
3375 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3376 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3377 
3378 	q = rxq->qdisc;
3379 	if (q != &noop_qdisc) {
3380 		spin_lock(qdisc_lock(q));
3381 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3382 			result = qdisc_enqueue_root(skb, q);
3383 		spin_unlock(qdisc_lock(q));
3384 	}
3385 
3386 	return result;
3387 }
3388 
3389 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3390 					 struct packet_type **pt_prev,
3391 					 int *ret, struct net_device *orig_dev)
3392 {
3393 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3394 
3395 	if (!rxq || rxq->qdisc == &noop_qdisc)
3396 		goto out;
3397 
3398 	if (*pt_prev) {
3399 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3400 		*pt_prev = NULL;
3401 	}
3402 
3403 	switch (ing_filter(skb, rxq)) {
3404 	case TC_ACT_SHOT:
3405 	case TC_ACT_STOLEN:
3406 		kfree_skb(skb);
3407 		return NULL;
3408 	}
3409 
3410 out:
3411 	skb->tc_verd = 0;
3412 	return skb;
3413 }
3414 #endif
3415 
3416 /**
3417  *	netdev_rx_handler_register - register receive handler
3418  *	@dev: device to register a handler for
3419  *	@rx_handler: receive handler to register
3420  *	@rx_handler_data: data pointer that is used by rx handler
3421  *
3422  *	Register a receive hander for a device. This handler will then be
3423  *	called from __netif_receive_skb. A negative errno code is returned
3424  *	on a failure.
3425  *
3426  *	The caller must hold the rtnl_mutex.
3427  *
3428  *	For a general description of rx_handler, see enum rx_handler_result.
3429  */
3430 int netdev_rx_handler_register(struct net_device *dev,
3431 			       rx_handler_func_t *rx_handler,
3432 			       void *rx_handler_data)
3433 {
3434 	ASSERT_RTNL();
3435 
3436 	if (dev->rx_handler)
3437 		return -EBUSY;
3438 
3439 	/* Note: rx_handler_data must be set before rx_handler */
3440 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3441 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3442 
3443 	return 0;
3444 }
3445 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3446 
3447 /**
3448  *	netdev_rx_handler_unregister - unregister receive handler
3449  *	@dev: device to unregister a handler from
3450  *
3451  *	Unregister a receive handler from a device.
3452  *
3453  *	The caller must hold the rtnl_mutex.
3454  */
3455 void netdev_rx_handler_unregister(struct net_device *dev)
3456 {
3457 
3458 	ASSERT_RTNL();
3459 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3460 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3461 	 * section has a guarantee to see a non NULL rx_handler_data
3462 	 * as well.
3463 	 */
3464 	synchronize_net();
3465 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3466 }
3467 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3468 
3469 /*
3470  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3471  * the special handling of PFMEMALLOC skbs.
3472  */
3473 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3474 {
3475 	switch (skb->protocol) {
3476 	case __constant_htons(ETH_P_ARP):
3477 	case __constant_htons(ETH_P_IP):
3478 	case __constant_htons(ETH_P_IPV6):
3479 	case __constant_htons(ETH_P_8021Q):
3480 	case __constant_htons(ETH_P_8021AD):
3481 		return true;
3482 	default:
3483 		return false;
3484 	}
3485 }
3486 
3487 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3488 {
3489 	struct packet_type *ptype, *pt_prev;
3490 	rx_handler_func_t *rx_handler;
3491 	struct net_device *orig_dev;
3492 	struct net_device *null_or_dev;
3493 	bool deliver_exact = false;
3494 	int ret = NET_RX_DROP;
3495 	__be16 type;
3496 
3497 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3498 
3499 	trace_netif_receive_skb(skb);
3500 
3501 	/* if we've gotten here through NAPI, check netpoll */
3502 	if (netpoll_receive_skb(skb))
3503 		goto out;
3504 
3505 	orig_dev = skb->dev;
3506 
3507 	skb_reset_network_header(skb);
3508 	if (!skb_transport_header_was_set(skb))
3509 		skb_reset_transport_header(skb);
3510 	skb_reset_mac_len(skb);
3511 
3512 	pt_prev = NULL;
3513 
3514 	rcu_read_lock();
3515 
3516 another_round:
3517 	skb->skb_iif = skb->dev->ifindex;
3518 
3519 	__this_cpu_inc(softnet_data.processed);
3520 
3521 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3522 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3523 		skb = vlan_untag(skb);
3524 		if (unlikely(!skb))
3525 			goto unlock;
3526 	}
3527 
3528 #ifdef CONFIG_NET_CLS_ACT
3529 	if (skb->tc_verd & TC_NCLS) {
3530 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3531 		goto ncls;
3532 	}
3533 #endif
3534 
3535 	if (pfmemalloc)
3536 		goto skip_taps;
3537 
3538 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3539 		if (!ptype->dev || ptype->dev == skb->dev) {
3540 			if (pt_prev)
3541 				ret = deliver_skb(skb, pt_prev, orig_dev);
3542 			pt_prev = ptype;
3543 		}
3544 	}
3545 
3546 skip_taps:
3547 #ifdef CONFIG_NET_CLS_ACT
3548 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3549 	if (!skb)
3550 		goto unlock;
3551 ncls:
3552 #endif
3553 
3554 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3555 		goto drop;
3556 
3557 	if (vlan_tx_tag_present(skb)) {
3558 		if (pt_prev) {
3559 			ret = deliver_skb(skb, pt_prev, orig_dev);
3560 			pt_prev = NULL;
3561 		}
3562 		if (vlan_do_receive(&skb))
3563 			goto another_round;
3564 		else if (unlikely(!skb))
3565 			goto unlock;
3566 	}
3567 
3568 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3569 	if (rx_handler) {
3570 		if (pt_prev) {
3571 			ret = deliver_skb(skb, pt_prev, orig_dev);
3572 			pt_prev = NULL;
3573 		}
3574 		switch (rx_handler(&skb)) {
3575 		case RX_HANDLER_CONSUMED:
3576 			ret = NET_RX_SUCCESS;
3577 			goto unlock;
3578 		case RX_HANDLER_ANOTHER:
3579 			goto another_round;
3580 		case RX_HANDLER_EXACT:
3581 			deliver_exact = true;
3582 		case RX_HANDLER_PASS:
3583 			break;
3584 		default:
3585 			BUG();
3586 		}
3587 	}
3588 
3589 	if (unlikely(vlan_tx_tag_present(skb))) {
3590 		if (vlan_tx_tag_get_id(skb))
3591 			skb->pkt_type = PACKET_OTHERHOST;
3592 		/* Note: we might in the future use prio bits
3593 		 * and set skb->priority like in vlan_do_receive()
3594 		 * For the time being, just ignore Priority Code Point
3595 		 */
3596 		skb->vlan_tci = 0;
3597 	}
3598 
3599 	/* deliver only exact match when indicated */
3600 	null_or_dev = deliver_exact ? skb->dev : NULL;
3601 
3602 	type = skb->protocol;
3603 	list_for_each_entry_rcu(ptype,
3604 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3605 		if (ptype->type == type &&
3606 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3607 		     ptype->dev == orig_dev)) {
3608 			if (pt_prev)
3609 				ret = deliver_skb(skb, pt_prev, orig_dev);
3610 			pt_prev = ptype;
3611 		}
3612 	}
3613 
3614 	if (pt_prev) {
3615 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3616 			goto drop;
3617 		else
3618 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3619 	} else {
3620 drop:
3621 		atomic_long_inc(&skb->dev->rx_dropped);
3622 		kfree_skb(skb);
3623 		/* Jamal, now you will not able to escape explaining
3624 		 * me how you were going to use this. :-)
3625 		 */
3626 		ret = NET_RX_DROP;
3627 	}
3628 
3629 unlock:
3630 	rcu_read_unlock();
3631 out:
3632 	return ret;
3633 }
3634 
3635 static int __netif_receive_skb(struct sk_buff *skb)
3636 {
3637 	int ret;
3638 
3639 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3640 		unsigned long pflags = current->flags;
3641 
3642 		/*
3643 		 * PFMEMALLOC skbs are special, they should
3644 		 * - be delivered to SOCK_MEMALLOC sockets only
3645 		 * - stay away from userspace
3646 		 * - have bounded memory usage
3647 		 *
3648 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
3649 		 * context down to all allocation sites.
3650 		 */
3651 		current->flags |= PF_MEMALLOC;
3652 		ret = __netif_receive_skb_core(skb, true);
3653 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
3654 	} else
3655 		ret = __netif_receive_skb_core(skb, false);
3656 
3657 	return ret;
3658 }
3659 
3660 /**
3661  *	netif_receive_skb - process receive buffer from network
3662  *	@skb: buffer to process
3663  *
3664  *	netif_receive_skb() is the main receive data processing function.
3665  *	It always succeeds. The buffer may be dropped during processing
3666  *	for congestion control or by the protocol layers.
3667  *
3668  *	This function may only be called from softirq context and interrupts
3669  *	should be enabled.
3670  *
3671  *	Return values (usually ignored):
3672  *	NET_RX_SUCCESS: no congestion
3673  *	NET_RX_DROP: packet was dropped
3674  */
3675 int netif_receive_skb(struct sk_buff *skb)
3676 {
3677 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3678 
3679 	if (skb_defer_rx_timestamp(skb))
3680 		return NET_RX_SUCCESS;
3681 
3682 #ifdef CONFIG_RPS
3683 	if (static_key_false(&rps_needed)) {
3684 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3685 		int cpu, ret;
3686 
3687 		rcu_read_lock();
3688 
3689 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3690 
3691 		if (cpu >= 0) {
3692 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3693 			rcu_read_unlock();
3694 			return ret;
3695 		}
3696 		rcu_read_unlock();
3697 	}
3698 #endif
3699 	return __netif_receive_skb(skb);
3700 }
3701 EXPORT_SYMBOL(netif_receive_skb);
3702 
3703 /* Network device is going away, flush any packets still pending
3704  * Called with irqs disabled.
3705  */
3706 static void flush_backlog(void *arg)
3707 {
3708 	struct net_device *dev = arg;
3709 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3710 	struct sk_buff *skb, *tmp;
3711 
3712 	rps_lock(sd);
3713 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3714 		if (skb->dev == dev) {
3715 			__skb_unlink(skb, &sd->input_pkt_queue);
3716 			kfree_skb(skb);
3717 			input_queue_head_incr(sd);
3718 		}
3719 	}
3720 	rps_unlock(sd);
3721 
3722 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3723 		if (skb->dev == dev) {
3724 			__skb_unlink(skb, &sd->process_queue);
3725 			kfree_skb(skb);
3726 			input_queue_head_incr(sd);
3727 		}
3728 	}
3729 }
3730 
3731 static int napi_gro_complete(struct sk_buff *skb)
3732 {
3733 	struct packet_offload *ptype;
3734 	__be16 type = skb->protocol;
3735 	struct list_head *head = &offload_base;
3736 	int err = -ENOENT;
3737 
3738 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3739 
3740 	if (NAPI_GRO_CB(skb)->count == 1) {
3741 		skb_shinfo(skb)->gso_size = 0;
3742 		goto out;
3743 	}
3744 
3745 	rcu_read_lock();
3746 	list_for_each_entry_rcu(ptype, head, list) {
3747 		if (ptype->type != type || !ptype->callbacks.gro_complete)
3748 			continue;
3749 
3750 		err = ptype->callbacks.gro_complete(skb, 0);
3751 		break;
3752 	}
3753 	rcu_read_unlock();
3754 
3755 	if (err) {
3756 		WARN_ON(&ptype->list == head);
3757 		kfree_skb(skb);
3758 		return NET_RX_SUCCESS;
3759 	}
3760 
3761 out:
3762 	return netif_receive_skb(skb);
3763 }
3764 
3765 /* napi->gro_list contains packets ordered by age.
3766  * youngest packets at the head of it.
3767  * Complete skbs in reverse order to reduce latencies.
3768  */
3769 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3770 {
3771 	struct sk_buff *skb, *prev = NULL;
3772 
3773 	/* scan list and build reverse chain */
3774 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3775 		skb->prev = prev;
3776 		prev = skb;
3777 	}
3778 
3779 	for (skb = prev; skb; skb = prev) {
3780 		skb->next = NULL;
3781 
3782 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3783 			return;
3784 
3785 		prev = skb->prev;
3786 		napi_gro_complete(skb);
3787 		napi->gro_count--;
3788 	}
3789 
3790 	napi->gro_list = NULL;
3791 }
3792 EXPORT_SYMBOL(napi_gro_flush);
3793 
3794 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3795 {
3796 	struct sk_buff *p;
3797 	unsigned int maclen = skb->dev->hard_header_len;
3798 
3799 	for (p = napi->gro_list; p; p = p->next) {
3800 		unsigned long diffs;
3801 
3802 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3803 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3804 		if (maclen == ETH_HLEN)
3805 			diffs |= compare_ether_header(skb_mac_header(p),
3806 						      skb_gro_mac_header(skb));
3807 		else if (!diffs)
3808 			diffs = memcmp(skb_mac_header(p),
3809 				       skb_gro_mac_header(skb),
3810 				       maclen);
3811 		NAPI_GRO_CB(p)->same_flow = !diffs;
3812 		NAPI_GRO_CB(p)->flush = 0;
3813 	}
3814 }
3815 
3816 static void skb_gro_reset_offset(struct sk_buff *skb)
3817 {
3818 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
3819 	const skb_frag_t *frag0 = &pinfo->frags[0];
3820 
3821 	NAPI_GRO_CB(skb)->data_offset = 0;
3822 	NAPI_GRO_CB(skb)->frag0 = NULL;
3823 	NAPI_GRO_CB(skb)->frag0_len = 0;
3824 
3825 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3826 	    pinfo->nr_frags &&
3827 	    !PageHighMem(skb_frag_page(frag0))) {
3828 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3829 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3830 	}
3831 }
3832 
3833 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3834 {
3835 	struct sk_buff **pp = NULL;
3836 	struct packet_offload *ptype;
3837 	__be16 type = skb->protocol;
3838 	struct list_head *head = &offload_base;
3839 	int same_flow;
3840 	enum gro_result ret;
3841 
3842 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3843 		goto normal;
3844 
3845 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3846 		goto normal;
3847 
3848 	skb_gro_reset_offset(skb);
3849 	gro_list_prepare(napi, skb);
3850 
3851 	rcu_read_lock();
3852 	list_for_each_entry_rcu(ptype, head, list) {
3853 		if (ptype->type != type || !ptype->callbacks.gro_receive)
3854 			continue;
3855 
3856 		skb_set_network_header(skb, skb_gro_offset(skb));
3857 		skb_reset_mac_len(skb);
3858 		NAPI_GRO_CB(skb)->same_flow = 0;
3859 		NAPI_GRO_CB(skb)->flush = 0;
3860 		NAPI_GRO_CB(skb)->free = 0;
3861 
3862 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3863 		break;
3864 	}
3865 	rcu_read_unlock();
3866 
3867 	if (&ptype->list == head)
3868 		goto normal;
3869 
3870 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3871 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3872 
3873 	if (pp) {
3874 		struct sk_buff *nskb = *pp;
3875 
3876 		*pp = nskb->next;
3877 		nskb->next = NULL;
3878 		napi_gro_complete(nskb);
3879 		napi->gro_count--;
3880 	}
3881 
3882 	if (same_flow)
3883 		goto ok;
3884 
3885 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3886 		goto normal;
3887 
3888 	napi->gro_count++;
3889 	NAPI_GRO_CB(skb)->count = 1;
3890 	NAPI_GRO_CB(skb)->age = jiffies;
3891 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3892 	skb->next = napi->gro_list;
3893 	napi->gro_list = skb;
3894 	ret = GRO_HELD;
3895 
3896 pull:
3897 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3898 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3899 
3900 		BUG_ON(skb->end - skb->tail < grow);
3901 
3902 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3903 
3904 		skb->tail += grow;
3905 		skb->data_len -= grow;
3906 
3907 		skb_shinfo(skb)->frags[0].page_offset += grow;
3908 		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3909 
3910 		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3911 			skb_frag_unref(skb, 0);
3912 			memmove(skb_shinfo(skb)->frags,
3913 				skb_shinfo(skb)->frags + 1,
3914 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3915 		}
3916 	}
3917 
3918 ok:
3919 	return ret;
3920 
3921 normal:
3922 	ret = GRO_NORMAL;
3923 	goto pull;
3924 }
3925 
3926 
3927 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3928 {
3929 	switch (ret) {
3930 	case GRO_NORMAL:
3931 		if (netif_receive_skb(skb))
3932 			ret = GRO_DROP;
3933 		break;
3934 
3935 	case GRO_DROP:
3936 		kfree_skb(skb);
3937 		break;
3938 
3939 	case GRO_MERGED_FREE:
3940 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3941 			kmem_cache_free(skbuff_head_cache, skb);
3942 		else
3943 			__kfree_skb(skb);
3944 		break;
3945 
3946 	case GRO_HELD:
3947 	case GRO_MERGED:
3948 		break;
3949 	}
3950 
3951 	return ret;
3952 }
3953 
3954 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3955 {
3956 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3957 }
3958 EXPORT_SYMBOL(napi_gro_receive);
3959 
3960 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3961 {
3962 	__skb_pull(skb, skb_headlen(skb));
3963 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
3964 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3965 	skb->vlan_tci = 0;
3966 	skb->dev = napi->dev;
3967 	skb->skb_iif = 0;
3968 
3969 	napi->skb = skb;
3970 }
3971 
3972 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3973 {
3974 	struct sk_buff *skb = napi->skb;
3975 
3976 	if (!skb) {
3977 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3978 		napi->skb = skb;
3979 	}
3980 	return skb;
3981 }
3982 EXPORT_SYMBOL(napi_get_frags);
3983 
3984 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3985 			       gro_result_t ret)
3986 {
3987 	switch (ret) {
3988 	case GRO_NORMAL:
3989 		if (netif_receive_skb(skb))
3990 			ret = GRO_DROP;
3991 		break;
3992 
3993 	case GRO_DROP:
3994 	case GRO_MERGED_FREE:
3995 		napi_reuse_skb(napi, skb);
3996 		break;
3997 
3998 	case GRO_HELD:
3999 	case GRO_MERGED:
4000 		break;
4001 	}
4002 
4003 	return ret;
4004 }
4005 
4006 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4007 {
4008 	struct sk_buff *skb = napi->skb;
4009 
4010 	napi->skb = NULL;
4011 
4012 	if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) {
4013 		napi_reuse_skb(napi, skb);
4014 		return NULL;
4015 	}
4016 	skb->protocol = eth_type_trans(skb, skb->dev);
4017 
4018 	return skb;
4019 }
4020 
4021 gro_result_t napi_gro_frags(struct napi_struct *napi)
4022 {
4023 	struct sk_buff *skb = napi_frags_skb(napi);
4024 
4025 	if (!skb)
4026 		return GRO_DROP;
4027 
4028 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4029 }
4030 EXPORT_SYMBOL(napi_gro_frags);
4031 
4032 /*
4033  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4034  * Note: called with local irq disabled, but exits with local irq enabled.
4035  */
4036 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4037 {
4038 #ifdef CONFIG_RPS
4039 	struct softnet_data *remsd = sd->rps_ipi_list;
4040 
4041 	if (remsd) {
4042 		sd->rps_ipi_list = NULL;
4043 
4044 		local_irq_enable();
4045 
4046 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4047 		while (remsd) {
4048 			struct softnet_data *next = remsd->rps_ipi_next;
4049 
4050 			if (cpu_online(remsd->cpu))
4051 				__smp_call_function_single(remsd->cpu,
4052 							   &remsd->csd, 0);
4053 			remsd = next;
4054 		}
4055 	} else
4056 #endif
4057 		local_irq_enable();
4058 }
4059 
4060 static int process_backlog(struct napi_struct *napi, int quota)
4061 {
4062 	int work = 0;
4063 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4064 
4065 #ifdef CONFIG_RPS
4066 	/* Check if we have pending ipi, its better to send them now,
4067 	 * not waiting net_rx_action() end.
4068 	 */
4069 	if (sd->rps_ipi_list) {
4070 		local_irq_disable();
4071 		net_rps_action_and_irq_enable(sd);
4072 	}
4073 #endif
4074 	napi->weight = weight_p;
4075 	local_irq_disable();
4076 	while (work < quota) {
4077 		struct sk_buff *skb;
4078 		unsigned int qlen;
4079 
4080 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4081 			local_irq_enable();
4082 			__netif_receive_skb(skb);
4083 			local_irq_disable();
4084 			input_queue_head_incr(sd);
4085 			if (++work >= quota) {
4086 				local_irq_enable();
4087 				return work;
4088 			}
4089 		}
4090 
4091 		rps_lock(sd);
4092 		qlen = skb_queue_len(&sd->input_pkt_queue);
4093 		if (qlen)
4094 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4095 						   &sd->process_queue);
4096 
4097 		if (qlen < quota - work) {
4098 			/*
4099 			 * Inline a custom version of __napi_complete().
4100 			 * only current cpu owns and manipulates this napi,
4101 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4102 			 * we can use a plain write instead of clear_bit(),
4103 			 * and we dont need an smp_mb() memory barrier.
4104 			 */
4105 			list_del(&napi->poll_list);
4106 			napi->state = 0;
4107 
4108 			quota = work + qlen;
4109 		}
4110 		rps_unlock(sd);
4111 	}
4112 	local_irq_enable();
4113 
4114 	return work;
4115 }
4116 
4117 /**
4118  * __napi_schedule - schedule for receive
4119  * @n: entry to schedule
4120  *
4121  * The entry's receive function will be scheduled to run
4122  */
4123 void __napi_schedule(struct napi_struct *n)
4124 {
4125 	unsigned long flags;
4126 
4127 	local_irq_save(flags);
4128 	____napi_schedule(&__get_cpu_var(softnet_data), n);
4129 	local_irq_restore(flags);
4130 }
4131 EXPORT_SYMBOL(__napi_schedule);
4132 
4133 void __napi_complete(struct napi_struct *n)
4134 {
4135 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4136 	BUG_ON(n->gro_list);
4137 
4138 	list_del(&n->poll_list);
4139 	smp_mb__before_clear_bit();
4140 	clear_bit(NAPI_STATE_SCHED, &n->state);
4141 }
4142 EXPORT_SYMBOL(__napi_complete);
4143 
4144 void napi_complete(struct napi_struct *n)
4145 {
4146 	unsigned long flags;
4147 
4148 	/*
4149 	 * don't let napi dequeue from the cpu poll list
4150 	 * just in case its running on a different cpu
4151 	 */
4152 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4153 		return;
4154 
4155 	napi_gro_flush(n, false);
4156 	local_irq_save(flags);
4157 	__napi_complete(n);
4158 	local_irq_restore(flags);
4159 }
4160 EXPORT_SYMBOL(napi_complete);
4161 
4162 /* must be called under rcu_read_lock(), as we dont take a reference */
4163 struct napi_struct *napi_by_id(unsigned int napi_id)
4164 {
4165 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4166 	struct napi_struct *napi;
4167 
4168 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4169 		if (napi->napi_id == napi_id)
4170 			return napi;
4171 
4172 	return NULL;
4173 }
4174 EXPORT_SYMBOL_GPL(napi_by_id);
4175 
4176 void napi_hash_add(struct napi_struct *napi)
4177 {
4178 	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4179 
4180 		spin_lock(&napi_hash_lock);
4181 
4182 		/* 0 is not a valid id, we also skip an id that is taken
4183 		 * we expect both events to be extremely rare
4184 		 */
4185 		napi->napi_id = 0;
4186 		while (!napi->napi_id) {
4187 			napi->napi_id = ++napi_gen_id;
4188 			if (napi_by_id(napi->napi_id))
4189 				napi->napi_id = 0;
4190 		}
4191 
4192 		hlist_add_head_rcu(&napi->napi_hash_node,
4193 			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4194 
4195 		spin_unlock(&napi_hash_lock);
4196 	}
4197 }
4198 EXPORT_SYMBOL_GPL(napi_hash_add);
4199 
4200 /* Warning : caller is responsible to make sure rcu grace period
4201  * is respected before freeing memory containing @napi
4202  */
4203 void napi_hash_del(struct napi_struct *napi)
4204 {
4205 	spin_lock(&napi_hash_lock);
4206 
4207 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4208 		hlist_del_rcu(&napi->napi_hash_node);
4209 
4210 	spin_unlock(&napi_hash_lock);
4211 }
4212 EXPORT_SYMBOL_GPL(napi_hash_del);
4213 
4214 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4215 		    int (*poll)(struct napi_struct *, int), int weight)
4216 {
4217 	INIT_LIST_HEAD(&napi->poll_list);
4218 	napi->gro_count = 0;
4219 	napi->gro_list = NULL;
4220 	napi->skb = NULL;
4221 	napi->poll = poll;
4222 	if (weight > NAPI_POLL_WEIGHT)
4223 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4224 			    weight, dev->name);
4225 	napi->weight = weight;
4226 	list_add(&napi->dev_list, &dev->napi_list);
4227 	napi->dev = dev;
4228 #ifdef CONFIG_NETPOLL
4229 	spin_lock_init(&napi->poll_lock);
4230 	napi->poll_owner = -1;
4231 #endif
4232 	set_bit(NAPI_STATE_SCHED, &napi->state);
4233 }
4234 EXPORT_SYMBOL(netif_napi_add);
4235 
4236 void netif_napi_del(struct napi_struct *napi)
4237 {
4238 	list_del_init(&napi->dev_list);
4239 	napi_free_frags(napi);
4240 
4241 	kfree_skb_list(napi->gro_list);
4242 	napi->gro_list = NULL;
4243 	napi->gro_count = 0;
4244 }
4245 EXPORT_SYMBOL(netif_napi_del);
4246 
4247 static void net_rx_action(struct softirq_action *h)
4248 {
4249 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
4250 	unsigned long time_limit = jiffies + 2;
4251 	int budget = netdev_budget;
4252 	void *have;
4253 
4254 	local_irq_disable();
4255 
4256 	while (!list_empty(&sd->poll_list)) {
4257 		struct napi_struct *n;
4258 		int work, weight;
4259 
4260 		/* If softirq window is exhuasted then punt.
4261 		 * Allow this to run for 2 jiffies since which will allow
4262 		 * an average latency of 1.5/HZ.
4263 		 */
4264 		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4265 			goto softnet_break;
4266 
4267 		local_irq_enable();
4268 
4269 		/* Even though interrupts have been re-enabled, this
4270 		 * access is safe because interrupts can only add new
4271 		 * entries to the tail of this list, and only ->poll()
4272 		 * calls can remove this head entry from the list.
4273 		 */
4274 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4275 
4276 		have = netpoll_poll_lock(n);
4277 
4278 		weight = n->weight;
4279 
4280 		/* This NAPI_STATE_SCHED test is for avoiding a race
4281 		 * with netpoll's poll_napi().  Only the entity which
4282 		 * obtains the lock and sees NAPI_STATE_SCHED set will
4283 		 * actually make the ->poll() call.  Therefore we avoid
4284 		 * accidentally calling ->poll() when NAPI is not scheduled.
4285 		 */
4286 		work = 0;
4287 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4288 			work = n->poll(n, weight);
4289 			trace_napi_poll(n);
4290 		}
4291 
4292 		WARN_ON_ONCE(work > weight);
4293 
4294 		budget -= work;
4295 
4296 		local_irq_disable();
4297 
4298 		/* Drivers must not modify the NAPI state if they
4299 		 * consume the entire weight.  In such cases this code
4300 		 * still "owns" the NAPI instance and therefore can
4301 		 * move the instance around on the list at-will.
4302 		 */
4303 		if (unlikely(work == weight)) {
4304 			if (unlikely(napi_disable_pending(n))) {
4305 				local_irq_enable();
4306 				napi_complete(n);
4307 				local_irq_disable();
4308 			} else {
4309 				if (n->gro_list) {
4310 					/* flush too old packets
4311 					 * If HZ < 1000, flush all packets.
4312 					 */
4313 					local_irq_enable();
4314 					napi_gro_flush(n, HZ >= 1000);
4315 					local_irq_disable();
4316 				}
4317 				list_move_tail(&n->poll_list, &sd->poll_list);
4318 			}
4319 		}
4320 
4321 		netpoll_poll_unlock(have);
4322 	}
4323 out:
4324 	net_rps_action_and_irq_enable(sd);
4325 
4326 #ifdef CONFIG_NET_DMA
4327 	/*
4328 	 * There may not be any more sk_buffs coming right now, so push
4329 	 * any pending DMA copies to hardware
4330 	 */
4331 	dma_issue_pending_all();
4332 #endif
4333 
4334 	return;
4335 
4336 softnet_break:
4337 	sd->time_squeeze++;
4338 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4339 	goto out;
4340 }
4341 
4342 struct netdev_adjacent {
4343 	struct net_device *dev;
4344 
4345 	/* upper master flag, there can only be one master device per list */
4346 	bool master;
4347 
4348 	/* counter for the number of times this device was added to us */
4349 	u16 ref_nr;
4350 
4351 	/* private field for the users */
4352 	void *private;
4353 
4354 	struct list_head list;
4355 	struct rcu_head rcu;
4356 };
4357 
4358 static struct netdev_adjacent *__netdev_find_adj_rcu(struct net_device *dev,
4359 						     struct net_device *adj_dev,
4360 						     struct list_head *adj_list)
4361 {
4362 	struct netdev_adjacent *adj;
4363 
4364 	list_for_each_entry_rcu(adj, adj_list, list) {
4365 		if (adj->dev == adj_dev)
4366 			return adj;
4367 	}
4368 	return NULL;
4369 }
4370 
4371 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4372 						 struct net_device *adj_dev,
4373 						 struct list_head *adj_list)
4374 {
4375 	struct netdev_adjacent *adj;
4376 
4377 	list_for_each_entry(adj, adj_list, list) {
4378 		if (adj->dev == adj_dev)
4379 			return adj;
4380 	}
4381 	return NULL;
4382 }
4383 
4384 /**
4385  * netdev_has_upper_dev - Check if device is linked to an upper device
4386  * @dev: device
4387  * @upper_dev: upper device to check
4388  *
4389  * Find out if a device is linked to specified upper device and return true
4390  * in case it is. Note that this checks only immediate upper device,
4391  * not through a complete stack of devices. The caller must hold the RTNL lock.
4392  */
4393 bool netdev_has_upper_dev(struct net_device *dev,
4394 			  struct net_device *upper_dev)
4395 {
4396 	ASSERT_RTNL();
4397 
4398 	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4399 }
4400 EXPORT_SYMBOL(netdev_has_upper_dev);
4401 
4402 /**
4403  * netdev_has_any_upper_dev - Check if device is linked to some device
4404  * @dev: device
4405  *
4406  * Find out if a device is linked to an upper device and return true in case
4407  * it is. The caller must hold the RTNL lock.
4408  */
4409 bool netdev_has_any_upper_dev(struct net_device *dev)
4410 {
4411 	ASSERT_RTNL();
4412 
4413 	return !list_empty(&dev->all_adj_list.upper);
4414 }
4415 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4416 
4417 /**
4418  * netdev_master_upper_dev_get - Get master upper device
4419  * @dev: device
4420  *
4421  * Find a master upper device and return pointer to it or NULL in case
4422  * it's not there. The caller must hold the RTNL lock.
4423  */
4424 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4425 {
4426 	struct netdev_adjacent *upper;
4427 
4428 	ASSERT_RTNL();
4429 
4430 	if (list_empty(&dev->adj_list.upper))
4431 		return NULL;
4432 
4433 	upper = list_first_entry(&dev->adj_list.upper,
4434 				 struct netdev_adjacent, list);
4435 	if (likely(upper->master))
4436 		return upper->dev;
4437 	return NULL;
4438 }
4439 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4440 
4441 void *netdev_adjacent_get_private(struct list_head *adj_list)
4442 {
4443 	struct netdev_adjacent *adj;
4444 
4445 	adj = list_entry(adj_list, struct netdev_adjacent, list);
4446 
4447 	return adj->private;
4448 }
4449 EXPORT_SYMBOL(netdev_adjacent_get_private);
4450 
4451 /**
4452  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4453  * @dev: device
4454  * @iter: list_head ** of the current position
4455  *
4456  * Gets the next device from the dev's upper list, starting from iter
4457  * position. The caller must hold RCU read lock.
4458  */
4459 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4460 						     struct list_head **iter)
4461 {
4462 	struct netdev_adjacent *upper;
4463 
4464 	WARN_ON_ONCE(!rcu_read_lock_held());
4465 
4466 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4467 
4468 	if (&upper->list == &dev->all_adj_list.upper)
4469 		return NULL;
4470 
4471 	*iter = &upper->list;
4472 
4473 	return upper->dev;
4474 }
4475 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4476 
4477 /**
4478  * netdev_lower_get_next_private - Get the next ->private from the
4479  *				   lower neighbour list
4480  * @dev: device
4481  * @iter: list_head ** of the current position
4482  *
4483  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4484  * list, starting from iter position. The caller must hold either hold the
4485  * RTNL lock or its own locking that guarantees that the neighbour lower
4486  * list will remain unchainged.
4487  */
4488 void *netdev_lower_get_next_private(struct net_device *dev,
4489 				    struct list_head **iter)
4490 {
4491 	struct netdev_adjacent *lower;
4492 
4493 	lower = list_entry(*iter, struct netdev_adjacent, list);
4494 
4495 	if (&lower->list == &dev->adj_list.lower)
4496 		return NULL;
4497 
4498 	if (iter)
4499 		*iter = lower->list.next;
4500 
4501 	return lower->private;
4502 }
4503 EXPORT_SYMBOL(netdev_lower_get_next_private);
4504 
4505 /**
4506  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4507  *				       lower neighbour list, RCU
4508  *				       variant
4509  * @dev: device
4510  * @iter: list_head ** of the current position
4511  *
4512  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4513  * list, starting from iter position. The caller must hold RCU read lock.
4514  */
4515 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4516 					struct list_head **iter)
4517 {
4518 	struct netdev_adjacent *lower;
4519 
4520 	WARN_ON_ONCE(!rcu_read_lock_held());
4521 
4522 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4523 
4524 	if (&lower->list == &dev->adj_list.lower)
4525 		return NULL;
4526 
4527 	if (iter)
4528 		*iter = &lower->list;
4529 
4530 	return lower->private;
4531 }
4532 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4533 
4534 /**
4535  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4536  *				       lower neighbour list, RCU
4537  *				       variant
4538  * @dev: device
4539  *
4540  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4541  * list. The caller must hold RCU read lock.
4542  */
4543 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4544 {
4545 	struct netdev_adjacent *lower;
4546 
4547 	lower = list_first_or_null_rcu(&dev->adj_list.lower,
4548 			struct netdev_adjacent, list);
4549 	if (lower)
4550 		return lower->private;
4551 	return NULL;
4552 }
4553 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4554 
4555 /**
4556  * netdev_master_upper_dev_get_rcu - Get master upper device
4557  * @dev: device
4558  *
4559  * Find a master upper device and return pointer to it or NULL in case
4560  * it's not there. The caller must hold the RCU read lock.
4561  */
4562 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4563 {
4564 	struct netdev_adjacent *upper;
4565 
4566 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
4567 				       struct netdev_adjacent, list);
4568 	if (upper && likely(upper->master))
4569 		return upper->dev;
4570 	return NULL;
4571 }
4572 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4573 
4574 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4575 					struct net_device *adj_dev,
4576 					struct list_head *dev_list,
4577 					void *private, bool master)
4578 {
4579 	struct netdev_adjacent *adj;
4580 	char linkname[IFNAMSIZ+7];
4581 	int ret;
4582 
4583 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4584 
4585 	if (adj) {
4586 		adj->ref_nr++;
4587 		return 0;
4588 	}
4589 
4590 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4591 	if (!adj)
4592 		return -ENOMEM;
4593 
4594 	adj->dev = adj_dev;
4595 	adj->master = master;
4596 	adj->ref_nr = 1;
4597 	adj->private = private;
4598 	dev_hold(adj_dev);
4599 
4600 	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4601 		 adj_dev->name, dev->name, adj_dev->name);
4602 
4603 	if (dev_list == &dev->adj_list.lower) {
4604 		sprintf(linkname, "lower_%s", adj_dev->name);
4605 		ret = sysfs_create_link(&(dev->dev.kobj),
4606 					&(adj_dev->dev.kobj), linkname);
4607 		if (ret)
4608 			goto free_adj;
4609 	} else if (dev_list == &dev->adj_list.upper) {
4610 		sprintf(linkname, "upper_%s", adj_dev->name);
4611 		ret = sysfs_create_link(&(dev->dev.kobj),
4612 					&(adj_dev->dev.kobj), linkname);
4613 		if (ret)
4614 			goto free_adj;
4615 	}
4616 
4617 	/* Ensure that master link is always the first item in list. */
4618 	if (master) {
4619 		ret = sysfs_create_link(&(dev->dev.kobj),
4620 					&(adj_dev->dev.kobj), "master");
4621 		if (ret)
4622 			goto remove_symlinks;
4623 
4624 		list_add_rcu(&adj->list, dev_list);
4625 	} else {
4626 		list_add_tail_rcu(&adj->list, dev_list);
4627 	}
4628 
4629 	return 0;
4630 
4631 remove_symlinks:
4632 	if (dev_list == &dev->adj_list.lower) {
4633 		sprintf(linkname, "lower_%s", adj_dev->name);
4634 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4635 	} else if (dev_list == &dev->adj_list.upper) {
4636 		sprintf(linkname, "upper_%s", adj_dev->name);
4637 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4638 	}
4639 
4640 free_adj:
4641 	kfree(adj);
4642 	dev_put(adj_dev);
4643 
4644 	return ret;
4645 }
4646 
4647 void __netdev_adjacent_dev_remove(struct net_device *dev,
4648 				  struct net_device *adj_dev,
4649 				  struct list_head *dev_list)
4650 {
4651 	struct netdev_adjacent *adj;
4652 	char linkname[IFNAMSIZ+7];
4653 
4654 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4655 
4656 	if (!adj) {
4657 		pr_err("tried to remove device %s from %s\n",
4658 		       dev->name, adj_dev->name);
4659 		BUG();
4660 	}
4661 
4662 	if (adj->ref_nr > 1) {
4663 		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4664 			 adj->ref_nr-1);
4665 		adj->ref_nr--;
4666 		return;
4667 	}
4668 
4669 	if (adj->master)
4670 		sysfs_remove_link(&(dev->dev.kobj), "master");
4671 
4672 	if (dev_list == &dev->adj_list.lower) {
4673 		sprintf(linkname, "lower_%s", adj_dev->name);
4674 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4675 	} else if (dev_list == &dev->adj_list.upper) {
4676 		sprintf(linkname, "upper_%s", adj_dev->name);
4677 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4678 	}
4679 
4680 	list_del_rcu(&adj->list);
4681 	pr_debug("dev_put for %s, because link removed from %s to %s\n",
4682 		 adj_dev->name, dev->name, adj_dev->name);
4683 	dev_put(adj_dev);
4684 	kfree_rcu(adj, rcu);
4685 }
4686 
4687 int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4688 				     struct net_device *upper_dev,
4689 				     struct list_head *up_list,
4690 				     struct list_head *down_list,
4691 				     void *private, bool master)
4692 {
4693 	int ret;
4694 
4695 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4696 					   master);
4697 	if (ret)
4698 		return ret;
4699 
4700 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4701 					   false);
4702 	if (ret) {
4703 		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4704 		return ret;
4705 	}
4706 
4707 	return 0;
4708 }
4709 
4710 int __netdev_adjacent_dev_link(struct net_device *dev,
4711 			       struct net_device *upper_dev)
4712 {
4713 	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4714 						&dev->all_adj_list.upper,
4715 						&upper_dev->all_adj_list.lower,
4716 						NULL, false);
4717 }
4718 
4719 void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4720 					struct net_device *upper_dev,
4721 					struct list_head *up_list,
4722 					struct list_head *down_list)
4723 {
4724 	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4725 	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
4726 }
4727 
4728 void __netdev_adjacent_dev_unlink(struct net_device *dev,
4729 				  struct net_device *upper_dev)
4730 {
4731 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4732 					   &dev->all_adj_list.upper,
4733 					   &upper_dev->all_adj_list.lower);
4734 }
4735 
4736 int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4737 					 struct net_device *upper_dev,
4738 					 void *private, bool master)
4739 {
4740 	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4741 
4742 	if (ret)
4743 		return ret;
4744 
4745 	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4746 					       &dev->adj_list.upper,
4747 					       &upper_dev->adj_list.lower,
4748 					       private, master);
4749 	if (ret) {
4750 		__netdev_adjacent_dev_unlink(dev, upper_dev);
4751 		return ret;
4752 	}
4753 
4754 	return 0;
4755 }
4756 
4757 void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4758 					    struct net_device *upper_dev)
4759 {
4760 	__netdev_adjacent_dev_unlink(dev, upper_dev);
4761 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4762 					   &dev->adj_list.upper,
4763 					   &upper_dev->adj_list.lower);
4764 }
4765 
4766 static int __netdev_upper_dev_link(struct net_device *dev,
4767 				   struct net_device *upper_dev, bool master,
4768 				   void *private)
4769 {
4770 	struct netdev_adjacent *i, *j, *to_i, *to_j;
4771 	int ret = 0;
4772 
4773 	ASSERT_RTNL();
4774 
4775 	if (dev == upper_dev)
4776 		return -EBUSY;
4777 
4778 	/* To prevent loops, check if dev is not upper device to upper_dev. */
4779 	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
4780 		return -EBUSY;
4781 
4782 	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
4783 		return -EEXIST;
4784 
4785 	if (master && netdev_master_upper_dev_get(dev))
4786 		return -EBUSY;
4787 
4788 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
4789 						   master);
4790 	if (ret)
4791 		return ret;
4792 
4793 	/* Now that we linked these devs, make all the upper_dev's
4794 	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
4795 	 * versa, and don't forget the devices itself. All of these
4796 	 * links are non-neighbours.
4797 	 */
4798 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4799 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4800 			pr_debug("Interlinking %s with %s, non-neighbour\n",
4801 				 i->dev->name, j->dev->name);
4802 			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
4803 			if (ret)
4804 				goto rollback_mesh;
4805 		}
4806 	}
4807 
4808 	/* add dev to every upper_dev's upper device */
4809 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4810 		pr_debug("linking %s's upper device %s with %s\n",
4811 			 upper_dev->name, i->dev->name, dev->name);
4812 		ret = __netdev_adjacent_dev_link(dev, i->dev);
4813 		if (ret)
4814 			goto rollback_upper_mesh;
4815 	}
4816 
4817 	/* add upper_dev to every dev's lower device */
4818 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4819 		pr_debug("linking %s's lower device %s with %s\n", dev->name,
4820 			 i->dev->name, upper_dev->name);
4821 		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
4822 		if (ret)
4823 			goto rollback_lower_mesh;
4824 	}
4825 
4826 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4827 	return 0;
4828 
4829 rollback_lower_mesh:
4830 	to_i = i;
4831 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4832 		if (i == to_i)
4833 			break;
4834 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
4835 	}
4836 
4837 	i = NULL;
4838 
4839 rollback_upper_mesh:
4840 	to_i = i;
4841 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4842 		if (i == to_i)
4843 			break;
4844 		__netdev_adjacent_dev_unlink(dev, i->dev);
4845 	}
4846 
4847 	i = j = NULL;
4848 
4849 rollback_mesh:
4850 	to_i = i;
4851 	to_j = j;
4852 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4853 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4854 			if (i == to_i && j == to_j)
4855 				break;
4856 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
4857 		}
4858 		if (i == to_i)
4859 			break;
4860 	}
4861 
4862 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
4863 
4864 	return ret;
4865 }
4866 
4867 /**
4868  * netdev_upper_dev_link - Add a link to the upper device
4869  * @dev: device
4870  * @upper_dev: new upper device
4871  *
4872  * Adds a link to device which is upper to this one. The caller must hold
4873  * the RTNL lock. On a failure a negative errno code is returned.
4874  * On success the reference counts are adjusted and the function
4875  * returns zero.
4876  */
4877 int netdev_upper_dev_link(struct net_device *dev,
4878 			  struct net_device *upper_dev)
4879 {
4880 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
4881 }
4882 EXPORT_SYMBOL(netdev_upper_dev_link);
4883 
4884 /**
4885  * netdev_master_upper_dev_link - Add a master link to the upper device
4886  * @dev: device
4887  * @upper_dev: new upper device
4888  *
4889  * Adds a link to device which is upper to this one. In this case, only
4890  * one master upper device can be linked, although other non-master devices
4891  * might be linked as well. The caller must hold the RTNL lock.
4892  * On a failure a negative errno code is returned. On success the reference
4893  * counts are adjusted and the function returns zero.
4894  */
4895 int netdev_master_upper_dev_link(struct net_device *dev,
4896 				 struct net_device *upper_dev)
4897 {
4898 	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
4899 }
4900 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4901 
4902 int netdev_master_upper_dev_link_private(struct net_device *dev,
4903 					 struct net_device *upper_dev,
4904 					 void *private)
4905 {
4906 	return __netdev_upper_dev_link(dev, upper_dev, true, private);
4907 }
4908 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
4909 
4910 /**
4911  * netdev_upper_dev_unlink - Removes a link to upper device
4912  * @dev: device
4913  * @upper_dev: new upper device
4914  *
4915  * Removes a link to device which is upper to this one. The caller must hold
4916  * the RTNL lock.
4917  */
4918 void netdev_upper_dev_unlink(struct net_device *dev,
4919 			     struct net_device *upper_dev)
4920 {
4921 	struct netdev_adjacent *i, *j;
4922 	ASSERT_RTNL();
4923 
4924 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
4925 
4926 	/* Here is the tricky part. We must remove all dev's lower
4927 	 * devices from all upper_dev's upper devices and vice
4928 	 * versa, to maintain the graph relationship.
4929 	 */
4930 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
4931 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
4932 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
4933 
4934 	/* remove also the devices itself from lower/upper device
4935 	 * list
4936 	 */
4937 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
4938 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
4939 
4940 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
4941 		__netdev_adjacent_dev_unlink(dev, i->dev);
4942 
4943 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4944 }
4945 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4946 
4947 void *netdev_lower_dev_get_private_rcu(struct net_device *dev,
4948 				       struct net_device *lower_dev)
4949 {
4950 	struct netdev_adjacent *lower;
4951 
4952 	if (!lower_dev)
4953 		return NULL;
4954 	lower = __netdev_find_adj_rcu(dev, lower_dev, &dev->adj_list.lower);
4955 	if (!lower)
4956 		return NULL;
4957 
4958 	return lower->private;
4959 }
4960 EXPORT_SYMBOL(netdev_lower_dev_get_private_rcu);
4961 
4962 void *netdev_lower_dev_get_private(struct net_device *dev,
4963 				   struct net_device *lower_dev)
4964 {
4965 	struct netdev_adjacent *lower;
4966 
4967 	if (!lower_dev)
4968 		return NULL;
4969 	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
4970 	if (!lower)
4971 		return NULL;
4972 
4973 	return lower->private;
4974 }
4975 EXPORT_SYMBOL(netdev_lower_dev_get_private);
4976 
4977 static void dev_change_rx_flags(struct net_device *dev, int flags)
4978 {
4979 	const struct net_device_ops *ops = dev->netdev_ops;
4980 
4981 	if (ops->ndo_change_rx_flags)
4982 		ops->ndo_change_rx_flags(dev, flags);
4983 }
4984 
4985 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
4986 {
4987 	unsigned int old_flags = dev->flags;
4988 	kuid_t uid;
4989 	kgid_t gid;
4990 
4991 	ASSERT_RTNL();
4992 
4993 	dev->flags |= IFF_PROMISC;
4994 	dev->promiscuity += inc;
4995 	if (dev->promiscuity == 0) {
4996 		/*
4997 		 * Avoid overflow.
4998 		 * If inc causes overflow, untouch promisc and return error.
4999 		 */
5000 		if (inc < 0)
5001 			dev->flags &= ~IFF_PROMISC;
5002 		else {
5003 			dev->promiscuity -= inc;
5004 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5005 				dev->name);
5006 			return -EOVERFLOW;
5007 		}
5008 	}
5009 	if (dev->flags != old_flags) {
5010 		pr_info("device %s %s promiscuous mode\n",
5011 			dev->name,
5012 			dev->flags & IFF_PROMISC ? "entered" : "left");
5013 		if (audit_enabled) {
5014 			current_uid_gid(&uid, &gid);
5015 			audit_log(current->audit_context, GFP_ATOMIC,
5016 				AUDIT_ANOM_PROMISCUOUS,
5017 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5018 				dev->name, (dev->flags & IFF_PROMISC),
5019 				(old_flags & IFF_PROMISC),
5020 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
5021 				from_kuid(&init_user_ns, uid),
5022 				from_kgid(&init_user_ns, gid),
5023 				audit_get_sessionid(current));
5024 		}
5025 
5026 		dev_change_rx_flags(dev, IFF_PROMISC);
5027 	}
5028 	if (notify)
5029 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
5030 	return 0;
5031 }
5032 
5033 /**
5034  *	dev_set_promiscuity	- update promiscuity count on a device
5035  *	@dev: device
5036  *	@inc: modifier
5037  *
5038  *	Add or remove promiscuity from a device. While the count in the device
5039  *	remains above zero the interface remains promiscuous. Once it hits zero
5040  *	the device reverts back to normal filtering operation. A negative inc
5041  *	value is used to drop promiscuity on the device.
5042  *	Return 0 if successful or a negative errno code on error.
5043  */
5044 int dev_set_promiscuity(struct net_device *dev, int inc)
5045 {
5046 	unsigned int old_flags = dev->flags;
5047 	int err;
5048 
5049 	err = __dev_set_promiscuity(dev, inc, true);
5050 	if (err < 0)
5051 		return err;
5052 	if (dev->flags != old_flags)
5053 		dev_set_rx_mode(dev);
5054 	return err;
5055 }
5056 EXPORT_SYMBOL(dev_set_promiscuity);
5057 
5058 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5059 {
5060 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5061 
5062 	ASSERT_RTNL();
5063 
5064 	dev->flags |= IFF_ALLMULTI;
5065 	dev->allmulti += inc;
5066 	if (dev->allmulti == 0) {
5067 		/*
5068 		 * Avoid overflow.
5069 		 * If inc causes overflow, untouch allmulti and return error.
5070 		 */
5071 		if (inc < 0)
5072 			dev->flags &= ~IFF_ALLMULTI;
5073 		else {
5074 			dev->allmulti -= inc;
5075 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5076 				dev->name);
5077 			return -EOVERFLOW;
5078 		}
5079 	}
5080 	if (dev->flags ^ old_flags) {
5081 		dev_change_rx_flags(dev, IFF_ALLMULTI);
5082 		dev_set_rx_mode(dev);
5083 		if (notify)
5084 			__dev_notify_flags(dev, old_flags,
5085 					   dev->gflags ^ old_gflags);
5086 	}
5087 	return 0;
5088 }
5089 
5090 /**
5091  *	dev_set_allmulti	- update allmulti count on a device
5092  *	@dev: device
5093  *	@inc: modifier
5094  *
5095  *	Add or remove reception of all multicast frames to a device. While the
5096  *	count in the device remains above zero the interface remains listening
5097  *	to all interfaces. Once it hits zero the device reverts back to normal
5098  *	filtering operation. A negative @inc value is used to drop the counter
5099  *	when releasing a resource needing all multicasts.
5100  *	Return 0 if successful or a negative errno code on error.
5101  */
5102 
5103 int dev_set_allmulti(struct net_device *dev, int inc)
5104 {
5105 	return __dev_set_allmulti(dev, inc, true);
5106 }
5107 EXPORT_SYMBOL(dev_set_allmulti);
5108 
5109 /*
5110  *	Upload unicast and multicast address lists to device and
5111  *	configure RX filtering. When the device doesn't support unicast
5112  *	filtering it is put in promiscuous mode while unicast addresses
5113  *	are present.
5114  */
5115 void __dev_set_rx_mode(struct net_device *dev)
5116 {
5117 	const struct net_device_ops *ops = dev->netdev_ops;
5118 
5119 	/* dev_open will call this function so the list will stay sane. */
5120 	if (!(dev->flags&IFF_UP))
5121 		return;
5122 
5123 	if (!netif_device_present(dev))
5124 		return;
5125 
5126 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5127 		/* Unicast addresses changes may only happen under the rtnl,
5128 		 * therefore calling __dev_set_promiscuity here is safe.
5129 		 */
5130 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5131 			__dev_set_promiscuity(dev, 1, false);
5132 			dev->uc_promisc = true;
5133 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5134 			__dev_set_promiscuity(dev, -1, false);
5135 			dev->uc_promisc = false;
5136 		}
5137 	}
5138 
5139 	if (ops->ndo_set_rx_mode)
5140 		ops->ndo_set_rx_mode(dev);
5141 }
5142 
5143 void dev_set_rx_mode(struct net_device *dev)
5144 {
5145 	netif_addr_lock_bh(dev);
5146 	__dev_set_rx_mode(dev);
5147 	netif_addr_unlock_bh(dev);
5148 }
5149 
5150 /**
5151  *	dev_get_flags - get flags reported to userspace
5152  *	@dev: device
5153  *
5154  *	Get the combination of flag bits exported through APIs to userspace.
5155  */
5156 unsigned int dev_get_flags(const struct net_device *dev)
5157 {
5158 	unsigned int flags;
5159 
5160 	flags = (dev->flags & ~(IFF_PROMISC |
5161 				IFF_ALLMULTI |
5162 				IFF_RUNNING |
5163 				IFF_LOWER_UP |
5164 				IFF_DORMANT)) |
5165 		(dev->gflags & (IFF_PROMISC |
5166 				IFF_ALLMULTI));
5167 
5168 	if (netif_running(dev)) {
5169 		if (netif_oper_up(dev))
5170 			flags |= IFF_RUNNING;
5171 		if (netif_carrier_ok(dev))
5172 			flags |= IFF_LOWER_UP;
5173 		if (netif_dormant(dev))
5174 			flags |= IFF_DORMANT;
5175 	}
5176 
5177 	return flags;
5178 }
5179 EXPORT_SYMBOL(dev_get_flags);
5180 
5181 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5182 {
5183 	unsigned int old_flags = dev->flags;
5184 	int ret;
5185 
5186 	ASSERT_RTNL();
5187 
5188 	/*
5189 	 *	Set the flags on our device.
5190 	 */
5191 
5192 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5193 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5194 			       IFF_AUTOMEDIA)) |
5195 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5196 				    IFF_ALLMULTI));
5197 
5198 	/*
5199 	 *	Load in the correct multicast list now the flags have changed.
5200 	 */
5201 
5202 	if ((old_flags ^ flags) & IFF_MULTICAST)
5203 		dev_change_rx_flags(dev, IFF_MULTICAST);
5204 
5205 	dev_set_rx_mode(dev);
5206 
5207 	/*
5208 	 *	Have we downed the interface. We handle IFF_UP ourselves
5209 	 *	according to user attempts to set it, rather than blindly
5210 	 *	setting it.
5211 	 */
5212 
5213 	ret = 0;
5214 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
5215 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5216 
5217 		if (!ret)
5218 			dev_set_rx_mode(dev);
5219 	}
5220 
5221 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
5222 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
5223 		unsigned int old_flags = dev->flags;
5224 
5225 		dev->gflags ^= IFF_PROMISC;
5226 
5227 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
5228 			if (dev->flags != old_flags)
5229 				dev_set_rx_mode(dev);
5230 	}
5231 
5232 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5233 	   is important. Some (broken) drivers set IFF_PROMISC, when
5234 	   IFF_ALLMULTI is requested not asking us and not reporting.
5235 	 */
5236 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5237 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5238 
5239 		dev->gflags ^= IFF_ALLMULTI;
5240 		__dev_set_allmulti(dev, inc, false);
5241 	}
5242 
5243 	return ret;
5244 }
5245 
5246 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5247 			unsigned int gchanges)
5248 {
5249 	unsigned int changes = dev->flags ^ old_flags;
5250 
5251 	if (gchanges)
5252 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5253 
5254 	if (changes & IFF_UP) {
5255 		if (dev->flags & IFF_UP)
5256 			call_netdevice_notifiers(NETDEV_UP, dev);
5257 		else
5258 			call_netdevice_notifiers(NETDEV_DOWN, dev);
5259 	}
5260 
5261 	if (dev->flags & IFF_UP &&
5262 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5263 		struct netdev_notifier_change_info change_info;
5264 
5265 		change_info.flags_changed = changes;
5266 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5267 					      &change_info.info);
5268 	}
5269 }
5270 
5271 /**
5272  *	dev_change_flags - change device settings
5273  *	@dev: device
5274  *	@flags: device state flags
5275  *
5276  *	Change settings on device based state flags. The flags are
5277  *	in the userspace exported format.
5278  */
5279 int dev_change_flags(struct net_device *dev, unsigned int flags)
5280 {
5281 	int ret;
5282 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5283 
5284 	ret = __dev_change_flags(dev, flags);
5285 	if (ret < 0)
5286 		return ret;
5287 
5288 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5289 	__dev_notify_flags(dev, old_flags, changes);
5290 	return ret;
5291 }
5292 EXPORT_SYMBOL(dev_change_flags);
5293 
5294 /**
5295  *	dev_set_mtu - Change maximum transfer unit
5296  *	@dev: device
5297  *	@new_mtu: new transfer unit
5298  *
5299  *	Change the maximum transfer size of the network device.
5300  */
5301 int dev_set_mtu(struct net_device *dev, int new_mtu)
5302 {
5303 	const struct net_device_ops *ops = dev->netdev_ops;
5304 	int err;
5305 
5306 	if (new_mtu == dev->mtu)
5307 		return 0;
5308 
5309 	/*	MTU must be positive.	 */
5310 	if (new_mtu < 0)
5311 		return -EINVAL;
5312 
5313 	if (!netif_device_present(dev))
5314 		return -ENODEV;
5315 
5316 	err = 0;
5317 	if (ops->ndo_change_mtu)
5318 		err = ops->ndo_change_mtu(dev, new_mtu);
5319 	else
5320 		dev->mtu = new_mtu;
5321 
5322 	if (!err)
5323 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5324 	return err;
5325 }
5326 EXPORT_SYMBOL(dev_set_mtu);
5327 
5328 /**
5329  *	dev_set_group - Change group this device belongs to
5330  *	@dev: device
5331  *	@new_group: group this device should belong to
5332  */
5333 void dev_set_group(struct net_device *dev, int new_group)
5334 {
5335 	dev->group = new_group;
5336 }
5337 EXPORT_SYMBOL(dev_set_group);
5338 
5339 /**
5340  *	dev_set_mac_address - Change Media Access Control Address
5341  *	@dev: device
5342  *	@sa: new address
5343  *
5344  *	Change the hardware (MAC) address of the device
5345  */
5346 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5347 {
5348 	const struct net_device_ops *ops = dev->netdev_ops;
5349 	int err;
5350 
5351 	if (!ops->ndo_set_mac_address)
5352 		return -EOPNOTSUPP;
5353 	if (sa->sa_family != dev->type)
5354 		return -EINVAL;
5355 	if (!netif_device_present(dev))
5356 		return -ENODEV;
5357 	err = ops->ndo_set_mac_address(dev, sa);
5358 	if (err)
5359 		return err;
5360 	dev->addr_assign_type = NET_ADDR_SET;
5361 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5362 	add_device_randomness(dev->dev_addr, dev->addr_len);
5363 	return 0;
5364 }
5365 EXPORT_SYMBOL(dev_set_mac_address);
5366 
5367 /**
5368  *	dev_change_carrier - Change device carrier
5369  *	@dev: device
5370  *	@new_carrier: new value
5371  *
5372  *	Change device carrier
5373  */
5374 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5375 {
5376 	const struct net_device_ops *ops = dev->netdev_ops;
5377 
5378 	if (!ops->ndo_change_carrier)
5379 		return -EOPNOTSUPP;
5380 	if (!netif_device_present(dev))
5381 		return -ENODEV;
5382 	return ops->ndo_change_carrier(dev, new_carrier);
5383 }
5384 EXPORT_SYMBOL(dev_change_carrier);
5385 
5386 /**
5387  *	dev_get_phys_port_id - Get device physical port ID
5388  *	@dev: device
5389  *	@ppid: port ID
5390  *
5391  *	Get device physical port ID
5392  */
5393 int dev_get_phys_port_id(struct net_device *dev,
5394 			 struct netdev_phys_port_id *ppid)
5395 {
5396 	const struct net_device_ops *ops = dev->netdev_ops;
5397 
5398 	if (!ops->ndo_get_phys_port_id)
5399 		return -EOPNOTSUPP;
5400 	return ops->ndo_get_phys_port_id(dev, ppid);
5401 }
5402 EXPORT_SYMBOL(dev_get_phys_port_id);
5403 
5404 /**
5405  *	dev_new_index	-	allocate an ifindex
5406  *	@net: the applicable net namespace
5407  *
5408  *	Returns a suitable unique value for a new device interface
5409  *	number.  The caller must hold the rtnl semaphore or the
5410  *	dev_base_lock to be sure it remains unique.
5411  */
5412 static int dev_new_index(struct net *net)
5413 {
5414 	int ifindex = net->ifindex;
5415 	for (;;) {
5416 		if (++ifindex <= 0)
5417 			ifindex = 1;
5418 		if (!__dev_get_by_index(net, ifindex))
5419 			return net->ifindex = ifindex;
5420 	}
5421 }
5422 
5423 /* Delayed registration/unregisteration */
5424 static LIST_HEAD(net_todo_list);
5425 static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5426 
5427 static void net_set_todo(struct net_device *dev)
5428 {
5429 	list_add_tail(&dev->todo_list, &net_todo_list);
5430 	dev_net(dev)->dev_unreg_count++;
5431 }
5432 
5433 static void rollback_registered_many(struct list_head *head)
5434 {
5435 	struct net_device *dev, *tmp;
5436 	LIST_HEAD(close_head);
5437 
5438 	BUG_ON(dev_boot_phase);
5439 	ASSERT_RTNL();
5440 
5441 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5442 		/* Some devices call without registering
5443 		 * for initialization unwind. Remove those
5444 		 * devices and proceed with the remaining.
5445 		 */
5446 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5447 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5448 				 dev->name, dev);
5449 
5450 			WARN_ON(1);
5451 			list_del(&dev->unreg_list);
5452 			continue;
5453 		}
5454 		dev->dismantle = true;
5455 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5456 	}
5457 
5458 	/* If device is running, close it first. */
5459 	list_for_each_entry(dev, head, unreg_list)
5460 		list_add_tail(&dev->close_list, &close_head);
5461 	dev_close_many(&close_head);
5462 
5463 	list_for_each_entry(dev, head, unreg_list) {
5464 		/* And unlink it from device chain. */
5465 		unlist_netdevice(dev);
5466 
5467 		dev->reg_state = NETREG_UNREGISTERING;
5468 	}
5469 
5470 	synchronize_net();
5471 
5472 	list_for_each_entry(dev, head, unreg_list) {
5473 		/* Shutdown queueing discipline. */
5474 		dev_shutdown(dev);
5475 
5476 
5477 		/* Notify protocols, that we are about to destroy
5478 		   this device. They should clean all the things.
5479 		*/
5480 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5481 
5482 		if (!dev->rtnl_link_ops ||
5483 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5484 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5485 
5486 		/*
5487 		 *	Flush the unicast and multicast chains
5488 		 */
5489 		dev_uc_flush(dev);
5490 		dev_mc_flush(dev);
5491 
5492 		if (dev->netdev_ops->ndo_uninit)
5493 			dev->netdev_ops->ndo_uninit(dev);
5494 
5495 		/* Notifier chain MUST detach us all upper devices. */
5496 		WARN_ON(netdev_has_any_upper_dev(dev));
5497 
5498 		/* Remove entries from kobject tree */
5499 		netdev_unregister_kobject(dev);
5500 #ifdef CONFIG_XPS
5501 		/* Remove XPS queueing entries */
5502 		netif_reset_xps_queues_gt(dev, 0);
5503 #endif
5504 	}
5505 
5506 	synchronize_net();
5507 
5508 	list_for_each_entry(dev, head, unreg_list)
5509 		dev_put(dev);
5510 }
5511 
5512 static void rollback_registered(struct net_device *dev)
5513 {
5514 	LIST_HEAD(single);
5515 
5516 	list_add(&dev->unreg_list, &single);
5517 	rollback_registered_many(&single);
5518 	list_del(&single);
5519 }
5520 
5521 static netdev_features_t netdev_fix_features(struct net_device *dev,
5522 	netdev_features_t features)
5523 {
5524 	/* Fix illegal checksum combinations */
5525 	if ((features & NETIF_F_HW_CSUM) &&
5526 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5527 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5528 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5529 	}
5530 
5531 	/* TSO requires that SG is present as well. */
5532 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5533 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5534 		features &= ~NETIF_F_ALL_TSO;
5535 	}
5536 
5537 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5538 					!(features & NETIF_F_IP_CSUM)) {
5539 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5540 		features &= ~NETIF_F_TSO;
5541 		features &= ~NETIF_F_TSO_ECN;
5542 	}
5543 
5544 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5545 					 !(features & NETIF_F_IPV6_CSUM)) {
5546 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5547 		features &= ~NETIF_F_TSO6;
5548 	}
5549 
5550 	/* TSO ECN requires that TSO is present as well. */
5551 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5552 		features &= ~NETIF_F_TSO_ECN;
5553 
5554 	/* Software GSO depends on SG. */
5555 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5556 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5557 		features &= ~NETIF_F_GSO;
5558 	}
5559 
5560 	/* UFO needs SG and checksumming */
5561 	if (features & NETIF_F_UFO) {
5562 		/* maybe split UFO into V4 and V6? */
5563 		if (!((features & NETIF_F_GEN_CSUM) ||
5564 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5565 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5566 			netdev_dbg(dev,
5567 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5568 			features &= ~NETIF_F_UFO;
5569 		}
5570 
5571 		if (!(features & NETIF_F_SG)) {
5572 			netdev_dbg(dev,
5573 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5574 			features &= ~NETIF_F_UFO;
5575 		}
5576 	}
5577 
5578 	return features;
5579 }
5580 
5581 int __netdev_update_features(struct net_device *dev)
5582 {
5583 	netdev_features_t features;
5584 	int err = 0;
5585 
5586 	ASSERT_RTNL();
5587 
5588 	features = netdev_get_wanted_features(dev);
5589 
5590 	if (dev->netdev_ops->ndo_fix_features)
5591 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5592 
5593 	/* driver might be less strict about feature dependencies */
5594 	features = netdev_fix_features(dev, features);
5595 
5596 	if (dev->features == features)
5597 		return 0;
5598 
5599 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5600 		&dev->features, &features);
5601 
5602 	if (dev->netdev_ops->ndo_set_features)
5603 		err = dev->netdev_ops->ndo_set_features(dev, features);
5604 
5605 	if (unlikely(err < 0)) {
5606 		netdev_err(dev,
5607 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5608 			err, &features, &dev->features);
5609 		return -1;
5610 	}
5611 
5612 	if (!err)
5613 		dev->features = features;
5614 
5615 	return 1;
5616 }
5617 
5618 /**
5619  *	netdev_update_features - recalculate device features
5620  *	@dev: the device to check
5621  *
5622  *	Recalculate dev->features set and send notifications if it
5623  *	has changed. Should be called after driver or hardware dependent
5624  *	conditions might have changed that influence the features.
5625  */
5626 void netdev_update_features(struct net_device *dev)
5627 {
5628 	if (__netdev_update_features(dev))
5629 		netdev_features_change(dev);
5630 }
5631 EXPORT_SYMBOL(netdev_update_features);
5632 
5633 /**
5634  *	netdev_change_features - recalculate device features
5635  *	@dev: the device to check
5636  *
5637  *	Recalculate dev->features set and send notifications even
5638  *	if they have not changed. Should be called instead of
5639  *	netdev_update_features() if also dev->vlan_features might
5640  *	have changed to allow the changes to be propagated to stacked
5641  *	VLAN devices.
5642  */
5643 void netdev_change_features(struct net_device *dev)
5644 {
5645 	__netdev_update_features(dev);
5646 	netdev_features_change(dev);
5647 }
5648 EXPORT_SYMBOL(netdev_change_features);
5649 
5650 /**
5651  *	netif_stacked_transfer_operstate -	transfer operstate
5652  *	@rootdev: the root or lower level device to transfer state from
5653  *	@dev: the device to transfer operstate to
5654  *
5655  *	Transfer operational state from root to device. This is normally
5656  *	called when a stacking relationship exists between the root
5657  *	device and the device(a leaf device).
5658  */
5659 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5660 					struct net_device *dev)
5661 {
5662 	if (rootdev->operstate == IF_OPER_DORMANT)
5663 		netif_dormant_on(dev);
5664 	else
5665 		netif_dormant_off(dev);
5666 
5667 	if (netif_carrier_ok(rootdev)) {
5668 		if (!netif_carrier_ok(dev))
5669 			netif_carrier_on(dev);
5670 	} else {
5671 		if (netif_carrier_ok(dev))
5672 			netif_carrier_off(dev);
5673 	}
5674 }
5675 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5676 
5677 #ifdef CONFIG_RPS
5678 static int netif_alloc_rx_queues(struct net_device *dev)
5679 {
5680 	unsigned int i, count = dev->num_rx_queues;
5681 	struct netdev_rx_queue *rx;
5682 
5683 	BUG_ON(count < 1);
5684 
5685 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5686 	if (!rx)
5687 		return -ENOMEM;
5688 
5689 	dev->_rx = rx;
5690 
5691 	for (i = 0; i < count; i++)
5692 		rx[i].dev = dev;
5693 	return 0;
5694 }
5695 #endif
5696 
5697 static void netdev_init_one_queue(struct net_device *dev,
5698 				  struct netdev_queue *queue, void *_unused)
5699 {
5700 	/* Initialize queue lock */
5701 	spin_lock_init(&queue->_xmit_lock);
5702 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5703 	queue->xmit_lock_owner = -1;
5704 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5705 	queue->dev = dev;
5706 #ifdef CONFIG_BQL
5707 	dql_init(&queue->dql, HZ);
5708 #endif
5709 }
5710 
5711 static void netif_free_tx_queues(struct net_device *dev)
5712 {
5713 	if (is_vmalloc_addr(dev->_tx))
5714 		vfree(dev->_tx);
5715 	else
5716 		kfree(dev->_tx);
5717 }
5718 
5719 static int netif_alloc_netdev_queues(struct net_device *dev)
5720 {
5721 	unsigned int count = dev->num_tx_queues;
5722 	struct netdev_queue *tx;
5723 	size_t sz = count * sizeof(*tx);
5724 
5725 	BUG_ON(count < 1 || count > 0xffff);
5726 
5727 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5728 	if (!tx) {
5729 		tx = vzalloc(sz);
5730 		if (!tx)
5731 			return -ENOMEM;
5732 	}
5733 	dev->_tx = tx;
5734 
5735 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5736 	spin_lock_init(&dev->tx_global_lock);
5737 
5738 	return 0;
5739 }
5740 
5741 /**
5742  *	register_netdevice	- register a network device
5743  *	@dev: device to register
5744  *
5745  *	Take a completed network device structure and add it to the kernel
5746  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5747  *	chain. 0 is returned on success. A negative errno code is returned
5748  *	on a failure to set up the device, or if the name is a duplicate.
5749  *
5750  *	Callers must hold the rtnl semaphore. You may want
5751  *	register_netdev() instead of this.
5752  *
5753  *	BUGS:
5754  *	The locking appears insufficient to guarantee two parallel registers
5755  *	will not get the same name.
5756  */
5757 
5758 int register_netdevice(struct net_device *dev)
5759 {
5760 	int ret;
5761 	struct net *net = dev_net(dev);
5762 
5763 	BUG_ON(dev_boot_phase);
5764 	ASSERT_RTNL();
5765 
5766 	might_sleep();
5767 
5768 	/* When net_device's are persistent, this will be fatal. */
5769 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5770 	BUG_ON(!net);
5771 
5772 	spin_lock_init(&dev->addr_list_lock);
5773 	netdev_set_addr_lockdep_class(dev);
5774 
5775 	dev->iflink = -1;
5776 
5777 	ret = dev_get_valid_name(net, dev, dev->name);
5778 	if (ret < 0)
5779 		goto out;
5780 
5781 	/* Init, if this function is available */
5782 	if (dev->netdev_ops->ndo_init) {
5783 		ret = dev->netdev_ops->ndo_init(dev);
5784 		if (ret) {
5785 			if (ret > 0)
5786 				ret = -EIO;
5787 			goto out;
5788 		}
5789 	}
5790 
5791 	if (((dev->hw_features | dev->features) &
5792 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
5793 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5794 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5795 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5796 		ret = -EINVAL;
5797 		goto err_uninit;
5798 	}
5799 
5800 	ret = -EBUSY;
5801 	if (!dev->ifindex)
5802 		dev->ifindex = dev_new_index(net);
5803 	else if (__dev_get_by_index(net, dev->ifindex))
5804 		goto err_uninit;
5805 
5806 	if (dev->iflink == -1)
5807 		dev->iflink = dev->ifindex;
5808 
5809 	/* Transfer changeable features to wanted_features and enable
5810 	 * software offloads (GSO and GRO).
5811 	 */
5812 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5813 	dev->features |= NETIF_F_SOFT_FEATURES;
5814 	dev->wanted_features = dev->features & dev->hw_features;
5815 
5816 	/* Turn on no cache copy if HW is doing checksum */
5817 	if (!(dev->flags & IFF_LOOPBACK)) {
5818 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
5819 		if (dev->features & NETIF_F_ALL_CSUM) {
5820 			dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5821 			dev->features |= NETIF_F_NOCACHE_COPY;
5822 		}
5823 	}
5824 
5825 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5826 	 */
5827 	dev->vlan_features |= NETIF_F_HIGHDMA;
5828 
5829 	/* Make NETIF_F_SG inheritable to tunnel devices.
5830 	 */
5831 	dev->hw_enc_features |= NETIF_F_SG;
5832 
5833 	/* Make NETIF_F_SG inheritable to MPLS.
5834 	 */
5835 	dev->mpls_features |= NETIF_F_SG;
5836 
5837 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5838 	ret = notifier_to_errno(ret);
5839 	if (ret)
5840 		goto err_uninit;
5841 
5842 	ret = netdev_register_kobject(dev);
5843 	if (ret)
5844 		goto err_uninit;
5845 	dev->reg_state = NETREG_REGISTERED;
5846 
5847 	__netdev_update_features(dev);
5848 
5849 	/*
5850 	 *	Default initial state at registry is that the
5851 	 *	device is present.
5852 	 */
5853 
5854 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5855 
5856 	linkwatch_init_dev(dev);
5857 
5858 	dev_init_scheduler(dev);
5859 	dev_hold(dev);
5860 	list_netdevice(dev);
5861 	add_device_randomness(dev->dev_addr, dev->addr_len);
5862 
5863 	/* If the device has permanent device address, driver should
5864 	 * set dev_addr and also addr_assign_type should be set to
5865 	 * NET_ADDR_PERM (default value).
5866 	 */
5867 	if (dev->addr_assign_type == NET_ADDR_PERM)
5868 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5869 
5870 	/* Notify protocols, that a new device appeared. */
5871 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5872 	ret = notifier_to_errno(ret);
5873 	if (ret) {
5874 		rollback_registered(dev);
5875 		dev->reg_state = NETREG_UNREGISTERED;
5876 	}
5877 	/*
5878 	 *	Prevent userspace races by waiting until the network
5879 	 *	device is fully setup before sending notifications.
5880 	 */
5881 	if (!dev->rtnl_link_ops ||
5882 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5883 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
5884 
5885 out:
5886 	return ret;
5887 
5888 err_uninit:
5889 	if (dev->netdev_ops->ndo_uninit)
5890 		dev->netdev_ops->ndo_uninit(dev);
5891 	goto out;
5892 }
5893 EXPORT_SYMBOL(register_netdevice);
5894 
5895 /**
5896  *	init_dummy_netdev	- init a dummy network device for NAPI
5897  *	@dev: device to init
5898  *
5899  *	This takes a network device structure and initialize the minimum
5900  *	amount of fields so it can be used to schedule NAPI polls without
5901  *	registering a full blown interface. This is to be used by drivers
5902  *	that need to tie several hardware interfaces to a single NAPI
5903  *	poll scheduler due to HW limitations.
5904  */
5905 int init_dummy_netdev(struct net_device *dev)
5906 {
5907 	/* Clear everything. Note we don't initialize spinlocks
5908 	 * are they aren't supposed to be taken by any of the
5909 	 * NAPI code and this dummy netdev is supposed to be
5910 	 * only ever used for NAPI polls
5911 	 */
5912 	memset(dev, 0, sizeof(struct net_device));
5913 
5914 	/* make sure we BUG if trying to hit standard
5915 	 * register/unregister code path
5916 	 */
5917 	dev->reg_state = NETREG_DUMMY;
5918 
5919 	/* NAPI wants this */
5920 	INIT_LIST_HEAD(&dev->napi_list);
5921 
5922 	/* a dummy interface is started by default */
5923 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5924 	set_bit(__LINK_STATE_START, &dev->state);
5925 
5926 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5927 	 * because users of this 'device' dont need to change
5928 	 * its refcount.
5929 	 */
5930 
5931 	return 0;
5932 }
5933 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5934 
5935 
5936 /**
5937  *	register_netdev	- register a network device
5938  *	@dev: device to register
5939  *
5940  *	Take a completed network device structure and add it to the kernel
5941  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5942  *	chain. 0 is returned on success. A negative errno code is returned
5943  *	on a failure to set up the device, or if the name is a duplicate.
5944  *
5945  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5946  *	and expands the device name if you passed a format string to
5947  *	alloc_netdev.
5948  */
5949 int register_netdev(struct net_device *dev)
5950 {
5951 	int err;
5952 
5953 	rtnl_lock();
5954 	err = register_netdevice(dev);
5955 	rtnl_unlock();
5956 	return err;
5957 }
5958 EXPORT_SYMBOL(register_netdev);
5959 
5960 int netdev_refcnt_read(const struct net_device *dev)
5961 {
5962 	int i, refcnt = 0;
5963 
5964 	for_each_possible_cpu(i)
5965 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5966 	return refcnt;
5967 }
5968 EXPORT_SYMBOL(netdev_refcnt_read);
5969 
5970 /**
5971  * netdev_wait_allrefs - wait until all references are gone.
5972  * @dev: target net_device
5973  *
5974  * This is called when unregistering network devices.
5975  *
5976  * Any protocol or device that holds a reference should register
5977  * for netdevice notification, and cleanup and put back the
5978  * reference if they receive an UNREGISTER event.
5979  * We can get stuck here if buggy protocols don't correctly
5980  * call dev_put.
5981  */
5982 static void netdev_wait_allrefs(struct net_device *dev)
5983 {
5984 	unsigned long rebroadcast_time, warning_time;
5985 	int refcnt;
5986 
5987 	linkwatch_forget_dev(dev);
5988 
5989 	rebroadcast_time = warning_time = jiffies;
5990 	refcnt = netdev_refcnt_read(dev);
5991 
5992 	while (refcnt != 0) {
5993 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5994 			rtnl_lock();
5995 
5996 			/* Rebroadcast unregister notification */
5997 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5998 
5999 			__rtnl_unlock();
6000 			rcu_barrier();
6001 			rtnl_lock();
6002 
6003 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6004 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6005 				     &dev->state)) {
6006 				/* We must not have linkwatch events
6007 				 * pending on unregister. If this
6008 				 * happens, we simply run the queue
6009 				 * unscheduled, resulting in a noop
6010 				 * for this device.
6011 				 */
6012 				linkwatch_run_queue();
6013 			}
6014 
6015 			__rtnl_unlock();
6016 
6017 			rebroadcast_time = jiffies;
6018 		}
6019 
6020 		msleep(250);
6021 
6022 		refcnt = netdev_refcnt_read(dev);
6023 
6024 		if (time_after(jiffies, warning_time + 10 * HZ)) {
6025 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6026 				 dev->name, refcnt);
6027 			warning_time = jiffies;
6028 		}
6029 	}
6030 }
6031 
6032 /* The sequence is:
6033  *
6034  *	rtnl_lock();
6035  *	...
6036  *	register_netdevice(x1);
6037  *	register_netdevice(x2);
6038  *	...
6039  *	unregister_netdevice(y1);
6040  *	unregister_netdevice(y2);
6041  *      ...
6042  *	rtnl_unlock();
6043  *	free_netdev(y1);
6044  *	free_netdev(y2);
6045  *
6046  * We are invoked by rtnl_unlock().
6047  * This allows us to deal with problems:
6048  * 1) We can delete sysfs objects which invoke hotplug
6049  *    without deadlocking with linkwatch via keventd.
6050  * 2) Since we run with the RTNL semaphore not held, we can sleep
6051  *    safely in order to wait for the netdev refcnt to drop to zero.
6052  *
6053  * We must not return until all unregister events added during
6054  * the interval the lock was held have been completed.
6055  */
6056 void netdev_run_todo(void)
6057 {
6058 	struct list_head list;
6059 
6060 	/* Snapshot list, allow later requests */
6061 	list_replace_init(&net_todo_list, &list);
6062 
6063 	__rtnl_unlock();
6064 
6065 
6066 	/* Wait for rcu callbacks to finish before next phase */
6067 	if (!list_empty(&list))
6068 		rcu_barrier();
6069 
6070 	while (!list_empty(&list)) {
6071 		struct net_device *dev
6072 			= list_first_entry(&list, struct net_device, todo_list);
6073 		list_del(&dev->todo_list);
6074 
6075 		rtnl_lock();
6076 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6077 		__rtnl_unlock();
6078 
6079 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6080 			pr_err("network todo '%s' but state %d\n",
6081 			       dev->name, dev->reg_state);
6082 			dump_stack();
6083 			continue;
6084 		}
6085 
6086 		dev->reg_state = NETREG_UNREGISTERED;
6087 
6088 		on_each_cpu(flush_backlog, dev, 1);
6089 
6090 		netdev_wait_allrefs(dev);
6091 
6092 		/* paranoia */
6093 		BUG_ON(netdev_refcnt_read(dev));
6094 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
6095 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6096 		WARN_ON(dev->dn_ptr);
6097 
6098 		if (dev->destructor)
6099 			dev->destructor(dev);
6100 
6101 		/* Report a network device has been unregistered */
6102 		rtnl_lock();
6103 		dev_net(dev)->dev_unreg_count--;
6104 		__rtnl_unlock();
6105 		wake_up(&netdev_unregistering_wq);
6106 
6107 		/* Free network device */
6108 		kobject_put(&dev->dev.kobj);
6109 	}
6110 }
6111 
6112 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6113  * fields in the same order, with only the type differing.
6114  */
6115 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6116 			     const struct net_device_stats *netdev_stats)
6117 {
6118 #if BITS_PER_LONG == 64
6119 	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6120 	memcpy(stats64, netdev_stats, sizeof(*stats64));
6121 #else
6122 	size_t i, n = sizeof(*stats64) / sizeof(u64);
6123 	const unsigned long *src = (const unsigned long *)netdev_stats;
6124 	u64 *dst = (u64 *)stats64;
6125 
6126 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6127 		     sizeof(*stats64) / sizeof(u64));
6128 	for (i = 0; i < n; i++)
6129 		dst[i] = src[i];
6130 #endif
6131 }
6132 EXPORT_SYMBOL(netdev_stats_to_stats64);
6133 
6134 /**
6135  *	dev_get_stats	- get network device statistics
6136  *	@dev: device to get statistics from
6137  *	@storage: place to store stats
6138  *
6139  *	Get network statistics from device. Return @storage.
6140  *	The device driver may provide its own method by setting
6141  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6142  *	otherwise the internal statistics structure is used.
6143  */
6144 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6145 					struct rtnl_link_stats64 *storage)
6146 {
6147 	const struct net_device_ops *ops = dev->netdev_ops;
6148 
6149 	if (ops->ndo_get_stats64) {
6150 		memset(storage, 0, sizeof(*storage));
6151 		ops->ndo_get_stats64(dev, storage);
6152 	} else if (ops->ndo_get_stats) {
6153 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6154 	} else {
6155 		netdev_stats_to_stats64(storage, &dev->stats);
6156 	}
6157 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6158 	return storage;
6159 }
6160 EXPORT_SYMBOL(dev_get_stats);
6161 
6162 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6163 {
6164 	struct netdev_queue *queue = dev_ingress_queue(dev);
6165 
6166 #ifdef CONFIG_NET_CLS_ACT
6167 	if (queue)
6168 		return queue;
6169 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6170 	if (!queue)
6171 		return NULL;
6172 	netdev_init_one_queue(dev, queue, NULL);
6173 	queue->qdisc = &noop_qdisc;
6174 	queue->qdisc_sleeping = &noop_qdisc;
6175 	rcu_assign_pointer(dev->ingress_queue, queue);
6176 #endif
6177 	return queue;
6178 }
6179 
6180 static const struct ethtool_ops default_ethtool_ops;
6181 
6182 void netdev_set_default_ethtool_ops(struct net_device *dev,
6183 				    const struct ethtool_ops *ops)
6184 {
6185 	if (dev->ethtool_ops == &default_ethtool_ops)
6186 		dev->ethtool_ops = ops;
6187 }
6188 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6189 
6190 void netdev_freemem(struct net_device *dev)
6191 {
6192 	char *addr = (char *)dev - dev->padded;
6193 
6194 	if (is_vmalloc_addr(addr))
6195 		vfree(addr);
6196 	else
6197 		kfree(addr);
6198 }
6199 
6200 /**
6201  *	alloc_netdev_mqs - allocate network device
6202  *	@sizeof_priv:	size of private data to allocate space for
6203  *	@name:		device name format string
6204  *	@setup:		callback to initialize device
6205  *	@txqs:		the number of TX subqueues to allocate
6206  *	@rxqs:		the number of RX subqueues to allocate
6207  *
6208  *	Allocates a struct net_device with private data area for driver use
6209  *	and performs basic initialization.  Also allocates subquue structs
6210  *	for each queue on the device.
6211  */
6212 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6213 		void (*setup)(struct net_device *),
6214 		unsigned int txqs, unsigned int rxqs)
6215 {
6216 	struct net_device *dev;
6217 	size_t alloc_size;
6218 	struct net_device *p;
6219 
6220 	BUG_ON(strlen(name) >= sizeof(dev->name));
6221 
6222 	if (txqs < 1) {
6223 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6224 		return NULL;
6225 	}
6226 
6227 #ifdef CONFIG_RPS
6228 	if (rxqs < 1) {
6229 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6230 		return NULL;
6231 	}
6232 #endif
6233 
6234 	alloc_size = sizeof(struct net_device);
6235 	if (sizeof_priv) {
6236 		/* ensure 32-byte alignment of private area */
6237 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6238 		alloc_size += sizeof_priv;
6239 	}
6240 	/* ensure 32-byte alignment of whole construct */
6241 	alloc_size += NETDEV_ALIGN - 1;
6242 
6243 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6244 	if (!p)
6245 		p = vzalloc(alloc_size);
6246 	if (!p)
6247 		return NULL;
6248 
6249 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6250 	dev->padded = (char *)dev - (char *)p;
6251 
6252 	dev->pcpu_refcnt = alloc_percpu(int);
6253 	if (!dev->pcpu_refcnt)
6254 		goto free_dev;
6255 
6256 	if (dev_addr_init(dev))
6257 		goto free_pcpu;
6258 
6259 	dev_mc_init(dev);
6260 	dev_uc_init(dev);
6261 
6262 	dev_net_set(dev, &init_net);
6263 
6264 	dev->gso_max_size = GSO_MAX_SIZE;
6265 	dev->gso_max_segs = GSO_MAX_SEGS;
6266 
6267 	INIT_LIST_HEAD(&dev->napi_list);
6268 	INIT_LIST_HEAD(&dev->unreg_list);
6269 	INIT_LIST_HEAD(&dev->close_list);
6270 	INIT_LIST_HEAD(&dev->link_watch_list);
6271 	INIT_LIST_HEAD(&dev->adj_list.upper);
6272 	INIT_LIST_HEAD(&dev->adj_list.lower);
6273 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
6274 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
6275 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
6276 	setup(dev);
6277 
6278 	dev->num_tx_queues = txqs;
6279 	dev->real_num_tx_queues = txqs;
6280 	if (netif_alloc_netdev_queues(dev))
6281 		goto free_all;
6282 
6283 #ifdef CONFIG_RPS
6284 	dev->num_rx_queues = rxqs;
6285 	dev->real_num_rx_queues = rxqs;
6286 	if (netif_alloc_rx_queues(dev))
6287 		goto free_all;
6288 #endif
6289 
6290 	strcpy(dev->name, name);
6291 	dev->group = INIT_NETDEV_GROUP;
6292 	if (!dev->ethtool_ops)
6293 		dev->ethtool_ops = &default_ethtool_ops;
6294 	return dev;
6295 
6296 free_all:
6297 	free_netdev(dev);
6298 	return NULL;
6299 
6300 free_pcpu:
6301 	free_percpu(dev->pcpu_refcnt);
6302 	netif_free_tx_queues(dev);
6303 #ifdef CONFIG_RPS
6304 	kfree(dev->_rx);
6305 #endif
6306 
6307 free_dev:
6308 	netdev_freemem(dev);
6309 	return NULL;
6310 }
6311 EXPORT_SYMBOL(alloc_netdev_mqs);
6312 
6313 /**
6314  *	free_netdev - free network device
6315  *	@dev: device
6316  *
6317  *	This function does the last stage of destroying an allocated device
6318  * 	interface. The reference to the device object is released.
6319  *	If this is the last reference then it will be freed.
6320  */
6321 void free_netdev(struct net_device *dev)
6322 {
6323 	struct napi_struct *p, *n;
6324 
6325 	release_net(dev_net(dev));
6326 
6327 	netif_free_tx_queues(dev);
6328 #ifdef CONFIG_RPS
6329 	kfree(dev->_rx);
6330 #endif
6331 
6332 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6333 
6334 	/* Flush device addresses */
6335 	dev_addr_flush(dev);
6336 
6337 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6338 		netif_napi_del(p);
6339 
6340 	free_percpu(dev->pcpu_refcnt);
6341 	dev->pcpu_refcnt = NULL;
6342 
6343 	/*  Compatibility with error handling in drivers */
6344 	if (dev->reg_state == NETREG_UNINITIALIZED) {
6345 		netdev_freemem(dev);
6346 		return;
6347 	}
6348 
6349 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6350 	dev->reg_state = NETREG_RELEASED;
6351 
6352 	/* will free via device release */
6353 	put_device(&dev->dev);
6354 }
6355 EXPORT_SYMBOL(free_netdev);
6356 
6357 /**
6358  *	synchronize_net -  Synchronize with packet receive processing
6359  *
6360  *	Wait for packets currently being received to be done.
6361  *	Does not block later packets from starting.
6362  */
6363 void synchronize_net(void)
6364 {
6365 	might_sleep();
6366 	if (rtnl_is_locked())
6367 		synchronize_rcu_expedited();
6368 	else
6369 		synchronize_rcu();
6370 }
6371 EXPORT_SYMBOL(synchronize_net);
6372 
6373 /**
6374  *	unregister_netdevice_queue - remove device from the kernel
6375  *	@dev: device
6376  *	@head: list
6377  *
6378  *	This function shuts down a device interface and removes it
6379  *	from the kernel tables.
6380  *	If head not NULL, device is queued to be unregistered later.
6381  *
6382  *	Callers must hold the rtnl semaphore.  You may want
6383  *	unregister_netdev() instead of this.
6384  */
6385 
6386 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6387 {
6388 	ASSERT_RTNL();
6389 
6390 	if (head) {
6391 		list_move_tail(&dev->unreg_list, head);
6392 	} else {
6393 		rollback_registered(dev);
6394 		/* Finish processing unregister after unlock */
6395 		net_set_todo(dev);
6396 	}
6397 }
6398 EXPORT_SYMBOL(unregister_netdevice_queue);
6399 
6400 /**
6401  *	unregister_netdevice_many - unregister many devices
6402  *	@head: list of devices
6403  */
6404 void unregister_netdevice_many(struct list_head *head)
6405 {
6406 	struct net_device *dev;
6407 
6408 	if (!list_empty(head)) {
6409 		rollback_registered_many(head);
6410 		list_for_each_entry(dev, head, unreg_list)
6411 			net_set_todo(dev);
6412 	}
6413 }
6414 EXPORT_SYMBOL(unregister_netdevice_many);
6415 
6416 /**
6417  *	unregister_netdev - remove device from the kernel
6418  *	@dev: device
6419  *
6420  *	This function shuts down a device interface and removes it
6421  *	from the kernel tables.
6422  *
6423  *	This is just a wrapper for unregister_netdevice that takes
6424  *	the rtnl semaphore.  In general you want to use this and not
6425  *	unregister_netdevice.
6426  */
6427 void unregister_netdev(struct net_device *dev)
6428 {
6429 	rtnl_lock();
6430 	unregister_netdevice(dev);
6431 	rtnl_unlock();
6432 }
6433 EXPORT_SYMBOL(unregister_netdev);
6434 
6435 /**
6436  *	dev_change_net_namespace - move device to different nethost namespace
6437  *	@dev: device
6438  *	@net: network namespace
6439  *	@pat: If not NULL name pattern to try if the current device name
6440  *	      is already taken in the destination network namespace.
6441  *
6442  *	This function shuts down a device interface and moves it
6443  *	to a new network namespace. On success 0 is returned, on
6444  *	a failure a netagive errno code is returned.
6445  *
6446  *	Callers must hold the rtnl semaphore.
6447  */
6448 
6449 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6450 {
6451 	int err;
6452 
6453 	ASSERT_RTNL();
6454 
6455 	/* Don't allow namespace local devices to be moved. */
6456 	err = -EINVAL;
6457 	if (dev->features & NETIF_F_NETNS_LOCAL)
6458 		goto out;
6459 
6460 	/* Ensure the device has been registrered */
6461 	if (dev->reg_state != NETREG_REGISTERED)
6462 		goto out;
6463 
6464 	/* Get out if there is nothing todo */
6465 	err = 0;
6466 	if (net_eq(dev_net(dev), net))
6467 		goto out;
6468 
6469 	/* Pick the destination device name, and ensure
6470 	 * we can use it in the destination network namespace.
6471 	 */
6472 	err = -EEXIST;
6473 	if (__dev_get_by_name(net, dev->name)) {
6474 		/* We get here if we can't use the current device name */
6475 		if (!pat)
6476 			goto out;
6477 		if (dev_get_valid_name(net, dev, pat) < 0)
6478 			goto out;
6479 	}
6480 
6481 	/*
6482 	 * And now a mini version of register_netdevice unregister_netdevice.
6483 	 */
6484 
6485 	/* If device is running close it first. */
6486 	dev_close(dev);
6487 
6488 	/* And unlink it from device chain */
6489 	err = -ENODEV;
6490 	unlist_netdevice(dev);
6491 
6492 	synchronize_net();
6493 
6494 	/* Shutdown queueing discipline. */
6495 	dev_shutdown(dev);
6496 
6497 	/* Notify protocols, that we are about to destroy
6498 	   this device. They should clean all the things.
6499 
6500 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6501 	   This is wanted because this way 8021q and macvlan know
6502 	   the device is just moving and can keep their slaves up.
6503 	*/
6504 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6505 	rcu_barrier();
6506 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6507 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6508 
6509 	/*
6510 	 *	Flush the unicast and multicast chains
6511 	 */
6512 	dev_uc_flush(dev);
6513 	dev_mc_flush(dev);
6514 
6515 	/* Send a netdev-removed uevent to the old namespace */
6516 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6517 
6518 	/* Actually switch the network namespace */
6519 	dev_net_set(dev, net);
6520 
6521 	/* If there is an ifindex conflict assign a new one */
6522 	if (__dev_get_by_index(net, dev->ifindex)) {
6523 		int iflink = (dev->iflink == dev->ifindex);
6524 		dev->ifindex = dev_new_index(net);
6525 		if (iflink)
6526 			dev->iflink = dev->ifindex;
6527 	}
6528 
6529 	/* Send a netdev-add uevent to the new namespace */
6530 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6531 
6532 	/* Fixup kobjects */
6533 	err = device_rename(&dev->dev, dev->name);
6534 	WARN_ON(err);
6535 
6536 	/* Add the device back in the hashes */
6537 	list_netdevice(dev);
6538 
6539 	/* Notify protocols, that a new device appeared. */
6540 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6541 
6542 	/*
6543 	 *	Prevent userspace races by waiting until the network
6544 	 *	device is fully setup before sending notifications.
6545 	 */
6546 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6547 
6548 	synchronize_net();
6549 	err = 0;
6550 out:
6551 	return err;
6552 }
6553 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6554 
6555 static int dev_cpu_callback(struct notifier_block *nfb,
6556 			    unsigned long action,
6557 			    void *ocpu)
6558 {
6559 	struct sk_buff **list_skb;
6560 	struct sk_buff *skb;
6561 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6562 	struct softnet_data *sd, *oldsd;
6563 
6564 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6565 		return NOTIFY_OK;
6566 
6567 	local_irq_disable();
6568 	cpu = smp_processor_id();
6569 	sd = &per_cpu(softnet_data, cpu);
6570 	oldsd = &per_cpu(softnet_data, oldcpu);
6571 
6572 	/* Find end of our completion_queue. */
6573 	list_skb = &sd->completion_queue;
6574 	while (*list_skb)
6575 		list_skb = &(*list_skb)->next;
6576 	/* Append completion queue from offline CPU. */
6577 	*list_skb = oldsd->completion_queue;
6578 	oldsd->completion_queue = NULL;
6579 
6580 	/* Append output queue from offline CPU. */
6581 	if (oldsd->output_queue) {
6582 		*sd->output_queue_tailp = oldsd->output_queue;
6583 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6584 		oldsd->output_queue = NULL;
6585 		oldsd->output_queue_tailp = &oldsd->output_queue;
6586 	}
6587 	/* Append NAPI poll list from offline CPU. */
6588 	if (!list_empty(&oldsd->poll_list)) {
6589 		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6590 		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6591 	}
6592 
6593 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6594 	local_irq_enable();
6595 
6596 	/* Process offline CPU's input_pkt_queue */
6597 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6598 		netif_rx(skb);
6599 		input_queue_head_incr(oldsd);
6600 	}
6601 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6602 		netif_rx(skb);
6603 		input_queue_head_incr(oldsd);
6604 	}
6605 
6606 	return NOTIFY_OK;
6607 }
6608 
6609 
6610 /**
6611  *	netdev_increment_features - increment feature set by one
6612  *	@all: current feature set
6613  *	@one: new feature set
6614  *	@mask: mask feature set
6615  *
6616  *	Computes a new feature set after adding a device with feature set
6617  *	@one to the master device with current feature set @all.  Will not
6618  *	enable anything that is off in @mask. Returns the new feature set.
6619  */
6620 netdev_features_t netdev_increment_features(netdev_features_t all,
6621 	netdev_features_t one, netdev_features_t mask)
6622 {
6623 	if (mask & NETIF_F_GEN_CSUM)
6624 		mask |= NETIF_F_ALL_CSUM;
6625 	mask |= NETIF_F_VLAN_CHALLENGED;
6626 
6627 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6628 	all &= one | ~NETIF_F_ALL_FOR_ALL;
6629 
6630 	/* If one device supports hw checksumming, set for all. */
6631 	if (all & NETIF_F_GEN_CSUM)
6632 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6633 
6634 	return all;
6635 }
6636 EXPORT_SYMBOL(netdev_increment_features);
6637 
6638 static struct hlist_head * __net_init netdev_create_hash(void)
6639 {
6640 	int i;
6641 	struct hlist_head *hash;
6642 
6643 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6644 	if (hash != NULL)
6645 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6646 			INIT_HLIST_HEAD(&hash[i]);
6647 
6648 	return hash;
6649 }
6650 
6651 /* Initialize per network namespace state */
6652 static int __net_init netdev_init(struct net *net)
6653 {
6654 	if (net != &init_net)
6655 		INIT_LIST_HEAD(&net->dev_base_head);
6656 
6657 	net->dev_name_head = netdev_create_hash();
6658 	if (net->dev_name_head == NULL)
6659 		goto err_name;
6660 
6661 	net->dev_index_head = netdev_create_hash();
6662 	if (net->dev_index_head == NULL)
6663 		goto err_idx;
6664 
6665 	return 0;
6666 
6667 err_idx:
6668 	kfree(net->dev_name_head);
6669 err_name:
6670 	return -ENOMEM;
6671 }
6672 
6673 /**
6674  *	netdev_drivername - network driver for the device
6675  *	@dev: network device
6676  *
6677  *	Determine network driver for device.
6678  */
6679 const char *netdev_drivername(const struct net_device *dev)
6680 {
6681 	const struct device_driver *driver;
6682 	const struct device *parent;
6683 	const char *empty = "";
6684 
6685 	parent = dev->dev.parent;
6686 	if (!parent)
6687 		return empty;
6688 
6689 	driver = parent->driver;
6690 	if (driver && driver->name)
6691 		return driver->name;
6692 	return empty;
6693 }
6694 
6695 static int __netdev_printk(const char *level, const struct net_device *dev,
6696 			   struct va_format *vaf)
6697 {
6698 	int r;
6699 
6700 	if (dev && dev->dev.parent) {
6701 		r = dev_printk_emit(level[1] - '0',
6702 				    dev->dev.parent,
6703 				    "%s %s %s: %pV",
6704 				    dev_driver_string(dev->dev.parent),
6705 				    dev_name(dev->dev.parent),
6706 				    netdev_name(dev), vaf);
6707 	} else if (dev) {
6708 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6709 	} else {
6710 		r = printk("%s(NULL net_device): %pV", level, vaf);
6711 	}
6712 
6713 	return r;
6714 }
6715 
6716 int netdev_printk(const char *level, const struct net_device *dev,
6717 		  const char *format, ...)
6718 {
6719 	struct va_format vaf;
6720 	va_list args;
6721 	int r;
6722 
6723 	va_start(args, format);
6724 
6725 	vaf.fmt = format;
6726 	vaf.va = &args;
6727 
6728 	r = __netdev_printk(level, dev, &vaf);
6729 
6730 	va_end(args);
6731 
6732 	return r;
6733 }
6734 EXPORT_SYMBOL(netdev_printk);
6735 
6736 #define define_netdev_printk_level(func, level)			\
6737 int func(const struct net_device *dev, const char *fmt, ...)	\
6738 {								\
6739 	int r;							\
6740 	struct va_format vaf;					\
6741 	va_list args;						\
6742 								\
6743 	va_start(args, fmt);					\
6744 								\
6745 	vaf.fmt = fmt;						\
6746 	vaf.va = &args;						\
6747 								\
6748 	r = __netdev_printk(level, dev, &vaf);			\
6749 								\
6750 	va_end(args);						\
6751 								\
6752 	return r;						\
6753 }								\
6754 EXPORT_SYMBOL(func);
6755 
6756 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6757 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6758 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6759 define_netdev_printk_level(netdev_err, KERN_ERR);
6760 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6761 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6762 define_netdev_printk_level(netdev_info, KERN_INFO);
6763 
6764 static void __net_exit netdev_exit(struct net *net)
6765 {
6766 	kfree(net->dev_name_head);
6767 	kfree(net->dev_index_head);
6768 }
6769 
6770 static struct pernet_operations __net_initdata netdev_net_ops = {
6771 	.init = netdev_init,
6772 	.exit = netdev_exit,
6773 };
6774 
6775 static void __net_exit default_device_exit(struct net *net)
6776 {
6777 	struct net_device *dev, *aux;
6778 	/*
6779 	 * Push all migratable network devices back to the
6780 	 * initial network namespace
6781 	 */
6782 	rtnl_lock();
6783 	for_each_netdev_safe(net, dev, aux) {
6784 		int err;
6785 		char fb_name[IFNAMSIZ];
6786 
6787 		/* Ignore unmoveable devices (i.e. loopback) */
6788 		if (dev->features & NETIF_F_NETNS_LOCAL)
6789 			continue;
6790 
6791 		/* Leave virtual devices for the generic cleanup */
6792 		if (dev->rtnl_link_ops)
6793 			continue;
6794 
6795 		/* Push remaining network devices to init_net */
6796 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6797 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6798 		if (err) {
6799 			pr_emerg("%s: failed to move %s to init_net: %d\n",
6800 				 __func__, dev->name, err);
6801 			BUG();
6802 		}
6803 	}
6804 	rtnl_unlock();
6805 }
6806 
6807 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
6808 {
6809 	/* Return with the rtnl_lock held when there are no network
6810 	 * devices unregistering in any network namespace in net_list.
6811 	 */
6812 	struct net *net;
6813 	bool unregistering;
6814 	DEFINE_WAIT(wait);
6815 
6816 	for (;;) {
6817 		prepare_to_wait(&netdev_unregistering_wq, &wait,
6818 				TASK_UNINTERRUPTIBLE);
6819 		unregistering = false;
6820 		rtnl_lock();
6821 		list_for_each_entry(net, net_list, exit_list) {
6822 			if (net->dev_unreg_count > 0) {
6823 				unregistering = true;
6824 				break;
6825 			}
6826 		}
6827 		if (!unregistering)
6828 			break;
6829 		__rtnl_unlock();
6830 		schedule();
6831 	}
6832 	finish_wait(&netdev_unregistering_wq, &wait);
6833 }
6834 
6835 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6836 {
6837 	/* At exit all network devices most be removed from a network
6838 	 * namespace.  Do this in the reverse order of registration.
6839 	 * Do this across as many network namespaces as possible to
6840 	 * improve batching efficiency.
6841 	 */
6842 	struct net_device *dev;
6843 	struct net *net;
6844 	LIST_HEAD(dev_kill_list);
6845 
6846 	/* To prevent network device cleanup code from dereferencing
6847 	 * loopback devices or network devices that have been freed
6848 	 * wait here for all pending unregistrations to complete,
6849 	 * before unregistring the loopback device and allowing the
6850 	 * network namespace be freed.
6851 	 *
6852 	 * The netdev todo list containing all network devices
6853 	 * unregistrations that happen in default_device_exit_batch
6854 	 * will run in the rtnl_unlock() at the end of
6855 	 * default_device_exit_batch.
6856 	 */
6857 	rtnl_lock_unregistering(net_list);
6858 	list_for_each_entry(net, net_list, exit_list) {
6859 		for_each_netdev_reverse(net, dev) {
6860 			if (dev->rtnl_link_ops)
6861 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6862 			else
6863 				unregister_netdevice_queue(dev, &dev_kill_list);
6864 		}
6865 	}
6866 	unregister_netdevice_many(&dev_kill_list);
6867 	list_del(&dev_kill_list);
6868 	rtnl_unlock();
6869 }
6870 
6871 static struct pernet_operations __net_initdata default_device_ops = {
6872 	.exit = default_device_exit,
6873 	.exit_batch = default_device_exit_batch,
6874 };
6875 
6876 /*
6877  *	Initialize the DEV module. At boot time this walks the device list and
6878  *	unhooks any devices that fail to initialise (normally hardware not
6879  *	present) and leaves us with a valid list of present and active devices.
6880  *
6881  */
6882 
6883 /*
6884  *       This is called single threaded during boot, so no need
6885  *       to take the rtnl semaphore.
6886  */
6887 static int __init net_dev_init(void)
6888 {
6889 	int i, rc = -ENOMEM;
6890 
6891 	BUG_ON(!dev_boot_phase);
6892 
6893 	if (dev_proc_init())
6894 		goto out;
6895 
6896 	if (netdev_kobject_init())
6897 		goto out;
6898 
6899 	INIT_LIST_HEAD(&ptype_all);
6900 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6901 		INIT_LIST_HEAD(&ptype_base[i]);
6902 
6903 	INIT_LIST_HEAD(&offload_base);
6904 
6905 	if (register_pernet_subsys(&netdev_net_ops))
6906 		goto out;
6907 
6908 	/*
6909 	 *	Initialise the packet receive queues.
6910 	 */
6911 
6912 	for_each_possible_cpu(i) {
6913 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6914 
6915 		memset(sd, 0, sizeof(*sd));
6916 		skb_queue_head_init(&sd->input_pkt_queue);
6917 		skb_queue_head_init(&sd->process_queue);
6918 		sd->completion_queue = NULL;
6919 		INIT_LIST_HEAD(&sd->poll_list);
6920 		sd->output_queue = NULL;
6921 		sd->output_queue_tailp = &sd->output_queue;
6922 #ifdef CONFIG_RPS
6923 		sd->csd.func = rps_trigger_softirq;
6924 		sd->csd.info = sd;
6925 		sd->csd.flags = 0;
6926 		sd->cpu = i;
6927 #endif
6928 
6929 		sd->backlog.poll = process_backlog;
6930 		sd->backlog.weight = weight_p;
6931 		sd->backlog.gro_list = NULL;
6932 		sd->backlog.gro_count = 0;
6933 
6934 #ifdef CONFIG_NET_FLOW_LIMIT
6935 		sd->flow_limit = NULL;
6936 #endif
6937 	}
6938 
6939 	dev_boot_phase = 0;
6940 
6941 	/* The loopback device is special if any other network devices
6942 	 * is present in a network namespace the loopback device must
6943 	 * be present. Since we now dynamically allocate and free the
6944 	 * loopback device ensure this invariant is maintained by
6945 	 * keeping the loopback device as the first device on the
6946 	 * list of network devices.  Ensuring the loopback devices
6947 	 * is the first device that appears and the last network device
6948 	 * that disappears.
6949 	 */
6950 	if (register_pernet_device(&loopback_net_ops))
6951 		goto out;
6952 
6953 	if (register_pernet_device(&default_device_ops))
6954 		goto out;
6955 
6956 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6957 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6958 
6959 	hotcpu_notifier(dev_cpu_callback, 0);
6960 	dst_init();
6961 	rc = 0;
6962 out:
6963 	return rc;
6964 }
6965 
6966 subsys_initcall(net_dev_init);
6967