xref: /linux/net/core/dev.c (revision 89e47d3b8a273b0eac21e4bf6d7fdb86b654fa16)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <linux/ipv6.h>
122 #include <linux/in.h>
123 #include <linux/jhash.h>
124 #include <linux/random.h>
125 #include <trace/events/napi.h>
126 #include <trace/events/net.h>
127 #include <trace/events/skb.h>
128 #include <linux/pci.h>
129 #include <linux/inetdevice.h>
130 #include <linux/cpu_rmap.h>
131 #include <linux/static_key.h>
132 #include <linux/hashtable.h>
133 #include <linux/vmalloc.h>
134 #include <linux/if_macvlan.h>
135 
136 #include "net-sysfs.h"
137 
138 /* Instead of increasing this, you should create a hash table. */
139 #define MAX_GRO_SKBS 8
140 
141 /* This should be increased if a protocol with a bigger head is added. */
142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
143 
144 static DEFINE_SPINLOCK(ptype_lock);
145 static DEFINE_SPINLOCK(offload_lock);
146 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
147 struct list_head ptype_all __read_mostly;	/* Taps */
148 static struct list_head offload_base __read_mostly;
149 
150 /*
151  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
152  * semaphore.
153  *
154  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
155  *
156  * Writers must hold the rtnl semaphore while they loop through the
157  * dev_base_head list, and hold dev_base_lock for writing when they do the
158  * actual updates.  This allows pure readers to access the list even
159  * while a writer is preparing to update it.
160  *
161  * To put it another way, dev_base_lock is held for writing only to
162  * protect against pure readers; the rtnl semaphore provides the
163  * protection against other writers.
164  *
165  * See, for example usages, register_netdevice() and
166  * unregister_netdevice(), which must be called with the rtnl
167  * semaphore held.
168  */
169 DEFINE_RWLOCK(dev_base_lock);
170 EXPORT_SYMBOL(dev_base_lock);
171 
172 /* protects napi_hash addition/deletion and napi_gen_id */
173 static DEFINE_SPINLOCK(napi_hash_lock);
174 
175 static unsigned int napi_gen_id;
176 static DEFINE_HASHTABLE(napi_hash, 8);
177 
178 static seqcount_t devnet_rename_seq;
179 
180 static inline void dev_base_seq_inc(struct net *net)
181 {
182 	while (++net->dev_base_seq == 0);
183 }
184 
185 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
186 {
187 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
188 
189 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
190 }
191 
192 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
193 {
194 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
195 }
196 
197 static inline void rps_lock(struct softnet_data *sd)
198 {
199 #ifdef CONFIG_RPS
200 	spin_lock(&sd->input_pkt_queue.lock);
201 #endif
202 }
203 
204 static inline void rps_unlock(struct softnet_data *sd)
205 {
206 #ifdef CONFIG_RPS
207 	spin_unlock(&sd->input_pkt_queue.lock);
208 #endif
209 }
210 
211 /* Device list insertion */
212 static void list_netdevice(struct net_device *dev)
213 {
214 	struct net *net = dev_net(dev);
215 
216 	ASSERT_RTNL();
217 
218 	write_lock_bh(&dev_base_lock);
219 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
220 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
221 	hlist_add_head_rcu(&dev->index_hlist,
222 			   dev_index_hash(net, dev->ifindex));
223 	write_unlock_bh(&dev_base_lock);
224 
225 	dev_base_seq_inc(net);
226 }
227 
228 /* Device list removal
229  * caller must respect a RCU grace period before freeing/reusing dev
230  */
231 static void unlist_netdevice(struct net_device *dev)
232 {
233 	ASSERT_RTNL();
234 
235 	/* Unlink dev from the device chain */
236 	write_lock_bh(&dev_base_lock);
237 	list_del_rcu(&dev->dev_list);
238 	hlist_del_rcu(&dev->name_hlist);
239 	hlist_del_rcu(&dev->index_hlist);
240 	write_unlock_bh(&dev_base_lock);
241 
242 	dev_base_seq_inc(dev_net(dev));
243 }
244 
245 /*
246  *	Our notifier list
247  */
248 
249 static RAW_NOTIFIER_HEAD(netdev_chain);
250 
251 /*
252  *	Device drivers call our routines to queue packets here. We empty the
253  *	queue in the local softnet handler.
254  */
255 
256 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
257 EXPORT_PER_CPU_SYMBOL(softnet_data);
258 
259 #ifdef CONFIG_LOCKDEP
260 /*
261  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
262  * according to dev->type
263  */
264 static const unsigned short netdev_lock_type[] =
265 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
266 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
267 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
268 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
269 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
270 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
271 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
272 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
273 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
274 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
275 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
276 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
277 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
278 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
279 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
280 
281 static const char *const netdev_lock_name[] =
282 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
283 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
284 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
285 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
286 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
287 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
288 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
289 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
290 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
291 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
292 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
293 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
294 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
295 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
296 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
297 
298 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
299 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
300 
301 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
302 {
303 	int i;
304 
305 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
306 		if (netdev_lock_type[i] == dev_type)
307 			return i;
308 	/* the last key is used by default */
309 	return ARRAY_SIZE(netdev_lock_type) - 1;
310 }
311 
312 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
313 						 unsigned short dev_type)
314 {
315 	int i;
316 
317 	i = netdev_lock_pos(dev_type);
318 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
319 				   netdev_lock_name[i]);
320 }
321 
322 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
323 {
324 	int i;
325 
326 	i = netdev_lock_pos(dev->type);
327 	lockdep_set_class_and_name(&dev->addr_list_lock,
328 				   &netdev_addr_lock_key[i],
329 				   netdev_lock_name[i]);
330 }
331 #else
332 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
333 						 unsigned short dev_type)
334 {
335 }
336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337 {
338 }
339 #endif
340 
341 /*******************************************************************************
342 
343 		Protocol management and registration routines
344 
345 *******************************************************************************/
346 
347 /*
348  *	Add a protocol ID to the list. Now that the input handler is
349  *	smarter we can dispense with all the messy stuff that used to be
350  *	here.
351  *
352  *	BEWARE!!! Protocol handlers, mangling input packets,
353  *	MUST BE last in hash buckets and checking protocol handlers
354  *	MUST start from promiscuous ptype_all chain in net_bh.
355  *	It is true now, do not change it.
356  *	Explanation follows: if protocol handler, mangling packet, will
357  *	be the first on list, it is not able to sense, that packet
358  *	is cloned and should be copied-on-write, so that it will
359  *	change it and subsequent readers will get broken packet.
360  *							--ANK (980803)
361  */
362 
363 static inline struct list_head *ptype_head(const struct packet_type *pt)
364 {
365 	if (pt->type == htons(ETH_P_ALL))
366 		return &ptype_all;
367 	else
368 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
369 }
370 
371 /**
372  *	dev_add_pack - add packet handler
373  *	@pt: packet type declaration
374  *
375  *	Add a protocol handler to the networking stack. The passed &packet_type
376  *	is linked into kernel lists and may not be freed until it has been
377  *	removed from the kernel lists.
378  *
379  *	This call does not sleep therefore it can not
380  *	guarantee all CPU's that are in middle of receiving packets
381  *	will see the new packet type (until the next received packet).
382  */
383 
384 void dev_add_pack(struct packet_type *pt)
385 {
386 	struct list_head *head = ptype_head(pt);
387 
388 	spin_lock(&ptype_lock);
389 	list_add_rcu(&pt->list, head);
390 	spin_unlock(&ptype_lock);
391 }
392 EXPORT_SYMBOL(dev_add_pack);
393 
394 /**
395  *	__dev_remove_pack	 - remove packet handler
396  *	@pt: packet type declaration
397  *
398  *	Remove a protocol handler that was previously added to the kernel
399  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
400  *	from the kernel lists and can be freed or reused once this function
401  *	returns.
402  *
403  *      The packet type might still be in use by receivers
404  *	and must not be freed until after all the CPU's have gone
405  *	through a quiescent state.
406  */
407 void __dev_remove_pack(struct packet_type *pt)
408 {
409 	struct list_head *head = ptype_head(pt);
410 	struct packet_type *pt1;
411 
412 	spin_lock(&ptype_lock);
413 
414 	list_for_each_entry(pt1, head, list) {
415 		if (pt == pt1) {
416 			list_del_rcu(&pt->list);
417 			goto out;
418 		}
419 	}
420 
421 	pr_warn("dev_remove_pack: %p not found\n", pt);
422 out:
423 	spin_unlock(&ptype_lock);
424 }
425 EXPORT_SYMBOL(__dev_remove_pack);
426 
427 /**
428  *	dev_remove_pack	 - remove packet handler
429  *	@pt: packet type declaration
430  *
431  *	Remove a protocol handler that was previously added to the kernel
432  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
433  *	from the kernel lists and can be freed or reused once this function
434  *	returns.
435  *
436  *	This call sleeps to guarantee that no CPU is looking at the packet
437  *	type after return.
438  */
439 void dev_remove_pack(struct packet_type *pt)
440 {
441 	__dev_remove_pack(pt);
442 
443 	synchronize_net();
444 }
445 EXPORT_SYMBOL(dev_remove_pack);
446 
447 
448 /**
449  *	dev_add_offload - register offload handlers
450  *	@po: protocol offload declaration
451  *
452  *	Add protocol offload handlers to the networking stack. The passed
453  *	&proto_offload is linked into kernel lists and may not be freed until
454  *	it has been removed from the kernel lists.
455  *
456  *	This call does not sleep therefore it can not
457  *	guarantee all CPU's that are in middle of receiving packets
458  *	will see the new offload handlers (until the next received packet).
459  */
460 void dev_add_offload(struct packet_offload *po)
461 {
462 	struct list_head *head = &offload_base;
463 
464 	spin_lock(&offload_lock);
465 	list_add_rcu(&po->list, head);
466 	spin_unlock(&offload_lock);
467 }
468 EXPORT_SYMBOL(dev_add_offload);
469 
470 /**
471  *	__dev_remove_offload	 - remove offload handler
472  *	@po: packet offload declaration
473  *
474  *	Remove a protocol offload handler that was previously added to the
475  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
476  *	is removed from the kernel lists and can be freed or reused once this
477  *	function returns.
478  *
479  *      The packet type might still be in use by receivers
480  *	and must not be freed until after all the CPU's have gone
481  *	through a quiescent state.
482  */
483 void __dev_remove_offload(struct packet_offload *po)
484 {
485 	struct list_head *head = &offload_base;
486 	struct packet_offload *po1;
487 
488 	spin_lock(&offload_lock);
489 
490 	list_for_each_entry(po1, head, list) {
491 		if (po == po1) {
492 			list_del_rcu(&po->list);
493 			goto out;
494 		}
495 	}
496 
497 	pr_warn("dev_remove_offload: %p not found\n", po);
498 out:
499 	spin_unlock(&offload_lock);
500 }
501 EXPORT_SYMBOL(__dev_remove_offload);
502 
503 /**
504  *	dev_remove_offload	 - remove packet offload handler
505  *	@po: packet offload declaration
506  *
507  *	Remove a packet offload handler that was previously added to the kernel
508  *	offload handlers by dev_add_offload(). The passed &offload_type is
509  *	removed from the kernel lists and can be freed or reused once this
510  *	function returns.
511  *
512  *	This call sleeps to guarantee that no CPU is looking at the packet
513  *	type after return.
514  */
515 void dev_remove_offload(struct packet_offload *po)
516 {
517 	__dev_remove_offload(po);
518 
519 	synchronize_net();
520 }
521 EXPORT_SYMBOL(dev_remove_offload);
522 
523 /******************************************************************************
524 
525 		      Device Boot-time Settings Routines
526 
527 *******************************************************************************/
528 
529 /* Boot time configuration table */
530 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
531 
532 /**
533  *	netdev_boot_setup_add	- add new setup entry
534  *	@name: name of the device
535  *	@map: configured settings for the device
536  *
537  *	Adds new setup entry to the dev_boot_setup list.  The function
538  *	returns 0 on error and 1 on success.  This is a generic routine to
539  *	all netdevices.
540  */
541 static int netdev_boot_setup_add(char *name, struct ifmap *map)
542 {
543 	struct netdev_boot_setup *s;
544 	int i;
545 
546 	s = dev_boot_setup;
547 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
548 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
549 			memset(s[i].name, 0, sizeof(s[i].name));
550 			strlcpy(s[i].name, name, IFNAMSIZ);
551 			memcpy(&s[i].map, map, sizeof(s[i].map));
552 			break;
553 		}
554 	}
555 
556 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
557 }
558 
559 /**
560  *	netdev_boot_setup_check	- check boot time settings
561  *	@dev: the netdevice
562  *
563  * 	Check boot time settings for the device.
564  *	The found settings are set for the device to be used
565  *	later in the device probing.
566  *	Returns 0 if no settings found, 1 if they are.
567  */
568 int netdev_boot_setup_check(struct net_device *dev)
569 {
570 	struct netdev_boot_setup *s = dev_boot_setup;
571 	int i;
572 
573 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
574 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
575 		    !strcmp(dev->name, s[i].name)) {
576 			dev->irq 	= s[i].map.irq;
577 			dev->base_addr 	= s[i].map.base_addr;
578 			dev->mem_start 	= s[i].map.mem_start;
579 			dev->mem_end 	= s[i].map.mem_end;
580 			return 1;
581 		}
582 	}
583 	return 0;
584 }
585 EXPORT_SYMBOL(netdev_boot_setup_check);
586 
587 
588 /**
589  *	netdev_boot_base	- get address from boot time settings
590  *	@prefix: prefix for network device
591  *	@unit: id for network device
592  *
593  * 	Check boot time settings for the base address of device.
594  *	The found settings are set for the device to be used
595  *	later in the device probing.
596  *	Returns 0 if no settings found.
597  */
598 unsigned long netdev_boot_base(const char *prefix, int unit)
599 {
600 	const struct netdev_boot_setup *s = dev_boot_setup;
601 	char name[IFNAMSIZ];
602 	int i;
603 
604 	sprintf(name, "%s%d", prefix, unit);
605 
606 	/*
607 	 * If device already registered then return base of 1
608 	 * to indicate not to probe for this interface
609 	 */
610 	if (__dev_get_by_name(&init_net, name))
611 		return 1;
612 
613 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
614 		if (!strcmp(name, s[i].name))
615 			return s[i].map.base_addr;
616 	return 0;
617 }
618 
619 /*
620  * Saves at boot time configured settings for any netdevice.
621  */
622 int __init netdev_boot_setup(char *str)
623 {
624 	int ints[5];
625 	struct ifmap map;
626 
627 	str = get_options(str, ARRAY_SIZE(ints), ints);
628 	if (!str || !*str)
629 		return 0;
630 
631 	/* Save settings */
632 	memset(&map, 0, sizeof(map));
633 	if (ints[0] > 0)
634 		map.irq = ints[1];
635 	if (ints[0] > 1)
636 		map.base_addr = ints[2];
637 	if (ints[0] > 2)
638 		map.mem_start = ints[3];
639 	if (ints[0] > 3)
640 		map.mem_end = ints[4];
641 
642 	/* Add new entry to the list */
643 	return netdev_boot_setup_add(str, &map);
644 }
645 
646 __setup("netdev=", netdev_boot_setup);
647 
648 /*******************************************************************************
649 
650 			    Device Interface Subroutines
651 
652 *******************************************************************************/
653 
654 /**
655  *	__dev_get_by_name	- find a device by its name
656  *	@net: the applicable net namespace
657  *	@name: name to find
658  *
659  *	Find an interface by name. Must be called under RTNL semaphore
660  *	or @dev_base_lock. If the name is found a pointer to the device
661  *	is returned. If the name is not found then %NULL is returned. The
662  *	reference counters are not incremented so the caller must be
663  *	careful with locks.
664  */
665 
666 struct net_device *__dev_get_by_name(struct net *net, const char *name)
667 {
668 	struct net_device *dev;
669 	struct hlist_head *head = dev_name_hash(net, name);
670 
671 	hlist_for_each_entry(dev, head, name_hlist)
672 		if (!strncmp(dev->name, name, IFNAMSIZ))
673 			return dev;
674 
675 	return NULL;
676 }
677 EXPORT_SYMBOL(__dev_get_by_name);
678 
679 /**
680  *	dev_get_by_name_rcu	- find a device by its name
681  *	@net: the applicable net namespace
682  *	@name: name to find
683  *
684  *	Find an interface by name.
685  *	If the name is found a pointer to the device is returned.
686  * 	If the name is not found then %NULL is returned.
687  *	The reference counters are not incremented so the caller must be
688  *	careful with locks. The caller must hold RCU lock.
689  */
690 
691 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
692 {
693 	struct net_device *dev;
694 	struct hlist_head *head = dev_name_hash(net, name);
695 
696 	hlist_for_each_entry_rcu(dev, head, name_hlist)
697 		if (!strncmp(dev->name, name, IFNAMSIZ))
698 			return dev;
699 
700 	return NULL;
701 }
702 EXPORT_SYMBOL(dev_get_by_name_rcu);
703 
704 /**
705  *	dev_get_by_name		- find a device by its name
706  *	@net: the applicable net namespace
707  *	@name: name to find
708  *
709  *	Find an interface by name. This can be called from any
710  *	context and does its own locking. The returned handle has
711  *	the usage count incremented and the caller must use dev_put() to
712  *	release it when it is no longer needed. %NULL is returned if no
713  *	matching device is found.
714  */
715 
716 struct net_device *dev_get_by_name(struct net *net, const char *name)
717 {
718 	struct net_device *dev;
719 
720 	rcu_read_lock();
721 	dev = dev_get_by_name_rcu(net, name);
722 	if (dev)
723 		dev_hold(dev);
724 	rcu_read_unlock();
725 	return dev;
726 }
727 EXPORT_SYMBOL(dev_get_by_name);
728 
729 /**
730  *	__dev_get_by_index - find a device by its ifindex
731  *	@net: the applicable net namespace
732  *	@ifindex: index of device
733  *
734  *	Search for an interface by index. Returns %NULL if the device
735  *	is not found or a pointer to the device. The device has not
736  *	had its reference counter increased so the caller must be careful
737  *	about locking. The caller must hold either the RTNL semaphore
738  *	or @dev_base_lock.
739  */
740 
741 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
742 {
743 	struct net_device *dev;
744 	struct hlist_head *head = dev_index_hash(net, ifindex);
745 
746 	hlist_for_each_entry(dev, head, index_hlist)
747 		if (dev->ifindex == ifindex)
748 			return dev;
749 
750 	return NULL;
751 }
752 EXPORT_SYMBOL(__dev_get_by_index);
753 
754 /**
755  *	dev_get_by_index_rcu - find a device by its ifindex
756  *	@net: the applicable net namespace
757  *	@ifindex: index of device
758  *
759  *	Search for an interface by index. Returns %NULL if the device
760  *	is not found or a pointer to the device. The device has not
761  *	had its reference counter increased so the caller must be careful
762  *	about locking. The caller must hold RCU lock.
763  */
764 
765 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
766 {
767 	struct net_device *dev;
768 	struct hlist_head *head = dev_index_hash(net, ifindex);
769 
770 	hlist_for_each_entry_rcu(dev, head, index_hlist)
771 		if (dev->ifindex == ifindex)
772 			return dev;
773 
774 	return NULL;
775 }
776 EXPORT_SYMBOL(dev_get_by_index_rcu);
777 
778 
779 /**
780  *	dev_get_by_index - find a device by its ifindex
781  *	@net: the applicable net namespace
782  *	@ifindex: index of device
783  *
784  *	Search for an interface by index. Returns NULL if the device
785  *	is not found or a pointer to the device. The device returned has
786  *	had a reference added and the pointer is safe until the user calls
787  *	dev_put to indicate they have finished with it.
788  */
789 
790 struct net_device *dev_get_by_index(struct net *net, int ifindex)
791 {
792 	struct net_device *dev;
793 
794 	rcu_read_lock();
795 	dev = dev_get_by_index_rcu(net, ifindex);
796 	if (dev)
797 		dev_hold(dev);
798 	rcu_read_unlock();
799 	return dev;
800 }
801 EXPORT_SYMBOL(dev_get_by_index);
802 
803 /**
804  *	netdev_get_name - get a netdevice name, knowing its ifindex.
805  *	@net: network namespace
806  *	@name: a pointer to the buffer where the name will be stored.
807  *	@ifindex: the ifindex of the interface to get the name from.
808  *
809  *	The use of raw_seqcount_begin() and cond_resched() before
810  *	retrying is required as we want to give the writers a chance
811  *	to complete when CONFIG_PREEMPT is not set.
812  */
813 int netdev_get_name(struct net *net, char *name, int ifindex)
814 {
815 	struct net_device *dev;
816 	unsigned int seq;
817 
818 retry:
819 	seq = raw_seqcount_begin(&devnet_rename_seq);
820 	rcu_read_lock();
821 	dev = dev_get_by_index_rcu(net, ifindex);
822 	if (!dev) {
823 		rcu_read_unlock();
824 		return -ENODEV;
825 	}
826 
827 	strcpy(name, dev->name);
828 	rcu_read_unlock();
829 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
830 		cond_resched();
831 		goto retry;
832 	}
833 
834 	return 0;
835 }
836 
837 /**
838  *	dev_getbyhwaddr_rcu - find a device by its hardware address
839  *	@net: the applicable net namespace
840  *	@type: media type of device
841  *	@ha: hardware address
842  *
843  *	Search for an interface by MAC address. Returns NULL if the device
844  *	is not found or a pointer to the device.
845  *	The caller must hold RCU or RTNL.
846  *	The returned device has not had its ref count increased
847  *	and the caller must therefore be careful about locking
848  *
849  */
850 
851 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
852 				       const char *ha)
853 {
854 	struct net_device *dev;
855 
856 	for_each_netdev_rcu(net, dev)
857 		if (dev->type == type &&
858 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
859 			return dev;
860 
861 	return NULL;
862 }
863 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
864 
865 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
866 {
867 	struct net_device *dev;
868 
869 	ASSERT_RTNL();
870 	for_each_netdev(net, dev)
871 		if (dev->type == type)
872 			return dev;
873 
874 	return NULL;
875 }
876 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
877 
878 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
879 {
880 	struct net_device *dev, *ret = NULL;
881 
882 	rcu_read_lock();
883 	for_each_netdev_rcu(net, dev)
884 		if (dev->type == type) {
885 			dev_hold(dev);
886 			ret = dev;
887 			break;
888 		}
889 	rcu_read_unlock();
890 	return ret;
891 }
892 EXPORT_SYMBOL(dev_getfirstbyhwtype);
893 
894 /**
895  *	dev_get_by_flags_rcu - find any device with given flags
896  *	@net: the applicable net namespace
897  *	@if_flags: IFF_* values
898  *	@mask: bitmask of bits in if_flags to check
899  *
900  *	Search for any interface with the given flags. Returns NULL if a device
901  *	is not found or a pointer to the device. Must be called inside
902  *	rcu_read_lock(), and result refcount is unchanged.
903  */
904 
905 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
906 				    unsigned short mask)
907 {
908 	struct net_device *dev, *ret;
909 
910 	ret = NULL;
911 	for_each_netdev_rcu(net, dev) {
912 		if (((dev->flags ^ if_flags) & mask) == 0) {
913 			ret = dev;
914 			break;
915 		}
916 	}
917 	return ret;
918 }
919 EXPORT_SYMBOL(dev_get_by_flags_rcu);
920 
921 /**
922  *	dev_valid_name - check if name is okay for network device
923  *	@name: name string
924  *
925  *	Network device names need to be valid file names to
926  *	to allow sysfs to work.  We also disallow any kind of
927  *	whitespace.
928  */
929 bool dev_valid_name(const char *name)
930 {
931 	if (*name == '\0')
932 		return false;
933 	if (strlen(name) >= IFNAMSIZ)
934 		return false;
935 	if (!strcmp(name, ".") || !strcmp(name, ".."))
936 		return false;
937 
938 	while (*name) {
939 		if (*name == '/' || isspace(*name))
940 			return false;
941 		name++;
942 	}
943 	return true;
944 }
945 EXPORT_SYMBOL(dev_valid_name);
946 
947 /**
948  *	__dev_alloc_name - allocate a name for a device
949  *	@net: network namespace to allocate the device name in
950  *	@name: name format string
951  *	@buf:  scratch buffer and result name string
952  *
953  *	Passed a format string - eg "lt%d" it will try and find a suitable
954  *	id. It scans list of devices to build up a free map, then chooses
955  *	the first empty slot. The caller must hold the dev_base or rtnl lock
956  *	while allocating the name and adding the device in order to avoid
957  *	duplicates.
958  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
959  *	Returns the number of the unit assigned or a negative errno code.
960  */
961 
962 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
963 {
964 	int i = 0;
965 	const char *p;
966 	const int max_netdevices = 8*PAGE_SIZE;
967 	unsigned long *inuse;
968 	struct net_device *d;
969 
970 	p = strnchr(name, IFNAMSIZ-1, '%');
971 	if (p) {
972 		/*
973 		 * Verify the string as this thing may have come from
974 		 * the user.  There must be either one "%d" and no other "%"
975 		 * characters.
976 		 */
977 		if (p[1] != 'd' || strchr(p + 2, '%'))
978 			return -EINVAL;
979 
980 		/* Use one page as a bit array of possible slots */
981 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
982 		if (!inuse)
983 			return -ENOMEM;
984 
985 		for_each_netdev(net, d) {
986 			if (!sscanf(d->name, name, &i))
987 				continue;
988 			if (i < 0 || i >= max_netdevices)
989 				continue;
990 
991 			/*  avoid cases where sscanf is not exact inverse of printf */
992 			snprintf(buf, IFNAMSIZ, name, i);
993 			if (!strncmp(buf, d->name, IFNAMSIZ))
994 				set_bit(i, inuse);
995 		}
996 
997 		i = find_first_zero_bit(inuse, max_netdevices);
998 		free_page((unsigned long) inuse);
999 	}
1000 
1001 	if (buf != name)
1002 		snprintf(buf, IFNAMSIZ, name, i);
1003 	if (!__dev_get_by_name(net, buf))
1004 		return i;
1005 
1006 	/* It is possible to run out of possible slots
1007 	 * when the name is long and there isn't enough space left
1008 	 * for the digits, or if all bits are used.
1009 	 */
1010 	return -ENFILE;
1011 }
1012 
1013 /**
1014  *	dev_alloc_name - allocate a name for a device
1015  *	@dev: device
1016  *	@name: name format string
1017  *
1018  *	Passed a format string - eg "lt%d" it will try and find a suitable
1019  *	id. It scans list of devices to build up a free map, then chooses
1020  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1021  *	while allocating the name and adding the device in order to avoid
1022  *	duplicates.
1023  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1024  *	Returns the number of the unit assigned or a negative errno code.
1025  */
1026 
1027 int dev_alloc_name(struct net_device *dev, const char *name)
1028 {
1029 	char buf[IFNAMSIZ];
1030 	struct net *net;
1031 	int ret;
1032 
1033 	BUG_ON(!dev_net(dev));
1034 	net = dev_net(dev);
1035 	ret = __dev_alloc_name(net, name, buf);
1036 	if (ret >= 0)
1037 		strlcpy(dev->name, buf, IFNAMSIZ);
1038 	return ret;
1039 }
1040 EXPORT_SYMBOL(dev_alloc_name);
1041 
1042 static int dev_alloc_name_ns(struct net *net,
1043 			     struct net_device *dev,
1044 			     const char *name)
1045 {
1046 	char buf[IFNAMSIZ];
1047 	int ret;
1048 
1049 	ret = __dev_alloc_name(net, name, buf);
1050 	if (ret >= 0)
1051 		strlcpy(dev->name, buf, IFNAMSIZ);
1052 	return ret;
1053 }
1054 
1055 static int dev_get_valid_name(struct net *net,
1056 			      struct net_device *dev,
1057 			      const char *name)
1058 {
1059 	BUG_ON(!net);
1060 
1061 	if (!dev_valid_name(name))
1062 		return -EINVAL;
1063 
1064 	if (strchr(name, '%'))
1065 		return dev_alloc_name_ns(net, dev, name);
1066 	else if (__dev_get_by_name(net, name))
1067 		return -EEXIST;
1068 	else if (dev->name != name)
1069 		strlcpy(dev->name, name, IFNAMSIZ);
1070 
1071 	return 0;
1072 }
1073 
1074 /**
1075  *	dev_change_name - change name of a device
1076  *	@dev: device
1077  *	@newname: name (or format string) must be at least IFNAMSIZ
1078  *
1079  *	Change name of a device, can pass format strings "eth%d".
1080  *	for wildcarding.
1081  */
1082 int dev_change_name(struct net_device *dev, const char *newname)
1083 {
1084 	char oldname[IFNAMSIZ];
1085 	int err = 0;
1086 	int ret;
1087 	struct net *net;
1088 
1089 	ASSERT_RTNL();
1090 	BUG_ON(!dev_net(dev));
1091 
1092 	net = dev_net(dev);
1093 	if (dev->flags & IFF_UP)
1094 		return -EBUSY;
1095 
1096 	write_seqcount_begin(&devnet_rename_seq);
1097 
1098 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1099 		write_seqcount_end(&devnet_rename_seq);
1100 		return 0;
1101 	}
1102 
1103 	memcpy(oldname, dev->name, IFNAMSIZ);
1104 
1105 	err = dev_get_valid_name(net, dev, newname);
1106 	if (err < 0) {
1107 		write_seqcount_end(&devnet_rename_seq);
1108 		return err;
1109 	}
1110 
1111 rollback:
1112 	ret = device_rename(&dev->dev, dev->name);
1113 	if (ret) {
1114 		memcpy(dev->name, oldname, IFNAMSIZ);
1115 		write_seqcount_end(&devnet_rename_seq);
1116 		return ret;
1117 	}
1118 
1119 	write_seqcount_end(&devnet_rename_seq);
1120 
1121 	write_lock_bh(&dev_base_lock);
1122 	hlist_del_rcu(&dev->name_hlist);
1123 	write_unlock_bh(&dev_base_lock);
1124 
1125 	synchronize_rcu();
1126 
1127 	write_lock_bh(&dev_base_lock);
1128 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1129 	write_unlock_bh(&dev_base_lock);
1130 
1131 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1132 	ret = notifier_to_errno(ret);
1133 
1134 	if (ret) {
1135 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1136 		if (err >= 0) {
1137 			err = ret;
1138 			write_seqcount_begin(&devnet_rename_seq);
1139 			memcpy(dev->name, oldname, IFNAMSIZ);
1140 			goto rollback;
1141 		} else {
1142 			pr_err("%s: name change rollback failed: %d\n",
1143 			       dev->name, ret);
1144 		}
1145 	}
1146 
1147 	return err;
1148 }
1149 
1150 /**
1151  *	dev_set_alias - change ifalias of a device
1152  *	@dev: device
1153  *	@alias: name up to IFALIASZ
1154  *	@len: limit of bytes to copy from info
1155  *
1156  *	Set ifalias for a device,
1157  */
1158 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1159 {
1160 	char *new_ifalias;
1161 
1162 	ASSERT_RTNL();
1163 
1164 	if (len >= IFALIASZ)
1165 		return -EINVAL;
1166 
1167 	if (!len) {
1168 		kfree(dev->ifalias);
1169 		dev->ifalias = NULL;
1170 		return 0;
1171 	}
1172 
1173 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1174 	if (!new_ifalias)
1175 		return -ENOMEM;
1176 	dev->ifalias = new_ifalias;
1177 
1178 	strlcpy(dev->ifalias, alias, len+1);
1179 	return len;
1180 }
1181 
1182 
1183 /**
1184  *	netdev_features_change - device changes features
1185  *	@dev: device to cause notification
1186  *
1187  *	Called to indicate a device has changed features.
1188  */
1189 void netdev_features_change(struct net_device *dev)
1190 {
1191 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1192 }
1193 EXPORT_SYMBOL(netdev_features_change);
1194 
1195 /**
1196  *	netdev_state_change - device changes state
1197  *	@dev: device to cause notification
1198  *
1199  *	Called to indicate a device has changed state. This function calls
1200  *	the notifier chains for netdev_chain and sends a NEWLINK message
1201  *	to the routing socket.
1202  */
1203 void netdev_state_change(struct net_device *dev)
1204 {
1205 	if (dev->flags & IFF_UP) {
1206 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1207 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1208 	}
1209 }
1210 EXPORT_SYMBOL(netdev_state_change);
1211 
1212 /**
1213  * 	netdev_notify_peers - notify network peers about existence of @dev
1214  * 	@dev: network device
1215  *
1216  * Generate traffic such that interested network peers are aware of
1217  * @dev, such as by generating a gratuitous ARP. This may be used when
1218  * a device wants to inform the rest of the network about some sort of
1219  * reconfiguration such as a failover event or virtual machine
1220  * migration.
1221  */
1222 void netdev_notify_peers(struct net_device *dev)
1223 {
1224 	rtnl_lock();
1225 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1226 	rtnl_unlock();
1227 }
1228 EXPORT_SYMBOL(netdev_notify_peers);
1229 
1230 static int __dev_open(struct net_device *dev)
1231 {
1232 	const struct net_device_ops *ops = dev->netdev_ops;
1233 	int ret;
1234 
1235 	ASSERT_RTNL();
1236 
1237 	if (!netif_device_present(dev))
1238 		return -ENODEV;
1239 
1240 	/* Block netpoll from trying to do any rx path servicing.
1241 	 * If we don't do this there is a chance ndo_poll_controller
1242 	 * or ndo_poll may be running while we open the device
1243 	 */
1244 	netpoll_rx_disable(dev);
1245 
1246 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1247 	ret = notifier_to_errno(ret);
1248 	if (ret)
1249 		return ret;
1250 
1251 	set_bit(__LINK_STATE_START, &dev->state);
1252 
1253 	if (ops->ndo_validate_addr)
1254 		ret = ops->ndo_validate_addr(dev);
1255 
1256 	if (!ret && ops->ndo_open)
1257 		ret = ops->ndo_open(dev);
1258 
1259 	netpoll_rx_enable(dev);
1260 
1261 	if (ret)
1262 		clear_bit(__LINK_STATE_START, &dev->state);
1263 	else {
1264 		dev->flags |= IFF_UP;
1265 		net_dmaengine_get();
1266 		dev_set_rx_mode(dev);
1267 		dev_activate(dev);
1268 		add_device_randomness(dev->dev_addr, dev->addr_len);
1269 	}
1270 
1271 	return ret;
1272 }
1273 
1274 /**
1275  *	dev_open	- prepare an interface for use.
1276  *	@dev:	device to open
1277  *
1278  *	Takes a device from down to up state. The device's private open
1279  *	function is invoked and then the multicast lists are loaded. Finally
1280  *	the device is moved into the up state and a %NETDEV_UP message is
1281  *	sent to the netdev notifier chain.
1282  *
1283  *	Calling this function on an active interface is a nop. On a failure
1284  *	a negative errno code is returned.
1285  */
1286 int dev_open(struct net_device *dev)
1287 {
1288 	int ret;
1289 
1290 	if (dev->flags & IFF_UP)
1291 		return 0;
1292 
1293 	ret = __dev_open(dev);
1294 	if (ret < 0)
1295 		return ret;
1296 
1297 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1298 	call_netdevice_notifiers(NETDEV_UP, dev);
1299 
1300 	return ret;
1301 }
1302 EXPORT_SYMBOL(dev_open);
1303 
1304 static int __dev_close_many(struct list_head *head)
1305 {
1306 	struct net_device *dev;
1307 
1308 	ASSERT_RTNL();
1309 	might_sleep();
1310 
1311 	list_for_each_entry(dev, head, close_list) {
1312 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1313 
1314 		clear_bit(__LINK_STATE_START, &dev->state);
1315 
1316 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1317 		 * can be even on different cpu. So just clear netif_running().
1318 		 *
1319 		 * dev->stop() will invoke napi_disable() on all of it's
1320 		 * napi_struct instances on this device.
1321 		 */
1322 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1323 	}
1324 
1325 	dev_deactivate_many(head);
1326 
1327 	list_for_each_entry(dev, head, close_list) {
1328 		const struct net_device_ops *ops = dev->netdev_ops;
1329 
1330 		/*
1331 		 *	Call the device specific close. This cannot fail.
1332 		 *	Only if device is UP
1333 		 *
1334 		 *	We allow it to be called even after a DETACH hot-plug
1335 		 *	event.
1336 		 */
1337 		if (ops->ndo_stop)
1338 			ops->ndo_stop(dev);
1339 
1340 		dev->flags &= ~IFF_UP;
1341 		net_dmaengine_put();
1342 	}
1343 
1344 	return 0;
1345 }
1346 
1347 static int __dev_close(struct net_device *dev)
1348 {
1349 	int retval;
1350 	LIST_HEAD(single);
1351 
1352 	/* Temporarily disable netpoll until the interface is down */
1353 	netpoll_rx_disable(dev);
1354 
1355 	list_add(&dev->close_list, &single);
1356 	retval = __dev_close_many(&single);
1357 	list_del(&single);
1358 
1359 	netpoll_rx_enable(dev);
1360 	return retval;
1361 }
1362 
1363 static int dev_close_many(struct list_head *head)
1364 {
1365 	struct net_device *dev, *tmp;
1366 
1367 	/* Remove the devices that don't need to be closed */
1368 	list_for_each_entry_safe(dev, tmp, head, close_list)
1369 		if (!(dev->flags & IFF_UP))
1370 			list_del_init(&dev->close_list);
1371 
1372 	__dev_close_many(head);
1373 
1374 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1375 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1377 		list_del_init(&dev->close_list);
1378 	}
1379 
1380 	return 0;
1381 }
1382 
1383 /**
1384  *	dev_close - shutdown an interface.
1385  *	@dev: device to shutdown
1386  *
1387  *	This function moves an active device into down state. A
1388  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1389  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1390  *	chain.
1391  */
1392 int dev_close(struct net_device *dev)
1393 {
1394 	if (dev->flags & IFF_UP) {
1395 		LIST_HEAD(single);
1396 
1397 		/* Block netpoll rx while the interface is going down */
1398 		netpoll_rx_disable(dev);
1399 
1400 		list_add(&dev->close_list, &single);
1401 		dev_close_many(&single);
1402 		list_del(&single);
1403 
1404 		netpoll_rx_enable(dev);
1405 	}
1406 	return 0;
1407 }
1408 EXPORT_SYMBOL(dev_close);
1409 
1410 
1411 /**
1412  *	dev_disable_lro - disable Large Receive Offload on a device
1413  *	@dev: device
1414  *
1415  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1416  *	called under RTNL.  This is needed if received packets may be
1417  *	forwarded to another interface.
1418  */
1419 void dev_disable_lro(struct net_device *dev)
1420 {
1421 	/*
1422 	 * If we're trying to disable lro on a vlan device
1423 	 * use the underlying physical device instead
1424 	 */
1425 	if (is_vlan_dev(dev))
1426 		dev = vlan_dev_real_dev(dev);
1427 
1428 	/* the same for macvlan devices */
1429 	if (netif_is_macvlan(dev))
1430 		dev = macvlan_dev_real_dev(dev);
1431 
1432 	dev->wanted_features &= ~NETIF_F_LRO;
1433 	netdev_update_features(dev);
1434 
1435 	if (unlikely(dev->features & NETIF_F_LRO))
1436 		netdev_WARN(dev, "failed to disable LRO!\n");
1437 }
1438 EXPORT_SYMBOL(dev_disable_lro);
1439 
1440 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1441 				   struct net_device *dev)
1442 {
1443 	struct netdev_notifier_info info;
1444 
1445 	netdev_notifier_info_init(&info, dev);
1446 	return nb->notifier_call(nb, val, &info);
1447 }
1448 
1449 static int dev_boot_phase = 1;
1450 
1451 /**
1452  *	register_netdevice_notifier - register a network notifier block
1453  *	@nb: notifier
1454  *
1455  *	Register a notifier to be called when network device events occur.
1456  *	The notifier passed is linked into the kernel structures and must
1457  *	not be reused until it has been unregistered. A negative errno code
1458  *	is returned on a failure.
1459  *
1460  * 	When registered all registration and up events are replayed
1461  *	to the new notifier to allow device to have a race free
1462  *	view of the network device list.
1463  */
1464 
1465 int register_netdevice_notifier(struct notifier_block *nb)
1466 {
1467 	struct net_device *dev;
1468 	struct net_device *last;
1469 	struct net *net;
1470 	int err;
1471 
1472 	rtnl_lock();
1473 	err = raw_notifier_chain_register(&netdev_chain, nb);
1474 	if (err)
1475 		goto unlock;
1476 	if (dev_boot_phase)
1477 		goto unlock;
1478 	for_each_net(net) {
1479 		for_each_netdev(net, dev) {
1480 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1481 			err = notifier_to_errno(err);
1482 			if (err)
1483 				goto rollback;
1484 
1485 			if (!(dev->flags & IFF_UP))
1486 				continue;
1487 
1488 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1489 		}
1490 	}
1491 
1492 unlock:
1493 	rtnl_unlock();
1494 	return err;
1495 
1496 rollback:
1497 	last = dev;
1498 	for_each_net(net) {
1499 		for_each_netdev(net, dev) {
1500 			if (dev == last)
1501 				goto outroll;
1502 
1503 			if (dev->flags & IFF_UP) {
1504 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1505 							dev);
1506 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1507 			}
1508 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1509 		}
1510 	}
1511 
1512 outroll:
1513 	raw_notifier_chain_unregister(&netdev_chain, nb);
1514 	goto unlock;
1515 }
1516 EXPORT_SYMBOL(register_netdevice_notifier);
1517 
1518 /**
1519  *	unregister_netdevice_notifier - unregister a network notifier block
1520  *	@nb: notifier
1521  *
1522  *	Unregister a notifier previously registered by
1523  *	register_netdevice_notifier(). The notifier is unlinked into the
1524  *	kernel structures and may then be reused. A negative errno code
1525  *	is returned on a failure.
1526  *
1527  * 	After unregistering unregister and down device events are synthesized
1528  *	for all devices on the device list to the removed notifier to remove
1529  *	the need for special case cleanup code.
1530  */
1531 
1532 int unregister_netdevice_notifier(struct notifier_block *nb)
1533 {
1534 	struct net_device *dev;
1535 	struct net *net;
1536 	int err;
1537 
1538 	rtnl_lock();
1539 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1540 	if (err)
1541 		goto unlock;
1542 
1543 	for_each_net(net) {
1544 		for_each_netdev(net, dev) {
1545 			if (dev->flags & IFF_UP) {
1546 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1547 							dev);
1548 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1549 			}
1550 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1551 		}
1552 	}
1553 unlock:
1554 	rtnl_unlock();
1555 	return err;
1556 }
1557 EXPORT_SYMBOL(unregister_netdevice_notifier);
1558 
1559 /**
1560  *	call_netdevice_notifiers_info - call all network notifier blocks
1561  *	@val: value passed unmodified to notifier function
1562  *	@dev: net_device pointer passed unmodified to notifier function
1563  *	@info: notifier information data
1564  *
1565  *	Call all network notifier blocks.  Parameters and return value
1566  *	are as for raw_notifier_call_chain().
1567  */
1568 
1569 int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1570 				  struct netdev_notifier_info *info)
1571 {
1572 	ASSERT_RTNL();
1573 	netdev_notifier_info_init(info, dev);
1574 	return raw_notifier_call_chain(&netdev_chain, val, info);
1575 }
1576 EXPORT_SYMBOL(call_netdevice_notifiers_info);
1577 
1578 /**
1579  *	call_netdevice_notifiers - call all network notifier blocks
1580  *      @val: value passed unmodified to notifier function
1581  *      @dev: net_device pointer passed unmodified to notifier function
1582  *
1583  *	Call all network notifier blocks.  Parameters and return value
1584  *	are as for raw_notifier_call_chain().
1585  */
1586 
1587 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1588 {
1589 	struct netdev_notifier_info info;
1590 
1591 	return call_netdevice_notifiers_info(val, dev, &info);
1592 }
1593 EXPORT_SYMBOL(call_netdevice_notifiers);
1594 
1595 static struct static_key netstamp_needed __read_mostly;
1596 #ifdef HAVE_JUMP_LABEL
1597 /* We are not allowed to call static_key_slow_dec() from irq context
1598  * If net_disable_timestamp() is called from irq context, defer the
1599  * static_key_slow_dec() calls.
1600  */
1601 static atomic_t netstamp_needed_deferred;
1602 #endif
1603 
1604 void net_enable_timestamp(void)
1605 {
1606 #ifdef HAVE_JUMP_LABEL
1607 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1608 
1609 	if (deferred) {
1610 		while (--deferred)
1611 			static_key_slow_dec(&netstamp_needed);
1612 		return;
1613 	}
1614 #endif
1615 	static_key_slow_inc(&netstamp_needed);
1616 }
1617 EXPORT_SYMBOL(net_enable_timestamp);
1618 
1619 void net_disable_timestamp(void)
1620 {
1621 #ifdef HAVE_JUMP_LABEL
1622 	if (in_interrupt()) {
1623 		atomic_inc(&netstamp_needed_deferred);
1624 		return;
1625 	}
1626 #endif
1627 	static_key_slow_dec(&netstamp_needed);
1628 }
1629 EXPORT_SYMBOL(net_disable_timestamp);
1630 
1631 static inline void net_timestamp_set(struct sk_buff *skb)
1632 {
1633 	skb->tstamp.tv64 = 0;
1634 	if (static_key_false(&netstamp_needed))
1635 		__net_timestamp(skb);
1636 }
1637 
1638 #define net_timestamp_check(COND, SKB)			\
1639 	if (static_key_false(&netstamp_needed)) {		\
1640 		if ((COND) && !(SKB)->tstamp.tv64)	\
1641 			__net_timestamp(SKB);		\
1642 	}						\
1643 
1644 static inline bool is_skb_forwardable(struct net_device *dev,
1645 				      struct sk_buff *skb)
1646 {
1647 	unsigned int len;
1648 
1649 	if (!(dev->flags & IFF_UP))
1650 		return false;
1651 
1652 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1653 	if (skb->len <= len)
1654 		return true;
1655 
1656 	/* if TSO is enabled, we don't care about the length as the packet
1657 	 * could be forwarded without being segmented before
1658 	 */
1659 	if (skb_is_gso(skb))
1660 		return true;
1661 
1662 	return false;
1663 }
1664 
1665 /**
1666  * dev_forward_skb - loopback an skb to another netif
1667  *
1668  * @dev: destination network device
1669  * @skb: buffer to forward
1670  *
1671  * return values:
1672  *	NET_RX_SUCCESS	(no congestion)
1673  *	NET_RX_DROP     (packet was dropped, but freed)
1674  *
1675  * dev_forward_skb can be used for injecting an skb from the
1676  * start_xmit function of one device into the receive queue
1677  * of another device.
1678  *
1679  * The receiving device may be in another namespace, so
1680  * we have to clear all information in the skb that could
1681  * impact namespace isolation.
1682  */
1683 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1684 {
1685 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1686 		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1687 			atomic_long_inc(&dev->rx_dropped);
1688 			kfree_skb(skb);
1689 			return NET_RX_DROP;
1690 		}
1691 	}
1692 
1693 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1694 		atomic_long_inc(&dev->rx_dropped);
1695 		kfree_skb(skb);
1696 		return NET_RX_DROP;
1697 	}
1698 
1699 	skb_scrub_packet(skb, true);
1700 	skb->protocol = eth_type_trans(skb, dev);
1701 
1702 	return netif_rx(skb);
1703 }
1704 EXPORT_SYMBOL_GPL(dev_forward_skb);
1705 
1706 static inline int deliver_skb(struct sk_buff *skb,
1707 			      struct packet_type *pt_prev,
1708 			      struct net_device *orig_dev)
1709 {
1710 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1711 		return -ENOMEM;
1712 	atomic_inc(&skb->users);
1713 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1714 }
1715 
1716 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1717 {
1718 	if (!ptype->af_packet_priv || !skb->sk)
1719 		return false;
1720 
1721 	if (ptype->id_match)
1722 		return ptype->id_match(ptype, skb->sk);
1723 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1724 		return true;
1725 
1726 	return false;
1727 }
1728 
1729 /*
1730  *	Support routine. Sends outgoing frames to any network
1731  *	taps currently in use.
1732  */
1733 
1734 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1735 {
1736 	struct packet_type *ptype;
1737 	struct sk_buff *skb2 = NULL;
1738 	struct packet_type *pt_prev = NULL;
1739 
1740 	rcu_read_lock();
1741 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1742 		/* Never send packets back to the socket
1743 		 * they originated from - MvS (miquels@drinkel.ow.org)
1744 		 */
1745 		if ((ptype->dev == dev || !ptype->dev) &&
1746 		    (!skb_loop_sk(ptype, skb))) {
1747 			if (pt_prev) {
1748 				deliver_skb(skb2, pt_prev, skb->dev);
1749 				pt_prev = ptype;
1750 				continue;
1751 			}
1752 
1753 			skb2 = skb_clone(skb, GFP_ATOMIC);
1754 			if (!skb2)
1755 				break;
1756 
1757 			net_timestamp_set(skb2);
1758 
1759 			/* skb->nh should be correctly
1760 			   set by sender, so that the second statement is
1761 			   just protection against buggy protocols.
1762 			 */
1763 			skb_reset_mac_header(skb2);
1764 
1765 			if (skb_network_header(skb2) < skb2->data ||
1766 			    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1767 				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1768 						     ntohs(skb2->protocol),
1769 						     dev->name);
1770 				skb_reset_network_header(skb2);
1771 			}
1772 
1773 			skb2->transport_header = skb2->network_header;
1774 			skb2->pkt_type = PACKET_OUTGOING;
1775 			pt_prev = ptype;
1776 		}
1777 	}
1778 	if (pt_prev)
1779 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1780 	rcu_read_unlock();
1781 }
1782 
1783 /**
1784  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1785  * @dev: Network device
1786  * @txq: number of queues available
1787  *
1788  * If real_num_tx_queues is changed the tc mappings may no longer be
1789  * valid. To resolve this verify the tc mapping remains valid and if
1790  * not NULL the mapping. With no priorities mapping to this
1791  * offset/count pair it will no longer be used. In the worst case TC0
1792  * is invalid nothing can be done so disable priority mappings. If is
1793  * expected that drivers will fix this mapping if they can before
1794  * calling netif_set_real_num_tx_queues.
1795  */
1796 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1797 {
1798 	int i;
1799 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1800 
1801 	/* If TC0 is invalidated disable TC mapping */
1802 	if (tc->offset + tc->count > txq) {
1803 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1804 		dev->num_tc = 0;
1805 		return;
1806 	}
1807 
1808 	/* Invalidated prio to tc mappings set to TC0 */
1809 	for (i = 1; i < TC_BITMASK + 1; i++) {
1810 		int q = netdev_get_prio_tc_map(dev, i);
1811 
1812 		tc = &dev->tc_to_txq[q];
1813 		if (tc->offset + tc->count > txq) {
1814 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1815 				i, q);
1816 			netdev_set_prio_tc_map(dev, i, 0);
1817 		}
1818 	}
1819 }
1820 
1821 #ifdef CONFIG_XPS
1822 static DEFINE_MUTEX(xps_map_mutex);
1823 #define xmap_dereference(P)		\
1824 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1825 
1826 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1827 					int cpu, u16 index)
1828 {
1829 	struct xps_map *map = NULL;
1830 	int pos;
1831 
1832 	if (dev_maps)
1833 		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1834 
1835 	for (pos = 0; map && pos < map->len; pos++) {
1836 		if (map->queues[pos] == index) {
1837 			if (map->len > 1) {
1838 				map->queues[pos] = map->queues[--map->len];
1839 			} else {
1840 				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1841 				kfree_rcu(map, rcu);
1842 				map = NULL;
1843 			}
1844 			break;
1845 		}
1846 	}
1847 
1848 	return map;
1849 }
1850 
1851 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1852 {
1853 	struct xps_dev_maps *dev_maps;
1854 	int cpu, i;
1855 	bool active = false;
1856 
1857 	mutex_lock(&xps_map_mutex);
1858 	dev_maps = xmap_dereference(dev->xps_maps);
1859 
1860 	if (!dev_maps)
1861 		goto out_no_maps;
1862 
1863 	for_each_possible_cpu(cpu) {
1864 		for (i = index; i < dev->num_tx_queues; i++) {
1865 			if (!remove_xps_queue(dev_maps, cpu, i))
1866 				break;
1867 		}
1868 		if (i == dev->num_tx_queues)
1869 			active = true;
1870 	}
1871 
1872 	if (!active) {
1873 		RCU_INIT_POINTER(dev->xps_maps, NULL);
1874 		kfree_rcu(dev_maps, rcu);
1875 	}
1876 
1877 	for (i = index; i < dev->num_tx_queues; i++)
1878 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1879 					     NUMA_NO_NODE);
1880 
1881 out_no_maps:
1882 	mutex_unlock(&xps_map_mutex);
1883 }
1884 
1885 static struct xps_map *expand_xps_map(struct xps_map *map,
1886 				      int cpu, u16 index)
1887 {
1888 	struct xps_map *new_map;
1889 	int alloc_len = XPS_MIN_MAP_ALLOC;
1890 	int i, pos;
1891 
1892 	for (pos = 0; map && pos < map->len; pos++) {
1893 		if (map->queues[pos] != index)
1894 			continue;
1895 		return map;
1896 	}
1897 
1898 	/* Need to add queue to this CPU's existing map */
1899 	if (map) {
1900 		if (pos < map->alloc_len)
1901 			return map;
1902 
1903 		alloc_len = map->alloc_len * 2;
1904 	}
1905 
1906 	/* Need to allocate new map to store queue on this CPU's map */
1907 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1908 			       cpu_to_node(cpu));
1909 	if (!new_map)
1910 		return NULL;
1911 
1912 	for (i = 0; i < pos; i++)
1913 		new_map->queues[i] = map->queues[i];
1914 	new_map->alloc_len = alloc_len;
1915 	new_map->len = pos;
1916 
1917 	return new_map;
1918 }
1919 
1920 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1921 			u16 index)
1922 {
1923 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1924 	struct xps_map *map, *new_map;
1925 	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1926 	int cpu, numa_node_id = -2;
1927 	bool active = false;
1928 
1929 	mutex_lock(&xps_map_mutex);
1930 
1931 	dev_maps = xmap_dereference(dev->xps_maps);
1932 
1933 	/* allocate memory for queue storage */
1934 	for_each_online_cpu(cpu) {
1935 		if (!cpumask_test_cpu(cpu, mask))
1936 			continue;
1937 
1938 		if (!new_dev_maps)
1939 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1940 		if (!new_dev_maps) {
1941 			mutex_unlock(&xps_map_mutex);
1942 			return -ENOMEM;
1943 		}
1944 
1945 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1946 				 NULL;
1947 
1948 		map = expand_xps_map(map, cpu, index);
1949 		if (!map)
1950 			goto error;
1951 
1952 		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1953 	}
1954 
1955 	if (!new_dev_maps)
1956 		goto out_no_new_maps;
1957 
1958 	for_each_possible_cpu(cpu) {
1959 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1960 			/* add queue to CPU maps */
1961 			int pos = 0;
1962 
1963 			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1964 			while ((pos < map->len) && (map->queues[pos] != index))
1965 				pos++;
1966 
1967 			if (pos == map->len)
1968 				map->queues[map->len++] = index;
1969 #ifdef CONFIG_NUMA
1970 			if (numa_node_id == -2)
1971 				numa_node_id = cpu_to_node(cpu);
1972 			else if (numa_node_id != cpu_to_node(cpu))
1973 				numa_node_id = -1;
1974 #endif
1975 		} else if (dev_maps) {
1976 			/* fill in the new device map from the old device map */
1977 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1978 			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1979 		}
1980 
1981 	}
1982 
1983 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1984 
1985 	/* Cleanup old maps */
1986 	if (dev_maps) {
1987 		for_each_possible_cpu(cpu) {
1988 			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1989 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1990 			if (map && map != new_map)
1991 				kfree_rcu(map, rcu);
1992 		}
1993 
1994 		kfree_rcu(dev_maps, rcu);
1995 	}
1996 
1997 	dev_maps = new_dev_maps;
1998 	active = true;
1999 
2000 out_no_new_maps:
2001 	/* update Tx queue numa node */
2002 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2003 				     (numa_node_id >= 0) ? numa_node_id :
2004 				     NUMA_NO_NODE);
2005 
2006 	if (!dev_maps)
2007 		goto out_no_maps;
2008 
2009 	/* removes queue from unused CPUs */
2010 	for_each_possible_cpu(cpu) {
2011 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2012 			continue;
2013 
2014 		if (remove_xps_queue(dev_maps, cpu, index))
2015 			active = true;
2016 	}
2017 
2018 	/* free map if not active */
2019 	if (!active) {
2020 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2021 		kfree_rcu(dev_maps, rcu);
2022 	}
2023 
2024 out_no_maps:
2025 	mutex_unlock(&xps_map_mutex);
2026 
2027 	return 0;
2028 error:
2029 	/* remove any maps that we added */
2030 	for_each_possible_cpu(cpu) {
2031 		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2032 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2033 				 NULL;
2034 		if (new_map && new_map != map)
2035 			kfree(new_map);
2036 	}
2037 
2038 	mutex_unlock(&xps_map_mutex);
2039 
2040 	kfree(new_dev_maps);
2041 	return -ENOMEM;
2042 }
2043 EXPORT_SYMBOL(netif_set_xps_queue);
2044 
2045 #endif
2046 /*
2047  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2048  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2049  */
2050 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2051 {
2052 	int rc;
2053 
2054 	if (txq < 1 || txq > dev->num_tx_queues)
2055 		return -EINVAL;
2056 
2057 	if (dev->reg_state == NETREG_REGISTERED ||
2058 	    dev->reg_state == NETREG_UNREGISTERING) {
2059 		ASSERT_RTNL();
2060 
2061 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2062 						  txq);
2063 		if (rc)
2064 			return rc;
2065 
2066 		if (dev->num_tc)
2067 			netif_setup_tc(dev, txq);
2068 
2069 		if (txq < dev->real_num_tx_queues) {
2070 			qdisc_reset_all_tx_gt(dev, txq);
2071 #ifdef CONFIG_XPS
2072 			netif_reset_xps_queues_gt(dev, txq);
2073 #endif
2074 		}
2075 	}
2076 
2077 	dev->real_num_tx_queues = txq;
2078 	return 0;
2079 }
2080 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2081 
2082 #ifdef CONFIG_RPS
2083 /**
2084  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2085  *	@dev: Network device
2086  *	@rxq: Actual number of RX queues
2087  *
2088  *	This must be called either with the rtnl_lock held or before
2089  *	registration of the net device.  Returns 0 on success, or a
2090  *	negative error code.  If called before registration, it always
2091  *	succeeds.
2092  */
2093 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2094 {
2095 	int rc;
2096 
2097 	if (rxq < 1 || rxq > dev->num_rx_queues)
2098 		return -EINVAL;
2099 
2100 	if (dev->reg_state == NETREG_REGISTERED) {
2101 		ASSERT_RTNL();
2102 
2103 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2104 						  rxq);
2105 		if (rc)
2106 			return rc;
2107 	}
2108 
2109 	dev->real_num_rx_queues = rxq;
2110 	return 0;
2111 }
2112 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2113 #endif
2114 
2115 /**
2116  * netif_get_num_default_rss_queues - default number of RSS queues
2117  *
2118  * This routine should set an upper limit on the number of RSS queues
2119  * used by default by multiqueue devices.
2120  */
2121 int netif_get_num_default_rss_queues(void)
2122 {
2123 	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2124 }
2125 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2126 
2127 static inline void __netif_reschedule(struct Qdisc *q)
2128 {
2129 	struct softnet_data *sd;
2130 	unsigned long flags;
2131 
2132 	local_irq_save(flags);
2133 	sd = &__get_cpu_var(softnet_data);
2134 	q->next_sched = NULL;
2135 	*sd->output_queue_tailp = q;
2136 	sd->output_queue_tailp = &q->next_sched;
2137 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2138 	local_irq_restore(flags);
2139 }
2140 
2141 void __netif_schedule(struct Qdisc *q)
2142 {
2143 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2144 		__netif_reschedule(q);
2145 }
2146 EXPORT_SYMBOL(__netif_schedule);
2147 
2148 struct dev_kfree_skb_cb {
2149 	enum skb_free_reason reason;
2150 };
2151 
2152 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2153 {
2154 	return (struct dev_kfree_skb_cb *)skb->cb;
2155 }
2156 
2157 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2158 {
2159 	unsigned long flags;
2160 
2161 	if (likely(atomic_read(&skb->users) == 1)) {
2162 		smp_rmb();
2163 		atomic_set(&skb->users, 0);
2164 	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2165 		return;
2166 	}
2167 	get_kfree_skb_cb(skb)->reason = reason;
2168 	local_irq_save(flags);
2169 	skb->next = __this_cpu_read(softnet_data.completion_queue);
2170 	__this_cpu_write(softnet_data.completion_queue, skb);
2171 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2172 	local_irq_restore(flags);
2173 }
2174 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2175 
2176 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2177 {
2178 	if (in_irq() || irqs_disabled())
2179 		__dev_kfree_skb_irq(skb, reason);
2180 	else
2181 		dev_kfree_skb(skb);
2182 }
2183 EXPORT_SYMBOL(__dev_kfree_skb_any);
2184 
2185 
2186 /**
2187  * netif_device_detach - mark device as removed
2188  * @dev: network device
2189  *
2190  * Mark device as removed from system and therefore no longer available.
2191  */
2192 void netif_device_detach(struct net_device *dev)
2193 {
2194 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2195 	    netif_running(dev)) {
2196 		netif_tx_stop_all_queues(dev);
2197 	}
2198 }
2199 EXPORT_SYMBOL(netif_device_detach);
2200 
2201 /**
2202  * netif_device_attach - mark device as attached
2203  * @dev: network device
2204  *
2205  * Mark device as attached from system and restart if needed.
2206  */
2207 void netif_device_attach(struct net_device *dev)
2208 {
2209 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2210 	    netif_running(dev)) {
2211 		netif_tx_wake_all_queues(dev);
2212 		__netdev_watchdog_up(dev);
2213 	}
2214 }
2215 EXPORT_SYMBOL(netif_device_attach);
2216 
2217 static void skb_warn_bad_offload(const struct sk_buff *skb)
2218 {
2219 	static const netdev_features_t null_features = 0;
2220 	struct net_device *dev = skb->dev;
2221 	const char *driver = "";
2222 
2223 	if (!net_ratelimit())
2224 		return;
2225 
2226 	if (dev && dev->dev.parent)
2227 		driver = dev_driver_string(dev->dev.parent);
2228 
2229 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2230 	     "gso_type=%d ip_summed=%d\n",
2231 	     driver, dev ? &dev->features : &null_features,
2232 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2233 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2234 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2235 }
2236 
2237 /*
2238  * Invalidate hardware checksum when packet is to be mangled, and
2239  * complete checksum manually on outgoing path.
2240  */
2241 int skb_checksum_help(struct sk_buff *skb)
2242 {
2243 	__wsum csum;
2244 	int ret = 0, offset;
2245 
2246 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2247 		goto out_set_summed;
2248 
2249 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2250 		skb_warn_bad_offload(skb);
2251 		return -EINVAL;
2252 	}
2253 
2254 	/* Before computing a checksum, we should make sure no frag could
2255 	 * be modified by an external entity : checksum could be wrong.
2256 	 */
2257 	if (skb_has_shared_frag(skb)) {
2258 		ret = __skb_linearize(skb);
2259 		if (ret)
2260 			goto out;
2261 	}
2262 
2263 	offset = skb_checksum_start_offset(skb);
2264 	BUG_ON(offset >= skb_headlen(skb));
2265 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2266 
2267 	offset += skb->csum_offset;
2268 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2269 
2270 	if (skb_cloned(skb) &&
2271 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2272 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2273 		if (ret)
2274 			goto out;
2275 	}
2276 
2277 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2278 out_set_summed:
2279 	skb->ip_summed = CHECKSUM_NONE;
2280 out:
2281 	return ret;
2282 }
2283 EXPORT_SYMBOL(skb_checksum_help);
2284 
2285 __be16 skb_network_protocol(struct sk_buff *skb)
2286 {
2287 	__be16 type = skb->protocol;
2288 	int vlan_depth = ETH_HLEN;
2289 
2290 	/* Tunnel gso handlers can set protocol to ethernet. */
2291 	if (type == htons(ETH_P_TEB)) {
2292 		struct ethhdr *eth;
2293 
2294 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2295 			return 0;
2296 
2297 		eth = (struct ethhdr *)skb_mac_header(skb);
2298 		type = eth->h_proto;
2299 	}
2300 
2301 	while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2302 		struct vlan_hdr *vh;
2303 
2304 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2305 			return 0;
2306 
2307 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2308 		type = vh->h_vlan_encapsulated_proto;
2309 		vlan_depth += VLAN_HLEN;
2310 	}
2311 
2312 	return type;
2313 }
2314 
2315 /**
2316  *	skb_mac_gso_segment - mac layer segmentation handler.
2317  *	@skb: buffer to segment
2318  *	@features: features for the output path (see dev->features)
2319  */
2320 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2321 				    netdev_features_t features)
2322 {
2323 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2324 	struct packet_offload *ptype;
2325 	__be16 type = skb_network_protocol(skb);
2326 
2327 	if (unlikely(!type))
2328 		return ERR_PTR(-EINVAL);
2329 
2330 	__skb_pull(skb, skb->mac_len);
2331 
2332 	rcu_read_lock();
2333 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2334 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2335 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2336 				int err;
2337 
2338 				err = ptype->callbacks.gso_send_check(skb);
2339 				segs = ERR_PTR(err);
2340 				if (err || skb_gso_ok(skb, features))
2341 					break;
2342 				__skb_push(skb, (skb->data -
2343 						 skb_network_header(skb)));
2344 			}
2345 			segs = ptype->callbacks.gso_segment(skb, features);
2346 			break;
2347 		}
2348 	}
2349 	rcu_read_unlock();
2350 
2351 	__skb_push(skb, skb->data - skb_mac_header(skb));
2352 
2353 	return segs;
2354 }
2355 EXPORT_SYMBOL(skb_mac_gso_segment);
2356 
2357 
2358 /* openvswitch calls this on rx path, so we need a different check.
2359  */
2360 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2361 {
2362 	if (tx_path)
2363 		return skb->ip_summed != CHECKSUM_PARTIAL;
2364 	else
2365 		return skb->ip_summed == CHECKSUM_NONE;
2366 }
2367 
2368 /**
2369  *	__skb_gso_segment - Perform segmentation on skb.
2370  *	@skb: buffer to segment
2371  *	@features: features for the output path (see dev->features)
2372  *	@tx_path: whether it is called in TX path
2373  *
2374  *	This function segments the given skb and returns a list of segments.
2375  *
2376  *	It may return NULL if the skb requires no segmentation.  This is
2377  *	only possible when GSO is used for verifying header integrity.
2378  */
2379 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2380 				  netdev_features_t features, bool tx_path)
2381 {
2382 	if (unlikely(skb_needs_check(skb, tx_path))) {
2383 		int err;
2384 
2385 		skb_warn_bad_offload(skb);
2386 
2387 		if (skb_header_cloned(skb) &&
2388 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2389 			return ERR_PTR(err);
2390 	}
2391 
2392 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2393 	SKB_GSO_CB(skb)->encap_level = 0;
2394 
2395 	skb_reset_mac_header(skb);
2396 	skb_reset_mac_len(skb);
2397 
2398 	return skb_mac_gso_segment(skb, features);
2399 }
2400 EXPORT_SYMBOL(__skb_gso_segment);
2401 
2402 /* Take action when hardware reception checksum errors are detected. */
2403 #ifdef CONFIG_BUG
2404 void netdev_rx_csum_fault(struct net_device *dev)
2405 {
2406 	if (net_ratelimit()) {
2407 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2408 		dump_stack();
2409 	}
2410 }
2411 EXPORT_SYMBOL(netdev_rx_csum_fault);
2412 #endif
2413 
2414 /* Actually, we should eliminate this check as soon as we know, that:
2415  * 1. IOMMU is present and allows to map all the memory.
2416  * 2. No high memory really exists on this machine.
2417  */
2418 
2419 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2420 {
2421 #ifdef CONFIG_HIGHMEM
2422 	int i;
2423 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2424 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2425 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2426 			if (PageHighMem(skb_frag_page(frag)))
2427 				return 1;
2428 		}
2429 	}
2430 
2431 	if (PCI_DMA_BUS_IS_PHYS) {
2432 		struct device *pdev = dev->dev.parent;
2433 
2434 		if (!pdev)
2435 			return 0;
2436 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2437 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2438 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2439 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2440 				return 1;
2441 		}
2442 	}
2443 #endif
2444 	return 0;
2445 }
2446 
2447 struct dev_gso_cb {
2448 	void (*destructor)(struct sk_buff *skb);
2449 };
2450 
2451 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2452 
2453 static void dev_gso_skb_destructor(struct sk_buff *skb)
2454 {
2455 	struct dev_gso_cb *cb;
2456 
2457 	do {
2458 		struct sk_buff *nskb = skb->next;
2459 
2460 		skb->next = nskb->next;
2461 		nskb->next = NULL;
2462 		kfree_skb(nskb);
2463 	} while (skb->next);
2464 
2465 	cb = DEV_GSO_CB(skb);
2466 	if (cb->destructor)
2467 		cb->destructor(skb);
2468 }
2469 
2470 /**
2471  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2472  *	@skb: buffer to segment
2473  *	@features: device features as applicable to this skb
2474  *
2475  *	This function segments the given skb and stores the list of segments
2476  *	in skb->next.
2477  */
2478 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2479 {
2480 	struct sk_buff *segs;
2481 
2482 	segs = skb_gso_segment(skb, features);
2483 
2484 	/* Verifying header integrity only. */
2485 	if (!segs)
2486 		return 0;
2487 
2488 	if (IS_ERR(segs))
2489 		return PTR_ERR(segs);
2490 
2491 	skb->next = segs;
2492 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2493 	skb->destructor = dev_gso_skb_destructor;
2494 
2495 	return 0;
2496 }
2497 
2498 static netdev_features_t harmonize_features(struct sk_buff *skb,
2499 	netdev_features_t features)
2500 {
2501 	if (skb->ip_summed != CHECKSUM_NONE &&
2502 	    !can_checksum_protocol(features, skb_network_protocol(skb))) {
2503 		features &= ~NETIF_F_ALL_CSUM;
2504 	} else if (illegal_highdma(skb->dev, skb)) {
2505 		features &= ~NETIF_F_SG;
2506 	}
2507 
2508 	return features;
2509 }
2510 
2511 netdev_features_t netif_skb_features(struct sk_buff *skb)
2512 {
2513 	__be16 protocol = skb->protocol;
2514 	netdev_features_t features = skb->dev->features;
2515 
2516 	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2517 		features &= ~NETIF_F_GSO_MASK;
2518 
2519 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2520 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2521 		protocol = veh->h_vlan_encapsulated_proto;
2522 	} else if (!vlan_tx_tag_present(skb)) {
2523 		return harmonize_features(skb, features);
2524 	}
2525 
2526 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2527 					       NETIF_F_HW_VLAN_STAG_TX);
2528 
2529 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2530 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2531 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2532 				NETIF_F_HW_VLAN_STAG_TX;
2533 
2534 	return harmonize_features(skb, features);
2535 }
2536 EXPORT_SYMBOL(netif_skb_features);
2537 
2538 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2539 			struct netdev_queue *txq, void *accel_priv)
2540 {
2541 	const struct net_device_ops *ops = dev->netdev_ops;
2542 	int rc = NETDEV_TX_OK;
2543 	unsigned int skb_len;
2544 
2545 	if (likely(!skb->next)) {
2546 		netdev_features_t features;
2547 
2548 		/*
2549 		 * If device doesn't need skb->dst, release it right now while
2550 		 * its hot in this cpu cache
2551 		 */
2552 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2553 			skb_dst_drop(skb);
2554 
2555 		features = netif_skb_features(skb);
2556 
2557 		if (vlan_tx_tag_present(skb) &&
2558 		    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2559 			skb = __vlan_put_tag(skb, skb->vlan_proto,
2560 					     vlan_tx_tag_get(skb));
2561 			if (unlikely(!skb))
2562 				goto out;
2563 
2564 			skb->vlan_tci = 0;
2565 		}
2566 
2567 		/* If encapsulation offload request, verify we are testing
2568 		 * hardware encapsulation features instead of standard
2569 		 * features for the netdev
2570 		 */
2571 		if (skb->encapsulation)
2572 			features &= dev->hw_enc_features;
2573 
2574 		if (netif_needs_gso(skb, features)) {
2575 			if (unlikely(dev_gso_segment(skb, features)))
2576 				goto out_kfree_skb;
2577 			if (skb->next)
2578 				goto gso;
2579 		} else {
2580 			if (skb_needs_linearize(skb, features) &&
2581 			    __skb_linearize(skb))
2582 				goto out_kfree_skb;
2583 
2584 			/* If packet is not checksummed and device does not
2585 			 * support checksumming for this protocol, complete
2586 			 * checksumming here.
2587 			 */
2588 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2589 				if (skb->encapsulation)
2590 					skb_set_inner_transport_header(skb,
2591 						skb_checksum_start_offset(skb));
2592 				else
2593 					skb_set_transport_header(skb,
2594 						skb_checksum_start_offset(skb));
2595 				if (!(features & NETIF_F_ALL_CSUM) &&
2596 				     skb_checksum_help(skb))
2597 					goto out_kfree_skb;
2598 			}
2599 		}
2600 
2601 		if (!list_empty(&ptype_all))
2602 			dev_queue_xmit_nit(skb, dev);
2603 
2604 		skb_len = skb->len;
2605 		if (accel_priv)
2606 			rc = ops->ndo_dfwd_start_xmit(skb, dev, accel_priv);
2607 		else
2608 			rc = ops->ndo_start_xmit(skb, dev);
2609 
2610 		trace_net_dev_xmit(skb, rc, dev, skb_len);
2611 		if (rc == NETDEV_TX_OK && txq)
2612 			txq_trans_update(txq);
2613 		return rc;
2614 	}
2615 
2616 gso:
2617 	do {
2618 		struct sk_buff *nskb = skb->next;
2619 
2620 		skb->next = nskb->next;
2621 		nskb->next = NULL;
2622 
2623 		if (!list_empty(&ptype_all))
2624 			dev_queue_xmit_nit(nskb, dev);
2625 
2626 		skb_len = nskb->len;
2627 		if (accel_priv)
2628 			rc = ops->ndo_dfwd_start_xmit(nskb, dev, accel_priv);
2629 		else
2630 			rc = ops->ndo_start_xmit(nskb, dev);
2631 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2632 		if (unlikely(rc != NETDEV_TX_OK)) {
2633 			if (rc & ~NETDEV_TX_MASK)
2634 				goto out_kfree_gso_skb;
2635 			nskb->next = skb->next;
2636 			skb->next = nskb;
2637 			return rc;
2638 		}
2639 		txq_trans_update(txq);
2640 		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2641 			return NETDEV_TX_BUSY;
2642 	} while (skb->next);
2643 
2644 out_kfree_gso_skb:
2645 	if (likely(skb->next == NULL)) {
2646 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2647 		consume_skb(skb);
2648 		return rc;
2649 	}
2650 out_kfree_skb:
2651 	kfree_skb(skb);
2652 out:
2653 	return rc;
2654 }
2655 EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2656 
2657 static void qdisc_pkt_len_init(struct sk_buff *skb)
2658 {
2659 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2660 
2661 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2662 
2663 	/* To get more precise estimation of bytes sent on wire,
2664 	 * we add to pkt_len the headers size of all segments
2665 	 */
2666 	if (shinfo->gso_size)  {
2667 		unsigned int hdr_len;
2668 		u16 gso_segs = shinfo->gso_segs;
2669 
2670 		/* mac layer + network layer */
2671 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2672 
2673 		/* + transport layer */
2674 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2675 			hdr_len += tcp_hdrlen(skb);
2676 		else
2677 			hdr_len += sizeof(struct udphdr);
2678 
2679 		if (shinfo->gso_type & SKB_GSO_DODGY)
2680 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2681 						shinfo->gso_size);
2682 
2683 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2684 	}
2685 }
2686 
2687 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2688 				 struct net_device *dev,
2689 				 struct netdev_queue *txq)
2690 {
2691 	spinlock_t *root_lock = qdisc_lock(q);
2692 	bool contended;
2693 	int rc;
2694 
2695 	qdisc_pkt_len_init(skb);
2696 	qdisc_calculate_pkt_len(skb, q);
2697 	/*
2698 	 * Heuristic to force contended enqueues to serialize on a
2699 	 * separate lock before trying to get qdisc main lock.
2700 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2701 	 * and dequeue packets faster.
2702 	 */
2703 	contended = qdisc_is_running(q);
2704 	if (unlikely(contended))
2705 		spin_lock(&q->busylock);
2706 
2707 	spin_lock(root_lock);
2708 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2709 		kfree_skb(skb);
2710 		rc = NET_XMIT_DROP;
2711 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2712 		   qdisc_run_begin(q)) {
2713 		/*
2714 		 * This is a work-conserving queue; there are no old skbs
2715 		 * waiting to be sent out; and the qdisc is not running -
2716 		 * xmit the skb directly.
2717 		 */
2718 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2719 			skb_dst_force(skb);
2720 
2721 		qdisc_bstats_update(q, skb);
2722 
2723 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2724 			if (unlikely(contended)) {
2725 				spin_unlock(&q->busylock);
2726 				contended = false;
2727 			}
2728 			__qdisc_run(q);
2729 		} else
2730 			qdisc_run_end(q);
2731 
2732 		rc = NET_XMIT_SUCCESS;
2733 	} else {
2734 		skb_dst_force(skb);
2735 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2736 		if (qdisc_run_begin(q)) {
2737 			if (unlikely(contended)) {
2738 				spin_unlock(&q->busylock);
2739 				contended = false;
2740 			}
2741 			__qdisc_run(q);
2742 		}
2743 	}
2744 	spin_unlock(root_lock);
2745 	if (unlikely(contended))
2746 		spin_unlock(&q->busylock);
2747 	return rc;
2748 }
2749 
2750 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2751 static void skb_update_prio(struct sk_buff *skb)
2752 {
2753 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2754 
2755 	if (!skb->priority && skb->sk && map) {
2756 		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2757 
2758 		if (prioidx < map->priomap_len)
2759 			skb->priority = map->priomap[prioidx];
2760 	}
2761 }
2762 #else
2763 #define skb_update_prio(skb)
2764 #endif
2765 
2766 static DEFINE_PER_CPU(int, xmit_recursion);
2767 #define RECURSION_LIMIT 10
2768 
2769 /**
2770  *	dev_loopback_xmit - loop back @skb
2771  *	@skb: buffer to transmit
2772  */
2773 int dev_loopback_xmit(struct sk_buff *skb)
2774 {
2775 	skb_reset_mac_header(skb);
2776 	__skb_pull(skb, skb_network_offset(skb));
2777 	skb->pkt_type = PACKET_LOOPBACK;
2778 	skb->ip_summed = CHECKSUM_UNNECESSARY;
2779 	WARN_ON(!skb_dst(skb));
2780 	skb_dst_force(skb);
2781 	netif_rx_ni(skb);
2782 	return 0;
2783 }
2784 EXPORT_SYMBOL(dev_loopback_xmit);
2785 
2786 /**
2787  *	dev_queue_xmit - transmit a buffer
2788  *	@skb: buffer to transmit
2789  *
2790  *	Queue a buffer for transmission to a network device. The caller must
2791  *	have set the device and priority and built the buffer before calling
2792  *	this function. The function can be called from an interrupt.
2793  *
2794  *	A negative errno code is returned on a failure. A success does not
2795  *	guarantee the frame will be transmitted as it may be dropped due
2796  *	to congestion or traffic shaping.
2797  *
2798  * -----------------------------------------------------------------------------------
2799  *      I notice this method can also return errors from the queue disciplines,
2800  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2801  *      be positive.
2802  *
2803  *      Regardless of the return value, the skb is consumed, so it is currently
2804  *      difficult to retry a send to this method.  (You can bump the ref count
2805  *      before sending to hold a reference for retry if you are careful.)
2806  *
2807  *      When calling this method, interrupts MUST be enabled.  This is because
2808  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2809  *          --BLG
2810  */
2811 int dev_queue_xmit(struct sk_buff *skb)
2812 {
2813 	struct net_device *dev = skb->dev;
2814 	struct netdev_queue *txq;
2815 	struct Qdisc *q;
2816 	int rc = -ENOMEM;
2817 
2818 	skb_reset_mac_header(skb);
2819 
2820 	/* Disable soft irqs for various locks below. Also
2821 	 * stops preemption for RCU.
2822 	 */
2823 	rcu_read_lock_bh();
2824 
2825 	skb_update_prio(skb);
2826 
2827 	txq = netdev_pick_tx(dev, skb);
2828 	q = rcu_dereference_bh(txq->qdisc);
2829 
2830 #ifdef CONFIG_NET_CLS_ACT
2831 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2832 #endif
2833 	trace_net_dev_queue(skb);
2834 	if (q->enqueue) {
2835 		rc = __dev_xmit_skb(skb, q, dev, txq);
2836 		goto out;
2837 	}
2838 
2839 	/* The device has no queue. Common case for software devices:
2840 	   loopback, all the sorts of tunnels...
2841 
2842 	   Really, it is unlikely that netif_tx_lock protection is necessary
2843 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2844 	   counters.)
2845 	   However, it is possible, that they rely on protection
2846 	   made by us here.
2847 
2848 	   Check this and shot the lock. It is not prone from deadlocks.
2849 	   Either shot noqueue qdisc, it is even simpler 8)
2850 	 */
2851 	if (dev->flags & IFF_UP) {
2852 		int cpu = smp_processor_id(); /* ok because BHs are off */
2853 
2854 		if (txq->xmit_lock_owner != cpu) {
2855 
2856 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2857 				goto recursion_alert;
2858 
2859 			HARD_TX_LOCK(dev, txq, cpu);
2860 
2861 			if (!netif_xmit_stopped(txq)) {
2862 				__this_cpu_inc(xmit_recursion);
2863 				rc = dev_hard_start_xmit(skb, dev, txq, NULL);
2864 				__this_cpu_dec(xmit_recursion);
2865 				if (dev_xmit_complete(rc)) {
2866 					HARD_TX_UNLOCK(dev, txq);
2867 					goto out;
2868 				}
2869 			}
2870 			HARD_TX_UNLOCK(dev, txq);
2871 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2872 					     dev->name);
2873 		} else {
2874 			/* Recursion is detected! It is possible,
2875 			 * unfortunately
2876 			 */
2877 recursion_alert:
2878 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2879 					     dev->name);
2880 		}
2881 	}
2882 
2883 	rc = -ENETDOWN;
2884 	rcu_read_unlock_bh();
2885 
2886 	kfree_skb(skb);
2887 	return rc;
2888 out:
2889 	rcu_read_unlock_bh();
2890 	return rc;
2891 }
2892 EXPORT_SYMBOL(dev_queue_xmit);
2893 
2894 
2895 /*=======================================================================
2896 			Receiver routines
2897   =======================================================================*/
2898 
2899 int netdev_max_backlog __read_mostly = 1000;
2900 EXPORT_SYMBOL(netdev_max_backlog);
2901 
2902 int netdev_tstamp_prequeue __read_mostly = 1;
2903 int netdev_budget __read_mostly = 300;
2904 int weight_p __read_mostly = 64;            /* old backlog weight */
2905 
2906 /* Called with irq disabled */
2907 static inline void ____napi_schedule(struct softnet_data *sd,
2908 				     struct napi_struct *napi)
2909 {
2910 	list_add_tail(&napi->poll_list, &sd->poll_list);
2911 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2912 }
2913 
2914 #ifdef CONFIG_RPS
2915 
2916 /* One global table that all flow-based protocols share. */
2917 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2918 EXPORT_SYMBOL(rps_sock_flow_table);
2919 
2920 struct static_key rps_needed __read_mostly;
2921 
2922 static struct rps_dev_flow *
2923 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2924 	    struct rps_dev_flow *rflow, u16 next_cpu)
2925 {
2926 	if (next_cpu != RPS_NO_CPU) {
2927 #ifdef CONFIG_RFS_ACCEL
2928 		struct netdev_rx_queue *rxqueue;
2929 		struct rps_dev_flow_table *flow_table;
2930 		struct rps_dev_flow *old_rflow;
2931 		u32 flow_id;
2932 		u16 rxq_index;
2933 		int rc;
2934 
2935 		/* Should we steer this flow to a different hardware queue? */
2936 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2937 		    !(dev->features & NETIF_F_NTUPLE))
2938 			goto out;
2939 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2940 		if (rxq_index == skb_get_rx_queue(skb))
2941 			goto out;
2942 
2943 		rxqueue = dev->_rx + rxq_index;
2944 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2945 		if (!flow_table)
2946 			goto out;
2947 		flow_id = skb->rxhash & flow_table->mask;
2948 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2949 							rxq_index, flow_id);
2950 		if (rc < 0)
2951 			goto out;
2952 		old_rflow = rflow;
2953 		rflow = &flow_table->flows[flow_id];
2954 		rflow->filter = rc;
2955 		if (old_rflow->filter == rflow->filter)
2956 			old_rflow->filter = RPS_NO_FILTER;
2957 	out:
2958 #endif
2959 		rflow->last_qtail =
2960 			per_cpu(softnet_data, next_cpu).input_queue_head;
2961 	}
2962 
2963 	rflow->cpu = next_cpu;
2964 	return rflow;
2965 }
2966 
2967 /*
2968  * get_rps_cpu is called from netif_receive_skb and returns the target
2969  * CPU from the RPS map of the receiving queue for a given skb.
2970  * rcu_read_lock must be held on entry.
2971  */
2972 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2973 		       struct rps_dev_flow **rflowp)
2974 {
2975 	struct netdev_rx_queue *rxqueue;
2976 	struct rps_map *map;
2977 	struct rps_dev_flow_table *flow_table;
2978 	struct rps_sock_flow_table *sock_flow_table;
2979 	int cpu = -1;
2980 	u16 tcpu;
2981 
2982 	if (skb_rx_queue_recorded(skb)) {
2983 		u16 index = skb_get_rx_queue(skb);
2984 		if (unlikely(index >= dev->real_num_rx_queues)) {
2985 			WARN_ONCE(dev->real_num_rx_queues > 1,
2986 				  "%s received packet on queue %u, but number "
2987 				  "of RX queues is %u\n",
2988 				  dev->name, index, dev->real_num_rx_queues);
2989 			goto done;
2990 		}
2991 		rxqueue = dev->_rx + index;
2992 	} else
2993 		rxqueue = dev->_rx;
2994 
2995 	map = rcu_dereference(rxqueue->rps_map);
2996 	if (map) {
2997 		if (map->len == 1 &&
2998 		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2999 			tcpu = map->cpus[0];
3000 			if (cpu_online(tcpu))
3001 				cpu = tcpu;
3002 			goto done;
3003 		}
3004 	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3005 		goto done;
3006 	}
3007 
3008 	skb_reset_network_header(skb);
3009 	if (!skb_get_rxhash(skb))
3010 		goto done;
3011 
3012 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3013 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3014 	if (flow_table && sock_flow_table) {
3015 		u16 next_cpu;
3016 		struct rps_dev_flow *rflow;
3017 
3018 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3019 		tcpu = rflow->cpu;
3020 
3021 		next_cpu = sock_flow_table->ents[skb->rxhash &
3022 		    sock_flow_table->mask];
3023 
3024 		/*
3025 		 * If the desired CPU (where last recvmsg was done) is
3026 		 * different from current CPU (one in the rx-queue flow
3027 		 * table entry), switch if one of the following holds:
3028 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
3029 		 *   - Current CPU is offline.
3030 		 *   - The current CPU's queue tail has advanced beyond the
3031 		 *     last packet that was enqueued using this table entry.
3032 		 *     This guarantees that all previous packets for the flow
3033 		 *     have been dequeued, thus preserving in order delivery.
3034 		 */
3035 		if (unlikely(tcpu != next_cpu) &&
3036 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3037 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3038 		      rflow->last_qtail)) >= 0)) {
3039 			tcpu = next_cpu;
3040 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3041 		}
3042 
3043 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3044 			*rflowp = rflow;
3045 			cpu = tcpu;
3046 			goto done;
3047 		}
3048 	}
3049 
3050 	if (map) {
3051 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3052 
3053 		if (cpu_online(tcpu)) {
3054 			cpu = tcpu;
3055 			goto done;
3056 		}
3057 	}
3058 
3059 done:
3060 	return cpu;
3061 }
3062 
3063 #ifdef CONFIG_RFS_ACCEL
3064 
3065 /**
3066  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3067  * @dev: Device on which the filter was set
3068  * @rxq_index: RX queue index
3069  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3070  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3071  *
3072  * Drivers that implement ndo_rx_flow_steer() should periodically call
3073  * this function for each installed filter and remove the filters for
3074  * which it returns %true.
3075  */
3076 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3077 			 u32 flow_id, u16 filter_id)
3078 {
3079 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3080 	struct rps_dev_flow_table *flow_table;
3081 	struct rps_dev_flow *rflow;
3082 	bool expire = true;
3083 	int cpu;
3084 
3085 	rcu_read_lock();
3086 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3087 	if (flow_table && flow_id <= flow_table->mask) {
3088 		rflow = &flow_table->flows[flow_id];
3089 		cpu = ACCESS_ONCE(rflow->cpu);
3090 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3091 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3092 			   rflow->last_qtail) <
3093 		     (int)(10 * flow_table->mask)))
3094 			expire = false;
3095 	}
3096 	rcu_read_unlock();
3097 	return expire;
3098 }
3099 EXPORT_SYMBOL(rps_may_expire_flow);
3100 
3101 #endif /* CONFIG_RFS_ACCEL */
3102 
3103 /* Called from hardirq (IPI) context */
3104 static void rps_trigger_softirq(void *data)
3105 {
3106 	struct softnet_data *sd = data;
3107 
3108 	____napi_schedule(sd, &sd->backlog);
3109 	sd->received_rps++;
3110 }
3111 
3112 #endif /* CONFIG_RPS */
3113 
3114 /*
3115  * Check if this softnet_data structure is another cpu one
3116  * If yes, queue it to our IPI list and return 1
3117  * If no, return 0
3118  */
3119 static int rps_ipi_queued(struct softnet_data *sd)
3120 {
3121 #ifdef CONFIG_RPS
3122 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3123 
3124 	if (sd != mysd) {
3125 		sd->rps_ipi_next = mysd->rps_ipi_list;
3126 		mysd->rps_ipi_list = sd;
3127 
3128 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3129 		return 1;
3130 	}
3131 #endif /* CONFIG_RPS */
3132 	return 0;
3133 }
3134 
3135 #ifdef CONFIG_NET_FLOW_LIMIT
3136 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3137 #endif
3138 
3139 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3140 {
3141 #ifdef CONFIG_NET_FLOW_LIMIT
3142 	struct sd_flow_limit *fl;
3143 	struct softnet_data *sd;
3144 	unsigned int old_flow, new_flow;
3145 
3146 	if (qlen < (netdev_max_backlog >> 1))
3147 		return false;
3148 
3149 	sd = &__get_cpu_var(softnet_data);
3150 
3151 	rcu_read_lock();
3152 	fl = rcu_dereference(sd->flow_limit);
3153 	if (fl) {
3154 		new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3155 		old_flow = fl->history[fl->history_head];
3156 		fl->history[fl->history_head] = new_flow;
3157 
3158 		fl->history_head++;
3159 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3160 
3161 		if (likely(fl->buckets[old_flow]))
3162 			fl->buckets[old_flow]--;
3163 
3164 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3165 			fl->count++;
3166 			rcu_read_unlock();
3167 			return true;
3168 		}
3169 	}
3170 	rcu_read_unlock();
3171 #endif
3172 	return false;
3173 }
3174 
3175 /*
3176  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3177  * queue (may be a remote CPU queue).
3178  */
3179 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3180 			      unsigned int *qtail)
3181 {
3182 	struct softnet_data *sd;
3183 	unsigned long flags;
3184 	unsigned int qlen;
3185 
3186 	sd = &per_cpu(softnet_data, cpu);
3187 
3188 	local_irq_save(flags);
3189 
3190 	rps_lock(sd);
3191 	qlen = skb_queue_len(&sd->input_pkt_queue);
3192 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3193 		if (skb_queue_len(&sd->input_pkt_queue)) {
3194 enqueue:
3195 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3196 			input_queue_tail_incr_save(sd, qtail);
3197 			rps_unlock(sd);
3198 			local_irq_restore(flags);
3199 			return NET_RX_SUCCESS;
3200 		}
3201 
3202 		/* Schedule NAPI for backlog device
3203 		 * We can use non atomic operation since we own the queue lock
3204 		 */
3205 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3206 			if (!rps_ipi_queued(sd))
3207 				____napi_schedule(sd, &sd->backlog);
3208 		}
3209 		goto enqueue;
3210 	}
3211 
3212 	sd->dropped++;
3213 	rps_unlock(sd);
3214 
3215 	local_irq_restore(flags);
3216 
3217 	atomic_long_inc(&skb->dev->rx_dropped);
3218 	kfree_skb(skb);
3219 	return NET_RX_DROP;
3220 }
3221 
3222 /**
3223  *	netif_rx	-	post buffer to the network code
3224  *	@skb: buffer to post
3225  *
3226  *	This function receives a packet from a device driver and queues it for
3227  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3228  *	may be dropped during processing for congestion control or by the
3229  *	protocol layers.
3230  *
3231  *	return values:
3232  *	NET_RX_SUCCESS	(no congestion)
3233  *	NET_RX_DROP     (packet was dropped)
3234  *
3235  */
3236 
3237 int netif_rx(struct sk_buff *skb)
3238 {
3239 	int ret;
3240 
3241 	/* if netpoll wants it, pretend we never saw it */
3242 	if (netpoll_rx(skb))
3243 		return NET_RX_DROP;
3244 
3245 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3246 
3247 	trace_netif_rx(skb);
3248 #ifdef CONFIG_RPS
3249 	if (static_key_false(&rps_needed)) {
3250 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3251 		int cpu;
3252 
3253 		preempt_disable();
3254 		rcu_read_lock();
3255 
3256 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3257 		if (cpu < 0)
3258 			cpu = smp_processor_id();
3259 
3260 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3261 
3262 		rcu_read_unlock();
3263 		preempt_enable();
3264 	} else
3265 #endif
3266 	{
3267 		unsigned int qtail;
3268 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3269 		put_cpu();
3270 	}
3271 	return ret;
3272 }
3273 EXPORT_SYMBOL(netif_rx);
3274 
3275 int netif_rx_ni(struct sk_buff *skb)
3276 {
3277 	int err;
3278 
3279 	preempt_disable();
3280 	err = netif_rx(skb);
3281 	if (local_softirq_pending())
3282 		do_softirq();
3283 	preempt_enable();
3284 
3285 	return err;
3286 }
3287 EXPORT_SYMBOL(netif_rx_ni);
3288 
3289 static void net_tx_action(struct softirq_action *h)
3290 {
3291 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3292 
3293 	if (sd->completion_queue) {
3294 		struct sk_buff *clist;
3295 
3296 		local_irq_disable();
3297 		clist = sd->completion_queue;
3298 		sd->completion_queue = NULL;
3299 		local_irq_enable();
3300 
3301 		while (clist) {
3302 			struct sk_buff *skb = clist;
3303 			clist = clist->next;
3304 
3305 			WARN_ON(atomic_read(&skb->users));
3306 			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3307 				trace_consume_skb(skb);
3308 			else
3309 				trace_kfree_skb(skb, net_tx_action);
3310 			__kfree_skb(skb);
3311 		}
3312 	}
3313 
3314 	if (sd->output_queue) {
3315 		struct Qdisc *head;
3316 
3317 		local_irq_disable();
3318 		head = sd->output_queue;
3319 		sd->output_queue = NULL;
3320 		sd->output_queue_tailp = &sd->output_queue;
3321 		local_irq_enable();
3322 
3323 		while (head) {
3324 			struct Qdisc *q = head;
3325 			spinlock_t *root_lock;
3326 
3327 			head = head->next_sched;
3328 
3329 			root_lock = qdisc_lock(q);
3330 			if (spin_trylock(root_lock)) {
3331 				smp_mb__before_clear_bit();
3332 				clear_bit(__QDISC_STATE_SCHED,
3333 					  &q->state);
3334 				qdisc_run(q);
3335 				spin_unlock(root_lock);
3336 			} else {
3337 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3338 					      &q->state)) {
3339 					__netif_reschedule(q);
3340 				} else {
3341 					smp_mb__before_clear_bit();
3342 					clear_bit(__QDISC_STATE_SCHED,
3343 						  &q->state);
3344 				}
3345 			}
3346 		}
3347 	}
3348 }
3349 
3350 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3351     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3352 /* This hook is defined here for ATM LANE */
3353 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3354 			     unsigned char *addr) __read_mostly;
3355 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3356 #endif
3357 
3358 #ifdef CONFIG_NET_CLS_ACT
3359 /* TODO: Maybe we should just force sch_ingress to be compiled in
3360  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3361  * a compare and 2 stores extra right now if we dont have it on
3362  * but have CONFIG_NET_CLS_ACT
3363  * NOTE: This doesn't stop any functionality; if you dont have
3364  * the ingress scheduler, you just can't add policies on ingress.
3365  *
3366  */
3367 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3368 {
3369 	struct net_device *dev = skb->dev;
3370 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3371 	int result = TC_ACT_OK;
3372 	struct Qdisc *q;
3373 
3374 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3375 		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3376 				     skb->skb_iif, dev->ifindex);
3377 		return TC_ACT_SHOT;
3378 	}
3379 
3380 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3381 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3382 
3383 	q = rxq->qdisc;
3384 	if (q != &noop_qdisc) {
3385 		spin_lock(qdisc_lock(q));
3386 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3387 			result = qdisc_enqueue_root(skb, q);
3388 		spin_unlock(qdisc_lock(q));
3389 	}
3390 
3391 	return result;
3392 }
3393 
3394 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3395 					 struct packet_type **pt_prev,
3396 					 int *ret, struct net_device *orig_dev)
3397 {
3398 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3399 
3400 	if (!rxq || rxq->qdisc == &noop_qdisc)
3401 		goto out;
3402 
3403 	if (*pt_prev) {
3404 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3405 		*pt_prev = NULL;
3406 	}
3407 
3408 	switch (ing_filter(skb, rxq)) {
3409 	case TC_ACT_SHOT:
3410 	case TC_ACT_STOLEN:
3411 		kfree_skb(skb);
3412 		return NULL;
3413 	}
3414 
3415 out:
3416 	skb->tc_verd = 0;
3417 	return skb;
3418 }
3419 #endif
3420 
3421 /**
3422  *	netdev_rx_handler_register - register receive handler
3423  *	@dev: device to register a handler for
3424  *	@rx_handler: receive handler to register
3425  *	@rx_handler_data: data pointer that is used by rx handler
3426  *
3427  *	Register a receive hander for a device. This handler will then be
3428  *	called from __netif_receive_skb. A negative errno code is returned
3429  *	on a failure.
3430  *
3431  *	The caller must hold the rtnl_mutex.
3432  *
3433  *	For a general description of rx_handler, see enum rx_handler_result.
3434  */
3435 int netdev_rx_handler_register(struct net_device *dev,
3436 			       rx_handler_func_t *rx_handler,
3437 			       void *rx_handler_data)
3438 {
3439 	ASSERT_RTNL();
3440 
3441 	if (dev->rx_handler)
3442 		return -EBUSY;
3443 
3444 	/* Note: rx_handler_data must be set before rx_handler */
3445 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3446 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3447 
3448 	return 0;
3449 }
3450 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3451 
3452 /**
3453  *	netdev_rx_handler_unregister - unregister receive handler
3454  *	@dev: device to unregister a handler from
3455  *
3456  *	Unregister a receive handler from a device.
3457  *
3458  *	The caller must hold the rtnl_mutex.
3459  */
3460 void netdev_rx_handler_unregister(struct net_device *dev)
3461 {
3462 
3463 	ASSERT_RTNL();
3464 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3465 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3466 	 * section has a guarantee to see a non NULL rx_handler_data
3467 	 * as well.
3468 	 */
3469 	synchronize_net();
3470 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3471 }
3472 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3473 
3474 /*
3475  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3476  * the special handling of PFMEMALLOC skbs.
3477  */
3478 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3479 {
3480 	switch (skb->protocol) {
3481 	case __constant_htons(ETH_P_ARP):
3482 	case __constant_htons(ETH_P_IP):
3483 	case __constant_htons(ETH_P_IPV6):
3484 	case __constant_htons(ETH_P_8021Q):
3485 	case __constant_htons(ETH_P_8021AD):
3486 		return true;
3487 	default:
3488 		return false;
3489 	}
3490 }
3491 
3492 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3493 {
3494 	struct packet_type *ptype, *pt_prev;
3495 	rx_handler_func_t *rx_handler;
3496 	struct net_device *orig_dev;
3497 	struct net_device *null_or_dev;
3498 	bool deliver_exact = false;
3499 	int ret = NET_RX_DROP;
3500 	__be16 type;
3501 
3502 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3503 
3504 	trace_netif_receive_skb(skb);
3505 
3506 	/* if we've gotten here through NAPI, check netpoll */
3507 	if (netpoll_receive_skb(skb))
3508 		goto out;
3509 
3510 	orig_dev = skb->dev;
3511 
3512 	skb_reset_network_header(skb);
3513 	if (!skb_transport_header_was_set(skb))
3514 		skb_reset_transport_header(skb);
3515 	skb_reset_mac_len(skb);
3516 
3517 	pt_prev = NULL;
3518 
3519 	rcu_read_lock();
3520 
3521 another_round:
3522 	skb->skb_iif = skb->dev->ifindex;
3523 
3524 	__this_cpu_inc(softnet_data.processed);
3525 
3526 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3527 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3528 		skb = vlan_untag(skb);
3529 		if (unlikely(!skb))
3530 			goto unlock;
3531 	}
3532 
3533 #ifdef CONFIG_NET_CLS_ACT
3534 	if (skb->tc_verd & TC_NCLS) {
3535 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3536 		goto ncls;
3537 	}
3538 #endif
3539 
3540 	if (pfmemalloc)
3541 		goto skip_taps;
3542 
3543 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3544 		if (!ptype->dev || ptype->dev == skb->dev) {
3545 			if (pt_prev)
3546 				ret = deliver_skb(skb, pt_prev, orig_dev);
3547 			pt_prev = ptype;
3548 		}
3549 	}
3550 
3551 skip_taps:
3552 #ifdef CONFIG_NET_CLS_ACT
3553 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3554 	if (!skb)
3555 		goto unlock;
3556 ncls:
3557 #endif
3558 
3559 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3560 		goto drop;
3561 
3562 	if (vlan_tx_tag_present(skb)) {
3563 		if (pt_prev) {
3564 			ret = deliver_skb(skb, pt_prev, orig_dev);
3565 			pt_prev = NULL;
3566 		}
3567 		if (vlan_do_receive(&skb))
3568 			goto another_round;
3569 		else if (unlikely(!skb))
3570 			goto unlock;
3571 	}
3572 
3573 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3574 	if (rx_handler) {
3575 		if (pt_prev) {
3576 			ret = deliver_skb(skb, pt_prev, orig_dev);
3577 			pt_prev = NULL;
3578 		}
3579 		switch (rx_handler(&skb)) {
3580 		case RX_HANDLER_CONSUMED:
3581 			ret = NET_RX_SUCCESS;
3582 			goto unlock;
3583 		case RX_HANDLER_ANOTHER:
3584 			goto another_round;
3585 		case RX_HANDLER_EXACT:
3586 			deliver_exact = true;
3587 		case RX_HANDLER_PASS:
3588 			break;
3589 		default:
3590 			BUG();
3591 		}
3592 	}
3593 
3594 	if (unlikely(vlan_tx_tag_present(skb))) {
3595 		if (vlan_tx_tag_get_id(skb))
3596 			skb->pkt_type = PACKET_OTHERHOST;
3597 		/* Note: we might in the future use prio bits
3598 		 * and set skb->priority like in vlan_do_receive()
3599 		 * For the time being, just ignore Priority Code Point
3600 		 */
3601 		skb->vlan_tci = 0;
3602 	}
3603 
3604 	/* deliver only exact match when indicated */
3605 	null_or_dev = deliver_exact ? skb->dev : NULL;
3606 
3607 	type = skb->protocol;
3608 	list_for_each_entry_rcu(ptype,
3609 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3610 		if (ptype->type == type &&
3611 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3612 		     ptype->dev == orig_dev)) {
3613 			if (pt_prev)
3614 				ret = deliver_skb(skb, pt_prev, orig_dev);
3615 			pt_prev = ptype;
3616 		}
3617 	}
3618 
3619 	if (pt_prev) {
3620 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3621 			goto drop;
3622 		else
3623 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3624 	} else {
3625 drop:
3626 		atomic_long_inc(&skb->dev->rx_dropped);
3627 		kfree_skb(skb);
3628 		/* Jamal, now you will not able to escape explaining
3629 		 * me how you were going to use this. :-)
3630 		 */
3631 		ret = NET_RX_DROP;
3632 	}
3633 
3634 unlock:
3635 	rcu_read_unlock();
3636 out:
3637 	return ret;
3638 }
3639 
3640 static int __netif_receive_skb(struct sk_buff *skb)
3641 {
3642 	int ret;
3643 
3644 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3645 		unsigned long pflags = current->flags;
3646 
3647 		/*
3648 		 * PFMEMALLOC skbs are special, they should
3649 		 * - be delivered to SOCK_MEMALLOC sockets only
3650 		 * - stay away from userspace
3651 		 * - have bounded memory usage
3652 		 *
3653 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
3654 		 * context down to all allocation sites.
3655 		 */
3656 		current->flags |= PF_MEMALLOC;
3657 		ret = __netif_receive_skb_core(skb, true);
3658 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
3659 	} else
3660 		ret = __netif_receive_skb_core(skb, false);
3661 
3662 	return ret;
3663 }
3664 
3665 /**
3666  *	netif_receive_skb - process receive buffer from network
3667  *	@skb: buffer to process
3668  *
3669  *	netif_receive_skb() is the main receive data processing function.
3670  *	It always succeeds. The buffer may be dropped during processing
3671  *	for congestion control or by the protocol layers.
3672  *
3673  *	This function may only be called from softirq context and interrupts
3674  *	should be enabled.
3675  *
3676  *	Return values (usually ignored):
3677  *	NET_RX_SUCCESS: no congestion
3678  *	NET_RX_DROP: packet was dropped
3679  */
3680 int netif_receive_skb(struct sk_buff *skb)
3681 {
3682 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3683 
3684 	if (skb_defer_rx_timestamp(skb))
3685 		return NET_RX_SUCCESS;
3686 
3687 #ifdef CONFIG_RPS
3688 	if (static_key_false(&rps_needed)) {
3689 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3690 		int cpu, ret;
3691 
3692 		rcu_read_lock();
3693 
3694 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3695 
3696 		if (cpu >= 0) {
3697 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3698 			rcu_read_unlock();
3699 			return ret;
3700 		}
3701 		rcu_read_unlock();
3702 	}
3703 #endif
3704 	return __netif_receive_skb(skb);
3705 }
3706 EXPORT_SYMBOL(netif_receive_skb);
3707 
3708 /* Network device is going away, flush any packets still pending
3709  * Called with irqs disabled.
3710  */
3711 static void flush_backlog(void *arg)
3712 {
3713 	struct net_device *dev = arg;
3714 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3715 	struct sk_buff *skb, *tmp;
3716 
3717 	rps_lock(sd);
3718 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3719 		if (skb->dev == dev) {
3720 			__skb_unlink(skb, &sd->input_pkt_queue);
3721 			kfree_skb(skb);
3722 			input_queue_head_incr(sd);
3723 		}
3724 	}
3725 	rps_unlock(sd);
3726 
3727 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3728 		if (skb->dev == dev) {
3729 			__skb_unlink(skb, &sd->process_queue);
3730 			kfree_skb(skb);
3731 			input_queue_head_incr(sd);
3732 		}
3733 	}
3734 }
3735 
3736 static int napi_gro_complete(struct sk_buff *skb)
3737 {
3738 	struct packet_offload *ptype;
3739 	__be16 type = skb->protocol;
3740 	struct list_head *head = &offload_base;
3741 	int err = -ENOENT;
3742 
3743 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3744 
3745 	if (NAPI_GRO_CB(skb)->count == 1) {
3746 		skb_shinfo(skb)->gso_size = 0;
3747 		goto out;
3748 	}
3749 
3750 	rcu_read_lock();
3751 	list_for_each_entry_rcu(ptype, head, list) {
3752 		if (ptype->type != type || !ptype->callbacks.gro_complete)
3753 			continue;
3754 
3755 		err = ptype->callbacks.gro_complete(skb, 0);
3756 		break;
3757 	}
3758 	rcu_read_unlock();
3759 
3760 	if (err) {
3761 		WARN_ON(&ptype->list == head);
3762 		kfree_skb(skb);
3763 		return NET_RX_SUCCESS;
3764 	}
3765 
3766 out:
3767 	return netif_receive_skb(skb);
3768 }
3769 
3770 /* napi->gro_list contains packets ordered by age.
3771  * youngest packets at the head of it.
3772  * Complete skbs in reverse order to reduce latencies.
3773  */
3774 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3775 {
3776 	struct sk_buff *skb, *prev = NULL;
3777 
3778 	/* scan list and build reverse chain */
3779 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3780 		skb->prev = prev;
3781 		prev = skb;
3782 	}
3783 
3784 	for (skb = prev; skb; skb = prev) {
3785 		skb->next = NULL;
3786 
3787 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3788 			return;
3789 
3790 		prev = skb->prev;
3791 		napi_gro_complete(skb);
3792 		napi->gro_count--;
3793 	}
3794 
3795 	napi->gro_list = NULL;
3796 }
3797 EXPORT_SYMBOL(napi_gro_flush);
3798 
3799 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3800 {
3801 	struct sk_buff *p;
3802 	unsigned int maclen = skb->dev->hard_header_len;
3803 
3804 	for (p = napi->gro_list; p; p = p->next) {
3805 		unsigned long diffs;
3806 
3807 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3808 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3809 		if (maclen == ETH_HLEN)
3810 			diffs |= compare_ether_header(skb_mac_header(p),
3811 						      skb_gro_mac_header(skb));
3812 		else if (!diffs)
3813 			diffs = memcmp(skb_mac_header(p),
3814 				       skb_gro_mac_header(skb),
3815 				       maclen);
3816 		NAPI_GRO_CB(p)->same_flow = !diffs;
3817 		NAPI_GRO_CB(p)->flush = 0;
3818 	}
3819 }
3820 
3821 static void skb_gro_reset_offset(struct sk_buff *skb)
3822 {
3823 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
3824 	const skb_frag_t *frag0 = &pinfo->frags[0];
3825 
3826 	NAPI_GRO_CB(skb)->data_offset = 0;
3827 	NAPI_GRO_CB(skb)->frag0 = NULL;
3828 	NAPI_GRO_CB(skb)->frag0_len = 0;
3829 
3830 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3831 	    pinfo->nr_frags &&
3832 	    !PageHighMem(skb_frag_page(frag0))) {
3833 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3834 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3835 	}
3836 }
3837 
3838 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3839 {
3840 	struct sk_buff **pp = NULL;
3841 	struct packet_offload *ptype;
3842 	__be16 type = skb->protocol;
3843 	struct list_head *head = &offload_base;
3844 	int same_flow;
3845 	enum gro_result ret;
3846 
3847 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3848 		goto normal;
3849 
3850 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3851 		goto normal;
3852 
3853 	skb_gro_reset_offset(skb);
3854 	gro_list_prepare(napi, skb);
3855 
3856 	rcu_read_lock();
3857 	list_for_each_entry_rcu(ptype, head, list) {
3858 		if (ptype->type != type || !ptype->callbacks.gro_receive)
3859 			continue;
3860 
3861 		skb_set_network_header(skb, skb_gro_offset(skb));
3862 		skb_reset_mac_len(skb);
3863 		NAPI_GRO_CB(skb)->same_flow = 0;
3864 		NAPI_GRO_CB(skb)->flush = 0;
3865 		NAPI_GRO_CB(skb)->free = 0;
3866 
3867 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3868 		break;
3869 	}
3870 	rcu_read_unlock();
3871 
3872 	if (&ptype->list == head)
3873 		goto normal;
3874 
3875 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3876 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3877 
3878 	if (pp) {
3879 		struct sk_buff *nskb = *pp;
3880 
3881 		*pp = nskb->next;
3882 		nskb->next = NULL;
3883 		napi_gro_complete(nskb);
3884 		napi->gro_count--;
3885 	}
3886 
3887 	if (same_flow)
3888 		goto ok;
3889 
3890 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3891 		goto normal;
3892 
3893 	napi->gro_count++;
3894 	NAPI_GRO_CB(skb)->count = 1;
3895 	NAPI_GRO_CB(skb)->age = jiffies;
3896 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3897 	skb->next = napi->gro_list;
3898 	napi->gro_list = skb;
3899 	ret = GRO_HELD;
3900 
3901 pull:
3902 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3903 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3904 
3905 		BUG_ON(skb->end - skb->tail < grow);
3906 
3907 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3908 
3909 		skb->tail += grow;
3910 		skb->data_len -= grow;
3911 
3912 		skb_shinfo(skb)->frags[0].page_offset += grow;
3913 		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3914 
3915 		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3916 			skb_frag_unref(skb, 0);
3917 			memmove(skb_shinfo(skb)->frags,
3918 				skb_shinfo(skb)->frags + 1,
3919 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3920 		}
3921 	}
3922 
3923 ok:
3924 	return ret;
3925 
3926 normal:
3927 	ret = GRO_NORMAL;
3928 	goto pull;
3929 }
3930 
3931 
3932 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3933 {
3934 	switch (ret) {
3935 	case GRO_NORMAL:
3936 		if (netif_receive_skb(skb))
3937 			ret = GRO_DROP;
3938 		break;
3939 
3940 	case GRO_DROP:
3941 		kfree_skb(skb);
3942 		break;
3943 
3944 	case GRO_MERGED_FREE:
3945 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3946 			kmem_cache_free(skbuff_head_cache, skb);
3947 		else
3948 			__kfree_skb(skb);
3949 		break;
3950 
3951 	case GRO_HELD:
3952 	case GRO_MERGED:
3953 		break;
3954 	}
3955 
3956 	return ret;
3957 }
3958 
3959 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3960 {
3961 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3962 }
3963 EXPORT_SYMBOL(napi_gro_receive);
3964 
3965 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3966 {
3967 	__skb_pull(skb, skb_headlen(skb));
3968 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
3969 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3970 	skb->vlan_tci = 0;
3971 	skb->dev = napi->dev;
3972 	skb->skb_iif = 0;
3973 
3974 	napi->skb = skb;
3975 }
3976 
3977 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3978 {
3979 	struct sk_buff *skb = napi->skb;
3980 
3981 	if (!skb) {
3982 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3983 		napi->skb = skb;
3984 	}
3985 	return skb;
3986 }
3987 EXPORT_SYMBOL(napi_get_frags);
3988 
3989 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3990 			       gro_result_t ret)
3991 {
3992 	switch (ret) {
3993 	case GRO_NORMAL:
3994 		if (netif_receive_skb(skb))
3995 			ret = GRO_DROP;
3996 		break;
3997 
3998 	case GRO_DROP:
3999 	case GRO_MERGED_FREE:
4000 		napi_reuse_skb(napi, skb);
4001 		break;
4002 
4003 	case GRO_HELD:
4004 	case GRO_MERGED:
4005 		break;
4006 	}
4007 
4008 	return ret;
4009 }
4010 
4011 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4012 {
4013 	struct sk_buff *skb = napi->skb;
4014 
4015 	napi->skb = NULL;
4016 
4017 	if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) {
4018 		napi_reuse_skb(napi, skb);
4019 		return NULL;
4020 	}
4021 	skb->protocol = eth_type_trans(skb, skb->dev);
4022 
4023 	return skb;
4024 }
4025 
4026 gro_result_t napi_gro_frags(struct napi_struct *napi)
4027 {
4028 	struct sk_buff *skb = napi_frags_skb(napi);
4029 
4030 	if (!skb)
4031 		return GRO_DROP;
4032 
4033 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4034 }
4035 EXPORT_SYMBOL(napi_gro_frags);
4036 
4037 /*
4038  * net_rps_action sends any pending IPI's for rps.
4039  * Note: called with local irq disabled, but exits with local irq enabled.
4040  */
4041 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4042 {
4043 #ifdef CONFIG_RPS
4044 	struct softnet_data *remsd = sd->rps_ipi_list;
4045 
4046 	if (remsd) {
4047 		sd->rps_ipi_list = NULL;
4048 
4049 		local_irq_enable();
4050 
4051 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4052 		while (remsd) {
4053 			struct softnet_data *next = remsd->rps_ipi_next;
4054 
4055 			if (cpu_online(remsd->cpu))
4056 				__smp_call_function_single(remsd->cpu,
4057 							   &remsd->csd, 0);
4058 			remsd = next;
4059 		}
4060 	} else
4061 #endif
4062 		local_irq_enable();
4063 }
4064 
4065 static int process_backlog(struct napi_struct *napi, int quota)
4066 {
4067 	int work = 0;
4068 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4069 
4070 #ifdef CONFIG_RPS
4071 	/* Check if we have pending ipi, its better to send them now,
4072 	 * not waiting net_rx_action() end.
4073 	 */
4074 	if (sd->rps_ipi_list) {
4075 		local_irq_disable();
4076 		net_rps_action_and_irq_enable(sd);
4077 	}
4078 #endif
4079 	napi->weight = weight_p;
4080 	local_irq_disable();
4081 	while (work < quota) {
4082 		struct sk_buff *skb;
4083 		unsigned int qlen;
4084 
4085 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4086 			local_irq_enable();
4087 			__netif_receive_skb(skb);
4088 			local_irq_disable();
4089 			input_queue_head_incr(sd);
4090 			if (++work >= quota) {
4091 				local_irq_enable();
4092 				return work;
4093 			}
4094 		}
4095 
4096 		rps_lock(sd);
4097 		qlen = skb_queue_len(&sd->input_pkt_queue);
4098 		if (qlen)
4099 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4100 						   &sd->process_queue);
4101 
4102 		if (qlen < quota - work) {
4103 			/*
4104 			 * Inline a custom version of __napi_complete().
4105 			 * only current cpu owns and manipulates this napi,
4106 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4107 			 * we can use a plain write instead of clear_bit(),
4108 			 * and we dont need an smp_mb() memory barrier.
4109 			 */
4110 			list_del(&napi->poll_list);
4111 			napi->state = 0;
4112 
4113 			quota = work + qlen;
4114 		}
4115 		rps_unlock(sd);
4116 	}
4117 	local_irq_enable();
4118 
4119 	return work;
4120 }
4121 
4122 /**
4123  * __napi_schedule - schedule for receive
4124  * @n: entry to schedule
4125  *
4126  * The entry's receive function will be scheduled to run
4127  */
4128 void __napi_schedule(struct napi_struct *n)
4129 {
4130 	unsigned long flags;
4131 
4132 	local_irq_save(flags);
4133 	____napi_schedule(&__get_cpu_var(softnet_data), n);
4134 	local_irq_restore(flags);
4135 }
4136 EXPORT_SYMBOL(__napi_schedule);
4137 
4138 void __napi_complete(struct napi_struct *n)
4139 {
4140 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4141 	BUG_ON(n->gro_list);
4142 
4143 	list_del(&n->poll_list);
4144 	smp_mb__before_clear_bit();
4145 	clear_bit(NAPI_STATE_SCHED, &n->state);
4146 }
4147 EXPORT_SYMBOL(__napi_complete);
4148 
4149 void napi_complete(struct napi_struct *n)
4150 {
4151 	unsigned long flags;
4152 
4153 	/*
4154 	 * don't let napi dequeue from the cpu poll list
4155 	 * just in case its running on a different cpu
4156 	 */
4157 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4158 		return;
4159 
4160 	napi_gro_flush(n, false);
4161 	local_irq_save(flags);
4162 	__napi_complete(n);
4163 	local_irq_restore(flags);
4164 }
4165 EXPORT_SYMBOL(napi_complete);
4166 
4167 /* must be called under rcu_read_lock(), as we dont take a reference */
4168 struct napi_struct *napi_by_id(unsigned int napi_id)
4169 {
4170 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4171 	struct napi_struct *napi;
4172 
4173 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4174 		if (napi->napi_id == napi_id)
4175 			return napi;
4176 
4177 	return NULL;
4178 }
4179 EXPORT_SYMBOL_GPL(napi_by_id);
4180 
4181 void napi_hash_add(struct napi_struct *napi)
4182 {
4183 	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4184 
4185 		spin_lock(&napi_hash_lock);
4186 
4187 		/* 0 is not a valid id, we also skip an id that is taken
4188 		 * we expect both events to be extremely rare
4189 		 */
4190 		napi->napi_id = 0;
4191 		while (!napi->napi_id) {
4192 			napi->napi_id = ++napi_gen_id;
4193 			if (napi_by_id(napi->napi_id))
4194 				napi->napi_id = 0;
4195 		}
4196 
4197 		hlist_add_head_rcu(&napi->napi_hash_node,
4198 			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4199 
4200 		spin_unlock(&napi_hash_lock);
4201 	}
4202 }
4203 EXPORT_SYMBOL_GPL(napi_hash_add);
4204 
4205 /* Warning : caller is responsible to make sure rcu grace period
4206  * is respected before freeing memory containing @napi
4207  */
4208 void napi_hash_del(struct napi_struct *napi)
4209 {
4210 	spin_lock(&napi_hash_lock);
4211 
4212 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4213 		hlist_del_rcu(&napi->napi_hash_node);
4214 
4215 	spin_unlock(&napi_hash_lock);
4216 }
4217 EXPORT_SYMBOL_GPL(napi_hash_del);
4218 
4219 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4220 		    int (*poll)(struct napi_struct *, int), int weight)
4221 {
4222 	INIT_LIST_HEAD(&napi->poll_list);
4223 	napi->gro_count = 0;
4224 	napi->gro_list = NULL;
4225 	napi->skb = NULL;
4226 	napi->poll = poll;
4227 	if (weight > NAPI_POLL_WEIGHT)
4228 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4229 			    weight, dev->name);
4230 	napi->weight = weight;
4231 	list_add(&napi->dev_list, &dev->napi_list);
4232 	napi->dev = dev;
4233 #ifdef CONFIG_NETPOLL
4234 	spin_lock_init(&napi->poll_lock);
4235 	napi->poll_owner = -1;
4236 #endif
4237 	set_bit(NAPI_STATE_SCHED, &napi->state);
4238 }
4239 EXPORT_SYMBOL(netif_napi_add);
4240 
4241 void netif_napi_del(struct napi_struct *napi)
4242 {
4243 	struct sk_buff *skb, *next;
4244 
4245 	list_del_init(&napi->dev_list);
4246 	napi_free_frags(napi);
4247 
4248 	for (skb = napi->gro_list; skb; skb = next) {
4249 		next = skb->next;
4250 		skb->next = NULL;
4251 		kfree_skb(skb);
4252 	}
4253 
4254 	napi->gro_list = NULL;
4255 	napi->gro_count = 0;
4256 }
4257 EXPORT_SYMBOL(netif_napi_del);
4258 
4259 static void net_rx_action(struct softirq_action *h)
4260 {
4261 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
4262 	unsigned long time_limit = jiffies + 2;
4263 	int budget = netdev_budget;
4264 	void *have;
4265 
4266 	local_irq_disable();
4267 
4268 	while (!list_empty(&sd->poll_list)) {
4269 		struct napi_struct *n;
4270 		int work, weight;
4271 
4272 		/* If softirq window is exhuasted then punt.
4273 		 * Allow this to run for 2 jiffies since which will allow
4274 		 * an average latency of 1.5/HZ.
4275 		 */
4276 		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4277 			goto softnet_break;
4278 
4279 		local_irq_enable();
4280 
4281 		/* Even though interrupts have been re-enabled, this
4282 		 * access is safe because interrupts can only add new
4283 		 * entries to the tail of this list, and only ->poll()
4284 		 * calls can remove this head entry from the list.
4285 		 */
4286 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4287 
4288 		have = netpoll_poll_lock(n);
4289 
4290 		weight = n->weight;
4291 
4292 		/* This NAPI_STATE_SCHED test is for avoiding a race
4293 		 * with netpoll's poll_napi().  Only the entity which
4294 		 * obtains the lock and sees NAPI_STATE_SCHED set will
4295 		 * actually make the ->poll() call.  Therefore we avoid
4296 		 * accidentally calling ->poll() when NAPI is not scheduled.
4297 		 */
4298 		work = 0;
4299 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4300 			work = n->poll(n, weight);
4301 			trace_napi_poll(n);
4302 		}
4303 
4304 		WARN_ON_ONCE(work > weight);
4305 
4306 		budget -= work;
4307 
4308 		local_irq_disable();
4309 
4310 		/* Drivers must not modify the NAPI state if they
4311 		 * consume the entire weight.  In such cases this code
4312 		 * still "owns" the NAPI instance and therefore can
4313 		 * move the instance around on the list at-will.
4314 		 */
4315 		if (unlikely(work == weight)) {
4316 			if (unlikely(napi_disable_pending(n))) {
4317 				local_irq_enable();
4318 				napi_complete(n);
4319 				local_irq_disable();
4320 			} else {
4321 				if (n->gro_list) {
4322 					/* flush too old packets
4323 					 * If HZ < 1000, flush all packets.
4324 					 */
4325 					local_irq_enable();
4326 					napi_gro_flush(n, HZ >= 1000);
4327 					local_irq_disable();
4328 				}
4329 				list_move_tail(&n->poll_list, &sd->poll_list);
4330 			}
4331 		}
4332 
4333 		netpoll_poll_unlock(have);
4334 	}
4335 out:
4336 	net_rps_action_and_irq_enable(sd);
4337 
4338 #ifdef CONFIG_NET_DMA
4339 	/*
4340 	 * There may not be any more sk_buffs coming right now, so push
4341 	 * any pending DMA copies to hardware
4342 	 */
4343 	dma_issue_pending_all();
4344 #endif
4345 
4346 	return;
4347 
4348 softnet_break:
4349 	sd->time_squeeze++;
4350 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4351 	goto out;
4352 }
4353 
4354 struct netdev_adjacent {
4355 	struct net_device *dev;
4356 
4357 	/* upper master flag, there can only be one master device per list */
4358 	bool master;
4359 
4360 	/* counter for the number of times this device was added to us */
4361 	u16 ref_nr;
4362 
4363 	/* private field for the users */
4364 	void *private;
4365 
4366 	struct list_head list;
4367 	struct rcu_head rcu;
4368 };
4369 
4370 static struct netdev_adjacent *__netdev_find_adj_rcu(struct net_device *dev,
4371 						     struct net_device *adj_dev,
4372 						     struct list_head *adj_list)
4373 {
4374 	struct netdev_adjacent *adj;
4375 
4376 	list_for_each_entry_rcu(adj, adj_list, list) {
4377 		if (adj->dev == adj_dev)
4378 			return adj;
4379 	}
4380 	return NULL;
4381 }
4382 
4383 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4384 						 struct net_device *adj_dev,
4385 						 struct list_head *adj_list)
4386 {
4387 	struct netdev_adjacent *adj;
4388 
4389 	list_for_each_entry(adj, adj_list, list) {
4390 		if (adj->dev == adj_dev)
4391 			return adj;
4392 	}
4393 	return NULL;
4394 }
4395 
4396 /**
4397  * netdev_has_upper_dev - Check if device is linked to an upper device
4398  * @dev: device
4399  * @upper_dev: upper device to check
4400  *
4401  * Find out if a device is linked to specified upper device and return true
4402  * in case it is. Note that this checks only immediate upper device,
4403  * not through a complete stack of devices. The caller must hold the RTNL lock.
4404  */
4405 bool netdev_has_upper_dev(struct net_device *dev,
4406 			  struct net_device *upper_dev)
4407 {
4408 	ASSERT_RTNL();
4409 
4410 	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4411 }
4412 EXPORT_SYMBOL(netdev_has_upper_dev);
4413 
4414 /**
4415  * netdev_has_any_upper_dev - Check if device is linked to some device
4416  * @dev: device
4417  *
4418  * Find out if a device is linked to an upper device and return true in case
4419  * it is. The caller must hold the RTNL lock.
4420  */
4421 bool netdev_has_any_upper_dev(struct net_device *dev)
4422 {
4423 	ASSERT_RTNL();
4424 
4425 	return !list_empty(&dev->all_adj_list.upper);
4426 }
4427 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4428 
4429 /**
4430  * netdev_master_upper_dev_get - Get master upper device
4431  * @dev: device
4432  *
4433  * Find a master upper device and return pointer to it or NULL in case
4434  * it's not there. The caller must hold the RTNL lock.
4435  */
4436 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4437 {
4438 	struct netdev_adjacent *upper;
4439 
4440 	ASSERT_RTNL();
4441 
4442 	if (list_empty(&dev->adj_list.upper))
4443 		return NULL;
4444 
4445 	upper = list_first_entry(&dev->adj_list.upper,
4446 				 struct netdev_adjacent, list);
4447 	if (likely(upper->master))
4448 		return upper->dev;
4449 	return NULL;
4450 }
4451 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4452 
4453 void *netdev_adjacent_get_private(struct list_head *adj_list)
4454 {
4455 	struct netdev_adjacent *adj;
4456 
4457 	adj = list_entry(adj_list, struct netdev_adjacent, list);
4458 
4459 	return adj->private;
4460 }
4461 EXPORT_SYMBOL(netdev_adjacent_get_private);
4462 
4463 /**
4464  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4465  * @dev: device
4466  * @iter: list_head ** of the current position
4467  *
4468  * Gets the next device from the dev's upper list, starting from iter
4469  * position. The caller must hold RCU read lock.
4470  */
4471 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4472 						     struct list_head **iter)
4473 {
4474 	struct netdev_adjacent *upper;
4475 
4476 	WARN_ON_ONCE(!rcu_read_lock_held());
4477 
4478 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4479 
4480 	if (&upper->list == &dev->all_adj_list.upper)
4481 		return NULL;
4482 
4483 	*iter = &upper->list;
4484 
4485 	return upper->dev;
4486 }
4487 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4488 
4489 /**
4490  * netdev_lower_get_next_private - Get the next ->private from the
4491  *				   lower neighbour list
4492  * @dev: device
4493  * @iter: list_head ** of the current position
4494  *
4495  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4496  * list, starting from iter position. The caller must hold either hold the
4497  * RTNL lock or its own locking that guarantees that the neighbour lower
4498  * list will remain unchainged.
4499  */
4500 void *netdev_lower_get_next_private(struct net_device *dev,
4501 				    struct list_head **iter)
4502 {
4503 	struct netdev_adjacent *lower;
4504 
4505 	lower = list_entry(*iter, struct netdev_adjacent, list);
4506 
4507 	if (&lower->list == &dev->adj_list.lower)
4508 		return NULL;
4509 
4510 	if (iter)
4511 		*iter = lower->list.next;
4512 
4513 	return lower->private;
4514 }
4515 EXPORT_SYMBOL(netdev_lower_get_next_private);
4516 
4517 /**
4518  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4519  *				       lower neighbour list, RCU
4520  *				       variant
4521  * @dev: device
4522  * @iter: list_head ** of the current position
4523  *
4524  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4525  * list, starting from iter position. The caller must hold RCU read lock.
4526  */
4527 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4528 					struct list_head **iter)
4529 {
4530 	struct netdev_adjacent *lower;
4531 
4532 	WARN_ON_ONCE(!rcu_read_lock_held());
4533 
4534 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4535 
4536 	if (&lower->list == &dev->adj_list.lower)
4537 		return NULL;
4538 
4539 	if (iter)
4540 		*iter = &lower->list;
4541 
4542 	return lower->private;
4543 }
4544 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4545 
4546 /**
4547  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4548  *				       lower neighbour list, RCU
4549  *				       variant
4550  * @dev: device
4551  *
4552  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4553  * list. The caller must hold RCU read lock.
4554  */
4555 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4556 {
4557 	struct netdev_adjacent *lower;
4558 
4559 	lower = list_first_or_null_rcu(&dev->adj_list.lower,
4560 			struct netdev_adjacent, list);
4561 	if (lower)
4562 		return lower->private;
4563 	return NULL;
4564 }
4565 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4566 
4567 /**
4568  * netdev_master_upper_dev_get_rcu - Get master upper device
4569  * @dev: device
4570  *
4571  * Find a master upper device and return pointer to it or NULL in case
4572  * it's not there. The caller must hold the RCU read lock.
4573  */
4574 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4575 {
4576 	struct netdev_adjacent *upper;
4577 
4578 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
4579 				       struct netdev_adjacent, list);
4580 	if (upper && likely(upper->master))
4581 		return upper->dev;
4582 	return NULL;
4583 }
4584 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4585 
4586 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4587 					struct net_device *adj_dev,
4588 					struct list_head *dev_list,
4589 					void *private, bool master)
4590 {
4591 	struct netdev_adjacent *adj;
4592 	char linkname[IFNAMSIZ+7];
4593 	int ret;
4594 
4595 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4596 
4597 	if (adj) {
4598 		adj->ref_nr++;
4599 		return 0;
4600 	}
4601 
4602 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4603 	if (!adj)
4604 		return -ENOMEM;
4605 
4606 	adj->dev = adj_dev;
4607 	adj->master = master;
4608 	adj->ref_nr = 1;
4609 	adj->private = private;
4610 	dev_hold(adj_dev);
4611 
4612 	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4613 		 adj_dev->name, dev->name, adj_dev->name);
4614 
4615 	if (dev_list == &dev->adj_list.lower) {
4616 		sprintf(linkname, "lower_%s", adj_dev->name);
4617 		ret = sysfs_create_link(&(dev->dev.kobj),
4618 					&(adj_dev->dev.kobj), linkname);
4619 		if (ret)
4620 			goto free_adj;
4621 	} else if (dev_list == &dev->adj_list.upper) {
4622 		sprintf(linkname, "upper_%s", adj_dev->name);
4623 		ret = sysfs_create_link(&(dev->dev.kobj),
4624 					&(adj_dev->dev.kobj), linkname);
4625 		if (ret)
4626 			goto free_adj;
4627 	}
4628 
4629 	/* Ensure that master link is always the first item in list. */
4630 	if (master) {
4631 		ret = sysfs_create_link(&(dev->dev.kobj),
4632 					&(adj_dev->dev.kobj), "master");
4633 		if (ret)
4634 			goto remove_symlinks;
4635 
4636 		list_add_rcu(&adj->list, dev_list);
4637 	} else {
4638 		list_add_tail_rcu(&adj->list, dev_list);
4639 	}
4640 
4641 	return 0;
4642 
4643 remove_symlinks:
4644 	if (dev_list == &dev->adj_list.lower) {
4645 		sprintf(linkname, "lower_%s", adj_dev->name);
4646 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4647 	} else if (dev_list == &dev->adj_list.upper) {
4648 		sprintf(linkname, "upper_%s", adj_dev->name);
4649 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4650 	}
4651 
4652 free_adj:
4653 	kfree(adj);
4654 	dev_put(adj_dev);
4655 
4656 	return ret;
4657 }
4658 
4659 void __netdev_adjacent_dev_remove(struct net_device *dev,
4660 				  struct net_device *adj_dev,
4661 				  struct list_head *dev_list)
4662 {
4663 	struct netdev_adjacent *adj;
4664 	char linkname[IFNAMSIZ+7];
4665 
4666 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4667 
4668 	if (!adj) {
4669 		pr_err("tried to remove device %s from %s\n",
4670 		       dev->name, adj_dev->name);
4671 		BUG();
4672 	}
4673 
4674 	if (adj->ref_nr > 1) {
4675 		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4676 			 adj->ref_nr-1);
4677 		adj->ref_nr--;
4678 		return;
4679 	}
4680 
4681 	if (adj->master)
4682 		sysfs_remove_link(&(dev->dev.kobj), "master");
4683 
4684 	if (dev_list == &dev->adj_list.lower) {
4685 		sprintf(linkname, "lower_%s", adj_dev->name);
4686 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4687 	} else if (dev_list == &dev->adj_list.upper) {
4688 		sprintf(linkname, "upper_%s", adj_dev->name);
4689 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4690 	}
4691 
4692 	list_del_rcu(&adj->list);
4693 	pr_debug("dev_put for %s, because link removed from %s to %s\n",
4694 		 adj_dev->name, dev->name, adj_dev->name);
4695 	dev_put(adj_dev);
4696 	kfree_rcu(adj, rcu);
4697 }
4698 
4699 int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4700 				     struct net_device *upper_dev,
4701 				     struct list_head *up_list,
4702 				     struct list_head *down_list,
4703 				     void *private, bool master)
4704 {
4705 	int ret;
4706 
4707 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4708 					   master);
4709 	if (ret)
4710 		return ret;
4711 
4712 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4713 					   false);
4714 	if (ret) {
4715 		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4716 		return ret;
4717 	}
4718 
4719 	return 0;
4720 }
4721 
4722 int __netdev_adjacent_dev_link(struct net_device *dev,
4723 			       struct net_device *upper_dev)
4724 {
4725 	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4726 						&dev->all_adj_list.upper,
4727 						&upper_dev->all_adj_list.lower,
4728 						NULL, false);
4729 }
4730 
4731 void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4732 					struct net_device *upper_dev,
4733 					struct list_head *up_list,
4734 					struct list_head *down_list)
4735 {
4736 	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4737 	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
4738 }
4739 
4740 void __netdev_adjacent_dev_unlink(struct net_device *dev,
4741 				  struct net_device *upper_dev)
4742 {
4743 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4744 					   &dev->all_adj_list.upper,
4745 					   &upper_dev->all_adj_list.lower);
4746 }
4747 
4748 int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4749 					 struct net_device *upper_dev,
4750 					 void *private, bool master)
4751 {
4752 	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4753 
4754 	if (ret)
4755 		return ret;
4756 
4757 	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4758 					       &dev->adj_list.upper,
4759 					       &upper_dev->adj_list.lower,
4760 					       private, master);
4761 	if (ret) {
4762 		__netdev_adjacent_dev_unlink(dev, upper_dev);
4763 		return ret;
4764 	}
4765 
4766 	return 0;
4767 }
4768 
4769 void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4770 					    struct net_device *upper_dev)
4771 {
4772 	__netdev_adjacent_dev_unlink(dev, upper_dev);
4773 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4774 					   &dev->adj_list.upper,
4775 					   &upper_dev->adj_list.lower);
4776 }
4777 
4778 static int __netdev_upper_dev_link(struct net_device *dev,
4779 				   struct net_device *upper_dev, bool master,
4780 				   void *private)
4781 {
4782 	struct netdev_adjacent *i, *j, *to_i, *to_j;
4783 	int ret = 0;
4784 
4785 	ASSERT_RTNL();
4786 
4787 	if (dev == upper_dev)
4788 		return -EBUSY;
4789 
4790 	/* To prevent loops, check if dev is not upper device to upper_dev. */
4791 	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
4792 		return -EBUSY;
4793 
4794 	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
4795 		return -EEXIST;
4796 
4797 	if (master && netdev_master_upper_dev_get(dev))
4798 		return -EBUSY;
4799 
4800 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
4801 						   master);
4802 	if (ret)
4803 		return ret;
4804 
4805 	/* Now that we linked these devs, make all the upper_dev's
4806 	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
4807 	 * versa, and don't forget the devices itself. All of these
4808 	 * links are non-neighbours.
4809 	 */
4810 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4811 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4812 			pr_debug("Interlinking %s with %s, non-neighbour\n",
4813 				 i->dev->name, j->dev->name);
4814 			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
4815 			if (ret)
4816 				goto rollback_mesh;
4817 		}
4818 	}
4819 
4820 	/* add dev to every upper_dev's upper device */
4821 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4822 		pr_debug("linking %s's upper device %s with %s\n",
4823 			 upper_dev->name, i->dev->name, dev->name);
4824 		ret = __netdev_adjacent_dev_link(dev, i->dev);
4825 		if (ret)
4826 			goto rollback_upper_mesh;
4827 	}
4828 
4829 	/* add upper_dev to every dev's lower device */
4830 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4831 		pr_debug("linking %s's lower device %s with %s\n", dev->name,
4832 			 i->dev->name, upper_dev->name);
4833 		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
4834 		if (ret)
4835 			goto rollback_lower_mesh;
4836 	}
4837 
4838 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4839 	return 0;
4840 
4841 rollback_lower_mesh:
4842 	to_i = i;
4843 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4844 		if (i == to_i)
4845 			break;
4846 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
4847 	}
4848 
4849 	i = NULL;
4850 
4851 rollback_upper_mesh:
4852 	to_i = i;
4853 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4854 		if (i == to_i)
4855 			break;
4856 		__netdev_adjacent_dev_unlink(dev, i->dev);
4857 	}
4858 
4859 	i = j = NULL;
4860 
4861 rollback_mesh:
4862 	to_i = i;
4863 	to_j = j;
4864 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4865 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4866 			if (i == to_i && j == to_j)
4867 				break;
4868 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
4869 		}
4870 		if (i == to_i)
4871 			break;
4872 	}
4873 
4874 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
4875 
4876 	return ret;
4877 }
4878 
4879 /**
4880  * netdev_upper_dev_link - Add a link to the upper device
4881  * @dev: device
4882  * @upper_dev: new upper device
4883  *
4884  * Adds a link to device which is upper to this one. The caller must hold
4885  * the RTNL lock. On a failure a negative errno code is returned.
4886  * On success the reference counts are adjusted and the function
4887  * returns zero.
4888  */
4889 int netdev_upper_dev_link(struct net_device *dev,
4890 			  struct net_device *upper_dev)
4891 {
4892 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
4893 }
4894 EXPORT_SYMBOL(netdev_upper_dev_link);
4895 
4896 /**
4897  * netdev_master_upper_dev_link - Add a master link to the upper device
4898  * @dev: device
4899  * @upper_dev: new upper device
4900  *
4901  * Adds a link to device which is upper to this one. In this case, only
4902  * one master upper device can be linked, although other non-master devices
4903  * might be linked as well. The caller must hold the RTNL lock.
4904  * On a failure a negative errno code is returned. On success the reference
4905  * counts are adjusted and the function returns zero.
4906  */
4907 int netdev_master_upper_dev_link(struct net_device *dev,
4908 				 struct net_device *upper_dev)
4909 {
4910 	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
4911 }
4912 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4913 
4914 int netdev_master_upper_dev_link_private(struct net_device *dev,
4915 					 struct net_device *upper_dev,
4916 					 void *private)
4917 {
4918 	return __netdev_upper_dev_link(dev, upper_dev, true, private);
4919 }
4920 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
4921 
4922 /**
4923  * netdev_upper_dev_unlink - Removes a link to upper device
4924  * @dev: device
4925  * @upper_dev: new upper device
4926  *
4927  * Removes a link to device which is upper to this one. The caller must hold
4928  * the RTNL lock.
4929  */
4930 void netdev_upper_dev_unlink(struct net_device *dev,
4931 			     struct net_device *upper_dev)
4932 {
4933 	struct netdev_adjacent *i, *j;
4934 	ASSERT_RTNL();
4935 
4936 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
4937 
4938 	/* Here is the tricky part. We must remove all dev's lower
4939 	 * devices from all upper_dev's upper devices and vice
4940 	 * versa, to maintain the graph relationship.
4941 	 */
4942 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
4943 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
4944 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
4945 
4946 	/* remove also the devices itself from lower/upper device
4947 	 * list
4948 	 */
4949 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
4950 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
4951 
4952 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
4953 		__netdev_adjacent_dev_unlink(dev, i->dev);
4954 
4955 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4956 }
4957 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4958 
4959 void *netdev_lower_dev_get_private_rcu(struct net_device *dev,
4960 				       struct net_device *lower_dev)
4961 {
4962 	struct netdev_adjacent *lower;
4963 
4964 	if (!lower_dev)
4965 		return NULL;
4966 	lower = __netdev_find_adj_rcu(dev, lower_dev, &dev->adj_list.lower);
4967 	if (!lower)
4968 		return NULL;
4969 
4970 	return lower->private;
4971 }
4972 EXPORT_SYMBOL(netdev_lower_dev_get_private_rcu);
4973 
4974 void *netdev_lower_dev_get_private(struct net_device *dev,
4975 				   struct net_device *lower_dev)
4976 {
4977 	struct netdev_adjacent *lower;
4978 
4979 	if (!lower_dev)
4980 		return NULL;
4981 	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
4982 	if (!lower)
4983 		return NULL;
4984 
4985 	return lower->private;
4986 }
4987 EXPORT_SYMBOL(netdev_lower_dev_get_private);
4988 
4989 static void dev_change_rx_flags(struct net_device *dev, int flags)
4990 {
4991 	const struct net_device_ops *ops = dev->netdev_ops;
4992 
4993 	if (ops->ndo_change_rx_flags)
4994 		ops->ndo_change_rx_flags(dev, flags);
4995 }
4996 
4997 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
4998 {
4999 	unsigned int old_flags = dev->flags;
5000 	kuid_t uid;
5001 	kgid_t gid;
5002 
5003 	ASSERT_RTNL();
5004 
5005 	dev->flags |= IFF_PROMISC;
5006 	dev->promiscuity += inc;
5007 	if (dev->promiscuity == 0) {
5008 		/*
5009 		 * Avoid overflow.
5010 		 * If inc causes overflow, untouch promisc and return error.
5011 		 */
5012 		if (inc < 0)
5013 			dev->flags &= ~IFF_PROMISC;
5014 		else {
5015 			dev->promiscuity -= inc;
5016 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5017 				dev->name);
5018 			return -EOVERFLOW;
5019 		}
5020 	}
5021 	if (dev->flags != old_flags) {
5022 		pr_info("device %s %s promiscuous mode\n",
5023 			dev->name,
5024 			dev->flags & IFF_PROMISC ? "entered" : "left");
5025 		if (audit_enabled) {
5026 			current_uid_gid(&uid, &gid);
5027 			audit_log(current->audit_context, GFP_ATOMIC,
5028 				AUDIT_ANOM_PROMISCUOUS,
5029 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5030 				dev->name, (dev->flags & IFF_PROMISC),
5031 				(old_flags & IFF_PROMISC),
5032 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
5033 				from_kuid(&init_user_ns, uid),
5034 				from_kgid(&init_user_ns, gid),
5035 				audit_get_sessionid(current));
5036 		}
5037 
5038 		dev_change_rx_flags(dev, IFF_PROMISC);
5039 	}
5040 	if (notify)
5041 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
5042 	return 0;
5043 }
5044 
5045 /**
5046  *	dev_set_promiscuity	- update promiscuity count on a device
5047  *	@dev: device
5048  *	@inc: modifier
5049  *
5050  *	Add or remove promiscuity from a device. While the count in the device
5051  *	remains above zero the interface remains promiscuous. Once it hits zero
5052  *	the device reverts back to normal filtering operation. A negative inc
5053  *	value is used to drop promiscuity on the device.
5054  *	Return 0 if successful or a negative errno code on error.
5055  */
5056 int dev_set_promiscuity(struct net_device *dev, int inc)
5057 {
5058 	unsigned int old_flags = dev->flags;
5059 	int err;
5060 
5061 	err = __dev_set_promiscuity(dev, inc, true);
5062 	if (err < 0)
5063 		return err;
5064 	if (dev->flags != old_flags)
5065 		dev_set_rx_mode(dev);
5066 	return err;
5067 }
5068 EXPORT_SYMBOL(dev_set_promiscuity);
5069 
5070 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5071 {
5072 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5073 
5074 	ASSERT_RTNL();
5075 
5076 	dev->flags |= IFF_ALLMULTI;
5077 	dev->allmulti += inc;
5078 	if (dev->allmulti == 0) {
5079 		/*
5080 		 * Avoid overflow.
5081 		 * If inc causes overflow, untouch allmulti and return error.
5082 		 */
5083 		if (inc < 0)
5084 			dev->flags &= ~IFF_ALLMULTI;
5085 		else {
5086 			dev->allmulti -= inc;
5087 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5088 				dev->name);
5089 			return -EOVERFLOW;
5090 		}
5091 	}
5092 	if (dev->flags ^ old_flags) {
5093 		dev_change_rx_flags(dev, IFF_ALLMULTI);
5094 		dev_set_rx_mode(dev);
5095 		if (notify)
5096 			__dev_notify_flags(dev, old_flags,
5097 					   dev->gflags ^ old_gflags);
5098 	}
5099 	return 0;
5100 }
5101 
5102 /**
5103  *	dev_set_allmulti	- update allmulti count on a device
5104  *	@dev: device
5105  *	@inc: modifier
5106  *
5107  *	Add or remove reception of all multicast frames to a device. While the
5108  *	count in the device remains above zero the interface remains listening
5109  *	to all interfaces. Once it hits zero the device reverts back to normal
5110  *	filtering operation. A negative @inc value is used to drop the counter
5111  *	when releasing a resource needing all multicasts.
5112  *	Return 0 if successful or a negative errno code on error.
5113  */
5114 
5115 int dev_set_allmulti(struct net_device *dev, int inc)
5116 {
5117 	return __dev_set_allmulti(dev, inc, true);
5118 }
5119 EXPORT_SYMBOL(dev_set_allmulti);
5120 
5121 /*
5122  *	Upload unicast and multicast address lists to device and
5123  *	configure RX filtering. When the device doesn't support unicast
5124  *	filtering it is put in promiscuous mode while unicast addresses
5125  *	are present.
5126  */
5127 void __dev_set_rx_mode(struct net_device *dev)
5128 {
5129 	const struct net_device_ops *ops = dev->netdev_ops;
5130 
5131 	/* dev_open will call this function so the list will stay sane. */
5132 	if (!(dev->flags&IFF_UP))
5133 		return;
5134 
5135 	if (!netif_device_present(dev))
5136 		return;
5137 
5138 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5139 		/* Unicast addresses changes may only happen under the rtnl,
5140 		 * therefore calling __dev_set_promiscuity here is safe.
5141 		 */
5142 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5143 			__dev_set_promiscuity(dev, 1, false);
5144 			dev->uc_promisc = true;
5145 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5146 			__dev_set_promiscuity(dev, -1, false);
5147 			dev->uc_promisc = false;
5148 		}
5149 	}
5150 
5151 	if (ops->ndo_set_rx_mode)
5152 		ops->ndo_set_rx_mode(dev);
5153 }
5154 
5155 void dev_set_rx_mode(struct net_device *dev)
5156 {
5157 	netif_addr_lock_bh(dev);
5158 	__dev_set_rx_mode(dev);
5159 	netif_addr_unlock_bh(dev);
5160 }
5161 
5162 /**
5163  *	dev_get_flags - get flags reported to userspace
5164  *	@dev: device
5165  *
5166  *	Get the combination of flag bits exported through APIs to userspace.
5167  */
5168 unsigned int dev_get_flags(const struct net_device *dev)
5169 {
5170 	unsigned int flags;
5171 
5172 	flags = (dev->flags & ~(IFF_PROMISC |
5173 				IFF_ALLMULTI |
5174 				IFF_RUNNING |
5175 				IFF_LOWER_UP |
5176 				IFF_DORMANT)) |
5177 		(dev->gflags & (IFF_PROMISC |
5178 				IFF_ALLMULTI));
5179 
5180 	if (netif_running(dev)) {
5181 		if (netif_oper_up(dev))
5182 			flags |= IFF_RUNNING;
5183 		if (netif_carrier_ok(dev))
5184 			flags |= IFF_LOWER_UP;
5185 		if (netif_dormant(dev))
5186 			flags |= IFF_DORMANT;
5187 	}
5188 
5189 	return flags;
5190 }
5191 EXPORT_SYMBOL(dev_get_flags);
5192 
5193 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5194 {
5195 	unsigned int old_flags = dev->flags;
5196 	int ret;
5197 
5198 	ASSERT_RTNL();
5199 
5200 	/*
5201 	 *	Set the flags on our device.
5202 	 */
5203 
5204 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5205 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5206 			       IFF_AUTOMEDIA)) |
5207 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5208 				    IFF_ALLMULTI));
5209 
5210 	/*
5211 	 *	Load in the correct multicast list now the flags have changed.
5212 	 */
5213 
5214 	if ((old_flags ^ flags) & IFF_MULTICAST)
5215 		dev_change_rx_flags(dev, IFF_MULTICAST);
5216 
5217 	dev_set_rx_mode(dev);
5218 
5219 	/*
5220 	 *	Have we downed the interface. We handle IFF_UP ourselves
5221 	 *	according to user attempts to set it, rather than blindly
5222 	 *	setting it.
5223 	 */
5224 
5225 	ret = 0;
5226 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
5227 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5228 
5229 		if (!ret)
5230 			dev_set_rx_mode(dev);
5231 	}
5232 
5233 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
5234 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
5235 		unsigned int old_flags = dev->flags;
5236 
5237 		dev->gflags ^= IFF_PROMISC;
5238 
5239 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
5240 			if (dev->flags != old_flags)
5241 				dev_set_rx_mode(dev);
5242 	}
5243 
5244 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5245 	   is important. Some (broken) drivers set IFF_PROMISC, when
5246 	   IFF_ALLMULTI is requested not asking us and not reporting.
5247 	 */
5248 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5249 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5250 
5251 		dev->gflags ^= IFF_ALLMULTI;
5252 		__dev_set_allmulti(dev, inc, false);
5253 	}
5254 
5255 	return ret;
5256 }
5257 
5258 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5259 			unsigned int gchanges)
5260 {
5261 	unsigned int changes = dev->flags ^ old_flags;
5262 
5263 	if (gchanges)
5264 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5265 
5266 	if (changes & IFF_UP) {
5267 		if (dev->flags & IFF_UP)
5268 			call_netdevice_notifiers(NETDEV_UP, dev);
5269 		else
5270 			call_netdevice_notifiers(NETDEV_DOWN, dev);
5271 	}
5272 
5273 	if (dev->flags & IFF_UP &&
5274 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5275 		struct netdev_notifier_change_info change_info;
5276 
5277 		change_info.flags_changed = changes;
5278 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5279 					      &change_info.info);
5280 	}
5281 }
5282 
5283 /**
5284  *	dev_change_flags - change device settings
5285  *	@dev: device
5286  *	@flags: device state flags
5287  *
5288  *	Change settings on device based state flags. The flags are
5289  *	in the userspace exported format.
5290  */
5291 int dev_change_flags(struct net_device *dev, unsigned int flags)
5292 {
5293 	int ret;
5294 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5295 
5296 	ret = __dev_change_flags(dev, flags);
5297 	if (ret < 0)
5298 		return ret;
5299 
5300 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5301 	__dev_notify_flags(dev, old_flags, changes);
5302 	return ret;
5303 }
5304 EXPORT_SYMBOL(dev_change_flags);
5305 
5306 /**
5307  *	dev_set_mtu - Change maximum transfer unit
5308  *	@dev: device
5309  *	@new_mtu: new transfer unit
5310  *
5311  *	Change the maximum transfer size of the network device.
5312  */
5313 int dev_set_mtu(struct net_device *dev, int new_mtu)
5314 {
5315 	const struct net_device_ops *ops = dev->netdev_ops;
5316 	int err;
5317 
5318 	if (new_mtu == dev->mtu)
5319 		return 0;
5320 
5321 	/*	MTU must be positive.	 */
5322 	if (new_mtu < 0)
5323 		return -EINVAL;
5324 
5325 	if (!netif_device_present(dev))
5326 		return -ENODEV;
5327 
5328 	err = 0;
5329 	if (ops->ndo_change_mtu)
5330 		err = ops->ndo_change_mtu(dev, new_mtu);
5331 	else
5332 		dev->mtu = new_mtu;
5333 
5334 	if (!err)
5335 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5336 	return err;
5337 }
5338 EXPORT_SYMBOL(dev_set_mtu);
5339 
5340 /**
5341  *	dev_set_group - Change group this device belongs to
5342  *	@dev: device
5343  *	@new_group: group this device should belong to
5344  */
5345 void dev_set_group(struct net_device *dev, int new_group)
5346 {
5347 	dev->group = new_group;
5348 }
5349 EXPORT_SYMBOL(dev_set_group);
5350 
5351 /**
5352  *	dev_set_mac_address - Change Media Access Control Address
5353  *	@dev: device
5354  *	@sa: new address
5355  *
5356  *	Change the hardware (MAC) address of the device
5357  */
5358 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5359 {
5360 	const struct net_device_ops *ops = dev->netdev_ops;
5361 	int err;
5362 
5363 	if (!ops->ndo_set_mac_address)
5364 		return -EOPNOTSUPP;
5365 	if (sa->sa_family != dev->type)
5366 		return -EINVAL;
5367 	if (!netif_device_present(dev))
5368 		return -ENODEV;
5369 	err = ops->ndo_set_mac_address(dev, sa);
5370 	if (err)
5371 		return err;
5372 	dev->addr_assign_type = NET_ADDR_SET;
5373 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5374 	add_device_randomness(dev->dev_addr, dev->addr_len);
5375 	return 0;
5376 }
5377 EXPORT_SYMBOL(dev_set_mac_address);
5378 
5379 /**
5380  *	dev_change_carrier - Change device carrier
5381  *	@dev: device
5382  *	@new_carrier: new value
5383  *
5384  *	Change device carrier
5385  */
5386 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5387 {
5388 	const struct net_device_ops *ops = dev->netdev_ops;
5389 
5390 	if (!ops->ndo_change_carrier)
5391 		return -EOPNOTSUPP;
5392 	if (!netif_device_present(dev))
5393 		return -ENODEV;
5394 	return ops->ndo_change_carrier(dev, new_carrier);
5395 }
5396 EXPORT_SYMBOL(dev_change_carrier);
5397 
5398 /**
5399  *	dev_get_phys_port_id - Get device physical port ID
5400  *	@dev: device
5401  *	@ppid: port ID
5402  *
5403  *	Get device physical port ID
5404  */
5405 int dev_get_phys_port_id(struct net_device *dev,
5406 			 struct netdev_phys_port_id *ppid)
5407 {
5408 	const struct net_device_ops *ops = dev->netdev_ops;
5409 
5410 	if (!ops->ndo_get_phys_port_id)
5411 		return -EOPNOTSUPP;
5412 	return ops->ndo_get_phys_port_id(dev, ppid);
5413 }
5414 EXPORT_SYMBOL(dev_get_phys_port_id);
5415 
5416 /**
5417  *	dev_new_index	-	allocate an ifindex
5418  *	@net: the applicable net namespace
5419  *
5420  *	Returns a suitable unique value for a new device interface
5421  *	number.  The caller must hold the rtnl semaphore or the
5422  *	dev_base_lock to be sure it remains unique.
5423  */
5424 static int dev_new_index(struct net *net)
5425 {
5426 	int ifindex = net->ifindex;
5427 	for (;;) {
5428 		if (++ifindex <= 0)
5429 			ifindex = 1;
5430 		if (!__dev_get_by_index(net, ifindex))
5431 			return net->ifindex = ifindex;
5432 	}
5433 }
5434 
5435 /* Delayed registration/unregisteration */
5436 static LIST_HEAD(net_todo_list);
5437 static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5438 
5439 static void net_set_todo(struct net_device *dev)
5440 {
5441 	list_add_tail(&dev->todo_list, &net_todo_list);
5442 	dev_net(dev)->dev_unreg_count++;
5443 }
5444 
5445 static void rollback_registered_many(struct list_head *head)
5446 {
5447 	struct net_device *dev, *tmp;
5448 	LIST_HEAD(close_head);
5449 
5450 	BUG_ON(dev_boot_phase);
5451 	ASSERT_RTNL();
5452 
5453 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5454 		/* Some devices call without registering
5455 		 * for initialization unwind. Remove those
5456 		 * devices and proceed with the remaining.
5457 		 */
5458 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5459 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5460 				 dev->name, dev);
5461 
5462 			WARN_ON(1);
5463 			list_del(&dev->unreg_list);
5464 			continue;
5465 		}
5466 		dev->dismantle = true;
5467 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5468 	}
5469 
5470 	/* If device is running, close it first. */
5471 	list_for_each_entry(dev, head, unreg_list)
5472 		list_add_tail(&dev->close_list, &close_head);
5473 	dev_close_many(&close_head);
5474 
5475 	list_for_each_entry(dev, head, unreg_list) {
5476 		/* And unlink it from device chain. */
5477 		unlist_netdevice(dev);
5478 
5479 		dev->reg_state = NETREG_UNREGISTERING;
5480 	}
5481 
5482 	synchronize_net();
5483 
5484 	list_for_each_entry(dev, head, unreg_list) {
5485 		/* Shutdown queueing discipline. */
5486 		dev_shutdown(dev);
5487 
5488 
5489 		/* Notify protocols, that we are about to destroy
5490 		   this device. They should clean all the things.
5491 		*/
5492 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5493 
5494 		if (!dev->rtnl_link_ops ||
5495 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5496 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5497 
5498 		/*
5499 		 *	Flush the unicast and multicast chains
5500 		 */
5501 		dev_uc_flush(dev);
5502 		dev_mc_flush(dev);
5503 
5504 		if (dev->netdev_ops->ndo_uninit)
5505 			dev->netdev_ops->ndo_uninit(dev);
5506 
5507 		/* Notifier chain MUST detach us all upper devices. */
5508 		WARN_ON(netdev_has_any_upper_dev(dev));
5509 
5510 		/* Remove entries from kobject tree */
5511 		netdev_unregister_kobject(dev);
5512 #ifdef CONFIG_XPS
5513 		/* Remove XPS queueing entries */
5514 		netif_reset_xps_queues_gt(dev, 0);
5515 #endif
5516 	}
5517 
5518 	synchronize_net();
5519 
5520 	list_for_each_entry(dev, head, unreg_list)
5521 		dev_put(dev);
5522 }
5523 
5524 static void rollback_registered(struct net_device *dev)
5525 {
5526 	LIST_HEAD(single);
5527 
5528 	list_add(&dev->unreg_list, &single);
5529 	rollback_registered_many(&single);
5530 	list_del(&single);
5531 }
5532 
5533 static netdev_features_t netdev_fix_features(struct net_device *dev,
5534 	netdev_features_t features)
5535 {
5536 	/* Fix illegal checksum combinations */
5537 	if ((features & NETIF_F_HW_CSUM) &&
5538 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5539 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5540 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5541 	}
5542 
5543 	/* TSO requires that SG is present as well. */
5544 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5545 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5546 		features &= ~NETIF_F_ALL_TSO;
5547 	}
5548 
5549 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5550 					!(features & NETIF_F_IP_CSUM)) {
5551 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5552 		features &= ~NETIF_F_TSO;
5553 		features &= ~NETIF_F_TSO_ECN;
5554 	}
5555 
5556 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5557 					 !(features & NETIF_F_IPV6_CSUM)) {
5558 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5559 		features &= ~NETIF_F_TSO6;
5560 	}
5561 
5562 	/* TSO ECN requires that TSO is present as well. */
5563 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5564 		features &= ~NETIF_F_TSO_ECN;
5565 
5566 	/* Software GSO depends on SG. */
5567 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5568 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5569 		features &= ~NETIF_F_GSO;
5570 	}
5571 
5572 	/* UFO needs SG and checksumming */
5573 	if (features & NETIF_F_UFO) {
5574 		/* maybe split UFO into V4 and V6? */
5575 		if (!((features & NETIF_F_GEN_CSUM) ||
5576 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5577 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5578 			netdev_dbg(dev,
5579 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5580 			features &= ~NETIF_F_UFO;
5581 		}
5582 
5583 		if (!(features & NETIF_F_SG)) {
5584 			netdev_dbg(dev,
5585 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5586 			features &= ~NETIF_F_UFO;
5587 		}
5588 	}
5589 
5590 	return features;
5591 }
5592 
5593 int __netdev_update_features(struct net_device *dev)
5594 {
5595 	netdev_features_t features;
5596 	int err = 0;
5597 
5598 	ASSERT_RTNL();
5599 
5600 	features = netdev_get_wanted_features(dev);
5601 
5602 	if (dev->netdev_ops->ndo_fix_features)
5603 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5604 
5605 	/* driver might be less strict about feature dependencies */
5606 	features = netdev_fix_features(dev, features);
5607 
5608 	if (dev->features == features)
5609 		return 0;
5610 
5611 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5612 		&dev->features, &features);
5613 
5614 	if (dev->netdev_ops->ndo_set_features)
5615 		err = dev->netdev_ops->ndo_set_features(dev, features);
5616 
5617 	if (unlikely(err < 0)) {
5618 		netdev_err(dev,
5619 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5620 			err, &features, &dev->features);
5621 		return -1;
5622 	}
5623 
5624 	if (!err)
5625 		dev->features = features;
5626 
5627 	return 1;
5628 }
5629 
5630 /**
5631  *	netdev_update_features - recalculate device features
5632  *	@dev: the device to check
5633  *
5634  *	Recalculate dev->features set and send notifications if it
5635  *	has changed. Should be called after driver or hardware dependent
5636  *	conditions might have changed that influence the features.
5637  */
5638 void netdev_update_features(struct net_device *dev)
5639 {
5640 	if (__netdev_update_features(dev))
5641 		netdev_features_change(dev);
5642 }
5643 EXPORT_SYMBOL(netdev_update_features);
5644 
5645 /**
5646  *	netdev_change_features - recalculate device features
5647  *	@dev: the device to check
5648  *
5649  *	Recalculate dev->features set and send notifications even
5650  *	if they have not changed. Should be called instead of
5651  *	netdev_update_features() if also dev->vlan_features might
5652  *	have changed to allow the changes to be propagated to stacked
5653  *	VLAN devices.
5654  */
5655 void netdev_change_features(struct net_device *dev)
5656 {
5657 	__netdev_update_features(dev);
5658 	netdev_features_change(dev);
5659 }
5660 EXPORT_SYMBOL(netdev_change_features);
5661 
5662 /**
5663  *	netif_stacked_transfer_operstate -	transfer operstate
5664  *	@rootdev: the root or lower level device to transfer state from
5665  *	@dev: the device to transfer operstate to
5666  *
5667  *	Transfer operational state from root to device. This is normally
5668  *	called when a stacking relationship exists between the root
5669  *	device and the device(a leaf device).
5670  */
5671 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5672 					struct net_device *dev)
5673 {
5674 	if (rootdev->operstate == IF_OPER_DORMANT)
5675 		netif_dormant_on(dev);
5676 	else
5677 		netif_dormant_off(dev);
5678 
5679 	if (netif_carrier_ok(rootdev)) {
5680 		if (!netif_carrier_ok(dev))
5681 			netif_carrier_on(dev);
5682 	} else {
5683 		if (netif_carrier_ok(dev))
5684 			netif_carrier_off(dev);
5685 	}
5686 }
5687 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5688 
5689 #ifdef CONFIG_RPS
5690 static int netif_alloc_rx_queues(struct net_device *dev)
5691 {
5692 	unsigned int i, count = dev->num_rx_queues;
5693 	struct netdev_rx_queue *rx;
5694 
5695 	BUG_ON(count < 1);
5696 
5697 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5698 	if (!rx)
5699 		return -ENOMEM;
5700 
5701 	dev->_rx = rx;
5702 
5703 	for (i = 0; i < count; i++)
5704 		rx[i].dev = dev;
5705 	return 0;
5706 }
5707 #endif
5708 
5709 static void netdev_init_one_queue(struct net_device *dev,
5710 				  struct netdev_queue *queue, void *_unused)
5711 {
5712 	/* Initialize queue lock */
5713 	spin_lock_init(&queue->_xmit_lock);
5714 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5715 	queue->xmit_lock_owner = -1;
5716 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5717 	queue->dev = dev;
5718 #ifdef CONFIG_BQL
5719 	dql_init(&queue->dql, HZ);
5720 #endif
5721 }
5722 
5723 static void netif_free_tx_queues(struct net_device *dev)
5724 {
5725 	if (is_vmalloc_addr(dev->_tx))
5726 		vfree(dev->_tx);
5727 	else
5728 		kfree(dev->_tx);
5729 }
5730 
5731 static int netif_alloc_netdev_queues(struct net_device *dev)
5732 {
5733 	unsigned int count = dev->num_tx_queues;
5734 	struct netdev_queue *tx;
5735 	size_t sz = count * sizeof(*tx);
5736 
5737 	BUG_ON(count < 1 || count > 0xffff);
5738 
5739 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5740 	if (!tx) {
5741 		tx = vzalloc(sz);
5742 		if (!tx)
5743 			return -ENOMEM;
5744 	}
5745 	dev->_tx = tx;
5746 
5747 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5748 	spin_lock_init(&dev->tx_global_lock);
5749 
5750 	return 0;
5751 }
5752 
5753 /**
5754  *	register_netdevice	- register a network device
5755  *	@dev: device to register
5756  *
5757  *	Take a completed network device structure and add it to the kernel
5758  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5759  *	chain. 0 is returned on success. A negative errno code is returned
5760  *	on a failure to set up the device, or if the name is a duplicate.
5761  *
5762  *	Callers must hold the rtnl semaphore. You may want
5763  *	register_netdev() instead of this.
5764  *
5765  *	BUGS:
5766  *	The locking appears insufficient to guarantee two parallel registers
5767  *	will not get the same name.
5768  */
5769 
5770 int register_netdevice(struct net_device *dev)
5771 {
5772 	int ret;
5773 	struct net *net = dev_net(dev);
5774 
5775 	BUG_ON(dev_boot_phase);
5776 	ASSERT_RTNL();
5777 
5778 	might_sleep();
5779 
5780 	/* When net_device's are persistent, this will be fatal. */
5781 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5782 	BUG_ON(!net);
5783 
5784 	spin_lock_init(&dev->addr_list_lock);
5785 	netdev_set_addr_lockdep_class(dev);
5786 
5787 	dev->iflink = -1;
5788 
5789 	ret = dev_get_valid_name(net, dev, dev->name);
5790 	if (ret < 0)
5791 		goto out;
5792 
5793 	/* Init, if this function is available */
5794 	if (dev->netdev_ops->ndo_init) {
5795 		ret = dev->netdev_ops->ndo_init(dev);
5796 		if (ret) {
5797 			if (ret > 0)
5798 				ret = -EIO;
5799 			goto out;
5800 		}
5801 	}
5802 
5803 	if (((dev->hw_features | dev->features) &
5804 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
5805 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5806 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5807 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5808 		ret = -EINVAL;
5809 		goto err_uninit;
5810 	}
5811 
5812 	ret = -EBUSY;
5813 	if (!dev->ifindex)
5814 		dev->ifindex = dev_new_index(net);
5815 	else if (__dev_get_by_index(net, dev->ifindex))
5816 		goto err_uninit;
5817 
5818 	if (dev->iflink == -1)
5819 		dev->iflink = dev->ifindex;
5820 
5821 	/* Transfer changeable features to wanted_features and enable
5822 	 * software offloads (GSO and GRO).
5823 	 */
5824 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5825 	dev->features |= NETIF_F_SOFT_FEATURES;
5826 	dev->wanted_features = dev->features & dev->hw_features;
5827 
5828 	/* Turn on no cache copy if HW is doing checksum */
5829 	if (!(dev->flags & IFF_LOOPBACK)) {
5830 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
5831 		if (dev->features & NETIF_F_ALL_CSUM) {
5832 			dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5833 			dev->features |= NETIF_F_NOCACHE_COPY;
5834 		}
5835 	}
5836 
5837 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5838 	 */
5839 	dev->vlan_features |= NETIF_F_HIGHDMA;
5840 
5841 	/* Make NETIF_F_SG inheritable to tunnel devices.
5842 	 */
5843 	dev->hw_enc_features |= NETIF_F_SG;
5844 
5845 	/* Make NETIF_F_SG inheritable to MPLS.
5846 	 */
5847 	dev->mpls_features |= NETIF_F_SG;
5848 
5849 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5850 	ret = notifier_to_errno(ret);
5851 	if (ret)
5852 		goto err_uninit;
5853 
5854 	ret = netdev_register_kobject(dev);
5855 	if (ret)
5856 		goto err_uninit;
5857 	dev->reg_state = NETREG_REGISTERED;
5858 
5859 	__netdev_update_features(dev);
5860 
5861 	/*
5862 	 *	Default initial state at registry is that the
5863 	 *	device is present.
5864 	 */
5865 
5866 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5867 
5868 	linkwatch_init_dev(dev);
5869 
5870 	dev_init_scheduler(dev);
5871 	dev_hold(dev);
5872 	list_netdevice(dev);
5873 	add_device_randomness(dev->dev_addr, dev->addr_len);
5874 
5875 	/* If the device has permanent device address, driver should
5876 	 * set dev_addr and also addr_assign_type should be set to
5877 	 * NET_ADDR_PERM (default value).
5878 	 */
5879 	if (dev->addr_assign_type == NET_ADDR_PERM)
5880 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5881 
5882 	/* Notify protocols, that a new device appeared. */
5883 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5884 	ret = notifier_to_errno(ret);
5885 	if (ret) {
5886 		rollback_registered(dev);
5887 		dev->reg_state = NETREG_UNREGISTERED;
5888 	}
5889 	/*
5890 	 *	Prevent userspace races by waiting until the network
5891 	 *	device is fully setup before sending notifications.
5892 	 */
5893 	if (!dev->rtnl_link_ops ||
5894 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5895 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
5896 
5897 out:
5898 	return ret;
5899 
5900 err_uninit:
5901 	if (dev->netdev_ops->ndo_uninit)
5902 		dev->netdev_ops->ndo_uninit(dev);
5903 	goto out;
5904 }
5905 EXPORT_SYMBOL(register_netdevice);
5906 
5907 /**
5908  *	init_dummy_netdev	- init a dummy network device for NAPI
5909  *	@dev: device to init
5910  *
5911  *	This takes a network device structure and initialize the minimum
5912  *	amount of fields so it can be used to schedule NAPI polls without
5913  *	registering a full blown interface. This is to be used by drivers
5914  *	that need to tie several hardware interfaces to a single NAPI
5915  *	poll scheduler due to HW limitations.
5916  */
5917 int init_dummy_netdev(struct net_device *dev)
5918 {
5919 	/* Clear everything. Note we don't initialize spinlocks
5920 	 * are they aren't supposed to be taken by any of the
5921 	 * NAPI code and this dummy netdev is supposed to be
5922 	 * only ever used for NAPI polls
5923 	 */
5924 	memset(dev, 0, sizeof(struct net_device));
5925 
5926 	/* make sure we BUG if trying to hit standard
5927 	 * register/unregister code path
5928 	 */
5929 	dev->reg_state = NETREG_DUMMY;
5930 
5931 	/* NAPI wants this */
5932 	INIT_LIST_HEAD(&dev->napi_list);
5933 
5934 	/* a dummy interface is started by default */
5935 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5936 	set_bit(__LINK_STATE_START, &dev->state);
5937 
5938 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5939 	 * because users of this 'device' dont need to change
5940 	 * its refcount.
5941 	 */
5942 
5943 	return 0;
5944 }
5945 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5946 
5947 
5948 /**
5949  *	register_netdev	- register a network device
5950  *	@dev: device to register
5951  *
5952  *	Take a completed network device structure and add it to the kernel
5953  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5954  *	chain. 0 is returned on success. A negative errno code is returned
5955  *	on a failure to set up the device, or if the name is a duplicate.
5956  *
5957  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5958  *	and expands the device name if you passed a format string to
5959  *	alloc_netdev.
5960  */
5961 int register_netdev(struct net_device *dev)
5962 {
5963 	int err;
5964 
5965 	rtnl_lock();
5966 	err = register_netdevice(dev);
5967 	rtnl_unlock();
5968 	return err;
5969 }
5970 EXPORT_SYMBOL(register_netdev);
5971 
5972 int netdev_refcnt_read(const struct net_device *dev)
5973 {
5974 	int i, refcnt = 0;
5975 
5976 	for_each_possible_cpu(i)
5977 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5978 	return refcnt;
5979 }
5980 EXPORT_SYMBOL(netdev_refcnt_read);
5981 
5982 /**
5983  * netdev_wait_allrefs - wait until all references are gone.
5984  * @dev: target net_device
5985  *
5986  * This is called when unregistering network devices.
5987  *
5988  * Any protocol or device that holds a reference should register
5989  * for netdevice notification, and cleanup and put back the
5990  * reference if they receive an UNREGISTER event.
5991  * We can get stuck here if buggy protocols don't correctly
5992  * call dev_put.
5993  */
5994 static void netdev_wait_allrefs(struct net_device *dev)
5995 {
5996 	unsigned long rebroadcast_time, warning_time;
5997 	int refcnt;
5998 
5999 	linkwatch_forget_dev(dev);
6000 
6001 	rebroadcast_time = warning_time = jiffies;
6002 	refcnt = netdev_refcnt_read(dev);
6003 
6004 	while (refcnt != 0) {
6005 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6006 			rtnl_lock();
6007 
6008 			/* Rebroadcast unregister notification */
6009 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6010 
6011 			__rtnl_unlock();
6012 			rcu_barrier();
6013 			rtnl_lock();
6014 
6015 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6016 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6017 				     &dev->state)) {
6018 				/* We must not have linkwatch events
6019 				 * pending on unregister. If this
6020 				 * happens, we simply run the queue
6021 				 * unscheduled, resulting in a noop
6022 				 * for this device.
6023 				 */
6024 				linkwatch_run_queue();
6025 			}
6026 
6027 			__rtnl_unlock();
6028 
6029 			rebroadcast_time = jiffies;
6030 		}
6031 
6032 		msleep(250);
6033 
6034 		refcnt = netdev_refcnt_read(dev);
6035 
6036 		if (time_after(jiffies, warning_time + 10 * HZ)) {
6037 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6038 				 dev->name, refcnt);
6039 			warning_time = jiffies;
6040 		}
6041 	}
6042 }
6043 
6044 /* The sequence is:
6045  *
6046  *	rtnl_lock();
6047  *	...
6048  *	register_netdevice(x1);
6049  *	register_netdevice(x2);
6050  *	...
6051  *	unregister_netdevice(y1);
6052  *	unregister_netdevice(y2);
6053  *      ...
6054  *	rtnl_unlock();
6055  *	free_netdev(y1);
6056  *	free_netdev(y2);
6057  *
6058  * We are invoked by rtnl_unlock().
6059  * This allows us to deal with problems:
6060  * 1) We can delete sysfs objects which invoke hotplug
6061  *    without deadlocking with linkwatch via keventd.
6062  * 2) Since we run with the RTNL semaphore not held, we can sleep
6063  *    safely in order to wait for the netdev refcnt to drop to zero.
6064  *
6065  * We must not return until all unregister events added during
6066  * the interval the lock was held have been completed.
6067  */
6068 void netdev_run_todo(void)
6069 {
6070 	struct list_head list;
6071 
6072 	/* Snapshot list, allow later requests */
6073 	list_replace_init(&net_todo_list, &list);
6074 
6075 	__rtnl_unlock();
6076 
6077 
6078 	/* Wait for rcu callbacks to finish before next phase */
6079 	if (!list_empty(&list))
6080 		rcu_barrier();
6081 
6082 	while (!list_empty(&list)) {
6083 		struct net_device *dev
6084 			= list_first_entry(&list, struct net_device, todo_list);
6085 		list_del(&dev->todo_list);
6086 
6087 		rtnl_lock();
6088 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6089 		__rtnl_unlock();
6090 
6091 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6092 			pr_err("network todo '%s' but state %d\n",
6093 			       dev->name, dev->reg_state);
6094 			dump_stack();
6095 			continue;
6096 		}
6097 
6098 		dev->reg_state = NETREG_UNREGISTERED;
6099 
6100 		on_each_cpu(flush_backlog, dev, 1);
6101 
6102 		netdev_wait_allrefs(dev);
6103 
6104 		/* paranoia */
6105 		BUG_ON(netdev_refcnt_read(dev));
6106 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
6107 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6108 		WARN_ON(dev->dn_ptr);
6109 
6110 		if (dev->destructor)
6111 			dev->destructor(dev);
6112 
6113 		/* Report a network device has been unregistered */
6114 		rtnl_lock();
6115 		dev_net(dev)->dev_unreg_count--;
6116 		__rtnl_unlock();
6117 		wake_up(&netdev_unregistering_wq);
6118 
6119 		/* Free network device */
6120 		kobject_put(&dev->dev.kobj);
6121 	}
6122 }
6123 
6124 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6125  * fields in the same order, with only the type differing.
6126  */
6127 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6128 			     const struct net_device_stats *netdev_stats)
6129 {
6130 #if BITS_PER_LONG == 64
6131 	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6132 	memcpy(stats64, netdev_stats, sizeof(*stats64));
6133 #else
6134 	size_t i, n = sizeof(*stats64) / sizeof(u64);
6135 	const unsigned long *src = (const unsigned long *)netdev_stats;
6136 	u64 *dst = (u64 *)stats64;
6137 
6138 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6139 		     sizeof(*stats64) / sizeof(u64));
6140 	for (i = 0; i < n; i++)
6141 		dst[i] = src[i];
6142 #endif
6143 }
6144 EXPORT_SYMBOL(netdev_stats_to_stats64);
6145 
6146 /**
6147  *	dev_get_stats	- get network device statistics
6148  *	@dev: device to get statistics from
6149  *	@storage: place to store stats
6150  *
6151  *	Get network statistics from device. Return @storage.
6152  *	The device driver may provide its own method by setting
6153  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6154  *	otherwise the internal statistics structure is used.
6155  */
6156 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6157 					struct rtnl_link_stats64 *storage)
6158 {
6159 	const struct net_device_ops *ops = dev->netdev_ops;
6160 
6161 	if (ops->ndo_get_stats64) {
6162 		memset(storage, 0, sizeof(*storage));
6163 		ops->ndo_get_stats64(dev, storage);
6164 	} else if (ops->ndo_get_stats) {
6165 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6166 	} else {
6167 		netdev_stats_to_stats64(storage, &dev->stats);
6168 	}
6169 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6170 	return storage;
6171 }
6172 EXPORT_SYMBOL(dev_get_stats);
6173 
6174 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6175 {
6176 	struct netdev_queue *queue = dev_ingress_queue(dev);
6177 
6178 #ifdef CONFIG_NET_CLS_ACT
6179 	if (queue)
6180 		return queue;
6181 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6182 	if (!queue)
6183 		return NULL;
6184 	netdev_init_one_queue(dev, queue, NULL);
6185 	queue->qdisc = &noop_qdisc;
6186 	queue->qdisc_sleeping = &noop_qdisc;
6187 	rcu_assign_pointer(dev->ingress_queue, queue);
6188 #endif
6189 	return queue;
6190 }
6191 
6192 static const struct ethtool_ops default_ethtool_ops;
6193 
6194 void netdev_set_default_ethtool_ops(struct net_device *dev,
6195 				    const struct ethtool_ops *ops)
6196 {
6197 	if (dev->ethtool_ops == &default_ethtool_ops)
6198 		dev->ethtool_ops = ops;
6199 }
6200 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6201 
6202 void netdev_freemem(struct net_device *dev)
6203 {
6204 	char *addr = (char *)dev - dev->padded;
6205 
6206 	if (is_vmalloc_addr(addr))
6207 		vfree(addr);
6208 	else
6209 		kfree(addr);
6210 }
6211 
6212 /**
6213  *	alloc_netdev_mqs - allocate network device
6214  *	@sizeof_priv:	size of private data to allocate space for
6215  *	@name:		device name format string
6216  *	@setup:		callback to initialize device
6217  *	@txqs:		the number of TX subqueues to allocate
6218  *	@rxqs:		the number of RX subqueues to allocate
6219  *
6220  *	Allocates a struct net_device with private data area for driver use
6221  *	and performs basic initialization.  Also allocates subquue structs
6222  *	for each queue on the device.
6223  */
6224 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6225 		void (*setup)(struct net_device *),
6226 		unsigned int txqs, unsigned int rxqs)
6227 {
6228 	struct net_device *dev;
6229 	size_t alloc_size;
6230 	struct net_device *p;
6231 
6232 	BUG_ON(strlen(name) >= sizeof(dev->name));
6233 
6234 	if (txqs < 1) {
6235 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6236 		return NULL;
6237 	}
6238 
6239 #ifdef CONFIG_RPS
6240 	if (rxqs < 1) {
6241 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6242 		return NULL;
6243 	}
6244 #endif
6245 
6246 	alloc_size = sizeof(struct net_device);
6247 	if (sizeof_priv) {
6248 		/* ensure 32-byte alignment of private area */
6249 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6250 		alloc_size += sizeof_priv;
6251 	}
6252 	/* ensure 32-byte alignment of whole construct */
6253 	alloc_size += NETDEV_ALIGN - 1;
6254 
6255 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6256 	if (!p)
6257 		p = vzalloc(alloc_size);
6258 	if (!p)
6259 		return NULL;
6260 
6261 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6262 	dev->padded = (char *)dev - (char *)p;
6263 
6264 	dev->pcpu_refcnt = alloc_percpu(int);
6265 	if (!dev->pcpu_refcnt)
6266 		goto free_dev;
6267 
6268 	if (dev_addr_init(dev))
6269 		goto free_pcpu;
6270 
6271 	dev_mc_init(dev);
6272 	dev_uc_init(dev);
6273 
6274 	dev_net_set(dev, &init_net);
6275 
6276 	dev->gso_max_size = GSO_MAX_SIZE;
6277 	dev->gso_max_segs = GSO_MAX_SEGS;
6278 
6279 	INIT_LIST_HEAD(&dev->napi_list);
6280 	INIT_LIST_HEAD(&dev->unreg_list);
6281 	INIT_LIST_HEAD(&dev->close_list);
6282 	INIT_LIST_HEAD(&dev->link_watch_list);
6283 	INIT_LIST_HEAD(&dev->adj_list.upper);
6284 	INIT_LIST_HEAD(&dev->adj_list.lower);
6285 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
6286 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
6287 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
6288 	setup(dev);
6289 
6290 	dev->num_tx_queues = txqs;
6291 	dev->real_num_tx_queues = txqs;
6292 	if (netif_alloc_netdev_queues(dev))
6293 		goto free_all;
6294 
6295 #ifdef CONFIG_RPS
6296 	dev->num_rx_queues = rxqs;
6297 	dev->real_num_rx_queues = rxqs;
6298 	if (netif_alloc_rx_queues(dev))
6299 		goto free_all;
6300 #endif
6301 
6302 	strcpy(dev->name, name);
6303 	dev->group = INIT_NETDEV_GROUP;
6304 	if (!dev->ethtool_ops)
6305 		dev->ethtool_ops = &default_ethtool_ops;
6306 	return dev;
6307 
6308 free_all:
6309 	free_netdev(dev);
6310 	return NULL;
6311 
6312 free_pcpu:
6313 	free_percpu(dev->pcpu_refcnt);
6314 	netif_free_tx_queues(dev);
6315 #ifdef CONFIG_RPS
6316 	kfree(dev->_rx);
6317 #endif
6318 
6319 free_dev:
6320 	netdev_freemem(dev);
6321 	return NULL;
6322 }
6323 EXPORT_SYMBOL(alloc_netdev_mqs);
6324 
6325 /**
6326  *	free_netdev - free network device
6327  *	@dev: device
6328  *
6329  *	This function does the last stage of destroying an allocated device
6330  * 	interface. The reference to the device object is released.
6331  *	If this is the last reference then it will be freed.
6332  */
6333 void free_netdev(struct net_device *dev)
6334 {
6335 	struct napi_struct *p, *n;
6336 
6337 	release_net(dev_net(dev));
6338 
6339 	netif_free_tx_queues(dev);
6340 #ifdef CONFIG_RPS
6341 	kfree(dev->_rx);
6342 #endif
6343 
6344 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6345 
6346 	/* Flush device addresses */
6347 	dev_addr_flush(dev);
6348 
6349 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6350 		netif_napi_del(p);
6351 
6352 	free_percpu(dev->pcpu_refcnt);
6353 	dev->pcpu_refcnt = NULL;
6354 
6355 	/*  Compatibility with error handling in drivers */
6356 	if (dev->reg_state == NETREG_UNINITIALIZED) {
6357 		netdev_freemem(dev);
6358 		return;
6359 	}
6360 
6361 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6362 	dev->reg_state = NETREG_RELEASED;
6363 
6364 	/* will free via device release */
6365 	put_device(&dev->dev);
6366 }
6367 EXPORT_SYMBOL(free_netdev);
6368 
6369 /**
6370  *	synchronize_net -  Synchronize with packet receive processing
6371  *
6372  *	Wait for packets currently being received to be done.
6373  *	Does not block later packets from starting.
6374  */
6375 void synchronize_net(void)
6376 {
6377 	might_sleep();
6378 	if (rtnl_is_locked())
6379 		synchronize_rcu_expedited();
6380 	else
6381 		synchronize_rcu();
6382 }
6383 EXPORT_SYMBOL(synchronize_net);
6384 
6385 /**
6386  *	unregister_netdevice_queue - remove device from the kernel
6387  *	@dev: device
6388  *	@head: list
6389  *
6390  *	This function shuts down a device interface and removes it
6391  *	from the kernel tables.
6392  *	If head not NULL, device is queued to be unregistered later.
6393  *
6394  *	Callers must hold the rtnl semaphore.  You may want
6395  *	unregister_netdev() instead of this.
6396  */
6397 
6398 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6399 {
6400 	ASSERT_RTNL();
6401 
6402 	if (head) {
6403 		list_move_tail(&dev->unreg_list, head);
6404 	} else {
6405 		rollback_registered(dev);
6406 		/* Finish processing unregister after unlock */
6407 		net_set_todo(dev);
6408 	}
6409 }
6410 EXPORT_SYMBOL(unregister_netdevice_queue);
6411 
6412 /**
6413  *	unregister_netdevice_many - unregister many devices
6414  *	@head: list of devices
6415  */
6416 void unregister_netdevice_many(struct list_head *head)
6417 {
6418 	struct net_device *dev;
6419 
6420 	if (!list_empty(head)) {
6421 		rollback_registered_many(head);
6422 		list_for_each_entry(dev, head, unreg_list)
6423 			net_set_todo(dev);
6424 	}
6425 }
6426 EXPORT_SYMBOL(unregister_netdevice_many);
6427 
6428 /**
6429  *	unregister_netdev - remove device from the kernel
6430  *	@dev: device
6431  *
6432  *	This function shuts down a device interface and removes it
6433  *	from the kernel tables.
6434  *
6435  *	This is just a wrapper for unregister_netdevice that takes
6436  *	the rtnl semaphore.  In general you want to use this and not
6437  *	unregister_netdevice.
6438  */
6439 void unregister_netdev(struct net_device *dev)
6440 {
6441 	rtnl_lock();
6442 	unregister_netdevice(dev);
6443 	rtnl_unlock();
6444 }
6445 EXPORT_SYMBOL(unregister_netdev);
6446 
6447 /**
6448  *	dev_change_net_namespace - move device to different nethost namespace
6449  *	@dev: device
6450  *	@net: network namespace
6451  *	@pat: If not NULL name pattern to try if the current device name
6452  *	      is already taken in the destination network namespace.
6453  *
6454  *	This function shuts down a device interface and moves it
6455  *	to a new network namespace. On success 0 is returned, on
6456  *	a failure a netagive errno code is returned.
6457  *
6458  *	Callers must hold the rtnl semaphore.
6459  */
6460 
6461 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6462 {
6463 	int err;
6464 
6465 	ASSERT_RTNL();
6466 
6467 	/* Don't allow namespace local devices to be moved. */
6468 	err = -EINVAL;
6469 	if (dev->features & NETIF_F_NETNS_LOCAL)
6470 		goto out;
6471 
6472 	/* Ensure the device has been registrered */
6473 	if (dev->reg_state != NETREG_REGISTERED)
6474 		goto out;
6475 
6476 	/* Get out if there is nothing todo */
6477 	err = 0;
6478 	if (net_eq(dev_net(dev), net))
6479 		goto out;
6480 
6481 	/* Pick the destination device name, and ensure
6482 	 * we can use it in the destination network namespace.
6483 	 */
6484 	err = -EEXIST;
6485 	if (__dev_get_by_name(net, dev->name)) {
6486 		/* We get here if we can't use the current device name */
6487 		if (!pat)
6488 			goto out;
6489 		if (dev_get_valid_name(net, dev, pat) < 0)
6490 			goto out;
6491 	}
6492 
6493 	/*
6494 	 * And now a mini version of register_netdevice unregister_netdevice.
6495 	 */
6496 
6497 	/* If device is running close it first. */
6498 	dev_close(dev);
6499 
6500 	/* And unlink it from device chain */
6501 	err = -ENODEV;
6502 	unlist_netdevice(dev);
6503 
6504 	synchronize_net();
6505 
6506 	/* Shutdown queueing discipline. */
6507 	dev_shutdown(dev);
6508 
6509 	/* Notify protocols, that we are about to destroy
6510 	   this device. They should clean all the things.
6511 
6512 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6513 	   This is wanted because this way 8021q and macvlan know
6514 	   the device is just moving and can keep their slaves up.
6515 	*/
6516 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6517 	rcu_barrier();
6518 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6519 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6520 
6521 	/*
6522 	 *	Flush the unicast and multicast chains
6523 	 */
6524 	dev_uc_flush(dev);
6525 	dev_mc_flush(dev);
6526 
6527 	/* Send a netdev-removed uevent to the old namespace */
6528 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6529 
6530 	/* Actually switch the network namespace */
6531 	dev_net_set(dev, net);
6532 
6533 	/* If there is an ifindex conflict assign a new one */
6534 	if (__dev_get_by_index(net, dev->ifindex)) {
6535 		int iflink = (dev->iflink == dev->ifindex);
6536 		dev->ifindex = dev_new_index(net);
6537 		if (iflink)
6538 			dev->iflink = dev->ifindex;
6539 	}
6540 
6541 	/* Send a netdev-add uevent to the new namespace */
6542 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6543 
6544 	/* Fixup kobjects */
6545 	err = device_rename(&dev->dev, dev->name);
6546 	WARN_ON(err);
6547 
6548 	/* Add the device back in the hashes */
6549 	list_netdevice(dev);
6550 
6551 	/* Notify protocols, that a new device appeared. */
6552 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6553 
6554 	/*
6555 	 *	Prevent userspace races by waiting until the network
6556 	 *	device is fully setup before sending notifications.
6557 	 */
6558 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6559 
6560 	synchronize_net();
6561 	err = 0;
6562 out:
6563 	return err;
6564 }
6565 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6566 
6567 static int dev_cpu_callback(struct notifier_block *nfb,
6568 			    unsigned long action,
6569 			    void *ocpu)
6570 {
6571 	struct sk_buff **list_skb;
6572 	struct sk_buff *skb;
6573 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6574 	struct softnet_data *sd, *oldsd;
6575 
6576 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6577 		return NOTIFY_OK;
6578 
6579 	local_irq_disable();
6580 	cpu = smp_processor_id();
6581 	sd = &per_cpu(softnet_data, cpu);
6582 	oldsd = &per_cpu(softnet_data, oldcpu);
6583 
6584 	/* Find end of our completion_queue. */
6585 	list_skb = &sd->completion_queue;
6586 	while (*list_skb)
6587 		list_skb = &(*list_skb)->next;
6588 	/* Append completion queue from offline CPU. */
6589 	*list_skb = oldsd->completion_queue;
6590 	oldsd->completion_queue = NULL;
6591 
6592 	/* Append output queue from offline CPU. */
6593 	if (oldsd->output_queue) {
6594 		*sd->output_queue_tailp = oldsd->output_queue;
6595 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6596 		oldsd->output_queue = NULL;
6597 		oldsd->output_queue_tailp = &oldsd->output_queue;
6598 	}
6599 	/* Append NAPI poll list from offline CPU. */
6600 	if (!list_empty(&oldsd->poll_list)) {
6601 		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6602 		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6603 	}
6604 
6605 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6606 	local_irq_enable();
6607 
6608 	/* Process offline CPU's input_pkt_queue */
6609 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6610 		netif_rx(skb);
6611 		input_queue_head_incr(oldsd);
6612 	}
6613 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6614 		netif_rx(skb);
6615 		input_queue_head_incr(oldsd);
6616 	}
6617 
6618 	return NOTIFY_OK;
6619 }
6620 
6621 
6622 /**
6623  *	netdev_increment_features - increment feature set by one
6624  *	@all: current feature set
6625  *	@one: new feature set
6626  *	@mask: mask feature set
6627  *
6628  *	Computes a new feature set after adding a device with feature set
6629  *	@one to the master device with current feature set @all.  Will not
6630  *	enable anything that is off in @mask. Returns the new feature set.
6631  */
6632 netdev_features_t netdev_increment_features(netdev_features_t all,
6633 	netdev_features_t one, netdev_features_t mask)
6634 {
6635 	if (mask & NETIF_F_GEN_CSUM)
6636 		mask |= NETIF_F_ALL_CSUM;
6637 	mask |= NETIF_F_VLAN_CHALLENGED;
6638 
6639 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6640 	all &= one | ~NETIF_F_ALL_FOR_ALL;
6641 
6642 	/* If one device supports hw checksumming, set for all. */
6643 	if (all & NETIF_F_GEN_CSUM)
6644 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6645 
6646 	return all;
6647 }
6648 EXPORT_SYMBOL(netdev_increment_features);
6649 
6650 static struct hlist_head * __net_init netdev_create_hash(void)
6651 {
6652 	int i;
6653 	struct hlist_head *hash;
6654 
6655 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6656 	if (hash != NULL)
6657 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6658 			INIT_HLIST_HEAD(&hash[i]);
6659 
6660 	return hash;
6661 }
6662 
6663 /* Initialize per network namespace state */
6664 static int __net_init netdev_init(struct net *net)
6665 {
6666 	if (net != &init_net)
6667 		INIT_LIST_HEAD(&net->dev_base_head);
6668 
6669 	net->dev_name_head = netdev_create_hash();
6670 	if (net->dev_name_head == NULL)
6671 		goto err_name;
6672 
6673 	net->dev_index_head = netdev_create_hash();
6674 	if (net->dev_index_head == NULL)
6675 		goto err_idx;
6676 
6677 	return 0;
6678 
6679 err_idx:
6680 	kfree(net->dev_name_head);
6681 err_name:
6682 	return -ENOMEM;
6683 }
6684 
6685 /**
6686  *	netdev_drivername - network driver for the device
6687  *	@dev: network device
6688  *
6689  *	Determine network driver for device.
6690  */
6691 const char *netdev_drivername(const struct net_device *dev)
6692 {
6693 	const struct device_driver *driver;
6694 	const struct device *parent;
6695 	const char *empty = "";
6696 
6697 	parent = dev->dev.parent;
6698 	if (!parent)
6699 		return empty;
6700 
6701 	driver = parent->driver;
6702 	if (driver && driver->name)
6703 		return driver->name;
6704 	return empty;
6705 }
6706 
6707 static int __netdev_printk(const char *level, const struct net_device *dev,
6708 			   struct va_format *vaf)
6709 {
6710 	int r;
6711 
6712 	if (dev && dev->dev.parent) {
6713 		r = dev_printk_emit(level[1] - '0',
6714 				    dev->dev.parent,
6715 				    "%s %s %s: %pV",
6716 				    dev_driver_string(dev->dev.parent),
6717 				    dev_name(dev->dev.parent),
6718 				    netdev_name(dev), vaf);
6719 	} else if (dev) {
6720 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6721 	} else {
6722 		r = printk("%s(NULL net_device): %pV", level, vaf);
6723 	}
6724 
6725 	return r;
6726 }
6727 
6728 int netdev_printk(const char *level, const struct net_device *dev,
6729 		  const char *format, ...)
6730 {
6731 	struct va_format vaf;
6732 	va_list args;
6733 	int r;
6734 
6735 	va_start(args, format);
6736 
6737 	vaf.fmt = format;
6738 	vaf.va = &args;
6739 
6740 	r = __netdev_printk(level, dev, &vaf);
6741 
6742 	va_end(args);
6743 
6744 	return r;
6745 }
6746 EXPORT_SYMBOL(netdev_printk);
6747 
6748 #define define_netdev_printk_level(func, level)			\
6749 int func(const struct net_device *dev, const char *fmt, ...)	\
6750 {								\
6751 	int r;							\
6752 	struct va_format vaf;					\
6753 	va_list args;						\
6754 								\
6755 	va_start(args, fmt);					\
6756 								\
6757 	vaf.fmt = fmt;						\
6758 	vaf.va = &args;						\
6759 								\
6760 	r = __netdev_printk(level, dev, &vaf);			\
6761 								\
6762 	va_end(args);						\
6763 								\
6764 	return r;						\
6765 }								\
6766 EXPORT_SYMBOL(func);
6767 
6768 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6769 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6770 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6771 define_netdev_printk_level(netdev_err, KERN_ERR);
6772 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6773 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6774 define_netdev_printk_level(netdev_info, KERN_INFO);
6775 
6776 static void __net_exit netdev_exit(struct net *net)
6777 {
6778 	kfree(net->dev_name_head);
6779 	kfree(net->dev_index_head);
6780 }
6781 
6782 static struct pernet_operations __net_initdata netdev_net_ops = {
6783 	.init = netdev_init,
6784 	.exit = netdev_exit,
6785 };
6786 
6787 static void __net_exit default_device_exit(struct net *net)
6788 {
6789 	struct net_device *dev, *aux;
6790 	/*
6791 	 * Push all migratable network devices back to the
6792 	 * initial network namespace
6793 	 */
6794 	rtnl_lock();
6795 	for_each_netdev_safe(net, dev, aux) {
6796 		int err;
6797 		char fb_name[IFNAMSIZ];
6798 
6799 		/* Ignore unmoveable devices (i.e. loopback) */
6800 		if (dev->features & NETIF_F_NETNS_LOCAL)
6801 			continue;
6802 
6803 		/* Leave virtual devices for the generic cleanup */
6804 		if (dev->rtnl_link_ops)
6805 			continue;
6806 
6807 		/* Push remaining network devices to init_net */
6808 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6809 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6810 		if (err) {
6811 			pr_emerg("%s: failed to move %s to init_net: %d\n",
6812 				 __func__, dev->name, err);
6813 			BUG();
6814 		}
6815 	}
6816 	rtnl_unlock();
6817 }
6818 
6819 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
6820 {
6821 	/* Return with the rtnl_lock held when there are no network
6822 	 * devices unregistering in any network namespace in net_list.
6823 	 */
6824 	struct net *net;
6825 	bool unregistering;
6826 	DEFINE_WAIT(wait);
6827 
6828 	for (;;) {
6829 		prepare_to_wait(&netdev_unregistering_wq, &wait,
6830 				TASK_UNINTERRUPTIBLE);
6831 		unregistering = false;
6832 		rtnl_lock();
6833 		list_for_each_entry(net, net_list, exit_list) {
6834 			if (net->dev_unreg_count > 0) {
6835 				unregistering = true;
6836 				break;
6837 			}
6838 		}
6839 		if (!unregistering)
6840 			break;
6841 		__rtnl_unlock();
6842 		schedule();
6843 	}
6844 	finish_wait(&netdev_unregistering_wq, &wait);
6845 }
6846 
6847 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6848 {
6849 	/* At exit all network devices most be removed from a network
6850 	 * namespace.  Do this in the reverse order of registration.
6851 	 * Do this across as many network namespaces as possible to
6852 	 * improve batching efficiency.
6853 	 */
6854 	struct net_device *dev;
6855 	struct net *net;
6856 	LIST_HEAD(dev_kill_list);
6857 
6858 	/* To prevent network device cleanup code from dereferencing
6859 	 * loopback devices or network devices that have been freed
6860 	 * wait here for all pending unregistrations to complete,
6861 	 * before unregistring the loopback device and allowing the
6862 	 * network namespace be freed.
6863 	 *
6864 	 * The netdev todo list containing all network devices
6865 	 * unregistrations that happen in default_device_exit_batch
6866 	 * will run in the rtnl_unlock() at the end of
6867 	 * default_device_exit_batch.
6868 	 */
6869 	rtnl_lock_unregistering(net_list);
6870 	list_for_each_entry(net, net_list, exit_list) {
6871 		for_each_netdev_reverse(net, dev) {
6872 			if (dev->rtnl_link_ops)
6873 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6874 			else
6875 				unregister_netdevice_queue(dev, &dev_kill_list);
6876 		}
6877 	}
6878 	unregister_netdevice_many(&dev_kill_list);
6879 	list_del(&dev_kill_list);
6880 	rtnl_unlock();
6881 }
6882 
6883 static struct pernet_operations __net_initdata default_device_ops = {
6884 	.exit = default_device_exit,
6885 	.exit_batch = default_device_exit_batch,
6886 };
6887 
6888 /*
6889  *	Initialize the DEV module. At boot time this walks the device list and
6890  *	unhooks any devices that fail to initialise (normally hardware not
6891  *	present) and leaves us with a valid list of present and active devices.
6892  *
6893  */
6894 
6895 /*
6896  *       This is called single threaded during boot, so no need
6897  *       to take the rtnl semaphore.
6898  */
6899 static int __init net_dev_init(void)
6900 {
6901 	int i, rc = -ENOMEM;
6902 
6903 	BUG_ON(!dev_boot_phase);
6904 
6905 	if (dev_proc_init())
6906 		goto out;
6907 
6908 	if (netdev_kobject_init())
6909 		goto out;
6910 
6911 	INIT_LIST_HEAD(&ptype_all);
6912 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6913 		INIT_LIST_HEAD(&ptype_base[i]);
6914 
6915 	INIT_LIST_HEAD(&offload_base);
6916 
6917 	if (register_pernet_subsys(&netdev_net_ops))
6918 		goto out;
6919 
6920 	/*
6921 	 *	Initialise the packet receive queues.
6922 	 */
6923 
6924 	for_each_possible_cpu(i) {
6925 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6926 
6927 		memset(sd, 0, sizeof(*sd));
6928 		skb_queue_head_init(&sd->input_pkt_queue);
6929 		skb_queue_head_init(&sd->process_queue);
6930 		sd->completion_queue = NULL;
6931 		INIT_LIST_HEAD(&sd->poll_list);
6932 		sd->output_queue = NULL;
6933 		sd->output_queue_tailp = &sd->output_queue;
6934 #ifdef CONFIG_RPS
6935 		sd->csd.func = rps_trigger_softirq;
6936 		sd->csd.info = sd;
6937 		sd->csd.flags = 0;
6938 		sd->cpu = i;
6939 #endif
6940 
6941 		sd->backlog.poll = process_backlog;
6942 		sd->backlog.weight = weight_p;
6943 		sd->backlog.gro_list = NULL;
6944 		sd->backlog.gro_count = 0;
6945 
6946 #ifdef CONFIG_NET_FLOW_LIMIT
6947 		sd->flow_limit = NULL;
6948 #endif
6949 	}
6950 
6951 	dev_boot_phase = 0;
6952 
6953 	/* The loopback device is special if any other network devices
6954 	 * is present in a network namespace the loopback device must
6955 	 * be present. Since we now dynamically allocate and free the
6956 	 * loopback device ensure this invariant is maintained by
6957 	 * keeping the loopback device as the first device on the
6958 	 * list of network devices.  Ensuring the loopback devices
6959 	 * is the first device that appears and the last network device
6960 	 * that disappears.
6961 	 */
6962 	if (register_pernet_device(&loopback_net_ops))
6963 		goto out;
6964 
6965 	if (register_pernet_device(&default_device_ops))
6966 		goto out;
6967 
6968 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6969 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6970 
6971 	hotcpu_notifier(dev_cpu_callback, 0);
6972 	dst_init();
6973 	rc = 0;
6974 out:
6975 	return rc;
6976 }
6977 
6978 subsys_initcall(net_dev_init);
6979