xref: /linux/net/core/dev.c (revision 615f2e5c531bc57d5a190f321d697988e950ae4d)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <linux/ipv6.h>
122 #include <linux/in.h>
123 #include <linux/jhash.h>
124 #include <linux/random.h>
125 #include <trace/events/napi.h>
126 #include <trace/events/net.h>
127 #include <trace/events/skb.h>
128 #include <linux/pci.h>
129 #include <linux/inetdevice.h>
130 #include <linux/cpu_rmap.h>
131 #include <linux/static_key.h>
132 
133 #include "net-sysfs.h"
134 
135 /* Instead of increasing this, you should create a hash table. */
136 #define MAX_GRO_SKBS 8
137 
138 /* This should be increased if a protocol with a bigger head is added. */
139 #define GRO_MAX_HEAD (MAX_HEADER + 128)
140 
141 static DEFINE_SPINLOCK(ptype_lock);
142 static DEFINE_SPINLOCK(offload_lock);
143 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
144 struct list_head ptype_all __read_mostly;	/* Taps */
145 static struct list_head offload_base __read_mostly;
146 
147 /*
148  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
149  * semaphore.
150  *
151  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
152  *
153  * Writers must hold the rtnl semaphore while they loop through the
154  * dev_base_head list, and hold dev_base_lock for writing when they do the
155  * actual updates.  This allows pure readers to access the list even
156  * while a writer is preparing to update it.
157  *
158  * To put it another way, dev_base_lock is held for writing only to
159  * protect against pure readers; the rtnl semaphore provides the
160  * protection against other writers.
161  *
162  * See, for example usages, register_netdevice() and
163  * unregister_netdevice(), which must be called with the rtnl
164  * semaphore held.
165  */
166 DEFINE_RWLOCK(dev_base_lock);
167 EXPORT_SYMBOL(dev_base_lock);
168 
169 seqcount_t devnet_rename_seq;
170 
171 static inline void dev_base_seq_inc(struct net *net)
172 {
173 	while (++net->dev_base_seq == 0);
174 }
175 
176 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
177 {
178 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
179 
180 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
181 }
182 
183 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
184 {
185 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
186 }
187 
188 static inline void rps_lock(struct softnet_data *sd)
189 {
190 #ifdef CONFIG_RPS
191 	spin_lock(&sd->input_pkt_queue.lock);
192 #endif
193 }
194 
195 static inline void rps_unlock(struct softnet_data *sd)
196 {
197 #ifdef CONFIG_RPS
198 	spin_unlock(&sd->input_pkt_queue.lock);
199 #endif
200 }
201 
202 /* Device list insertion */
203 static int list_netdevice(struct net_device *dev)
204 {
205 	struct net *net = dev_net(dev);
206 
207 	ASSERT_RTNL();
208 
209 	write_lock_bh(&dev_base_lock);
210 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
211 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
212 	hlist_add_head_rcu(&dev->index_hlist,
213 			   dev_index_hash(net, dev->ifindex));
214 	write_unlock_bh(&dev_base_lock);
215 
216 	dev_base_seq_inc(net);
217 
218 	return 0;
219 }
220 
221 /* Device list removal
222  * caller must respect a RCU grace period before freeing/reusing dev
223  */
224 static void unlist_netdevice(struct net_device *dev)
225 {
226 	ASSERT_RTNL();
227 
228 	/* Unlink dev from the device chain */
229 	write_lock_bh(&dev_base_lock);
230 	list_del_rcu(&dev->dev_list);
231 	hlist_del_rcu(&dev->name_hlist);
232 	hlist_del_rcu(&dev->index_hlist);
233 	write_unlock_bh(&dev_base_lock);
234 
235 	dev_base_seq_inc(dev_net(dev));
236 }
237 
238 /*
239  *	Our notifier list
240  */
241 
242 static RAW_NOTIFIER_HEAD(netdev_chain);
243 
244 /*
245  *	Device drivers call our routines to queue packets here. We empty the
246  *	queue in the local softnet handler.
247  */
248 
249 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
250 EXPORT_PER_CPU_SYMBOL(softnet_data);
251 
252 #ifdef CONFIG_LOCKDEP
253 /*
254  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
255  * according to dev->type
256  */
257 static const unsigned short netdev_lock_type[] =
258 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
259 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
260 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
261 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
262 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
263 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
264 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
265 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
266 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
267 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
268 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
269 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
270 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
271 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
272 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
273 
274 static const char *const netdev_lock_name[] =
275 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
276 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
277 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
278 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
279 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
280 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
281 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
282 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
283 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
284 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
285 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
286 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
287 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
288 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
289 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
290 
291 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
292 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
293 
294 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
295 {
296 	int i;
297 
298 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
299 		if (netdev_lock_type[i] == dev_type)
300 			return i;
301 	/* the last key is used by default */
302 	return ARRAY_SIZE(netdev_lock_type) - 1;
303 }
304 
305 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
306 						 unsigned short dev_type)
307 {
308 	int i;
309 
310 	i = netdev_lock_pos(dev_type);
311 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
312 				   netdev_lock_name[i]);
313 }
314 
315 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
316 {
317 	int i;
318 
319 	i = netdev_lock_pos(dev->type);
320 	lockdep_set_class_and_name(&dev->addr_list_lock,
321 				   &netdev_addr_lock_key[i],
322 				   netdev_lock_name[i]);
323 }
324 #else
325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 						 unsigned short dev_type)
327 {
328 }
329 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
330 {
331 }
332 #endif
333 
334 /*******************************************************************************
335 
336 		Protocol management and registration routines
337 
338 *******************************************************************************/
339 
340 /*
341  *	Add a protocol ID to the list. Now that the input handler is
342  *	smarter we can dispense with all the messy stuff that used to be
343  *	here.
344  *
345  *	BEWARE!!! Protocol handlers, mangling input packets,
346  *	MUST BE last in hash buckets and checking protocol handlers
347  *	MUST start from promiscuous ptype_all chain in net_bh.
348  *	It is true now, do not change it.
349  *	Explanation follows: if protocol handler, mangling packet, will
350  *	be the first on list, it is not able to sense, that packet
351  *	is cloned and should be copied-on-write, so that it will
352  *	change it and subsequent readers will get broken packet.
353  *							--ANK (980803)
354  */
355 
356 static inline struct list_head *ptype_head(const struct packet_type *pt)
357 {
358 	if (pt->type == htons(ETH_P_ALL))
359 		return &ptype_all;
360 	else
361 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
362 }
363 
364 /**
365  *	dev_add_pack - add packet handler
366  *	@pt: packet type declaration
367  *
368  *	Add a protocol handler to the networking stack. The passed &packet_type
369  *	is linked into kernel lists and may not be freed until it has been
370  *	removed from the kernel lists.
371  *
372  *	This call does not sleep therefore it can not
373  *	guarantee all CPU's that are in middle of receiving packets
374  *	will see the new packet type (until the next received packet).
375  */
376 
377 void dev_add_pack(struct packet_type *pt)
378 {
379 	struct list_head *head = ptype_head(pt);
380 
381 	spin_lock(&ptype_lock);
382 	list_add_rcu(&pt->list, head);
383 	spin_unlock(&ptype_lock);
384 }
385 EXPORT_SYMBOL(dev_add_pack);
386 
387 /**
388  *	__dev_remove_pack	 - remove packet handler
389  *	@pt: packet type declaration
390  *
391  *	Remove a protocol handler that was previously added to the kernel
392  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
393  *	from the kernel lists and can be freed or reused once this function
394  *	returns.
395  *
396  *      The packet type might still be in use by receivers
397  *	and must not be freed until after all the CPU's have gone
398  *	through a quiescent state.
399  */
400 void __dev_remove_pack(struct packet_type *pt)
401 {
402 	struct list_head *head = ptype_head(pt);
403 	struct packet_type *pt1;
404 
405 	spin_lock(&ptype_lock);
406 
407 	list_for_each_entry(pt1, head, list) {
408 		if (pt == pt1) {
409 			list_del_rcu(&pt->list);
410 			goto out;
411 		}
412 	}
413 
414 	pr_warn("dev_remove_pack: %p not found\n", pt);
415 out:
416 	spin_unlock(&ptype_lock);
417 }
418 EXPORT_SYMBOL(__dev_remove_pack);
419 
420 /**
421  *	dev_remove_pack	 - remove packet handler
422  *	@pt: packet type declaration
423  *
424  *	Remove a protocol handler that was previously added to the kernel
425  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
426  *	from the kernel lists and can be freed or reused once this function
427  *	returns.
428  *
429  *	This call sleeps to guarantee that no CPU is looking at the packet
430  *	type after return.
431  */
432 void dev_remove_pack(struct packet_type *pt)
433 {
434 	__dev_remove_pack(pt);
435 
436 	synchronize_net();
437 }
438 EXPORT_SYMBOL(dev_remove_pack);
439 
440 
441 /**
442  *	dev_add_offload - register offload handlers
443  *	@po: protocol offload declaration
444  *
445  *	Add protocol offload handlers to the networking stack. The passed
446  *	&proto_offload is linked into kernel lists and may not be freed until
447  *	it has been removed from the kernel lists.
448  *
449  *	This call does not sleep therefore it can not
450  *	guarantee all CPU's that are in middle of receiving packets
451  *	will see the new offload handlers (until the next received packet).
452  */
453 void dev_add_offload(struct packet_offload *po)
454 {
455 	struct list_head *head = &offload_base;
456 
457 	spin_lock(&offload_lock);
458 	list_add_rcu(&po->list, head);
459 	spin_unlock(&offload_lock);
460 }
461 EXPORT_SYMBOL(dev_add_offload);
462 
463 /**
464  *	__dev_remove_offload	 - remove offload handler
465  *	@po: packet offload declaration
466  *
467  *	Remove a protocol offload handler that was previously added to the
468  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
469  *	is removed from the kernel lists and can be freed or reused once this
470  *	function returns.
471  *
472  *      The packet type might still be in use by receivers
473  *	and must not be freed until after all the CPU's have gone
474  *	through a quiescent state.
475  */
476 void __dev_remove_offload(struct packet_offload *po)
477 {
478 	struct list_head *head = &offload_base;
479 	struct packet_offload *po1;
480 
481 	spin_lock(&offload_lock);
482 
483 	list_for_each_entry(po1, head, list) {
484 		if (po == po1) {
485 			list_del_rcu(&po->list);
486 			goto out;
487 		}
488 	}
489 
490 	pr_warn("dev_remove_offload: %p not found\n", po);
491 out:
492 	spin_unlock(&offload_lock);
493 }
494 EXPORT_SYMBOL(__dev_remove_offload);
495 
496 /**
497  *	dev_remove_offload	 - remove packet offload handler
498  *	@po: packet offload declaration
499  *
500  *	Remove a packet offload handler that was previously added to the kernel
501  *	offload handlers by dev_add_offload(). The passed &offload_type is
502  *	removed from the kernel lists and can be freed or reused once this
503  *	function returns.
504  *
505  *	This call sleeps to guarantee that no CPU is looking at the packet
506  *	type after return.
507  */
508 void dev_remove_offload(struct packet_offload *po)
509 {
510 	__dev_remove_offload(po);
511 
512 	synchronize_net();
513 }
514 EXPORT_SYMBOL(dev_remove_offload);
515 
516 /******************************************************************************
517 
518 		      Device Boot-time Settings Routines
519 
520 *******************************************************************************/
521 
522 /* Boot time configuration table */
523 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
524 
525 /**
526  *	netdev_boot_setup_add	- add new setup entry
527  *	@name: name of the device
528  *	@map: configured settings for the device
529  *
530  *	Adds new setup entry to the dev_boot_setup list.  The function
531  *	returns 0 on error and 1 on success.  This is a generic routine to
532  *	all netdevices.
533  */
534 static int netdev_boot_setup_add(char *name, struct ifmap *map)
535 {
536 	struct netdev_boot_setup *s;
537 	int i;
538 
539 	s = dev_boot_setup;
540 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
541 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
542 			memset(s[i].name, 0, sizeof(s[i].name));
543 			strlcpy(s[i].name, name, IFNAMSIZ);
544 			memcpy(&s[i].map, map, sizeof(s[i].map));
545 			break;
546 		}
547 	}
548 
549 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
550 }
551 
552 /**
553  *	netdev_boot_setup_check	- check boot time settings
554  *	@dev: the netdevice
555  *
556  * 	Check boot time settings for the device.
557  *	The found settings are set for the device to be used
558  *	later in the device probing.
559  *	Returns 0 if no settings found, 1 if they are.
560  */
561 int netdev_boot_setup_check(struct net_device *dev)
562 {
563 	struct netdev_boot_setup *s = dev_boot_setup;
564 	int i;
565 
566 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
567 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
568 		    !strcmp(dev->name, s[i].name)) {
569 			dev->irq 	= s[i].map.irq;
570 			dev->base_addr 	= s[i].map.base_addr;
571 			dev->mem_start 	= s[i].map.mem_start;
572 			dev->mem_end 	= s[i].map.mem_end;
573 			return 1;
574 		}
575 	}
576 	return 0;
577 }
578 EXPORT_SYMBOL(netdev_boot_setup_check);
579 
580 
581 /**
582  *	netdev_boot_base	- get address from boot time settings
583  *	@prefix: prefix for network device
584  *	@unit: id for network device
585  *
586  * 	Check boot time settings for the base address of device.
587  *	The found settings are set for the device to be used
588  *	later in the device probing.
589  *	Returns 0 if no settings found.
590  */
591 unsigned long netdev_boot_base(const char *prefix, int unit)
592 {
593 	const struct netdev_boot_setup *s = dev_boot_setup;
594 	char name[IFNAMSIZ];
595 	int i;
596 
597 	sprintf(name, "%s%d", prefix, unit);
598 
599 	/*
600 	 * If device already registered then return base of 1
601 	 * to indicate not to probe for this interface
602 	 */
603 	if (__dev_get_by_name(&init_net, name))
604 		return 1;
605 
606 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
607 		if (!strcmp(name, s[i].name))
608 			return s[i].map.base_addr;
609 	return 0;
610 }
611 
612 /*
613  * Saves at boot time configured settings for any netdevice.
614  */
615 int __init netdev_boot_setup(char *str)
616 {
617 	int ints[5];
618 	struct ifmap map;
619 
620 	str = get_options(str, ARRAY_SIZE(ints), ints);
621 	if (!str || !*str)
622 		return 0;
623 
624 	/* Save settings */
625 	memset(&map, 0, sizeof(map));
626 	if (ints[0] > 0)
627 		map.irq = ints[1];
628 	if (ints[0] > 1)
629 		map.base_addr = ints[2];
630 	if (ints[0] > 2)
631 		map.mem_start = ints[3];
632 	if (ints[0] > 3)
633 		map.mem_end = ints[4];
634 
635 	/* Add new entry to the list */
636 	return netdev_boot_setup_add(str, &map);
637 }
638 
639 __setup("netdev=", netdev_boot_setup);
640 
641 /*******************************************************************************
642 
643 			    Device Interface Subroutines
644 
645 *******************************************************************************/
646 
647 /**
648  *	__dev_get_by_name	- find a device by its name
649  *	@net: the applicable net namespace
650  *	@name: name to find
651  *
652  *	Find an interface by name. Must be called under RTNL semaphore
653  *	or @dev_base_lock. If the name is found a pointer to the device
654  *	is returned. If the name is not found then %NULL is returned. The
655  *	reference counters are not incremented so the caller must be
656  *	careful with locks.
657  */
658 
659 struct net_device *__dev_get_by_name(struct net *net, const char *name)
660 {
661 	struct hlist_node *p;
662 	struct net_device *dev;
663 	struct hlist_head *head = dev_name_hash(net, name);
664 
665 	hlist_for_each_entry(dev, p, head, name_hlist)
666 		if (!strncmp(dev->name, name, IFNAMSIZ))
667 			return dev;
668 
669 	return NULL;
670 }
671 EXPORT_SYMBOL(__dev_get_by_name);
672 
673 /**
674  *	dev_get_by_name_rcu	- find a device by its name
675  *	@net: the applicable net namespace
676  *	@name: name to find
677  *
678  *	Find an interface by name.
679  *	If the name is found a pointer to the device is returned.
680  * 	If the name is not found then %NULL is returned.
681  *	The reference counters are not incremented so the caller must be
682  *	careful with locks. The caller must hold RCU lock.
683  */
684 
685 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
686 {
687 	struct hlist_node *p;
688 	struct net_device *dev;
689 	struct hlist_head *head = dev_name_hash(net, name);
690 
691 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
692 		if (!strncmp(dev->name, name, IFNAMSIZ))
693 			return dev;
694 
695 	return NULL;
696 }
697 EXPORT_SYMBOL(dev_get_by_name_rcu);
698 
699 /**
700  *	dev_get_by_name		- find a device by its name
701  *	@net: the applicable net namespace
702  *	@name: name to find
703  *
704  *	Find an interface by name. This can be called from any
705  *	context and does its own locking. The returned handle has
706  *	the usage count incremented and the caller must use dev_put() to
707  *	release it when it is no longer needed. %NULL is returned if no
708  *	matching device is found.
709  */
710 
711 struct net_device *dev_get_by_name(struct net *net, const char *name)
712 {
713 	struct net_device *dev;
714 
715 	rcu_read_lock();
716 	dev = dev_get_by_name_rcu(net, name);
717 	if (dev)
718 		dev_hold(dev);
719 	rcu_read_unlock();
720 	return dev;
721 }
722 EXPORT_SYMBOL(dev_get_by_name);
723 
724 /**
725  *	__dev_get_by_index - find a device by its ifindex
726  *	@net: the applicable net namespace
727  *	@ifindex: index of device
728  *
729  *	Search for an interface by index. Returns %NULL if the device
730  *	is not found or a pointer to the device. The device has not
731  *	had its reference counter increased so the caller must be careful
732  *	about locking. The caller must hold either the RTNL semaphore
733  *	or @dev_base_lock.
734  */
735 
736 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
737 {
738 	struct hlist_node *p;
739 	struct net_device *dev;
740 	struct hlist_head *head = dev_index_hash(net, ifindex);
741 
742 	hlist_for_each_entry(dev, p, head, index_hlist)
743 		if (dev->ifindex == ifindex)
744 			return dev;
745 
746 	return NULL;
747 }
748 EXPORT_SYMBOL(__dev_get_by_index);
749 
750 /**
751  *	dev_get_by_index_rcu - find a device by its ifindex
752  *	@net: the applicable net namespace
753  *	@ifindex: index of device
754  *
755  *	Search for an interface by index. Returns %NULL if the device
756  *	is not found or a pointer to the device. The device has not
757  *	had its reference counter increased so the caller must be careful
758  *	about locking. The caller must hold RCU lock.
759  */
760 
761 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
762 {
763 	struct hlist_node *p;
764 	struct net_device *dev;
765 	struct hlist_head *head = dev_index_hash(net, ifindex);
766 
767 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
768 		if (dev->ifindex == ifindex)
769 			return dev;
770 
771 	return NULL;
772 }
773 EXPORT_SYMBOL(dev_get_by_index_rcu);
774 
775 
776 /**
777  *	dev_get_by_index - find a device by its ifindex
778  *	@net: the applicable net namespace
779  *	@ifindex: index of device
780  *
781  *	Search for an interface by index. Returns NULL if the device
782  *	is not found or a pointer to the device. The device returned has
783  *	had a reference added and the pointer is safe until the user calls
784  *	dev_put to indicate they have finished with it.
785  */
786 
787 struct net_device *dev_get_by_index(struct net *net, int ifindex)
788 {
789 	struct net_device *dev;
790 
791 	rcu_read_lock();
792 	dev = dev_get_by_index_rcu(net, ifindex);
793 	if (dev)
794 		dev_hold(dev);
795 	rcu_read_unlock();
796 	return dev;
797 }
798 EXPORT_SYMBOL(dev_get_by_index);
799 
800 /**
801  *	dev_getbyhwaddr_rcu - find a device by its hardware address
802  *	@net: the applicable net namespace
803  *	@type: media type of device
804  *	@ha: hardware address
805  *
806  *	Search for an interface by MAC address. Returns NULL if the device
807  *	is not found or a pointer to the device.
808  *	The caller must hold RCU or RTNL.
809  *	The returned device has not had its ref count increased
810  *	and the caller must therefore be careful about locking
811  *
812  */
813 
814 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
815 				       const char *ha)
816 {
817 	struct net_device *dev;
818 
819 	for_each_netdev_rcu(net, dev)
820 		if (dev->type == type &&
821 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
822 			return dev;
823 
824 	return NULL;
825 }
826 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
827 
828 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
829 {
830 	struct net_device *dev;
831 
832 	ASSERT_RTNL();
833 	for_each_netdev(net, dev)
834 		if (dev->type == type)
835 			return dev;
836 
837 	return NULL;
838 }
839 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
840 
841 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
842 {
843 	struct net_device *dev, *ret = NULL;
844 
845 	rcu_read_lock();
846 	for_each_netdev_rcu(net, dev)
847 		if (dev->type == type) {
848 			dev_hold(dev);
849 			ret = dev;
850 			break;
851 		}
852 	rcu_read_unlock();
853 	return ret;
854 }
855 EXPORT_SYMBOL(dev_getfirstbyhwtype);
856 
857 /**
858  *	dev_get_by_flags_rcu - find any device with given flags
859  *	@net: the applicable net namespace
860  *	@if_flags: IFF_* values
861  *	@mask: bitmask of bits in if_flags to check
862  *
863  *	Search for any interface with the given flags. Returns NULL if a device
864  *	is not found or a pointer to the device. Must be called inside
865  *	rcu_read_lock(), and result refcount is unchanged.
866  */
867 
868 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
869 				    unsigned short mask)
870 {
871 	struct net_device *dev, *ret;
872 
873 	ret = NULL;
874 	for_each_netdev_rcu(net, dev) {
875 		if (((dev->flags ^ if_flags) & mask) == 0) {
876 			ret = dev;
877 			break;
878 		}
879 	}
880 	return ret;
881 }
882 EXPORT_SYMBOL(dev_get_by_flags_rcu);
883 
884 /**
885  *	dev_valid_name - check if name is okay for network device
886  *	@name: name string
887  *
888  *	Network device names need to be valid file names to
889  *	to allow sysfs to work.  We also disallow any kind of
890  *	whitespace.
891  */
892 bool dev_valid_name(const char *name)
893 {
894 	if (*name == '\0')
895 		return false;
896 	if (strlen(name) >= IFNAMSIZ)
897 		return false;
898 	if (!strcmp(name, ".") || !strcmp(name, ".."))
899 		return false;
900 
901 	while (*name) {
902 		if (*name == '/' || isspace(*name))
903 			return false;
904 		name++;
905 	}
906 	return true;
907 }
908 EXPORT_SYMBOL(dev_valid_name);
909 
910 /**
911  *	__dev_alloc_name - allocate a name for a device
912  *	@net: network namespace to allocate the device name in
913  *	@name: name format string
914  *	@buf:  scratch buffer and result name string
915  *
916  *	Passed a format string - eg "lt%d" it will try and find a suitable
917  *	id. It scans list of devices to build up a free map, then chooses
918  *	the first empty slot. The caller must hold the dev_base or rtnl lock
919  *	while allocating the name and adding the device in order to avoid
920  *	duplicates.
921  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
922  *	Returns the number of the unit assigned or a negative errno code.
923  */
924 
925 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
926 {
927 	int i = 0;
928 	const char *p;
929 	const int max_netdevices = 8*PAGE_SIZE;
930 	unsigned long *inuse;
931 	struct net_device *d;
932 
933 	p = strnchr(name, IFNAMSIZ-1, '%');
934 	if (p) {
935 		/*
936 		 * Verify the string as this thing may have come from
937 		 * the user.  There must be either one "%d" and no other "%"
938 		 * characters.
939 		 */
940 		if (p[1] != 'd' || strchr(p + 2, '%'))
941 			return -EINVAL;
942 
943 		/* Use one page as a bit array of possible slots */
944 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
945 		if (!inuse)
946 			return -ENOMEM;
947 
948 		for_each_netdev(net, d) {
949 			if (!sscanf(d->name, name, &i))
950 				continue;
951 			if (i < 0 || i >= max_netdevices)
952 				continue;
953 
954 			/*  avoid cases where sscanf is not exact inverse of printf */
955 			snprintf(buf, IFNAMSIZ, name, i);
956 			if (!strncmp(buf, d->name, IFNAMSIZ))
957 				set_bit(i, inuse);
958 		}
959 
960 		i = find_first_zero_bit(inuse, max_netdevices);
961 		free_page((unsigned long) inuse);
962 	}
963 
964 	if (buf != name)
965 		snprintf(buf, IFNAMSIZ, name, i);
966 	if (!__dev_get_by_name(net, buf))
967 		return i;
968 
969 	/* It is possible to run out of possible slots
970 	 * when the name is long and there isn't enough space left
971 	 * for the digits, or if all bits are used.
972 	 */
973 	return -ENFILE;
974 }
975 
976 /**
977  *	dev_alloc_name - allocate a name for a device
978  *	@dev: device
979  *	@name: name format string
980  *
981  *	Passed a format string - eg "lt%d" it will try and find a suitable
982  *	id. It scans list of devices to build up a free map, then chooses
983  *	the first empty slot. The caller must hold the dev_base or rtnl lock
984  *	while allocating the name and adding the device in order to avoid
985  *	duplicates.
986  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
987  *	Returns the number of the unit assigned or a negative errno code.
988  */
989 
990 int dev_alloc_name(struct net_device *dev, const char *name)
991 {
992 	char buf[IFNAMSIZ];
993 	struct net *net;
994 	int ret;
995 
996 	BUG_ON(!dev_net(dev));
997 	net = dev_net(dev);
998 	ret = __dev_alloc_name(net, name, buf);
999 	if (ret >= 0)
1000 		strlcpy(dev->name, buf, IFNAMSIZ);
1001 	return ret;
1002 }
1003 EXPORT_SYMBOL(dev_alloc_name);
1004 
1005 static int dev_alloc_name_ns(struct net *net,
1006 			     struct net_device *dev,
1007 			     const char *name)
1008 {
1009 	char buf[IFNAMSIZ];
1010 	int ret;
1011 
1012 	ret = __dev_alloc_name(net, name, buf);
1013 	if (ret >= 0)
1014 		strlcpy(dev->name, buf, IFNAMSIZ);
1015 	return ret;
1016 }
1017 
1018 static int dev_get_valid_name(struct net *net,
1019 			      struct net_device *dev,
1020 			      const char *name)
1021 {
1022 	BUG_ON(!net);
1023 
1024 	if (!dev_valid_name(name))
1025 		return -EINVAL;
1026 
1027 	if (strchr(name, '%'))
1028 		return dev_alloc_name_ns(net, dev, name);
1029 	else if (__dev_get_by_name(net, name))
1030 		return -EEXIST;
1031 	else if (dev->name != name)
1032 		strlcpy(dev->name, name, IFNAMSIZ);
1033 
1034 	return 0;
1035 }
1036 
1037 /**
1038  *	dev_change_name - change name of a device
1039  *	@dev: device
1040  *	@newname: name (or format string) must be at least IFNAMSIZ
1041  *
1042  *	Change name of a device, can pass format strings "eth%d".
1043  *	for wildcarding.
1044  */
1045 int dev_change_name(struct net_device *dev, const char *newname)
1046 {
1047 	char oldname[IFNAMSIZ];
1048 	int err = 0;
1049 	int ret;
1050 	struct net *net;
1051 
1052 	ASSERT_RTNL();
1053 	BUG_ON(!dev_net(dev));
1054 
1055 	net = dev_net(dev);
1056 	if (dev->flags & IFF_UP)
1057 		return -EBUSY;
1058 
1059 	write_seqcount_begin(&devnet_rename_seq);
1060 
1061 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1062 		write_seqcount_end(&devnet_rename_seq);
1063 		return 0;
1064 	}
1065 
1066 	memcpy(oldname, dev->name, IFNAMSIZ);
1067 
1068 	err = dev_get_valid_name(net, dev, newname);
1069 	if (err < 0) {
1070 		write_seqcount_end(&devnet_rename_seq);
1071 		return err;
1072 	}
1073 
1074 rollback:
1075 	ret = device_rename(&dev->dev, dev->name);
1076 	if (ret) {
1077 		memcpy(dev->name, oldname, IFNAMSIZ);
1078 		write_seqcount_end(&devnet_rename_seq);
1079 		return ret;
1080 	}
1081 
1082 	write_seqcount_end(&devnet_rename_seq);
1083 
1084 	write_lock_bh(&dev_base_lock);
1085 	hlist_del_rcu(&dev->name_hlist);
1086 	write_unlock_bh(&dev_base_lock);
1087 
1088 	synchronize_rcu();
1089 
1090 	write_lock_bh(&dev_base_lock);
1091 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1092 	write_unlock_bh(&dev_base_lock);
1093 
1094 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1095 	ret = notifier_to_errno(ret);
1096 
1097 	if (ret) {
1098 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1099 		if (err >= 0) {
1100 			err = ret;
1101 			write_seqcount_begin(&devnet_rename_seq);
1102 			memcpy(dev->name, oldname, IFNAMSIZ);
1103 			goto rollback;
1104 		} else {
1105 			pr_err("%s: name change rollback failed: %d\n",
1106 			       dev->name, ret);
1107 		}
1108 	}
1109 
1110 	return err;
1111 }
1112 
1113 /**
1114  *	dev_set_alias - change ifalias of a device
1115  *	@dev: device
1116  *	@alias: name up to IFALIASZ
1117  *	@len: limit of bytes to copy from info
1118  *
1119  *	Set ifalias for a device,
1120  */
1121 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1122 {
1123 	char *new_ifalias;
1124 
1125 	ASSERT_RTNL();
1126 
1127 	if (len >= IFALIASZ)
1128 		return -EINVAL;
1129 
1130 	if (!len) {
1131 		kfree(dev->ifalias);
1132 		dev->ifalias = NULL;
1133 		return 0;
1134 	}
1135 
1136 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1137 	if (!new_ifalias)
1138 		return -ENOMEM;
1139 	dev->ifalias = new_ifalias;
1140 
1141 	strlcpy(dev->ifalias, alias, len+1);
1142 	return len;
1143 }
1144 
1145 
1146 /**
1147  *	netdev_features_change - device changes features
1148  *	@dev: device to cause notification
1149  *
1150  *	Called to indicate a device has changed features.
1151  */
1152 void netdev_features_change(struct net_device *dev)
1153 {
1154 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1155 }
1156 EXPORT_SYMBOL(netdev_features_change);
1157 
1158 /**
1159  *	netdev_state_change - device changes state
1160  *	@dev: device to cause notification
1161  *
1162  *	Called to indicate a device has changed state. This function calls
1163  *	the notifier chains for netdev_chain and sends a NEWLINK message
1164  *	to the routing socket.
1165  */
1166 void netdev_state_change(struct net_device *dev)
1167 {
1168 	if (dev->flags & IFF_UP) {
1169 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1170 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1171 	}
1172 }
1173 EXPORT_SYMBOL(netdev_state_change);
1174 
1175 /**
1176  * 	netdev_notify_peers - notify network peers about existence of @dev
1177  * 	@dev: network device
1178  *
1179  * Generate traffic such that interested network peers are aware of
1180  * @dev, such as by generating a gratuitous ARP. This may be used when
1181  * a device wants to inform the rest of the network about some sort of
1182  * reconfiguration such as a failover event or virtual machine
1183  * migration.
1184  */
1185 void netdev_notify_peers(struct net_device *dev)
1186 {
1187 	rtnl_lock();
1188 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1189 	rtnl_unlock();
1190 }
1191 EXPORT_SYMBOL(netdev_notify_peers);
1192 
1193 static int __dev_open(struct net_device *dev)
1194 {
1195 	const struct net_device_ops *ops = dev->netdev_ops;
1196 	int ret;
1197 
1198 	ASSERT_RTNL();
1199 
1200 	if (!netif_device_present(dev))
1201 		return -ENODEV;
1202 
1203 	/* Block netpoll from trying to do any rx path servicing.
1204 	 * If we don't do this there is a chance ndo_poll_controller
1205 	 * or ndo_poll may be running while we open the device
1206 	 */
1207 	ret = netpoll_rx_disable(dev);
1208 	if (ret)
1209 		return ret;
1210 
1211 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1212 	ret = notifier_to_errno(ret);
1213 	if (ret)
1214 		return ret;
1215 
1216 	set_bit(__LINK_STATE_START, &dev->state);
1217 
1218 	if (ops->ndo_validate_addr)
1219 		ret = ops->ndo_validate_addr(dev);
1220 
1221 	if (!ret && ops->ndo_open)
1222 		ret = ops->ndo_open(dev);
1223 
1224 	netpoll_rx_enable(dev);
1225 
1226 	if (ret)
1227 		clear_bit(__LINK_STATE_START, &dev->state);
1228 	else {
1229 		dev->flags |= IFF_UP;
1230 		net_dmaengine_get();
1231 		dev_set_rx_mode(dev);
1232 		dev_activate(dev);
1233 		add_device_randomness(dev->dev_addr, dev->addr_len);
1234 	}
1235 
1236 	return ret;
1237 }
1238 
1239 /**
1240  *	dev_open	- prepare an interface for use.
1241  *	@dev:	device to open
1242  *
1243  *	Takes a device from down to up state. The device's private open
1244  *	function is invoked and then the multicast lists are loaded. Finally
1245  *	the device is moved into the up state and a %NETDEV_UP message is
1246  *	sent to the netdev notifier chain.
1247  *
1248  *	Calling this function on an active interface is a nop. On a failure
1249  *	a negative errno code is returned.
1250  */
1251 int dev_open(struct net_device *dev)
1252 {
1253 	int ret;
1254 
1255 	if (dev->flags & IFF_UP)
1256 		return 0;
1257 
1258 	ret = __dev_open(dev);
1259 	if (ret < 0)
1260 		return ret;
1261 
1262 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1263 	call_netdevice_notifiers(NETDEV_UP, dev);
1264 
1265 	return ret;
1266 }
1267 EXPORT_SYMBOL(dev_open);
1268 
1269 static int __dev_close_many(struct list_head *head)
1270 {
1271 	struct net_device *dev;
1272 
1273 	ASSERT_RTNL();
1274 	might_sleep();
1275 
1276 	list_for_each_entry(dev, head, unreg_list) {
1277 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1278 
1279 		clear_bit(__LINK_STATE_START, &dev->state);
1280 
1281 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1282 		 * can be even on different cpu. So just clear netif_running().
1283 		 *
1284 		 * dev->stop() will invoke napi_disable() on all of it's
1285 		 * napi_struct instances on this device.
1286 		 */
1287 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1288 	}
1289 
1290 	dev_deactivate_many(head);
1291 
1292 	list_for_each_entry(dev, head, unreg_list) {
1293 		const struct net_device_ops *ops = dev->netdev_ops;
1294 
1295 		/*
1296 		 *	Call the device specific close. This cannot fail.
1297 		 *	Only if device is UP
1298 		 *
1299 		 *	We allow it to be called even after a DETACH hot-plug
1300 		 *	event.
1301 		 */
1302 		if (ops->ndo_stop)
1303 			ops->ndo_stop(dev);
1304 
1305 		dev->flags &= ~IFF_UP;
1306 		net_dmaengine_put();
1307 	}
1308 
1309 	return 0;
1310 }
1311 
1312 static int __dev_close(struct net_device *dev)
1313 {
1314 	int retval;
1315 	LIST_HEAD(single);
1316 
1317 	/* Temporarily disable netpoll until the interface is down */
1318 	retval = netpoll_rx_disable(dev);
1319 	if (retval)
1320 		return retval;
1321 
1322 	list_add(&dev->unreg_list, &single);
1323 	retval = __dev_close_many(&single);
1324 	list_del(&single);
1325 
1326 	netpoll_rx_enable(dev);
1327 	return retval;
1328 }
1329 
1330 static int dev_close_many(struct list_head *head)
1331 {
1332 	struct net_device *dev, *tmp;
1333 	LIST_HEAD(tmp_list);
1334 
1335 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1336 		if (!(dev->flags & IFF_UP))
1337 			list_move(&dev->unreg_list, &tmp_list);
1338 
1339 	__dev_close_many(head);
1340 
1341 	list_for_each_entry(dev, head, unreg_list) {
1342 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1343 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1344 	}
1345 
1346 	/* rollback_registered_many needs the complete original list */
1347 	list_splice(&tmp_list, head);
1348 	return 0;
1349 }
1350 
1351 /**
1352  *	dev_close - shutdown an interface.
1353  *	@dev: device to shutdown
1354  *
1355  *	This function moves an active device into down state. A
1356  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1357  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1358  *	chain.
1359  */
1360 int dev_close(struct net_device *dev)
1361 {
1362 	int ret = 0;
1363 	if (dev->flags & IFF_UP) {
1364 		LIST_HEAD(single);
1365 
1366 		/* Block netpoll rx while the interface is going down */
1367 		ret = netpoll_rx_disable(dev);
1368 		if (ret)
1369 			return ret;
1370 
1371 		list_add(&dev->unreg_list, &single);
1372 		dev_close_many(&single);
1373 		list_del(&single);
1374 
1375 		netpoll_rx_enable(dev);
1376 	}
1377 	return ret;
1378 }
1379 EXPORT_SYMBOL(dev_close);
1380 
1381 
1382 /**
1383  *	dev_disable_lro - disable Large Receive Offload on a device
1384  *	@dev: device
1385  *
1386  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1387  *	called under RTNL.  This is needed if received packets may be
1388  *	forwarded to another interface.
1389  */
1390 void dev_disable_lro(struct net_device *dev)
1391 {
1392 	/*
1393 	 * If we're trying to disable lro on a vlan device
1394 	 * use the underlying physical device instead
1395 	 */
1396 	if (is_vlan_dev(dev))
1397 		dev = vlan_dev_real_dev(dev);
1398 
1399 	dev->wanted_features &= ~NETIF_F_LRO;
1400 	netdev_update_features(dev);
1401 
1402 	if (unlikely(dev->features & NETIF_F_LRO))
1403 		netdev_WARN(dev, "failed to disable LRO!\n");
1404 }
1405 EXPORT_SYMBOL(dev_disable_lro);
1406 
1407 
1408 static int dev_boot_phase = 1;
1409 
1410 /**
1411  *	register_netdevice_notifier - register a network notifier block
1412  *	@nb: notifier
1413  *
1414  *	Register a notifier to be called when network device events occur.
1415  *	The notifier passed is linked into the kernel structures and must
1416  *	not be reused until it has been unregistered. A negative errno code
1417  *	is returned on a failure.
1418  *
1419  * 	When registered all registration and up events are replayed
1420  *	to the new notifier to allow device to have a race free
1421  *	view of the network device list.
1422  */
1423 
1424 int register_netdevice_notifier(struct notifier_block *nb)
1425 {
1426 	struct net_device *dev;
1427 	struct net_device *last;
1428 	struct net *net;
1429 	int err;
1430 
1431 	rtnl_lock();
1432 	err = raw_notifier_chain_register(&netdev_chain, nb);
1433 	if (err)
1434 		goto unlock;
1435 	if (dev_boot_phase)
1436 		goto unlock;
1437 	for_each_net(net) {
1438 		for_each_netdev(net, dev) {
1439 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1440 			err = notifier_to_errno(err);
1441 			if (err)
1442 				goto rollback;
1443 
1444 			if (!(dev->flags & IFF_UP))
1445 				continue;
1446 
1447 			nb->notifier_call(nb, NETDEV_UP, dev);
1448 		}
1449 	}
1450 
1451 unlock:
1452 	rtnl_unlock();
1453 	return err;
1454 
1455 rollback:
1456 	last = dev;
1457 	for_each_net(net) {
1458 		for_each_netdev(net, dev) {
1459 			if (dev == last)
1460 				goto outroll;
1461 
1462 			if (dev->flags & IFF_UP) {
1463 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1464 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1465 			}
1466 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1467 		}
1468 	}
1469 
1470 outroll:
1471 	raw_notifier_chain_unregister(&netdev_chain, nb);
1472 	goto unlock;
1473 }
1474 EXPORT_SYMBOL(register_netdevice_notifier);
1475 
1476 /**
1477  *	unregister_netdevice_notifier - unregister a network notifier block
1478  *	@nb: notifier
1479  *
1480  *	Unregister a notifier previously registered by
1481  *	register_netdevice_notifier(). The notifier is unlinked into the
1482  *	kernel structures and may then be reused. A negative errno code
1483  *	is returned on a failure.
1484  *
1485  * 	After unregistering unregister and down device events are synthesized
1486  *	for all devices on the device list to the removed notifier to remove
1487  *	the need for special case cleanup code.
1488  */
1489 
1490 int unregister_netdevice_notifier(struct notifier_block *nb)
1491 {
1492 	struct net_device *dev;
1493 	struct net *net;
1494 	int err;
1495 
1496 	rtnl_lock();
1497 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1498 	if (err)
1499 		goto unlock;
1500 
1501 	for_each_net(net) {
1502 		for_each_netdev(net, dev) {
1503 			if (dev->flags & IFF_UP) {
1504 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1505 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1506 			}
1507 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1508 		}
1509 	}
1510 unlock:
1511 	rtnl_unlock();
1512 	return err;
1513 }
1514 EXPORT_SYMBOL(unregister_netdevice_notifier);
1515 
1516 /**
1517  *	call_netdevice_notifiers - call all network notifier blocks
1518  *      @val: value passed unmodified to notifier function
1519  *      @dev: net_device pointer passed unmodified to notifier function
1520  *
1521  *	Call all network notifier blocks.  Parameters and return value
1522  *	are as for raw_notifier_call_chain().
1523  */
1524 
1525 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1526 {
1527 	ASSERT_RTNL();
1528 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1529 }
1530 EXPORT_SYMBOL(call_netdevice_notifiers);
1531 
1532 static struct static_key netstamp_needed __read_mostly;
1533 #ifdef HAVE_JUMP_LABEL
1534 /* We are not allowed to call static_key_slow_dec() from irq context
1535  * If net_disable_timestamp() is called from irq context, defer the
1536  * static_key_slow_dec() calls.
1537  */
1538 static atomic_t netstamp_needed_deferred;
1539 #endif
1540 
1541 void net_enable_timestamp(void)
1542 {
1543 #ifdef HAVE_JUMP_LABEL
1544 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1545 
1546 	if (deferred) {
1547 		while (--deferred)
1548 			static_key_slow_dec(&netstamp_needed);
1549 		return;
1550 	}
1551 #endif
1552 	WARN_ON(in_interrupt());
1553 	static_key_slow_inc(&netstamp_needed);
1554 }
1555 EXPORT_SYMBOL(net_enable_timestamp);
1556 
1557 void net_disable_timestamp(void)
1558 {
1559 #ifdef HAVE_JUMP_LABEL
1560 	if (in_interrupt()) {
1561 		atomic_inc(&netstamp_needed_deferred);
1562 		return;
1563 	}
1564 #endif
1565 	static_key_slow_dec(&netstamp_needed);
1566 }
1567 EXPORT_SYMBOL(net_disable_timestamp);
1568 
1569 static inline void net_timestamp_set(struct sk_buff *skb)
1570 {
1571 	skb->tstamp.tv64 = 0;
1572 	if (static_key_false(&netstamp_needed))
1573 		__net_timestamp(skb);
1574 }
1575 
1576 #define net_timestamp_check(COND, SKB)			\
1577 	if (static_key_false(&netstamp_needed)) {		\
1578 		if ((COND) && !(SKB)->tstamp.tv64)	\
1579 			__net_timestamp(SKB);		\
1580 	}						\
1581 
1582 static inline bool is_skb_forwardable(struct net_device *dev,
1583 				      struct sk_buff *skb)
1584 {
1585 	unsigned int len;
1586 
1587 	if (!(dev->flags & IFF_UP))
1588 		return false;
1589 
1590 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1591 	if (skb->len <= len)
1592 		return true;
1593 
1594 	/* if TSO is enabled, we don't care about the length as the packet
1595 	 * could be forwarded without being segmented before
1596 	 */
1597 	if (skb_is_gso(skb))
1598 		return true;
1599 
1600 	return false;
1601 }
1602 
1603 /**
1604  * dev_forward_skb - loopback an skb to another netif
1605  *
1606  * @dev: destination network device
1607  * @skb: buffer to forward
1608  *
1609  * return values:
1610  *	NET_RX_SUCCESS	(no congestion)
1611  *	NET_RX_DROP     (packet was dropped, but freed)
1612  *
1613  * dev_forward_skb can be used for injecting an skb from the
1614  * start_xmit function of one device into the receive queue
1615  * of another device.
1616  *
1617  * The receiving device may be in another namespace, so
1618  * we have to clear all information in the skb that could
1619  * impact namespace isolation.
1620  */
1621 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1622 {
1623 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1624 		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1625 			atomic_long_inc(&dev->rx_dropped);
1626 			kfree_skb(skb);
1627 			return NET_RX_DROP;
1628 		}
1629 	}
1630 
1631 	skb_orphan(skb);
1632 	nf_reset(skb);
1633 
1634 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1635 		atomic_long_inc(&dev->rx_dropped);
1636 		kfree_skb(skb);
1637 		return NET_RX_DROP;
1638 	}
1639 	skb->skb_iif = 0;
1640 	skb->dev = dev;
1641 	skb_dst_drop(skb);
1642 	skb->tstamp.tv64 = 0;
1643 	skb->pkt_type = PACKET_HOST;
1644 	skb->protocol = eth_type_trans(skb, dev);
1645 	skb->mark = 0;
1646 	secpath_reset(skb);
1647 	nf_reset(skb);
1648 	return netif_rx(skb);
1649 }
1650 EXPORT_SYMBOL_GPL(dev_forward_skb);
1651 
1652 static inline int deliver_skb(struct sk_buff *skb,
1653 			      struct packet_type *pt_prev,
1654 			      struct net_device *orig_dev)
1655 {
1656 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1657 		return -ENOMEM;
1658 	atomic_inc(&skb->users);
1659 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1660 }
1661 
1662 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1663 {
1664 	if (!ptype->af_packet_priv || !skb->sk)
1665 		return false;
1666 
1667 	if (ptype->id_match)
1668 		return ptype->id_match(ptype, skb->sk);
1669 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1670 		return true;
1671 
1672 	return false;
1673 }
1674 
1675 /*
1676  *	Support routine. Sends outgoing frames to any network
1677  *	taps currently in use.
1678  */
1679 
1680 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1681 {
1682 	struct packet_type *ptype;
1683 	struct sk_buff *skb2 = NULL;
1684 	struct packet_type *pt_prev = NULL;
1685 
1686 	rcu_read_lock();
1687 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1688 		/* Never send packets back to the socket
1689 		 * they originated from - MvS (miquels@drinkel.ow.org)
1690 		 */
1691 		if ((ptype->dev == dev || !ptype->dev) &&
1692 		    (!skb_loop_sk(ptype, skb))) {
1693 			if (pt_prev) {
1694 				deliver_skb(skb2, pt_prev, skb->dev);
1695 				pt_prev = ptype;
1696 				continue;
1697 			}
1698 
1699 			skb2 = skb_clone(skb, GFP_ATOMIC);
1700 			if (!skb2)
1701 				break;
1702 
1703 			net_timestamp_set(skb2);
1704 
1705 			/* skb->nh should be correctly
1706 			   set by sender, so that the second statement is
1707 			   just protection against buggy protocols.
1708 			 */
1709 			skb_reset_mac_header(skb2);
1710 
1711 			if (skb_network_header(skb2) < skb2->data ||
1712 			    skb2->network_header > skb2->tail) {
1713 				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1714 						     ntohs(skb2->protocol),
1715 						     dev->name);
1716 				skb_reset_network_header(skb2);
1717 			}
1718 
1719 			skb2->transport_header = skb2->network_header;
1720 			skb2->pkt_type = PACKET_OUTGOING;
1721 			pt_prev = ptype;
1722 		}
1723 	}
1724 	if (pt_prev)
1725 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1726 	rcu_read_unlock();
1727 }
1728 
1729 /**
1730  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1731  * @dev: Network device
1732  * @txq: number of queues available
1733  *
1734  * If real_num_tx_queues is changed the tc mappings may no longer be
1735  * valid. To resolve this verify the tc mapping remains valid and if
1736  * not NULL the mapping. With no priorities mapping to this
1737  * offset/count pair it will no longer be used. In the worst case TC0
1738  * is invalid nothing can be done so disable priority mappings. If is
1739  * expected that drivers will fix this mapping if they can before
1740  * calling netif_set_real_num_tx_queues.
1741  */
1742 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1743 {
1744 	int i;
1745 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1746 
1747 	/* If TC0 is invalidated disable TC mapping */
1748 	if (tc->offset + tc->count > txq) {
1749 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1750 		dev->num_tc = 0;
1751 		return;
1752 	}
1753 
1754 	/* Invalidated prio to tc mappings set to TC0 */
1755 	for (i = 1; i < TC_BITMASK + 1; i++) {
1756 		int q = netdev_get_prio_tc_map(dev, i);
1757 
1758 		tc = &dev->tc_to_txq[q];
1759 		if (tc->offset + tc->count > txq) {
1760 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1761 				i, q);
1762 			netdev_set_prio_tc_map(dev, i, 0);
1763 		}
1764 	}
1765 }
1766 
1767 #ifdef CONFIG_XPS
1768 static DEFINE_MUTEX(xps_map_mutex);
1769 #define xmap_dereference(P)		\
1770 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1771 
1772 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1773 					int cpu, u16 index)
1774 {
1775 	struct xps_map *map = NULL;
1776 	int pos;
1777 
1778 	if (dev_maps)
1779 		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1780 
1781 	for (pos = 0; map && pos < map->len; pos++) {
1782 		if (map->queues[pos] == index) {
1783 			if (map->len > 1) {
1784 				map->queues[pos] = map->queues[--map->len];
1785 			} else {
1786 				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1787 				kfree_rcu(map, rcu);
1788 				map = NULL;
1789 			}
1790 			break;
1791 		}
1792 	}
1793 
1794 	return map;
1795 }
1796 
1797 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1798 {
1799 	struct xps_dev_maps *dev_maps;
1800 	int cpu, i;
1801 	bool active = false;
1802 
1803 	mutex_lock(&xps_map_mutex);
1804 	dev_maps = xmap_dereference(dev->xps_maps);
1805 
1806 	if (!dev_maps)
1807 		goto out_no_maps;
1808 
1809 	for_each_possible_cpu(cpu) {
1810 		for (i = index; i < dev->num_tx_queues; i++) {
1811 			if (!remove_xps_queue(dev_maps, cpu, i))
1812 				break;
1813 		}
1814 		if (i == dev->num_tx_queues)
1815 			active = true;
1816 	}
1817 
1818 	if (!active) {
1819 		RCU_INIT_POINTER(dev->xps_maps, NULL);
1820 		kfree_rcu(dev_maps, rcu);
1821 	}
1822 
1823 	for (i = index; i < dev->num_tx_queues; i++)
1824 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1825 					     NUMA_NO_NODE);
1826 
1827 out_no_maps:
1828 	mutex_unlock(&xps_map_mutex);
1829 }
1830 
1831 static struct xps_map *expand_xps_map(struct xps_map *map,
1832 				      int cpu, u16 index)
1833 {
1834 	struct xps_map *new_map;
1835 	int alloc_len = XPS_MIN_MAP_ALLOC;
1836 	int i, pos;
1837 
1838 	for (pos = 0; map && pos < map->len; pos++) {
1839 		if (map->queues[pos] != index)
1840 			continue;
1841 		return map;
1842 	}
1843 
1844 	/* Need to add queue to this CPU's existing map */
1845 	if (map) {
1846 		if (pos < map->alloc_len)
1847 			return map;
1848 
1849 		alloc_len = map->alloc_len * 2;
1850 	}
1851 
1852 	/* Need to allocate new map to store queue on this CPU's map */
1853 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1854 			       cpu_to_node(cpu));
1855 	if (!new_map)
1856 		return NULL;
1857 
1858 	for (i = 0; i < pos; i++)
1859 		new_map->queues[i] = map->queues[i];
1860 	new_map->alloc_len = alloc_len;
1861 	new_map->len = pos;
1862 
1863 	return new_map;
1864 }
1865 
1866 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1867 {
1868 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1869 	struct xps_map *map, *new_map;
1870 	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1871 	int cpu, numa_node_id = -2;
1872 	bool active = false;
1873 
1874 	mutex_lock(&xps_map_mutex);
1875 
1876 	dev_maps = xmap_dereference(dev->xps_maps);
1877 
1878 	/* allocate memory for queue storage */
1879 	for_each_online_cpu(cpu) {
1880 		if (!cpumask_test_cpu(cpu, mask))
1881 			continue;
1882 
1883 		if (!new_dev_maps)
1884 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1885 		if (!new_dev_maps) {
1886 			mutex_unlock(&xps_map_mutex);
1887 			return -ENOMEM;
1888 		}
1889 
1890 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1891 				 NULL;
1892 
1893 		map = expand_xps_map(map, cpu, index);
1894 		if (!map)
1895 			goto error;
1896 
1897 		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1898 	}
1899 
1900 	if (!new_dev_maps)
1901 		goto out_no_new_maps;
1902 
1903 	for_each_possible_cpu(cpu) {
1904 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1905 			/* add queue to CPU maps */
1906 			int pos = 0;
1907 
1908 			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1909 			while ((pos < map->len) && (map->queues[pos] != index))
1910 				pos++;
1911 
1912 			if (pos == map->len)
1913 				map->queues[map->len++] = index;
1914 #ifdef CONFIG_NUMA
1915 			if (numa_node_id == -2)
1916 				numa_node_id = cpu_to_node(cpu);
1917 			else if (numa_node_id != cpu_to_node(cpu))
1918 				numa_node_id = -1;
1919 #endif
1920 		} else if (dev_maps) {
1921 			/* fill in the new device map from the old device map */
1922 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1923 			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1924 		}
1925 
1926 	}
1927 
1928 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1929 
1930 	/* Cleanup old maps */
1931 	if (dev_maps) {
1932 		for_each_possible_cpu(cpu) {
1933 			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1934 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1935 			if (map && map != new_map)
1936 				kfree_rcu(map, rcu);
1937 		}
1938 
1939 		kfree_rcu(dev_maps, rcu);
1940 	}
1941 
1942 	dev_maps = new_dev_maps;
1943 	active = true;
1944 
1945 out_no_new_maps:
1946 	/* update Tx queue numa node */
1947 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1948 				     (numa_node_id >= 0) ? numa_node_id :
1949 				     NUMA_NO_NODE);
1950 
1951 	if (!dev_maps)
1952 		goto out_no_maps;
1953 
1954 	/* removes queue from unused CPUs */
1955 	for_each_possible_cpu(cpu) {
1956 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1957 			continue;
1958 
1959 		if (remove_xps_queue(dev_maps, cpu, index))
1960 			active = true;
1961 	}
1962 
1963 	/* free map if not active */
1964 	if (!active) {
1965 		RCU_INIT_POINTER(dev->xps_maps, NULL);
1966 		kfree_rcu(dev_maps, rcu);
1967 	}
1968 
1969 out_no_maps:
1970 	mutex_unlock(&xps_map_mutex);
1971 
1972 	return 0;
1973 error:
1974 	/* remove any maps that we added */
1975 	for_each_possible_cpu(cpu) {
1976 		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1977 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1978 				 NULL;
1979 		if (new_map && new_map != map)
1980 			kfree(new_map);
1981 	}
1982 
1983 	mutex_unlock(&xps_map_mutex);
1984 
1985 	kfree(new_dev_maps);
1986 	return -ENOMEM;
1987 }
1988 EXPORT_SYMBOL(netif_set_xps_queue);
1989 
1990 #endif
1991 /*
1992  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1993  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1994  */
1995 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1996 {
1997 	int rc;
1998 
1999 	if (txq < 1 || txq > dev->num_tx_queues)
2000 		return -EINVAL;
2001 
2002 	if (dev->reg_state == NETREG_REGISTERED ||
2003 	    dev->reg_state == NETREG_UNREGISTERING) {
2004 		ASSERT_RTNL();
2005 
2006 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2007 						  txq);
2008 		if (rc)
2009 			return rc;
2010 
2011 		if (dev->num_tc)
2012 			netif_setup_tc(dev, txq);
2013 
2014 		if (txq < dev->real_num_tx_queues) {
2015 			qdisc_reset_all_tx_gt(dev, txq);
2016 #ifdef CONFIG_XPS
2017 			netif_reset_xps_queues_gt(dev, txq);
2018 #endif
2019 		}
2020 	}
2021 
2022 	dev->real_num_tx_queues = txq;
2023 	return 0;
2024 }
2025 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2026 
2027 #ifdef CONFIG_RPS
2028 /**
2029  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2030  *	@dev: Network device
2031  *	@rxq: Actual number of RX queues
2032  *
2033  *	This must be called either with the rtnl_lock held or before
2034  *	registration of the net device.  Returns 0 on success, or a
2035  *	negative error code.  If called before registration, it always
2036  *	succeeds.
2037  */
2038 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2039 {
2040 	int rc;
2041 
2042 	if (rxq < 1 || rxq > dev->num_rx_queues)
2043 		return -EINVAL;
2044 
2045 	if (dev->reg_state == NETREG_REGISTERED) {
2046 		ASSERT_RTNL();
2047 
2048 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2049 						  rxq);
2050 		if (rc)
2051 			return rc;
2052 	}
2053 
2054 	dev->real_num_rx_queues = rxq;
2055 	return 0;
2056 }
2057 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2058 #endif
2059 
2060 /**
2061  * netif_get_num_default_rss_queues - default number of RSS queues
2062  *
2063  * This routine should set an upper limit on the number of RSS queues
2064  * used by default by multiqueue devices.
2065  */
2066 int netif_get_num_default_rss_queues(void)
2067 {
2068 	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2069 }
2070 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2071 
2072 static inline void __netif_reschedule(struct Qdisc *q)
2073 {
2074 	struct softnet_data *sd;
2075 	unsigned long flags;
2076 
2077 	local_irq_save(flags);
2078 	sd = &__get_cpu_var(softnet_data);
2079 	q->next_sched = NULL;
2080 	*sd->output_queue_tailp = q;
2081 	sd->output_queue_tailp = &q->next_sched;
2082 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2083 	local_irq_restore(flags);
2084 }
2085 
2086 void __netif_schedule(struct Qdisc *q)
2087 {
2088 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2089 		__netif_reschedule(q);
2090 }
2091 EXPORT_SYMBOL(__netif_schedule);
2092 
2093 void dev_kfree_skb_irq(struct sk_buff *skb)
2094 {
2095 	if (atomic_dec_and_test(&skb->users)) {
2096 		struct softnet_data *sd;
2097 		unsigned long flags;
2098 
2099 		local_irq_save(flags);
2100 		sd = &__get_cpu_var(softnet_data);
2101 		skb->next = sd->completion_queue;
2102 		sd->completion_queue = skb;
2103 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
2104 		local_irq_restore(flags);
2105 	}
2106 }
2107 EXPORT_SYMBOL(dev_kfree_skb_irq);
2108 
2109 void dev_kfree_skb_any(struct sk_buff *skb)
2110 {
2111 	if (in_irq() || irqs_disabled())
2112 		dev_kfree_skb_irq(skb);
2113 	else
2114 		dev_kfree_skb(skb);
2115 }
2116 EXPORT_SYMBOL(dev_kfree_skb_any);
2117 
2118 
2119 /**
2120  * netif_device_detach - mark device as removed
2121  * @dev: network device
2122  *
2123  * Mark device as removed from system and therefore no longer available.
2124  */
2125 void netif_device_detach(struct net_device *dev)
2126 {
2127 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2128 	    netif_running(dev)) {
2129 		netif_tx_stop_all_queues(dev);
2130 	}
2131 }
2132 EXPORT_SYMBOL(netif_device_detach);
2133 
2134 /**
2135  * netif_device_attach - mark device as attached
2136  * @dev: network device
2137  *
2138  * Mark device as attached from system and restart if needed.
2139  */
2140 void netif_device_attach(struct net_device *dev)
2141 {
2142 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2143 	    netif_running(dev)) {
2144 		netif_tx_wake_all_queues(dev);
2145 		__netdev_watchdog_up(dev);
2146 	}
2147 }
2148 EXPORT_SYMBOL(netif_device_attach);
2149 
2150 static void skb_warn_bad_offload(const struct sk_buff *skb)
2151 {
2152 	static const netdev_features_t null_features = 0;
2153 	struct net_device *dev = skb->dev;
2154 	const char *driver = "";
2155 
2156 	if (dev && dev->dev.parent)
2157 		driver = dev_driver_string(dev->dev.parent);
2158 
2159 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2160 	     "gso_type=%d ip_summed=%d\n",
2161 	     driver, dev ? &dev->features : &null_features,
2162 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2163 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2164 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2165 }
2166 
2167 /*
2168  * Invalidate hardware checksum when packet is to be mangled, and
2169  * complete checksum manually on outgoing path.
2170  */
2171 int skb_checksum_help(struct sk_buff *skb)
2172 {
2173 	__wsum csum;
2174 	int ret = 0, offset;
2175 
2176 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2177 		goto out_set_summed;
2178 
2179 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2180 		skb_warn_bad_offload(skb);
2181 		return -EINVAL;
2182 	}
2183 
2184 	/* Before computing a checksum, we should make sure no frag could
2185 	 * be modified by an external entity : checksum could be wrong.
2186 	 */
2187 	if (skb_has_shared_frag(skb)) {
2188 		ret = __skb_linearize(skb);
2189 		if (ret)
2190 			goto out;
2191 	}
2192 
2193 	offset = skb_checksum_start_offset(skb);
2194 	BUG_ON(offset >= skb_headlen(skb));
2195 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2196 
2197 	offset += skb->csum_offset;
2198 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2199 
2200 	if (skb_cloned(skb) &&
2201 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2202 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2203 		if (ret)
2204 			goto out;
2205 	}
2206 
2207 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2208 out_set_summed:
2209 	skb->ip_summed = CHECKSUM_NONE;
2210 out:
2211 	return ret;
2212 }
2213 EXPORT_SYMBOL(skb_checksum_help);
2214 
2215 /**
2216  *	skb_mac_gso_segment - mac layer segmentation handler.
2217  *	@skb: buffer to segment
2218  *	@features: features for the output path (see dev->features)
2219  */
2220 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2221 				    netdev_features_t features)
2222 {
2223 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2224 	struct packet_offload *ptype;
2225 	__be16 type = skb->protocol;
2226 
2227 	while (type == htons(ETH_P_8021Q)) {
2228 		int vlan_depth = ETH_HLEN;
2229 		struct vlan_hdr *vh;
2230 
2231 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2232 			return ERR_PTR(-EINVAL);
2233 
2234 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2235 		type = vh->h_vlan_encapsulated_proto;
2236 		vlan_depth += VLAN_HLEN;
2237 	}
2238 
2239 	__skb_pull(skb, skb->mac_len);
2240 
2241 	rcu_read_lock();
2242 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2243 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2244 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2245 				int err;
2246 
2247 				err = ptype->callbacks.gso_send_check(skb);
2248 				segs = ERR_PTR(err);
2249 				if (err || skb_gso_ok(skb, features))
2250 					break;
2251 				__skb_push(skb, (skb->data -
2252 						 skb_network_header(skb)));
2253 			}
2254 			segs = ptype->callbacks.gso_segment(skb, features);
2255 			break;
2256 		}
2257 	}
2258 	rcu_read_unlock();
2259 
2260 	__skb_push(skb, skb->data - skb_mac_header(skb));
2261 
2262 	return segs;
2263 }
2264 EXPORT_SYMBOL(skb_mac_gso_segment);
2265 
2266 
2267 /* openvswitch calls this on rx path, so we need a different check.
2268  */
2269 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2270 {
2271 	if (tx_path)
2272 		return skb->ip_summed != CHECKSUM_PARTIAL;
2273 	else
2274 		return skb->ip_summed == CHECKSUM_NONE;
2275 }
2276 
2277 /**
2278  *	__skb_gso_segment - Perform segmentation on skb.
2279  *	@skb: buffer to segment
2280  *	@features: features for the output path (see dev->features)
2281  *	@tx_path: whether it is called in TX path
2282  *
2283  *	This function segments the given skb and returns a list of segments.
2284  *
2285  *	It may return NULL if the skb requires no segmentation.  This is
2286  *	only possible when GSO is used for verifying header integrity.
2287  */
2288 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2289 				  netdev_features_t features, bool tx_path)
2290 {
2291 	if (unlikely(skb_needs_check(skb, tx_path))) {
2292 		int err;
2293 
2294 		skb_warn_bad_offload(skb);
2295 
2296 		if (skb_header_cloned(skb) &&
2297 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2298 			return ERR_PTR(err);
2299 	}
2300 
2301 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2302 	skb_reset_mac_header(skb);
2303 	skb_reset_mac_len(skb);
2304 
2305 	return skb_mac_gso_segment(skb, features);
2306 }
2307 EXPORT_SYMBOL(__skb_gso_segment);
2308 
2309 /* Take action when hardware reception checksum errors are detected. */
2310 #ifdef CONFIG_BUG
2311 void netdev_rx_csum_fault(struct net_device *dev)
2312 {
2313 	if (net_ratelimit()) {
2314 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2315 		dump_stack();
2316 	}
2317 }
2318 EXPORT_SYMBOL(netdev_rx_csum_fault);
2319 #endif
2320 
2321 /* Actually, we should eliminate this check as soon as we know, that:
2322  * 1. IOMMU is present and allows to map all the memory.
2323  * 2. No high memory really exists on this machine.
2324  */
2325 
2326 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2327 {
2328 #ifdef CONFIG_HIGHMEM
2329 	int i;
2330 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2331 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2332 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2333 			if (PageHighMem(skb_frag_page(frag)))
2334 				return 1;
2335 		}
2336 	}
2337 
2338 	if (PCI_DMA_BUS_IS_PHYS) {
2339 		struct device *pdev = dev->dev.parent;
2340 
2341 		if (!pdev)
2342 			return 0;
2343 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2344 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2345 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2346 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2347 				return 1;
2348 		}
2349 	}
2350 #endif
2351 	return 0;
2352 }
2353 
2354 struct dev_gso_cb {
2355 	void (*destructor)(struct sk_buff *skb);
2356 };
2357 
2358 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2359 
2360 static void dev_gso_skb_destructor(struct sk_buff *skb)
2361 {
2362 	struct dev_gso_cb *cb;
2363 
2364 	do {
2365 		struct sk_buff *nskb = skb->next;
2366 
2367 		skb->next = nskb->next;
2368 		nskb->next = NULL;
2369 		kfree_skb(nskb);
2370 	} while (skb->next);
2371 
2372 	cb = DEV_GSO_CB(skb);
2373 	if (cb->destructor)
2374 		cb->destructor(skb);
2375 }
2376 
2377 /**
2378  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2379  *	@skb: buffer to segment
2380  *	@features: device features as applicable to this skb
2381  *
2382  *	This function segments the given skb and stores the list of segments
2383  *	in skb->next.
2384  */
2385 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2386 {
2387 	struct sk_buff *segs;
2388 
2389 	segs = skb_gso_segment(skb, features);
2390 
2391 	/* Verifying header integrity only. */
2392 	if (!segs)
2393 		return 0;
2394 
2395 	if (IS_ERR(segs))
2396 		return PTR_ERR(segs);
2397 
2398 	skb->next = segs;
2399 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2400 	skb->destructor = dev_gso_skb_destructor;
2401 
2402 	return 0;
2403 }
2404 
2405 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2406 {
2407 	return ((features & NETIF_F_GEN_CSUM) ||
2408 		((features & NETIF_F_V4_CSUM) &&
2409 		 protocol == htons(ETH_P_IP)) ||
2410 		((features & NETIF_F_V6_CSUM) &&
2411 		 protocol == htons(ETH_P_IPV6)) ||
2412 		((features & NETIF_F_FCOE_CRC) &&
2413 		 protocol == htons(ETH_P_FCOE)));
2414 }
2415 
2416 static netdev_features_t harmonize_features(struct sk_buff *skb,
2417 	__be16 protocol, netdev_features_t features)
2418 {
2419 	if (skb->ip_summed != CHECKSUM_NONE &&
2420 	    !can_checksum_protocol(features, protocol)) {
2421 		features &= ~NETIF_F_ALL_CSUM;
2422 		features &= ~NETIF_F_SG;
2423 	} else if (illegal_highdma(skb->dev, skb)) {
2424 		features &= ~NETIF_F_SG;
2425 	}
2426 
2427 	return features;
2428 }
2429 
2430 netdev_features_t netif_skb_features(struct sk_buff *skb)
2431 {
2432 	__be16 protocol = skb->protocol;
2433 	netdev_features_t features = skb->dev->features;
2434 
2435 	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2436 		features &= ~NETIF_F_GSO_MASK;
2437 
2438 	if (protocol == htons(ETH_P_8021Q)) {
2439 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2440 		protocol = veh->h_vlan_encapsulated_proto;
2441 	} else if (!vlan_tx_tag_present(skb)) {
2442 		return harmonize_features(skb, protocol, features);
2443 	}
2444 
2445 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2446 
2447 	if (protocol != htons(ETH_P_8021Q)) {
2448 		return harmonize_features(skb, protocol, features);
2449 	} else {
2450 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2451 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2452 		return harmonize_features(skb, protocol, features);
2453 	}
2454 }
2455 EXPORT_SYMBOL(netif_skb_features);
2456 
2457 /*
2458  * Returns true if either:
2459  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2460  *	2. skb is fragmented and the device does not support SG.
2461  */
2462 static inline int skb_needs_linearize(struct sk_buff *skb,
2463 				      int features)
2464 {
2465 	return skb_is_nonlinear(skb) &&
2466 			((skb_has_frag_list(skb) &&
2467 				!(features & NETIF_F_FRAGLIST)) ||
2468 			(skb_shinfo(skb)->nr_frags &&
2469 				!(features & NETIF_F_SG)));
2470 }
2471 
2472 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2473 			struct netdev_queue *txq)
2474 {
2475 	const struct net_device_ops *ops = dev->netdev_ops;
2476 	int rc = NETDEV_TX_OK;
2477 	unsigned int skb_len;
2478 
2479 	if (likely(!skb->next)) {
2480 		netdev_features_t features;
2481 
2482 		/*
2483 		 * If device doesn't need skb->dst, release it right now while
2484 		 * its hot in this cpu cache
2485 		 */
2486 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2487 			skb_dst_drop(skb);
2488 
2489 		features = netif_skb_features(skb);
2490 
2491 		if (vlan_tx_tag_present(skb) &&
2492 		    !(features & NETIF_F_HW_VLAN_TX)) {
2493 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2494 			if (unlikely(!skb))
2495 				goto out;
2496 
2497 			skb->vlan_tci = 0;
2498 		}
2499 
2500 		/* If encapsulation offload request, verify we are testing
2501 		 * hardware encapsulation features instead of standard
2502 		 * features for the netdev
2503 		 */
2504 		if (skb->encapsulation)
2505 			features &= dev->hw_enc_features;
2506 
2507 		if (netif_needs_gso(skb, features)) {
2508 			if (unlikely(dev_gso_segment(skb, features)))
2509 				goto out_kfree_skb;
2510 			if (skb->next)
2511 				goto gso;
2512 		} else {
2513 			if (skb_needs_linearize(skb, features) &&
2514 			    __skb_linearize(skb))
2515 				goto out_kfree_skb;
2516 
2517 			/* If packet is not checksummed and device does not
2518 			 * support checksumming for this protocol, complete
2519 			 * checksumming here.
2520 			 */
2521 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2522 				if (skb->encapsulation)
2523 					skb_set_inner_transport_header(skb,
2524 						skb_checksum_start_offset(skb));
2525 				else
2526 					skb_set_transport_header(skb,
2527 						skb_checksum_start_offset(skb));
2528 				if (!(features & NETIF_F_ALL_CSUM) &&
2529 				     skb_checksum_help(skb))
2530 					goto out_kfree_skb;
2531 			}
2532 		}
2533 
2534 		if (!list_empty(&ptype_all))
2535 			dev_queue_xmit_nit(skb, dev);
2536 
2537 		skb_len = skb->len;
2538 		rc = ops->ndo_start_xmit(skb, dev);
2539 		trace_net_dev_xmit(skb, rc, dev, skb_len);
2540 		if (rc == NETDEV_TX_OK)
2541 			txq_trans_update(txq);
2542 		return rc;
2543 	}
2544 
2545 gso:
2546 	do {
2547 		struct sk_buff *nskb = skb->next;
2548 
2549 		skb->next = nskb->next;
2550 		nskb->next = NULL;
2551 
2552 		/*
2553 		 * If device doesn't need nskb->dst, release it right now while
2554 		 * its hot in this cpu cache
2555 		 */
2556 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2557 			skb_dst_drop(nskb);
2558 
2559 		if (!list_empty(&ptype_all))
2560 			dev_queue_xmit_nit(nskb, dev);
2561 
2562 		skb_len = nskb->len;
2563 		rc = ops->ndo_start_xmit(nskb, dev);
2564 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2565 		if (unlikely(rc != NETDEV_TX_OK)) {
2566 			if (rc & ~NETDEV_TX_MASK)
2567 				goto out_kfree_gso_skb;
2568 			nskb->next = skb->next;
2569 			skb->next = nskb;
2570 			return rc;
2571 		}
2572 		txq_trans_update(txq);
2573 		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2574 			return NETDEV_TX_BUSY;
2575 	} while (skb->next);
2576 
2577 out_kfree_gso_skb:
2578 	if (likely(skb->next == NULL))
2579 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2580 out_kfree_skb:
2581 	kfree_skb(skb);
2582 out:
2583 	return rc;
2584 }
2585 
2586 static void qdisc_pkt_len_init(struct sk_buff *skb)
2587 {
2588 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2589 
2590 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2591 
2592 	/* To get more precise estimation of bytes sent on wire,
2593 	 * we add to pkt_len the headers size of all segments
2594 	 */
2595 	if (shinfo->gso_size)  {
2596 		unsigned int hdr_len;
2597 
2598 		/* mac layer + network layer */
2599 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2600 
2601 		/* + transport layer */
2602 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2603 			hdr_len += tcp_hdrlen(skb);
2604 		else
2605 			hdr_len += sizeof(struct udphdr);
2606 		qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
2607 	}
2608 }
2609 
2610 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2611 				 struct net_device *dev,
2612 				 struct netdev_queue *txq)
2613 {
2614 	spinlock_t *root_lock = qdisc_lock(q);
2615 	bool contended;
2616 	int rc;
2617 
2618 	qdisc_pkt_len_init(skb);
2619 	qdisc_calculate_pkt_len(skb, q);
2620 	/*
2621 	 * Heuristic to force contended enqueues to serialize on a
2622 	 * separate lock before trying to get qdisc main lock.
2623 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2624 	 * and dequeue packets faster.
2625 	 */
2626 	contended = qdisc_is_running(q);
2627 	if (unlikely(contended))
2628 		spin_lock(&q->busylock);
2629 
2630 	spin_lock(root_lock);
2631 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2632 		kfree_skb(skb);
2633 		rc = NET_XMIT_DROP;
2634 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2635 		   qdisc_run_begin(q)) {
2636 		/*
2637 		 * This is a work-conserving queue; there are no old skbs
2638 		 * waiting to be sent out; and the qdisc is not running -
2639 		 * xmit the skb directly.
2640 		 */
2641 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2642 			skb_dst_force(skb);
2643 
2644 		qdisc_bstats_update(q, skb);
2645 
2646 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2647 			if (unlikely(contended)) {
2648 				spin_unlock(&q->busylock);
2649 				contended = false;
2650 			}
2651 			__qdisc_run(q);
2652 		} else
2653 			qdisc_run_end(q);
2654 
2655 		rc = NET_XMIT_SUCCESS;
2656 	} else {
2657 		skb_dst_force(skb);
2658 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2659 		if (qdisc_run_begin(q)) {
2660 			if (unlikely(contended)) {
2661 				spin_unlock(&q->busylock);
2662 				contended = false;
2663 			}
2664 			__qdisc_run(q);
2665 		}
2666 	}
2667 	spin_unlock(root_lock);
2668 	if (unlikely(contended))
2669 		spin_unlock(&q->busylock);
2670 	return rc;
2671 }
2672 
2673 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2674 static void skb_update_prio(struct sk_buff *skb)
2675 {
2676 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2677 
2678 	if (!skb->priority && skb->sk && map) {
2679 		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2680 
2681 		if (prioidx < map->priomap_len)
2682 			skb->priority = map->priomap[prioidx];
2683 	}
2684 }
2685 #else
2686 #define skb_update_prio(skb)
2687 #endif
2688 
2689 static DEFINE_PER_CPU(int, xmit_recursion);
2690 #define RECURSION_LIMIT 10
2691 
2692 /**
2693  *	dev_loopback_xmit - loop back @skb
2694  *	@skb: buffer to transmit
2695  */
2696 int dev_loopback_xmit(struct sk_buff *skb)
2697 {
2698 	skb_reset_mac_header(skb);
2699 	__skb_pull(skb, skb_network_offset(skb));
2700 	skb->pkt_type = PACKET_LOOPBACK;
2701 	skb->ip_summed = CHECKSUM_UNNECESSARY;
2702 	WARN_ON(!skb_dst(skb));
2703 	skb_dst_force(skb);
2704 	netif_rx_ni(skb);
2705 	return 0;
2706 }
2707 EXPORT_SYMBOL(dev_loopback_xmit);
2708 
2709 /**
2710  *	dev_queue_xmit - transmit a buffer
2711  *	@skb: buffer to transmit
2712  *
2713  *	Queue a buffer for transmission to a network device. The caller must
2714  *	have set the device and priority and built the buffer before calling
2715  *	this function. The function can be called from an interrupt.
2716  *
2717  *	A negative errno code is returned on a failure. A success does not
2718  *	guarantee the frame will be transmitted as it may be dropped due
2719  *	to congestion or traffic shaping.
2720  *
2721  * -----------------------------------------------------------------------------------
2722  *      I notice this method can also return errors from the queue disciplines,
2723  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2724  *      be positive.
2725  *
2726  *      Regardless of the return value, the skb is consumed, so it is currently
2727  *      difficult to retry a send to this method.  (You can bump the ref count
2728  *      before sending to hold a reference for retry if you are careful.)
2729  *
2730  *      When calling this method, interrupts MUST be enabled.  This is because
2731  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2732  *          --BLG
2733  */
2734 int dev_queue_xmit(struct sk_buff *skb)
2735 {
2736 	struct net_device *dev = skb->dev;
2737 	struct netdev_queue *txq;
2738 	struct Qdisc *q;
2739 	int rc = -ENOMEM;
2740 
2741 	skb_reset_mac_header(skb);
2742 
2743 	/* Disable soft irqs for various locks below. Also
2744 	 * stops preemption for RCU.
2745 	 */
2746 	rcu_read_lock_bh();
2747 
2748 	skb_update_prio(skb);
2749 
2750 	txq = netdev_pick_tx(dev, skb);
2751 	q = rcu_dereference_bh(txq->qdisc);
2752 
2753 #ifdef CONFIG_NET_CLS_ACT
2754 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2755 #endif
2756 	trace_net_dev_queue(skb);
2757 	if (q->enqueue) {
2758 		rc = __dev_xmit_skb(skb, q, dev, txq);
2759 		goto out;
2760 	}
2761 
2762 	/* The device has no queue. Common case for software devices:
2763 	   loopback, all the sorts of tunnels...
2764 
2765 	   Really, it is unlikely that netif_tx_lock protection is necessary
2766 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2767 	   counters.)
2768 	   However, it is possible, that they rely on protection
2769 	   made by us here.
2770 
2771 	   Check this and shot the lock. It is not prone from deadlocks.
2772 	   Either shot noqueue qdisc, it is even simpler 8)
2773 	 */
2774 	if (dev->flags & IFF_UP) {
2775 		int cpu = smp_processor_id(); /* ok because BHs are off */
2776 
2777 		if (txq->xmit_lock_owner != cpu) {
2778 
2779 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2780 				goto recursion_alert;
2781 
2782 			HARD_TX_LOCK(dev, txq, cpu);
2783 
2784 			if (!netif_xmit_stopped(txq)) {
2785 				__this_cpu_inc(xmit_recursion);
2786 				rc = dev_hard_start_xmit(skb, dev, txq);
2787 				__this_cpu_dec(xmit_recursion);
2788 				if (dev_xmit_complete(rc)) {
2789 					HARD_TX_UNLOCK(dev, txq);
2790 					goto out;
2791 				}
2792 			}
2793 			HARD_TX_UNLOCK(dev, txq);
2794 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2795 					     dev->name);
2796 		} else {
2797 			/* Recursion is detected! It is possible,
2798 			 * unfortunately
2799 			 */
2800 recursion_alert:
2801 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2802 					     dev->name);
2803 		}
2804 	}
2805 
2806 	rc = -ENETDOWN;
2807 	rcu_read_unlock_bh();
2808 
2809 	kfree_skb(skb);
2810 	return rc;
2811 out:
2812 	rcu_read_unlock_bh();
2813 	return rc;
2814 }
2815 EXPORT_SYMBOL(dev_queue_xmit);
2816 
2817 
2818 /*=======================================================================
2819 			Receiver routines
2820   =======================================================================*/
2821 
2822 int netdev_max_backlog __read_mostly = 1000;
2823 EXPORT_SYMBOL(netdev_max_backlog);
2824 
2825 int netdev_tstamp_prequeue __read_mostly = 1;
2826 int netdev_budget __read_mostly = 300;
2827 int weight_p __read_mostly = 64;            /* old backlog weight */
2828 
2829 /* Called with irq disabled */
2830 static inline void ____napi_schedule(struct softnet_data *sd,
2831 				     struct napi_struct *napi)
2832 {
2833 	list_add_tail(&napi->poll_list, &sd->poll_list);
2834 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2835 }
2836 
2837 #ifdef CONFIG_RPS
2838 
2839 /* One global table that all flow-based protocols share. */
2840 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2841 EXPORT_SYMBOL(rps_sock_flow_table);
2842 
2843 struct static_key rps_needed __read_mostly;
2844 
2845 static struct rps_dev_flow *
2846 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2847 	    struct rps_dev_flow *rflow, u16 next_cpu)
2848 {
2849 	if (next_cpu != RPS_NO_CPU) {
2850 #ifdef CONFIG_RFS_ACCEL
2851 		struct netdev_rx_queue *rxqueue;
2852 		struct rps_dev_flow_table *flow_table;
2853 		struct rps_dev_flow *old_rflow;
2854 		u32 flow_id;
2855 		u16 rxq_index;
2856 		int rc;
2857 
2858 		/* Should we steer this flow to a different hardware queue? */
2859 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2860 		    !(dev->features & NETIF_F_NTUPLE))
2861 			goto out;
2862 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2863 		if (rxq_index == skb_get_rx_queue(skb))
2864 			goto out;
2865 
2866 		rxqueue = dev->_rx + rxq_index;
2867 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2868 		if (!flow_table)
2869 			goto out;
2870 		flow_id = skb->rxhash & flow_table->mask;
2871 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2872 							rxq_index, flow_id);
2873 		if (rc < 0)
2874 			goto out;
2875 		old_rflow = rflow;
2876 		rflow = &flow_table->flows[flow_id];
2877 		rflow->filter = rc;
2878 		if (old_rflow->filter == rflow->filter)
2879 			old_rflow->filter = RPS_NO_FILTER;
2880 	out:
2881 #endif
2882 		rflow->last_qtail =
2883 			per_cpu(softnet_data, next_cpu).input_queue_head;
2884 	}
2885 
2886 	rflow->cpu = next_cpu;
2887 	return rflow;
2888 }
2889 
2890 /*
2891  * get_rps_cpu is called from netif_receive_skb and returns the target
2892  * CPU from the RPS map of the receiving queue for a given skb.
2893  * rcu_read_lock must be held on entry.
2894  */
2895 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2896 		       struct rps_dev_flow **rflowp)
2897 {
2898 	struct netdev_rx_queue *rxqueue;
2899 	struct rps_map *map;
2900 	struct rps_dev_flow_table *flow_table;
2901 	struct rps_sock_flow_table *sock_flow_table;
2902 	int cpu = -1;
2903 	u16 tcpu;
2904 
2905 	if (skb_rx_queue_recorded(skb)) {
2906 		u16 index = skb_get_rx_queue(skb);
2907 		if (unlikely(index >= dev->real_num_rx_queues)) {
2908 			WARN_ONCE(dev->real_num_rx_queues > 1,
2909 				  "%s received packet on queue %u, but number "
2910 				  "of RX queues is %u\n",
2911 				  dev->name, index, dev->real_num_rx_queues);
2912 			goto done;
2913 		}
2914 		rxqueue = dev->_rx + index;
2915 	} else
2916 		rxqueue = dev->_rx;
2917 
2918 	map = rcu_dereference(rxqueue->rps_map);
2919 	if (map) {
2920 		if (map->len == 1 &&
2921 		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2922 			tcpu = map->cpus[0];
2923 			if (cpu_online(tcpu))
2924 				cpu = tcpu;
2925 			goto done;
2926 		}
2927 	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2928 		goto done;
2929 	}
2930 
2931 	skb_reset_network_header(skb);
2932 	if (!skb_get_rxhash(skb))
2933 		goto done;
2934 
2935 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2936 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2937 	if (flow_table && sock_flow_table) {
2938 		u16 next_cpu;
2939 		struct rps_dev_flow *rflow;
2940 
2941 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2942 		tcpu = rflow->cpu;
2943 
2944 		next_cpu = sock_flow_table->ents[skb->rxhash &
2945 		    sock_flow_table->mask];
2946 
2947 		/*
2948 		 * If the desired CPU (where last recvmsg was done) is
2949 		 * different from current CPU (one in the rx-queue flow
2950 		 * table entry), switch if one of the following holds:
2951 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2952 		 *   - Current CPU is offline.
2953 		 *   - The current CPU's queue tail has advanced beyond the
2954 		 *     last packet that was enqueued using this table entry.
2955 		 *     This guarantees that all previous packets for the flow
2956 		 *     have been dequeued, thus preserving in order delivery.
2957 		 */
2958 		if (unlikely(tcpu != next_cpu) &&
2959 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2960 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2961 		      rflow->last_qtail)) >= 0)) {
2962 			tcpu = next_cpu;
2963 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2964 		}
2965 
2966 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2967 			*rflowp = rflow;
2968 			cpu = tcpu;
2969 			goto done;
2970 		}
2971 	}
2972 
2973 	if (map) {
2974 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2975 
2976 		if (cpu_online(tcpu)) {
2977 			cpu = tcpu;
2978 			goto done;
2979 		}
2980 	}
2981 
2982 done:
2983 	return cpu;
2984 }
2985 
2986 #ifdef CONFIG_RFS_ACCEL
2987 
2988 /**
2989  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2990  * @dev: Device on which the filter was set
2991  * @rxq_index: RX queue index
2992  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2993  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2994  *
2995  * Drivers that implement ndo_rx_flow_steer() should periodically call
2996  * this function for each installed filter and remove the filters for
2997  * which it returns %true.
2998  */
2999 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3000 			 u32 flow_id, u16 filter_id)
3001 {
3002 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3003 	struct rps_dev_flow_table *flow_table;
3004 	struct rps_dev_flow *rflow;
3005 	bool expire = true;
3006 	int cpu;
3007 
3008 	rcu_read_lock();
3009 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3010 	if (flow_table && flow_id <= flow_table->mask) {
3011 		rflow = &flow_table->flows[flow_id];
3012 		cpu = ACCESS_ONCE(rflow->cpu);
3013 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3014 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3015 			   rflow->last_qtail) <
3016 		     (int)(10 * flow_table->mask)))
3017 			expire = false;
3018 	}
3019 	rcu_read_unlock();
3020 	return expire;
3021 }
3022 EXPORT_SYMBOL(rps_may_expire_flow);
3023 
3024 #endif /* CONFIG_RFS_ACCEL */
3025 
3026 /* Called from hardirq (IPI) context */
3027 static void rps_trigger_softirq(void *data)
3028 {
3029 	struct softnet_data *sd = data;
3030 
3031 	____napi_schedule(sd, &sd->backlog);
3032 	sd->received_rps++;
3033 }
3034 
3035 #endif /* CONFIG_RPS */
3036 
3037 /*
3038  * Check if this softnet_data structure is another cpu one
3039  * If yes, queue it to our IPI list and return 1
3040  * If no, return 0
3041  */
3042 static int rps_ipi_queued(struct softnet_data *sd)
3043 {
3044 #ifdef CONFIG_RPS
3045 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3046 
3047 	if (sd != mysd) {
3048 		sd->rps_ipi_next = mysd->rps_ipi_list;
3049 		mysd->rps_ipi_list = sd;
3050 
3051 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3052 		return 1;
3053 	}
3054 #endif /* CONFIG_RPS */
3055 	return 0;
3056 }
3057 
3058 /*
3059  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3060  * queue (may be a remote CPU queue).
3061  */
3062 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3063 			      unsigned int *qtail)
3064 {
3065 	struct softnet_data *sd;
3066 	unsigned long flags;
3067 
3068 	sd = &per_cpu(softnet_data, cpu);
3069 
3070 	local_irq_save(flags);
3071 
3072 	rps_lock(sd);
3073 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3074 		if (skb_queue_len(&sd->input_pkt_queue)) {
3075 enqueue:
3076 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3077 			input_queue_tail_incr_save(sd, qtail);
3078 			rps_unlock(sd);
3079 			local_irq_restore(flags);
3080 			return NET_RX_SUCCESS;
3081 		}
3082 
3083 		/* Schedule NAPI for backlog device
3084 		 * We can use non atomic operation since we own the queue lock
3085 		 */
3086 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3087 			if (!rps_ipi_queued(sd))
3088 				____napi_schedule(sd, &sd->backlog);
3089 		}
3090 		goto enqueue;
3091 	}
3092 
3093 	sd->dropped++;
3094 	rps_unlock(sd);
3095 
3096 	local_irq_restore(flags);
3097 
3098 	atomic_long_inc(&skb->dev->rx_dropped);
3099 	kfree_skb(skb);
3100 	return NET_RX_DROP;
3101 }
3102 
3103 /**
3104  *	netif_rx	-	post buffer to the network code
3105  *	@skb: buffer to post
3106  *
3107  *	This function receives a packet from a device driver and queues it for
3108  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3109  *	may be dropped during processing for congestion control or by the
3110  *	protocol layers.
3111  *
3112  *	return values:
3113  *	NET_RX_SUCCESS	(no congestion)
3114  *	NET_RX_DROP     (packet was dropped)
3115  *
3116  */
3117 
3118 int netif_rx(struct sk_buff *skb)
3119 {
3120 	int ret;
3121 
3122 	/* if netpoll wants it, pretend we never saw it */
3123 	if (netpoll_rx(skb))
3124 		return NET_RX_DROP;
3125 
3126 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3127 
3128 	trace_netif_rx(skb);
3129 #ifdef CONFIG_RPS
3130 	if (static_key_false(&rps_needed)) {
3131 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3132 		int cpu;
3133 
3134 		preempt_disable();
3135 		rcu_read_lock();
3136 
3137 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3138 		if (cpu < 0)
3139 			cpu = smp_processor_id();
3140 
3141 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3142 
3143 		rcu_read_unlock();
3144 		preempt_enable();
3145 	} else
3146 #endif
3147 	{
3148 		unsigned int qtail;
3149 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3150 		put_cpu();
3151 	}
3152 	return ret;
3153 }
3154 EXPORT_SYMBOL(netif_rx);
3155 
3156 int netif_rx_ni(struct sk_buff *skb)
3157 {
3158 	int err;
3159 
3160 	preempt_disable();
3161 	err = netif_rx(skb);
3162 	if (local_softirq_pending())
3163 		do_softirq();
3164 	preempt_enable();
3165 
3166 	return err;
3167 }
3168 EXPORT_SYMBOL(netif_rx_ni);
3169 
3170 static void net_tx_action(struct softirq_action *h)
3171 {
3172 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3173 
3174 	if (sd->completion_queue) {
3175 		struct sk_buff *clist;
3176 
3177 		local_irq_disable();
3178 		clist = sd->completion_queue;
3179 		sd->completion_queue = NULL;
3180 		local_irq_enable();
3181 
3182 		while (clist) {
3183 			struct sk_buff *skb = clist;
3184 			clist = clist->next;
3185 
3186 			WARN_ON(atomic_read(&skb->users));
3187 			trace_kfree_skb(skb, net_tx_action);
3188 			__kfree_skb(skb);
3189 		}
3190 	}
3191 
3192 	if (sd->output_queue) {
3193 		struct Qdisc *head;
3194 
3195 		local_irq_disable();
3196 		head = sd->output_queue;
3197 		sd->output_queue = NULL;
3198 		sd->output_queue_tailp = &sd->output_queue;
3199 		local_irq_enable();
3200 
3201 		while (head) {
3202 			struct Qdisc *q = head;
3203 			spinlock_t *root_lock;
3204 
3205 			head = head->next_sched;
3206 
3207 			root_lock = qdisc_lock(q);
3208 			if (spin_trylock(root_lock)) {
3209 				smp_mb__before_clear_bit();
3210 				clear_bit(__QDISC_STATE_SCHED,
3211 					  &q->state);
3212 				qdisc_run(q);
3213 				spin_unlock(root_lock);
3214 			} else {
3215 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3216 					      &q->state)) {
3217 					__netif_reschedule(q);
3218 				} else {
3219 					smp_mb__before_clear_bit();
3220 					clear_bit(__QDISC_STATE_SCHED,
3221 						  &q->state);
3222 				}
3223 			}
3224 		}
3225 	}
3226 }
3227 
3228 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3229     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3230 /* This hook is defined here for ATM LANE */
3231 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3232 			     unsigned char *addr) __read_mostly;
3233 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3234 #endif
3235 
3236 #ifdef CONFIG_NET_CLS_ACT
3237 /* TODO: Maybe we should just force sch_ingress to be compiled in
3238  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3239  * a compare and 2 stores extra right now if we dont have it on
3240  * but have CONFIG_NET_CLS_ACT
3241  * NOTE: This doesn't stop any functionality; if you dont have
3242  * the ingress scheduler, you just can't add policies on ingress.
3243  *
3244  */
3245 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3246 {
3247 	struct net_device *dev = skb->dev;
3248 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3249 	int result = TC_ACT_OK;
3250 	struct Qdisc *q;
3251 
3252 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3253 		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3254 				     skb->skb_iif, dev->ifindex);
3255 		return TC_ACT_SHOT;
3256 	}
3257 
3258 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3259 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3260 
3261 	q = rxq->qdisc;
3262 	if (q != &noop_qdisc) {
3263 		spin_lock(qdisc_lock(q));
3264 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3265 			result = qdisc_enqueue_root(skb, q);
3266 		spin_unlock(qdisc_lock(q));
3267 	}
3268 
3269 	return result;
3270 }
3271 
3272 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3273 					 struct packet_type **pt_prev,
3274 					 int *ret, struct net_device *orig_dev)
3275 {
3276 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3277 
3278 	if (!rxq || rxq->qdisc == &noop_qdisc)
3279 		goto out;
3280 
3281 	if (*pt_prev) {
3282 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3283 		*pt_prev = NULL;
3284 	}
3285 
3286 	switch (ing_filter(skb, rxq)) {
3287 	case TC_ACT_SHOT:
3288 	case TC_ACT_STOLEN:
3289 		kfree_skb(skb);
3290 		return NULL;
3291 	}
3292 
3293 out:
3294 	skb->tc_verd = 0;
3295 	return skb;
3296 }
3297 #endif
3298 
3299 /**
3300  *	netdev_rx_handler_register - register receive handler
3301  *	@dev: device to register a handler for
3302  *	@rx_handler: receive handler to register
3303  *	@rx_handler_data: data pointer that is used by rx handler
3304  *
3305  *	Register a receive hander for a device. This handler will then be
3306  *	called from __netif_receive_skb. A negative errno code is returned
3307  *	on a failure.
3308  *
3309  *	The caller must hold the rtnl_mutex.
3310  *
3311  *	For a general description of rx_handler, see enum rx_handler_result.
3312  */
3313 int netdev_rx_handler_register(struct net_device *dev,
3314 			       rx_handler_func_t *rx_handler,
3315 			       void *rx_handler_data)
3316 {
3317 	ASSERT_RTNL();
3318 
3319 	if (dev->rx_handler)
3320 		return -EBUSY;
3321 
3322 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3323 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3324 
3325 	return 0;
3326 }
3327 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3328 
3329 /**
3330  *	netdev_rx_handler_unregister - unregister receive handler
3331  *	@dev: device to unregister a handler from
3332  *
3333  *	Unregister a receive hander from a device.
3334  *
3335  *	The caller must hold the rtnl_mutex.
3336  */
3337 void netdev_rx_handler_unregister(struct net_device *dev)
3338 {
3339 
3340 	ASSERT_RTNL();
3341 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3342 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3343 }
3344 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3345 
3346 /*
3347  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3348  * the special handling of PFMEMALLOC skbs.
3349  */
3350 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3351 {
3352 	switch (skb->protocol) {
3353 	case __constant_htons(ETH_P_ARP):
3354 	case __constant_htons(ETH_P_IP):
3355 	case __constant_htons(ETH_P_IPV6):
3356 	case __constant_htons(ETH_P_8021Q):
3357 		return true;
3358 	default:
3359 		return false;
3360 	}
3361 }
3362 
3363 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3364 {
3365 	struct packet_type *ptype, *pt_prev;
3366 	rx_handler_func_t *rx_handler;
3367 	struct net_device *orig_dev;
3368 	struct net_device *null_or_dev;
3369 	bool deliver_exact = false;
3370 	int ret = NET_RX_DROP;
3371 	__be16 type;
3372 
3373 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3374 
3375 	trace_netif_receive_skb(skb);
3376 
3377 	/* if we've gotten here through NAPI, check netpoll */
3378 	if (netpoll_receive_skb(skb))
3379 		goto out;
3380 
3381 	orig_dev = skb->dev;
3382 
3383 	skb_reset_network_header(skb);
3384 	if (!skb_transport_header_was_set(skb))
3385 		skb_reset_transport_header(skb);
3386 	skb_reset_mac_len(skb);
3387 
3388 	pt_prev = NULL;
3389 
3390 	rcu_read_lock();
3391 
3392 another_round:
3393 	skb->skb_iif = skb->dev->ifindex;
3394 
3395 	__this_cpu_inc(softnet_data.processed);
3396 
3397 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3398 		skb = vlan_untag(skb);
3399 		if (unlikely(!skb))
3400 			goto unlock;
3401 	}
3402 
3403 #ifdef CONFIG_NET_CLS_ACT
3404 	if (skb->tc_verd & TC_NCLS) {
3405 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3406 		goto ncls;
3407 	}
3408 #endif
3409 
3410 	if (pfmemalloc)
3411 		goto skip_taps;
3412 
3413 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3414 		if (!ptype->dev || ptype->dev == skb->dev) {
3415 			if (pt_prev)
3416 				ret = deliver_skb(skb, pt_prev, orig_dev);
3417 			pt_prev = ptype;
3418 		}
3419 	}
3420 
3421 skip_taps:
3422 #ifdef CONFIG_NET_CLS_ACT
3423 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3424 	if (!skb)
3425 		goto unlock;
3426 ncls:
3427 #endif
3428 
3429 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3430 		goto drop;
3431 
3432 	if (vlan_tx_tag_present(skb)) {
3433 		if (pt_prev) {
3434 			ret = deliver_skb(skb, pt_prev, orig_dev);
3435 			pt_prev = NULL;
3436 		}
3437 		if (vlan_do_receive(&skb))
3438 			goto another_round;
3439 		else if (unlikely(!skb))
3440 			goto unlock;
3441 	}
3442 
3443 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3444 	if (rx_handler) {
3445 		if (pt_prev) {
3446 			ret = deliver_skb(skb, pt_prev, orig_dev);
3447 			pt_prev = NULL;
3448 		}
3449 		switch (rx_handler(&skb)) {
3450 		case RX_HANDLER_CONSUMED:
3451 			goto unlock;
3452 		case RX_HANDLER_ANOTHER:
3453 			goto another_round;
3454 		case RX_HANDLER_EXACT:
3455 			deliver_exact = true;
3456 		case RX_HANDLER_PASS:
3457 			break;
3458 		default:
3459 			BUG();
3460 		}
3461 	}
3462 
3463 	if (vlan_tx_nonzero_tag_present(skb))
3464 		skb->pkt_type = PACKET_OTHERHOST;
3465 
3466 	/* deliver only exact match when indicated */
3467 	null_or_dev = deliver_exact ? skb->dev : NULL;
3468 
3469 	type = skb->protocol;
3470 	list_for_each_entry_rcu(ptype,
3471 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3472 		if (ptype->type == type &&
3473 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3474 		     ptype->dev == orig_dev)) {
3475 			if (pt_prev)
3476 				ret = deliver_skb(skb, pt_prev, orig_dev);
3477 			pt_prev = ptype;
3478 		}
3479 	}
3480 
3481 	if (pt_prev) {
3482 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3483 			goto drop;
3484 		else
3485 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3486 	} else {
3487 drop:
3488 		atomic_long_inc(&skb->dev->rx_dropped);
3489 		kfree_skb(skb);
3490 		/* Jamal, now you will not able to escape explaining
3491 		 * me how you were going to use this. :-)
3492 		 */
3493 		ret = NET_RX_DROP;
3494 	}
3495 
3496 unlock:
3497 	rcu_read_unlock();
3498 out:
3499 	return ret;
3500 }
3501 
3502 static int __netif_receive_skb(struct sk_buff *skb)
3503 {
3504 	int ret;
3505 
3506 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3507 		unsigned long pflags = current->flags;
3508 
3509 		/*
3510 		 * PFMEMALLOC skbs are special, they should
3511 		 * - be delivered to SOCK_MEMALLOC sockets only
3512 		 * - stay away from userspace
3513 		 * - have bounded memory usage
3514 		 *
3515 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
3516 		 * context down to all allocation sites.
3517 		 */
3518 		current->flags |= PF_MEMALLOC;
3519 		ret = __netif_receive_skb_core(skb, true);
3520 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
3521 	} else
3522 		ret = __netif_receive_skb_core(skb, false);
3523 
3524 	return ret;
3525 }
3526 
3527 /**
3528  *	netif_receive_skb - process receive buffer from network
3529  *	@skb: buffer to process
3530  *
3531  *	netif_receive_skb() is the main receive data processing function.
3532  *	It always succeeds. The buffer may be dropped during processing
3533  *	for congestion control or by the protocol layers.
3534  *
3535  *	This function may only be called from softirq context and interrupts
3536  *	should be enabled.
3537  *
3538  *	Return values (usually ignored):
3539  *	NET_RX_SUCCESS: no congestion
3540  *	NET_RX_DROP: packet was dropped
3541  */
3542 int netif_receive_skb(struct sk_buff *skb)
3543 {
3544 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3545 
3546 	if (skb_defer_rx_timestamp(skb))
3547 		return NET_RX_SUCCESS;
3548 
3549 #ifdef CONFIG_RPS
3550 	if (static_key_false(&rps_needed)) {
3551 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3552 		int cpu, ret;
3553 
3554 		rcu_read_lock();
3555 
3556 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3557 
3558 		if (cpu >= 0) {
3559 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3560 			rcu_read_unlock();
3561 			return ret;
3562 		}
3563 		rcu_read_unlock();
3564 	}
3565 #endif
3566 	return __netif_receive_skb(skb);
3567 }
3568 EXPORT_SYMBOL(netif_receive_skb);
3569 
3570 /* Network device is going away, flush any packets still pending
3571  * Called with irqs disabled.
3572  */
3573 static void flush_backlog(void *arg)
3574 {
3575 	struct net_device *dev = arg;
3576 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3577 	struct sk_buff *skb, *tmp;
3578 
3579 	rps_lock(sd);
3580 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3581 		if (skb->dev == dev) {
3582 			__skb_unlink(skb, &sd->input_pkt_queue);
3583 			kfree_skb(skb);
3584 			input_queue_head_incr(sd);
3585 		}
3586 	}
3587 	rps_unlock(sd);
3588 
3589 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3590 		if (skb->dev == dev) {
3591 			__skb_unlink(skb, &sd->process_queue);
3592 			kfree_skb(skb);
3593 			input_queue_head_incr(sd);
3594 		}
3595 	}
3596 }
3597 
3598 static int napi_gro_complete(struct sk_buff *skb)
3599 {
3600 	struct packet_offload *ptype;
3601 	__be16 type = skb->protocol;
3602 	struct list_head *head = &offload_base;
3603 	int err = -ENOENT;
3604 
3605 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3606 
3607 	if (NAPI_GRO_CB(skb)->count == 1) {
3608 		skb_shinfo(skb)->gso_size = 0;
3609 		goto out;
3610 	}
3611 
3612 	rcu_read_lock();
3613 	list_for_each_entry_rcu(ptype, head, list) {
3614 		if (ptype->type != type || !ptype->callbacks.gro_complete)
3615 			continue;
3616 
3617 		err = ptype->callbacks.gro_complete(skb);
3618 		break;
3619 	}
3620 	rcu_read_unlock();
3621 
3622 	if (err) {
3623 		WARN_ON(&ptype->list == head);
3624 		kfree_skb(skb);
3625 		return NET_RX_SUCCESS;
3626 	}
3627 
3628 out:
3629 	return netif_receive_skb(skb);
3630 }
3631 
3632 /* napi->gro_list contains packets ordered by age.
3633  * youngest packets at the head of it.
3634  * Complete skbs in reverse order to reduce latencies.
3635  */
3636 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3637 {
3638 	struct sk_buff *skb, *prev = NULL;
3639 
3640 	/* scan list and build reverse chain */
3641 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3642 		skb->prev = prev;
3643 		prev = skb;
3644 	}
3645 
3646 	for (skb = prev; skb; skb = prev) {
3647 		skb->next = NULL;
3648 
3649 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3650 			return;
3651 
3652 		prev = skb->prev;
3653 		napi_gro_complete(skb);
3654 		napi->gro_count--;
3655 	}
3656 
3657 	napi->gro_list = NULL;
3658 }
3659 EXPORT_SYMBOL(napi_gro_flush);
3660 
3661 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3662 {
3663 	struct sk_buff *p;
3664 	unsigned int maclen = skb->dev->hard_header_len;
3665 
3666 	for (p = napi->gro_list; p; p = p->next) {
3667 		unsigned long diffs;
3668 
3669 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3670 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3671 		if (maclen == ETH_HLEN)
3672 			diffs |= compare_ether_header(skb_mac_header(p),
3673 						      skb_gro_mac_header(skb));
3674 		else if (!diffs)
3675 			diffs = memcmp(skb_mac_header(p),
3676 				       skb_gro_mac_header(skb),
3677 				       maclen);
3678 		NAPI_GRO_CB(p)->same_flow = !diffs;
3679 		NAPI_GRO_CB(p)->flush = 0;
3680 	}
3681 }
3682 
3683 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3684 {
3685 	struct sk_buff **pp = NULL;
3686 	struct packet_offload *ptype;
3687 	__be16 type = skb->protocol;
3688 	struct list_head *head = &offload_base;
3689 	int same_flow;
3690 	enum gro_result ret;
3691 
3692 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3693 		goto normal;
3694 
3695 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3696 		goto normal;
3697 
3698 	gro_list_prepare(napi, skb);
3699 
3700 	rcu_read_lock();
3701 	list_for_each_entry_rcu(ptype, head, list) {
3702 		if (ptype->type != type || !ptype->callbacks.gro_receive)
3703 			continue;
3704 
3705 		skb_set_network_header(skb, skb_gro_offset(skb));
3706 		skb_reset_mac_len(skb);
3707 		NAPI_GRO_CB(skb)->same_flow = 0;
3708 		NAPI_GRO_CB(skb)->flush = 0;
3709 		NAPI_GRO_CB(skb)->free = 0;
3710 
3711 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3712 		break;
3713 	}
3714 	rcu_read_unlock();
3715 
3716 	if (&ptype->list == head)
3717 		goto normal;
3718 
3719 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3720 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3721 
3722 	if (pp) {
3723 		struct sk_buff *nskb = *pp;
3724 
3725 		*pp = nskb->next;
3726 		nskb->next = NULL;
3727 		napi_gro_complete(nskb);
3728 		napi->gro_count--;
3729 	}
3730 
3731 	if (same_flow)
3732 		goto ok;
3733 
3734 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3735 		goto normal;
3736 
3737 	napi->gro_count++;
3738 	NAPI_GRO_CB(skb)->count = 1;
3739 	NAPI_GRO_CB(skb)->age = jiffies;
3740 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3741 	skb->next = napi->gro_list;
3742 	napi->gro_list = skb;
3743 	ret = GRO_HELD;
3744 
3745 pull:
3746 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3747 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3748 
3749 		BUG_ON(skb->end - skb->tail < grow);
3750 
3751 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3752 
3753 		skb->tail += grow;
3754 		skb->data_len -= grow;
3755 
3756 		skb_shinfo(skb)->frags[0].page_offset += grow;
3757 		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3758 
3759 		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3760 			skb_frag_unref(skb, 0);
3761 			memmove(skb_shinfo(skb)->frags,
3762 				skb_shinfo(skb)->frags + 1,
3763 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3764 		}
3765 	}
3766 
3767 ok:
3768 	return ret;
3769 
3770 normal:
3771 	ret = GRO_NORMAL;
3772 	goto pull;
3773 }
3774 
3775 
3776 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3777 {
3778 	switch (ret) {
3779 	case GRO_NORMAL:
3780 		if (netif_receive_skb(skb))
3781 			ret = GRO_DROP;
3782 		break;
3783 
3784 	case GRO_DROP:
3785 		kfree_skb(skb);
3786 		break;
3787 
3788 	case GRO_MERGED_FREE:
3789 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3790 			kmem_cache_free(skbuff_head_cache, skb);
3791 		else
3792 			__kfree_skb(skb);
3793 		break;
3794 
3795 	case GRO_HELD:
3796 	case GRO_MERGED:
3797 		break;
3798 	}
3799 
3800 	return ret;
3801 }
3802 
3803 static void skb_gro_reset_offset(struct sk_buff *skb)
3804 {
3805 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
3806 	const skb_frag_t *frag0 = &pinfo->frags[0];
3807 
3808 	NAPI_GRO_CB(skb)->data_offset = 0;
3809 	NAPI_GRO_CB(skb)->frag0 = NULL;
3810 	NAPI_GRO_CB(skb)->frag0_len = 0;
3811 
3812 	if (skb->mac_header == skb->tail &&
3813 	    pinfo->nr_frags &&
3814 	    !PageHighMem(skb_frag_page(frag0))) {
3815 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3816 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3817 	}
3818 }
3819 
3820 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3821 {
3822 	skb_gro_reset_offset(skb);
3823 
3824 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3825 }
3826 EXPORT_SYMBOL(napi_gro_receive);
3827 
3828 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3829 {
3830 	__skb_pull(skb, skb_headlen(skb));
3831 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
3832 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3833 	skb->vlan_tci = 0;
3834 	skb->dev = napi->dev;
3835 	skb->skb_iif = 0;
3836 
3837 	napi->skb = skb;
3838 }
3839 
3840 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3841 {
3842 	struct sk_buff *skb = napi->skb;
3843 
3844 	if (!skb) {
3845 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3846 		if (skb)
3847 			napi->skb = skb;
3848 	}
3849 	return skb;
3850 }
3851 EXPORT_SYMBOL(napi_get_frags);
3852 
3853 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3854 			       gro_result_t ret)
3855 {
3856 	switch (ret) {
3857 	case GRO_NORMAL:
3858 	case GRO_HELD:
3859 		skb->protocol = eth_type_trans(skb, skb->dev);
3860 
3861 		if (ret == GRO_HELD)
3862 			skb_gro_pull(skb, -ETH_HLEN);
3863 		else if (netif_receive_skb(skb))
3864 			ret = GRO_DROP;
3865 		break;
3866 
3867 	case GRO_DROP:
3868 	case GRO_MERGED_FREE:
3869 		napi_reuse_skb(napi, skb);
3870 		break;
3871 
3872 	case GRO_MERGED:
3873 		break;
3874 	}
3875 
3876 	return ret;
3877 }
3878 
3879 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3880 {
3881 	struct sk_buff *skb = napi->skb;
3882 	struct ethhdr *eth;
3883 	unsigned int hlen;
3884 	unsigned int off;
3885 
3886 	napi->skb = NULL;
3887 
3888 	skb_reset_mac_header(skb);
3889 	skb_gro_reset_offset(skb);
3890 
3891 	off = skb_gro_offset(skb);
3892 	hlen = off + sizeof(*eth);
3893 	eth = skb_gro_header_fast(skb, off);
3894 	if (skb_gro_header_hard(skb, hlen)) {
3895 		eth = skb_gro_header_slow(skb, hlen, off);
3896 		if (unlikely(!eth)) {
3897 			napi_reuse_skb(napi, skb);
3898 			skb = NULL;
3899 			goto out;
3900 		}
3901 	}
3902 
3903 	skb_gro_pull(skb, sizeof(*eth));
3904 
3905 	/*
3906 	 * This works because the only protocols we care about don't require
3907 	 * special handling.  We'll fix it up properly at the end.
3908 	 */
3909 	skb->protocol = eth->h_proto;
3910 
3911 out:
3912 	return skb;
3913 }
3914 
3915 gro_result_t napi_gro_frags(struct napi_struct *napi)
3916 {
3917 	struct sk_buff *skb = napi_frags_skb(napi);
3918 
3919 	if (!skb)
3920 		return GRO_DROP;
3921 
3922 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
3923 }
3924 EXPORT_SYMBOL(napi_gro_frags);
3925 
3926 /*
3927  * net_rps_action sends any pending IPI's for rps.
3928  * Note: called with local irq disabled, but exits with local irq enabled.
3929  */
3930 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3931 {
3932 #ifdef CONFIG_RPS
3933 	struct softnet_data *remsd = sd->rps_ipi_list;
3934 
3935 	if (remsd) {
3936 		sd->rps_ipi_list = NULL;
3937 
3938 		local_irq_enable();
3939 
3940 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3941 		while (remsd) {
3942 			struct softnet_data *next = remsd->rps_ipi_next;
3943 
3944 			if (cpu_online(remsd->cpu))
3945 				__smp_call_function_single(remsd->cpu,
3946 							   &remsd->csd, 0);
3947 			remsd = next;
3948 		}
3949 	} else
3950 #endif
3951 		local_irq_enable();
3952 }
3953 
3954 static int process_backlog(struct napi_struct *napi, int quota)
3955 {
3956 	int work = 0;
3957 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3958 
3959 #ifdef CONFIG_RPS
3960 	/* Check if we have pending ipi, its better to send them now,
3961 	 * not waiting net_rx_action() end.
3962 	 */
3963 	if (sd->rps_ipi_list) {
3964 		local_irq_disable();
3965 		net_rps_action_and_irq_enable(sd);
3966 	}
3967 #endif
3968 	napi->weight = weight_p;
3969 	local_irq_disable();
3970 	while (work < quota) {
3971 		struct sk_buff *skb;
3972 		unsigned int qlen;
3973 
3974 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3975 			local_irq_enable();
3976 			__netif_receive_skb(skb);
3977 			local_irq_disable();
3978 			input_queue_head_incr(sd);
3979 			if (++work >= quota) {
3980 				local_irq_enable();
3981 				return work;
3982 			}
3983 		}
3984 
3985 		rps_lock(sd);
3986 		qlen = skb_queue_len(&sd->input_pkt_queue);
3987 		if (qlen)
3988 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3989 						   &sd->process_queue);
3990 
3991 		if (qlen < quota - work) {
3992 			/*
3993 			 * Inline a custom version of __napi_complete().
3994 			 * only current cpu owns and manipulates this napi,
3995 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3996 			 * we can use a plain write instead of clear_bit(),
3997 			 * and we dont need an smp_mb() memory barrier.
3998 			 */
3999 			list_del(&napi->poll_list);
4000 			napi->state = 0;
4001 
4002 			quota = work + qlen;
4003 		}
4004 		rps_unlock(sd);
4005 	}
4006 	local_irq_enable();
4007 
4008 	return work;
4009 }
4010 
4011 /**
4012  * __napi_schedule - schedule for receive
4013  * @n: entry to schedule
4014  *
4015  * The entry's receive function will be scheduled to run
4016  */
4017 void __napi_schedule(struct napi_struct *n)
4018 {
4019 	unsigned long flags;
4020 
4021 	local_irq_save(flags);
4022 	____napi_schedule(&__get_cpu_var(softnet_data), n);
4023 	local_irq_restore(flags);
4024 }
4025 EXPORT_SYMBOL(__napi_schedule);
4026 
4027 void __napi_complete(struct napi_struct *n)
4028 {
4029 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4030 	BUG_ON(n->gro_list);
4031 
4032 	list_del(&n->poll_list);
4033 	smp_mb__before_clear_bit();
4034 	clear_bit(NAPI_STATE_SCHED, &n->state);
4035 }
4036 EXPORT_SYMBOL(__napi_complete);
4037 
4038 void napi_complete(struct napi_struct *n)
4039 {
4040 	unsigned long flags;
4041 
4042 	/*
4043 	 * don't let napi dequeue from the cpu poll list
4044 	 * just in case its running on a different cpu
4045 	 */
4046 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4047 		return;
4048 
4049 	napi_gro_flush(n, false);
4050 	local_irq_save(flags);
4051 	__napi_complete(n);
4052 	local_irq_restore(flags);
4053 }
4054 EXPORT_SYMBOL(napi_complete);
4055 
4056 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4057 		    int (*poll)(struct napi_struct *, int), int weight)
4058 {
4059 	INIT_LIST_HEAD(&napi->poll_list);
4060 	napi->gro_count = 0;
4061 	napi->gro_list = NULL;
4062 	napi->skb = NULL;
4063 	napi->poll = poll;
4064 	napi->weight = weight;
4065 	list_add(&napi->dev_list, &dev->napi_list);
4066 	napi->dev = dev;
4067 #ifdef CONFIG_NETPOLL
4068 	spin_lock_init(&napi->poll_lock);
4069 	napi->poll_owner = -1;
4070 #endif
4071 	set_bit(NAPI_STATE_SCHED, &napi->state);
4072 }
4073 EXPORT_SYMBOL(netif_napi_add);
4074 
4075 void netif_napi_del(struct napi_struct *napi)
4076 {
4077 	struct sk_buff *skb, *next;
4078 
4079 	list_del_init(&napi->dev_list);
4080 	napi_free_frags(napi);
4081 
4082 	for (skb = napi->gro_list; skb; skb = next) {
4083 		next = skb->next;
4084 		skb->next = NULL;
4085 		kfree_skb(skb);
4086 	}
4087 
4088 	napi->gro_list = NULL;
4089 	napi->gro_count = 0;
4090 }
4091 EXPORT_SYMBOL(netif_napi_del);
4092 
4093 static void net_rx_action(struct softirq_action *h)
4094 {
4095 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
4096 	unsigned long time_limit = jiffies + 2;
4097 	int budget = netdev_budget;
4098 	void *have;
4099 
4100 	local_irq_disable();
4101 
4102 	while (!list_empty(&sd->poll_list)) {
4103 		struct napi_struct *n;
4104 		int work, weight;
4105 
4106 		/* If softirq window is exhuasted then punt.
4107 		 * Allow this to run for 2 jiffies since which will allow
4108 		 * an average latency of 1.5/HZ.
4109 		 */
4110 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4111 			goto softnet_break;
4112 
4113 		local_irq_enable();
4114 
4115 		/* Even though interrupts have been re-enabled, this
4116 		 * access is safe because interrupts can only add new
4117 		 * entries to the tail of this list, and only ->poll()
4118 		 * calls can remove this head entry from the list.
4119 		 */
4120 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4121 
4122 		have = netpoll_poll_lock(n);
4123 
4124 		weight = n->weight;
4125 
4126 		/* This NAPI_STATE_SCHED test is for avoiding a race
4127 		 * with netpoll's poll_napi().  Only the entity which
4128 		 * obtains the lock and sees NAPI_STATE_SCHED set will
4129 		 * actually make the ->poll() call.  Therefore we avoid
4130 		 * accidentally calling ->poll() when NAPI is not scheduled.
4131 		 */
4132 		work = 0;
4133 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4134 			work = n->poll(n, weight);
4135 			trace_napi_poll(n);
4136 		}
4137 
4138 		WARN_ON_ONCE(work > weight);
4139 
4140 		budget -= work;
4141 
4142 		local_irq_disable();
4143 
4144 		/* Drivers must not modify the NAPI state if they
4145 		 * consume the entire weight.  In such cases this code
4146 		 * still "owns" the NAPI instance and therefore can
4147 		 * move the instance around on the list at-will.
4148 		 */
4149 		if (unlikely(work == weight)) {
4150 			if (unlikely(napi_disable_pending(n))) {
4151 				local_irq_enable();
4152 				napi_complete(n);
4153 				local_irq_disable();
4154 			} else {
4155 				if (n->gro_list) {
4156 					/* flush too old packets
4157 					 * If HZ < 1000, flush all packets.
4158 					 */
4159 					local_irq_enable();
4160 					napi_gro_flush(n, HZ >= 1000);
4161 					local_irq_disable();
4162 				}
4163 				list_move_tail(&n->poll_list, &sd->poll_list);
4164 			}
4165 		}
4166 
4167 		netpoll_poll_unlock(have);
4168 	}
4169 out:
4170 	net_rps_action_and_irq_enable(sd);
4171 
4172 #ifdef CONFIG_NET_DMA
4173 	/*
4174 	 * There may not be any more sk_buffs coming right now, so push
4175 	 * any pending DMA copies to hardware
4176 	 */
4177 	dma_issue_pending_all();
4178 #endif
4179 
4180 	return;
4181 
4182 softnet_break:
4183 	sd->time_squeeze++;
4184 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4185 	goto out;
4186 }
4187 
4188 struct netdev_upper {
4189 	struct net_device *dev;
4190 	bool master;
4191 	struct list_head list;
4192 	struct rcu_head rcu;
4193 	struct list_head search_list;
4194 };
4195 
4196 static void __append_search_uppers(struct list_head *search_list,
4197 				   struct net_device *dev)
4198 {
4199 	struct netdev_upper *upper;
4200 
4201 	list_for_each_entry(upper, &dev->upper_dev_list, list) {
4202 		/* check if this upper is not already in search list */
4203 		if (list_empty(&upper->search_list))
4204 			list_add_tail(&upper->search_list, search_list);
4205 	}
4206 }
4207 
4208 static bool __netdev_search_upper_dev(struct net_device *dev,
4209 				      struct net_device *upper_dev)
4210 {
4211 	LIST_HEAD(search_list);
4212 	struct netdev_upper *upper;
4213 	struct netdev_upper *tmp;
4214 	bool ret = false;
4215 
4216 	__append_search_uppers(&search_list, dev);
4217 	list_for_each_entry(upper, &search_list, search_list) {
4218 		if (upper->dev == upper_dev) {
4219 			ret = true;
4220 			break;
4221 		}
4222 		__append_search_uppers(&search_list, upper->dev);
4223 	}
4224 	list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4225 		INIT_LIST_HEAD(&upper->search_list);
4226 	return ret;
4227 }
4228 
4229 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4230 						struct net_device *upper_dev)
4231 {
4232 	struct netdev_upper *upper;
4233 
4234 	list_for_each_entry(upper, &dev->upper_dev_list, list) {
4235 		if (upper->dev == upper_dev)
4236 			return upper;
4237 	}
4238 	return NULL;
4239 }
4240 
4241 /**
4242  * netdev_has_upper_dev - Check if device is linked to an upper device
4243  * @dev: device
4244  * @upper_dev: upper device to check
4245  *
4246  * Find out if a device is linked to specified upper device and return true
4247  * in case it is. Note that this checks only immediate upper device,
4248  * not through a complete stack of devices. The caller must hold the RTNL lock.
4249  */
4250 bool netdev_has_upper_dev(struct net_device *dev,
4251 			  struct net_device *upper_dev)
4252 {
4253 	ASSERT_RTNL();
4254 
4255 	return __netdev_find_upper(dev, upper_dev);
4256 }
4257 EXPORT_SYMBOL(netdev_has_upper_dev);
4258 
4259 /**
4260  * netdev_has_any_upper_dev - Check if device is linked to some device
4261  * @dev: device
4262  *
4263  * Find out if a device is linked to an upper device and return true in case
4264  * it is. The caller must hold the RTNL lock.
4265  */
4266 bool netdev_has_any_upper_dev(struct net_device *dev)
4267 {
4268 	ASSERT_RTNL();
4269 
4270 	return !list_empty(&dev->upper_dev_list);
4271 }
4272 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4273 
4274 /**
4275  * netdev_master_upper_dev_get - Get master upper device
4276  * @dev: device
4277  *
4278  * Find a master upper device and return pointer to it or NULL in case
4279  * it's not there. The caller must hold the RTNL lock.
4280  */
4281 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4282 {
4283 	struct netdev_upper *upper;
4284 
4285 	ASSERT_RTNL();
4286 
4287 	if (list_empty(&dev->upper_dev_list))
4288 		return NULL;
4289 
4290 	upper = list_first_entry(&dev->upper_dev_list,
4291 				 struct netdev_upper, list);
4292 	if (likely(upper->master))
4293 		return upper->dev;
4294 	return NULL;
4295 }
4296 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4297 
4298 /**
4299  * netdev_master_upper_dev_get_rcu - Get master upper device
4300  * @dev: device
4301  *
4302  * Find a master upper device and return pointer to it or NULL in case
4303  * it's not there. The caller must hold the RCU read lock.
4304  */
4305 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4306 {
4307 	struct netdev_upper *upper;
4308 
4309 	upper = list_first_or_null_rcu(&dev->upper_dev_list,
4310 				       struct netdev_upper, list);
4311 	if (upper && likely(upper->master))
4312 		return upper->dev;
4313 	return NULL;
4314 }
4315 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4316 
4317 static int __netdev_upper_dev_link(struct net_device *dev,
4318 				   struct net_device *upper_dev, bool master)
4319 {
4320 	struct netdev_upper *upper;
4321 
4322 	ASSERT_RTNL();
4323 
4324 	if (dev == upper_dev)
4325 		return -EBUSY;
4326 
4327 	/* To prevent loops, check if dev is not upper device to upper_dev. */
4328 	if (__netdev_search_upper_dev(upper_dev, dev))
4329 		return -EBUSY;
4330 
4331 	if (__netdev_find_upper(dev, upper_dev))
4332 		return -EEXIST;
4333 
4334 	if (master && netdev_master_upper_dev_get(dev))
4335 		return -EBUSY;
4336 
4337 	upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4338 	if (!upper)
4339 		return -ENOMEM;
4340 
4341 	upper->dev = upper_dev;
4342 	upper->master = master;
4343 	INIT_LIST_HEAD(&upper->search_list);
4344 
4345 	/* Ensure that master upper link is always the first item in list. */
4346 	if (master)
4347 		list_add_rcu(&upper->list, &dev->upper_dev_list);
4348 	else
4349 		list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4350 	dev_hold(upper_dev);
4351 
4352 	return 0;
4353 }
4354 
4355 /**
4356  * netdev_upper_dev_link - Add a link to the upper device
4357  * @dev: device
4358  * @upper_dev: new upper device
4359  *
4360  * Adds a link to device which is upper to this one. The caller must hold
4361  * the RTNL lock. On a failure a negative errno code is returned.
4362  * On success the reference counts are adjusted and the function
4363  * returns zero.
4364  */
4365 int netdev_upper_dev_link(struct net_device *dev,
4366 			  struct net_device *upper_dev)
4367 {
4368 	return __netdev_upper_dev_link(dev, upper_dev, false);
4369 }
4370 EXPORT_SYMBOL(netdev_upper_dev_link);
4371 
4372 /**
4373  * netdev_master_upper_dev_link - Add a master link to the upper device
4374  * @dev: device
4375  * @upper_dev: new upper device
4376  *
4377  * Adds a link to device which is upper to this one. In this case, only
4378  * one master upper device can be linked, although other non-master devices
4379  * might be linked as well. The caller must hold the RTNL lock.
4380  * On a failure a negative errno code is returned. On success the reference
4381  * counts are adjusted and the function returns zero.
4382  */
4383 int netdev_master_upper_dev_link(struct net_device *dev,
4384 				 struct net_device *upper_dev)
4385 {
4386 	return __netdev_upper_dev_link(dev, upper_dev, true);
4387 }
4388 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4389 
4390 /**
4391  * netdev_upper_dev_unlink - Removes a link to upper device
4392  * @dev: device
4393  * @upper_dev: new upper device
4394  *
4395  * Removes a link to device which is upper to this one. The caller must hold
4396  * the RTNL lock.
4397  */
4398 void netdev_upper_dev_unlink(struct net_device *dev,
4399 			     struct net_device *upper_dev)
4400 {
4401 	struct netdev_upper *upper;
4402 
4403 	ASSERT_RTNL();
4404 
4405 	upper = __netdev_find_upper(dev, upper_dev);
4406 	if (!upper)
4407 		return;
4408 	list_del_rcu(&upper->list);
4409 	dev_put(upper_dev);
4410 	kfree_rcu(upper, rcu);
4411 }
4412 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4413 
4414 static void dev_change_rx_flags(struct net_device *dev, int flags)
4415 {
4416 	const struct net_device_ops *ops = dev->netdev_ops;
4417 
4418 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4419 		ops->ndo_change_rx_flags(dev, flags);
4420 }
4421 
4422 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4423 {
4424 	unsigned int old_flags = dev->flags;
4425 	kuid_t uid;
4426 	kgid_t gid;
4427 
4428 	ASSERT_RTNL();
4429 
4430 	dev->flags |= IFF_PROMISC;
4431 	dev->promiscuity += inc;
4432 	if (dev->promiscuity == 0) {
4433 		/*
4434 		 * Avoid overflow.
4435 		 * If inc causes overflow, untouch promisc and return error.
4436 		 */
4437 		if (inc < 0)
4438 			dev->flags &= ~IFF_PROMISC;
4439 		else {
4440 			dev->promiscuity -= inc;
4441 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4442 				dev->name);
4443 			return -EOVERFLOW;
4444 		}
4445 	}
4446 	if (dev->flags != old_flags) {
4447 		pr_info("device %s %s promiscuous mode\n",
4448 			dev->name,
4449 			dev->flags & IFF_PROMISC ? "entered" : "left");
4450 		if (audit_enabled) {
4451 			current_uid_gid(&uid, &gid);
4452 			audit_log(current->audit_context, GFP_ATOMIC,
4453 				AUDIT_ANOM_PROMISCUOUS,
4454 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4455 				dev->name, (dev->flags & IFF_PROMISC),
4456 				(old_flags & IFF_PROMISC),
4457 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
4458 				from_kuid(&init_user_ns, uid),
4459 				from_kgid(&init_user_ns, gid),
4460 				audit_get_sessionid(current));
4461 		}
4462 
4463 		dev_change_rx_flags(dev, IFF_PROMISC);
4464 	}
4465 	return 0;
4466 }
4467 
4468 /**
4469  *	dev_set_promiscuity	- update promiscuity count on a device
4470  *	@dev: device
4471  *	@inc: modifier
4472  *
4473  *	Add or remove promiscuity from a device. While the count in the device
4474  *	remains above zero the interface remains promiscuous. Once it hits zero
4475  *	the device reverts back to normal filtering operation. A negative inc
4476  *	value is used to drop promiscuity on the device.
4477  *	Return 0 if successful or a negative errno code on error.
4478  */
4479 int dev_set_promiscuity(struct net_device *dev, int inc)
4480 {
4481 	unsigned int old_flags = dev->flags;
4482 	int err;
4483 
4484 	err = __dev_set_promiscuity(dev, inc);
4485 	if (err < 0)
4486 		return err;
4487 	if (dev->flags != old_flags)
4488 		dev_set_rx_mode(dev);
4489 	return err;
4490 }
4491 EXPORT_SYMBOL(dev_set_promiscuity);
4492 
4493 /**
4494  *	dev_set_allmulti	- update allmulti count on a device
4495  *	@dev: device
4496  *	@inc: modifier
4497  *
4498  *	Add or remove reception of all multicast frames to a device. While the
4499  *	count in the device remains above zero the interface remains listening
4500  *	to all interfaces. Once it hits zero the device reverts back to normal
4501  *	filtering operation. A negative @inc value is used to drop the counter
4502  *	when releasing a resource needing all multicasts.
4503  *	Return 0 if successful or a negative errno code on error.
4504  */
4505 
4506 int dev_set_allmulti(struct net_device *dev, int inc)
4507 {
4508 	unsigned int old_flags = dev->flags;
4509 
4510 	ASSERT_RTNL();
4511 
4512 	dev->flags |= IFF_ALLMULTI;
4513 	dev->allmulti += inc;
4514 	if (dev->allmulti == 0) {
4515 		/*
4516 		 * Avoid overflow.
4517 		 * If inc causes overflow, untouch allmulti and return error.
4518 		 */
4519 		if (inc < 0)
4520 			dev->flags &= ~IFF_ALLMULTI;
4521 		else {
4522 			dev->allmulti -= inc;
4523 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4524 				dev->name);
4525 			return -EOVERFLOW;
4526 		}
4527 	}
4528 	if (dev->flags ^ old_flags) {
4529 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4530 		dev_set_rx_mode(dev);
4531 	}
4532 	return 0;
4533 }
4534 EXPORT_SYMBOL(dev_set_allmulti);
4535 
4536 /*
4537  *	Upload unicast and multicast address lists to device and
4538  *	configure RX filtering. When the device doesn't support unicast
4539  *	filtering it is put in promiscuous mode while unicast addresses
4540  *	are present.
4541  */
4542 void __dev_set_rx_mode(struct net_device *dev)
4543 {
4544 	const struct net_device_ops *ops = dev->netdev_ops;
4545 
4546 	/* dev_open will call this function so the list will stay sane. */
4547 	if (!(dev->flags&IFF_UP))
4548 		return;
4549 
4550 	if (!netif_device_present(dev))
4551 		return;
4552 
4553 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4554 		/* Unicast addresses changes may only happen under the rtnl,
4555 		 * therefore calling __dev_set_promiscuity here is safe.
4556 		 */
4557 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4558 			__dev_set_promiscuity(dev, 1);
4559 			dev->uc_promisc = true;
4560 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4561 			__dev_set_promiscuity(dev, -1);
4562 			dev->uc_promisc = false;
4563 		}
4564 	}
4565 
4566 	if (ops->ndo_set_rx_mode)
4567 		ops->ndo_set_rx_mode(dev);
4568 }
4569 
4570 void dev_set_rx_mode(struct net_device *dev)
4571 {
4572 	netif_addr_lock_bh(dev);
4573 	__dev_set_rx_mode(dev);
4574 	netif_addr_unlock_bh(dev);
4575 }
4576 
4577 /**
4578  *	dev_get_flags - get flags reported to userspace
4579  *	@dev: device
4580  *
4581  *	Get the combination of flag bits exported through APIs to userspace.
4582  */
4583 unsigned int dev_get_flags(const struct net_device *dev)
4584 {
4585 	unsigned int flags;
4586 
4587 	flags = (dev->flags & ~(IFF_PROMISC |
4588 				IFF_ALLMULTI |
4589 				IFF_RUNNING |
4590 				IFF_LOWER_UP |
4591 				IFF_DORMANT)) |
4592 		(dev->gflags & (IFF_PROMISC |
4593 				IFF_ALLMULTI));
4594 
4595 	if (netif_running(dev)) {
4596 		if (netif_oper_up(dev))
4597 			flags |= IFF_RUNNING;
4598 		if (netif_carrier_ok(dev))
4599 			flags |= IFF_LOWER_UP;
4600 		if (netif_dormant(dev))
4601 			flags |= IFF_DORMANT;
4602 	}
4603 
4604 	return flags;
4605 }
4606 EXPORT_SYMBOL(dev_get_flags);
4607 
4608 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4609 {
4610 	unsigned int old_flags = dev->flags;
4611 	int ret;
4612 
4613 	ASSERT_RTNL();
4614 
4615 	/*
4616 	 *	Set the flags on our device.
4617 	 */
4618 
4619 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4620 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4621 			       IFF_AUTOMEDIA)) |
4622 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4623 				    IFF_ALLMULTI));
4624 
4625 	/*
4626 	 *	Load in the correct multicast list now the flags have changed.
4627 	 */
4628 
4629 	if ((old_flags ^ flags) & IFF_MULTICAST)
4630 		dev_change_rx_flags(dev, IFF_MULTICAST);
4631 
4632 	dev_set_rx_mode(dev);
4633 
4634 	/*
4635 	 *	Have we downed the interface. We handle IFF_UP ourselves
4636 	 *	according to user attempts to set it, rather than blindly
4637 	 *	setting it.
4638 	 */
4639 
4640 	ret = 0;
4641 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4642 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4643 
4644 		if (!ret)
4645 			dev_set_rx_mode(dev);
4646 	}
4647 
4648 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4649 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4650 
4651 		dev->gflags ^= IFF_PROMISC;
4652 		dev_set_promiscuity(dev, inc);
4653 	}
4654 
4655 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4656 	   is important. Some (broken) drivers set IFF_PROMISC, when
4657 	   IFF_ALLMULTI is requested not asking us and not reporting.
4658 	 */
4659 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4660 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4661 
4662 		dev->gflags ^= IFF_ALLMULTI;
4663 		dev_set_allmulti(dev, inc);
4664 	}
4665 
4666 	return ret;
4667 }
4668 
4669 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4670 {
4671 	unsigned int changes = dev->flags ^ old_flags;
4672 
4673 	if (changes & IFF_UP) {
4674 		if (dev->flags & IFF_UP)
4675 			call_netdevice_notifiers(NETDEV_UP, dev);
4676 		else
4677 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4678 	}
4679 
4680 	if (dev->flags & IFF_UP &&
4681 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4682 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4683 }
4684 
4685 /**
4686  *	dev_change_flags - change device settings
4687  *	@dev: device
4688  *	@flags: device state flags
4689  *
4690  *	Change settings on device based state flags. The flags are
4691  *	in the userspace exported format.
4692  */
4693 int dev_change_flags(struct net_device *dev, unsigned int flags)
4694 {
4695 	int ret;
4696 	unsigned int changes, old_flags = dev->flags;
4697 
4698 	ret = __dev_change_flags(dev, flags);
4699 	if (ret < 0)
4700 		return ret;
4701 
4702 	changes = old_flags ^ dev->flags;
4703 	if (changes)
4704 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4705 
4706 	__dev_notify_flags(dev, old_flags);
4707 	return ret;
4708 }
4709 EXPORT_SYMBOL(dev_change_flags);
4710 
4711 /**
4712  *	dev_set_mtu - Change maximum transfer unit
4713  *	@dev: device
4714  *	@new_mtu: new transfer unit
4715  *
4716  *	Change the maximum transfer size of the network device.
4717  */
4718 int dev_set_mtu(struct net_device *dev, int new_mtu)
4719 {
4720 	const struct net_device_ops *ops = dev->netdev_ops;
4721 	int err;
4722 
4723 	if (new_mtu == dev->mtu)
4724 		return 0;
4725 
4726 	/*	MTU must be positive.	 */
4727 	if (new_mtu < 0)
4728 		return -EINVAL;
4729 
4730 	if (!netif_device_present(dev))
4731 		return -ENODEV;
4732 
4733 	err = 0;
4734 	if (ops->ndo_change_mtu)
4735 		err = ops->ndo_change_mtu(dev, new_mtu);
4736 	else
4737 		dev->mtu = new_mtu;
4738 
4739 	if (!err)
4740 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4741 	return err;
4742 }
4743 EXPORT_SYMBOL(dev_set_mtu);
4744 
4745 /**
4746  *	dev_set_group - Change group this device belongs to
4747  *	@dev: device
4748  *	@new_group: group this device should belong to
4749  */
4750 void dev_set_group(struct net_device *dev, int new_group)
4751 {
4752 	dev->group = new_group;
4753 }
4754 EXPORT_SYMBOL(dev_set_group);
4755 
4756 /**
4757  *	dev_set_mac_address - Change Media Access Control Address
4758  *	@dev: device
4759  *	@sa: new address
4760  *
4761  *	Change the hardware (MAC) address of the device
4762  */
4763 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4764 {
4765 	const struct net_device_ops *ops = dev->netdev_ops;
4766 	int err;
4767 
4768 	if (!ops->ndo_set_mac_address)
4769 		return -EOPNOTSUPP;
4770 	if (sa->sa_family != dev->type)
4771 		return -EINVAL;
4772 	if (!netif_device_present(dev))
4773 		return -ENODEV;
4774 	err = ops->ndo_set_mac_address(dev, sa);
4775 	if (err)
4776 		return err;
4777 	dev->addr_assign_type = NET_ADDR_SET;
4778 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4779 	add_device_randomness(dev->dev_addr, dev->addr_len);
4780 	return 0;
4781 }
4782 EXPORT_SYMBOL(dev_set_mac_address);
4783 
4784 /**
4785  *	dev_change_carrier - Change device carrier
4786  *	@dev: device
4787  *	@new_carries: new value
4788  *
4789  *	Change device carrier
4790  */
4791 int dev_change_carrier(struct net_device *dev, bool new_carrier)
4792 {
4793 	const struct net_device_ops *ops = dev->netdev_ops;
4794 
4795 	if (!ops->ndo_change_carrier)
4796 		return -EOPNOTSUPP;
4797 	if (!netif_device_present(dev))
4798 		return -ENODEV;
4799 	return ops->ndo_change_carrier(dev, new_carrier);
4800 }
4801 EXPORT_SYMBOL(dev_change_carrier);
4802 
4803 /**
4804  *	dev_new_index	-	allocate an ifindex
4805  *	@net: the applicable net namespace
4806  *
4807  *	Returns a suitable unique value for a new device interface
4808  *	number.  The caller must hold the rtnl semaphore or the
4809  *	dev_base_lock to be sure it remains unique.
4810  */
4811 static int dev_new_index(struct net *net)
4812 {
4813 	int ifindex = net->ifindex;
4814 	for (;;) {
4815 		if (++ifindex <= 0)
4816 			ifindex = 1;
4817 		if (!__dev_get_by_index(net, ifindex))
4818 			return net->ifindex = ifindex;
4819 	}
4820 }
4821 
4822 /* Delayed registration/unregisteration */
4823 static LIST_HEAD(net_todo_list);
4824 
4825 static void net_set_todo(struct net_device *dev)
4826 {
4827 	list_add_tail(&dev->todo_list, &net_todo_list);
4828 }
4829 
4830 static void rollback_registered_many(struct list_head *head)
4831 {
4832 	struct net_device *dev, *tmp;
4833 
4834 	BUG_ON(dev_boot_phase);
4835 	ASSERT_RTNL();
4836 
4837 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4838 		/* Some devices call without registering
4839 		 * for initialization unwind. Remove those
4840 		 * devices and proceed with the remaining.
4841 		 */
4842 		if (dev->reg_state == NETREG_UNINITIALIZED) {
4843 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4844 				 dev->name, dev);
4845 
4846 			WARN_ON(1);
4847 			list_del(&dev->unreg_list);
4848 			continue;
4849 		}
4850 		dev->dismantle = true;
4851 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
4852 	}
4853 
4854 	/* If device is running, close it first. */
4855 	dev_close_many(head);
4856 
4857 	list_for_each_entry(dev, head, unreg_list) {
4858 		/* And unlink it from device chain. */
4859 		unlist_netdevice(dev);
4860 
4861 		dev->reg_state = NETREG_UNREGISTERING;
4862 	}
4863 
4864 	synchronize_net();
4865 
4866 	list_for_each_entry(dev, head, unreg_list) {
4867 		/* Shutdown queueing discipline. */
4868 		dev_shutdown(dev);
4869 
4870 
4871 		/* Notify protocols, that we are about to destroy
4872 		   this device. They should clean all the things.
4873 		*/
4874 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4875 
4876 		if (!dev->rtnl_link_ops ||
4877 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4878 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4879 
4880 		/*
4881 		 *	Flush the unicast and multicast chains
4882 		 */
4883 		dev_uc_flush(dev);
4884 		dev_mc_flush(dev);
4885 
4886 		if (dev->netdev_ops->ndo_uninit)
4887 			dev->netdev_ops->ndo_uninit(dev);
4888 
4889 		/* Notifier chain MUST detach us all upper devices. */
4890 		WARN_ON(netdev_has_any_upper_dev(dev));
4891 
4892 		/* Remove entries from kobject tree */
4893 		netdev_unregister_kobject(dev);
4894 #ifdef CONFIG_XPS
4895 		/* Remove XPS queueing entries */
4896 		netif_reset_xps_queues_gt(dev, 0);
4897 #endif
4898 	}
4899 
4900 	synchronize_net();
4901 
4902 	list_for_each_entry(dev, head, unreg_list)
4903 		dev_put(dev);
4904 }
4905 
4906 static void rollback_registered(struct net_device *dev)
4907 {
4908 	LIST_HEAD(single);
4909 
4910 	list_add(&dev->unreg_list, &single);
4911 	rollback_registered_many(&single);
4912 	list_del(&single);
4913 }
4914 
4915 static netdev_features_t netdev_fix_features(struct net_device *dev,
4916 	netdev_features_t features)
4917 {
4918 	/* Fix illegal checksum combinations */
4919 	if ((features & NETIF_F_HW_CSUM) &&
4920 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4921 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
4922 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4923 	}
4924 
4925 	/* Fix illegal SG+CSUM combinations. */
4926 	if ((features & NETIF_F_SG) &&
4927 	    !(features & NETIF_F_ALL_CSUM)) {
4928 		netdev_dbg(dev,
4929 			"Dropping NETIF_F_SG since no checksum feature.\n");
4930 		features &= ~NETIF_F_SG;
4931 	}
4932 
4933 	/* TSO requires that SG is present as well. */
4934 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
4935 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
4936 		features &= ~NETIF_F_ALL_TSO;
4937 	}
4938 
4939 	/* TSO ECN requires that TSO is present as well. */
4940 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
4941 		features &= ~NETIF_F_TSO_ECN;
4942 
4943 	/* Software GSO depends on SG. */
4944 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
4945 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
4946 		features &= ~NETIF_F_GSO;
4947 	}
4948 
4949 	/* UFO needs SG and checksumming */
4950 	if (features & NETIF_F_UFO) {
4951 		/* maybe split UFO into V4 and V6? */
4952 		if (!((features & NETIF_F_GEN_CSUM) ||
4953 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
4954 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4955 			netdev_dbg(dev,
4956 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
4957 			features &= ~NETIF_F_UFO;
4958 		}
4959 
4960 		if (!(features & NETIF_F_SG)) {
4961 			netdev_dbg(dev,
4962 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
4963 			features &= ~NETIF_F_UFO;
4964 		}
4965 	}
4966 
4967 	return features;
4968 }
4969 
4970 int __netdev_update_features(struct net_device *dev)
4971 {
4972 	netdev_features_t features;
4973 	int err = 0;
4974 
4975 	ASSERT_RTNL();
4976 
4977 	features = netdev_get_wanted_features(dev);
4978 
4979 	if (dev->netdev_ops->ndo_fix_features)
4980 		features = dev->netdev_ops->ndo_fix_features(dev, features);
4981 
4982 	/* driver might be less strict about feature dependencies */
4983 	features = netdev_fix_features(dev, features);
4984 
4985 	if (dev->features == features)
4986 		return 0;
4987 
4988 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
4989 		&dev->features, &features);
4990 
4991 	if (dev->netdev_ops->ndo_set_features)
4992 		err = dev->netdev_ops->ndo_set_features(dev, features);
4993 
4994 	if (unlikely(err < 0)) {
4995 		netdev_err(dev,
4996 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
4997 			err, &features, &dev->features);
4998 		return -1;
4999 	}
5000 
5001 	if (!err)
5002 		dev->features = features;
5003 
5004 	return 1;
5005 }
5006 
5007 /**
5008  *	netdev_update_features - recalculate device features
5009  *	@dev: the device to check
5010  *
5011  *	Recalculate dev->features set and send notifications if it
5012  *	has changed. Should be called after driver or hardware dependent
5013  *	conditions might have changed that influence the features.
5014  */
5015 void netdev_update_features(struct net_device *dev)
5016 {
5017 	if (__netdev_update_features(dev))
5018 		netdev_features_change(dev);
5019 }
5020 EXPORT_SYMBOL(netdev_update_features);
5021 
5022 /**
5023  *	netdev_change_features - recalculate device features
5024  *	@dev: the device to check
5025  *
5026  *	Recalculate dev->features set and send notifications even
5027  *	if they have not changed. Should be called instead of
5028  *	netdev_update_features() if also dev->vlan_features might
5029  *	have changed to allow the changes to be propagated to stacked
5030  *	VLAN devices.
5031  */
5032 void netdev_change_features(struct net_device *dev)
5033 {
5034 	__netdev_update_features(dev);
5035 	netdev_features_change(dev);
5036 }
5037 EXPORT_SYMBOL(netdev_change_features);
5038 
5039 /**
5040  *	netif_stacked_transfer_operstate -	transfer operstate
5041  *	@rootdev: the root or lower level device to transfer state from
5042  *	@dev: the device to transfer operstate to
5043  *
5044  *	Transfer operational state from root to device. This is normally
5045  *	called when a stacking relationship exists between the root
5046  *	device and the device(a leaf device).
5047  */
5048 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5049 					struct net_device *dev)
5050 {
5051 	if (rootdev->operstate == IF_OPER_DORMANT)
5052 		netif_dormant_on(dev);
5053 	else
5054 		netif_dormant_off(dev);
5055 
5056 	if (netif_carrier_ok(rootdev)) {
5057 		if (!netif_carrier_ok(dev))
5058 			netif_carrier_on(dev);
5059 	} else {
5060 		if (netif_carrier_ok(dev))
5061 			netif_carrier_off(dev);
5062 	}
5063 }
5064 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5065 
5066 #ifdef CONFIG_RPS
5067 static int netif_alloc_rx_queues(struct net_device *dev)
5068 {
5069 	unsigned int i, count = dev->num_rx_queues;
5070 	struct netdev_rx_queue *rx;
5071 
5072 	BUG_ON(count < 1);
5073 
5074 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5075 	if (!rx)
5076 		return -ENOMEM;
5077 
5078 	dev->_rx = rx;
5079 
5080 	for (i = 0; i < count; i++)
5081 		rx[i].dev = dev;
5082 	return 0;
5083 }
5084 #endif
5085 
5086 static void netdev_init_one_queue(struct net_device *dev,
5087 				  struct netdev_queue *queue, void *_unused)
5088 {
5089 	/* Initialize queue lock */
5090 	spin_lock_init(&queue->_xmit_lock);
5091 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5092 	queue->xmit_lock_owner = -1;
5093 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5094 	queue->dev = dev;
5095 #ifdef CONFIG_BQL
5096 	dql_init(&queue->dql, HZ);
5097 #endif
5098 }
5099 
5100 static int netif_alloc_netdev_queues(struct net_device *dev)
5101 {
5102 	unsigned int count = dev->num_tx_queues;
5103 	struct netdev_queue *tx;
5104 
5105 	BUG_ON(count < 1);
5106 
5107 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5108 	if (!tx)
5109 		return -ENOMEM;
5110 
5111 	dev->_tx = tx;
5112 
5113 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5114 	spin_lock_init(&dev->tx_global_lock);
5115 
5116 	return 0;
5117 }
5118 
5119 /**
5120  *	register_netdevice	- register a network device
5121  *	@dev: device to register
5122  *
5123  *	Take a completed network device structure and add it to the kernel
5124  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5125  *	chain. 0 is returned on success. A negative errno code is returned
5126  *	on a failure to set up the device, or if the name is a duplicate.
5127  *
5128  *	Callers must hold the rtnl semaphore. You may want
5129  *	register_netdev() instead of this.
5130  *
5131  *	BUGS:
5132  *	The locking appears insufficient to guarantee two parallel registers
5133  *	will not get the same name.
5134  */
5135 
5136 int register_netdevice(struct net_device *dev)
5137 {
5138 	int ret;
5139 	struct net *net = dev_net(dev);
5140 
5141 	BUG_ON(dev_boot_phase);
5142 	ASSERT_RTNL();
5143 
5144 	might_sleep();
5145 
5146 	/* When net_device's are persistent, this will be fatal. */
5147 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5148 	BUG_ON(!net);
5149 
5150 	spin_lock_init(&dev->addr_list_lock);
5151 	netdev_set_addr_lockdep_class(dev);
5152 
5153 	dev->iflink = -1;
5154 
5155 	ret = dev_get_valid_name(net, dev, dev->name);
5156 	if (ret < 0)
5157 		goto out;
5158 
5159 	/* Init, if this function is available */
5160 	if (dev->netdev_ops->ndo_init) {
5161 		ret = dev->netdev_ops->ndo_init(dev);
5162 		if (ret) {
5163 			if (ret > 0)
5164 				ret = -EIO;
5165 			goto out;
5166 		}
5167 	}
5168 
5169 	if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) &&
5170 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5171 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5172 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5173 		ret = -EINVAL;
5174 		goto err_uninit;
5175 	}
5176 
5177 	ret = -EBUSY;
5178 	if (!dev->ifindex)
5179 		dev->ifindex = dev_new_index(net);
5180 	else if (__dev_get_by_index(net, dev->ifindex))
5181 		goto err_uninit;
5182 
5183 	if (dev->iflink == -1)
5184 		dev->iflink = dev->ifindex;
5185 
5186 	/* Transfer changeable features to wanted_features and enable
5187 	 * software offloads (GSO and GRO).
5188 	 */
5189 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5190 	dev->features |= NETIF_F_SOFT_FEATURES;
5191 	dev->wanted_features = dev->features & dev->hw_features;
5192 
5193 	/* Turn on no cache copy if HW is doing checksum */
5194 	if (!(dev->flags & IFF_LOOPBACK)) {
5195 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
5196 		if (dev->features & NETIF_F_ALL_CSUM) {
5197 			dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5198 			dev->features |= NETIF_F_NOCACHE_COPY;
5199 		}
5200 	}
5201 
5202 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5203 	 */
5204 	dev->vlan_features |= NETIF_F_HIGHDMA;
5205 
5206 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5207 	ret = notifier_to_errno(ret);
5208 	if (ret)
5209 		goto err_uninit;
5210 
5211 	ret = netdev_register_kobject(dev);
5212 	if (ret)
5213 		goto err_uninit;
5214 	dev->reg_state = NETREG_REGISTERED;
5215 
5216 	__netdev_update_features(dev);
5217 
5218 	/*
5219 	 *	Default initial state at registry is that the
5220 	 *	device is present.
5221 	 */
5222 
5223 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5224 
5225 	linkwatch_init_dev(dev);
5226 
5227 	dev_init_scheduler(dev);
5228 	dev_hold(dev);
5229 	list_netdevice(dev);
5230 	add_device_randomness(dev->dev_addr, dev->addr_len);
5231 
5232 	/* If the device has permanent device address, driver should
5233 	 * set dev_addr and also addr_assign_type should be set to
5234 	 * NET_ADDR_PERM (default value).
5235 	 */
5236 	if (dev->addr_assign_type == NET_ADDR_PERM)
5237 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5238 
5239 	/* Notify protocols, that a new device appeared. */
5240 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5241 	ret = notifier_to_errno(ret);
5242 	if (ret) {
5243 		rollback_registered(dev);
5244 		dev->reg_state = NETREG_UNREGISTERED;
5245 	}
5246 	/*
5247 	 *	Prevent userspace races by waiting until the network
5248 	 *	device is fully setup before sending notifications.
5249 	 */
5250 	if (!dev->rtnl_link_ops ||
5251 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5252 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5253 
5254 out:
5255 	return ret;
5256 
5257 err_uninit:
5258 	if (dev->netdev_ops->ndo_uninit)
5259 		dev->netdev_ops->ndo_uninit(dev);
5260 	goto out;
5261 }
5262 EXPORT_SYMBOL(register_netdevice);
5263 
5264 /**
5265  *	init_dummy_netdev	- init a dummy network device for NAPI
5266  *	@dev: device to init
5267  *
5268  *	This takes a network device structure and initialize the minimum
5269  *	amount of fields so it can be used to schedule NAPI polls without
5270  *	registering a full blown interface. This is to be used by drivers
5271  *	that need to tie several hardware interfaces to a single NAPI
5272  *	poll scheduler due to HW limitations.
5273  */
5274 int init_dummy_netdev(struct net_device *dev)
5275 {
5276 	/* Clear everything. Note we don't initialize spinlocks
5277 	 * are they aren't supposed to be taken by any of the
5278 	 * NAPI code and this dummy netdev is supposed to be
5279 	 * only ever used for NAPI polls
5280 	 */
5281 	memset(dev, 0, sizeof(struct net_device));
5282 
5283 	/* make sure we BUG if trying to hit standard
5284 	 * register/unregister code path
5285 	 */
5286 	dev->reg_state = NETREG_DUMMY;
5287 
5288 	/* NAPI wants this */
5289 	INIT_LIST_HEAD(&dev->napi_list);
5290 
5291 	/* a dummy interface is started by default */
5292 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5293 	set_bit(__LINK_STATE_START, &dev->state);
5294 
5295 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5296 	 * because users of this 'device' dont need to change
5297 	 * its refcount.
5298 	 */
5299 
5300 	return 0;
5301 }
5302 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5303 
5304 
5305 /**
5306  *	register_netdev	- register a network device
5307  *	@dev: device to register
5308  *
5309  *	Take a completed network device structure and add it to the kernel
5310  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5311  *	chain. 0 is returned on success. A negative errno code is returned
5312  *	on a failure to set up the device, or if the name is a duplicate.
5313  *
5314  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5315  *	and expands the device name if you passed a format string to
5316  *	alloc_netdev.
5317  */
5318 int register_netdev(struct net_device *dev)
5319 {
5320 	int err;
5321 
5322 	rtnl_lock();
5323 	err = register_netdevice(dev);
5324 	rtnl_unlock();
5325 	return err;
5326 }
5327 EXPORT_SYMBOL(register_netdev);
5328 
5329 int netdev_refcnt_read(const struct net_device *dev)
5330 {
5331 	int i, refcnt = 0;
5332 
5333 	for_each_possible_cpu(i)
5334 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5335 	return refcnt;
5336 }
5337 EXPORT_SYMBOL(netdev_refcnt_read);
5338 
5339 /**
5340  * netdev_wait_allrefs - wait until all references are gone.
5341  * @dev: target net_device
5342  *
5343  * This is called when unregistering network devices.
5344  *
5345  * Any protocol or device that holds a reference should register
5346  * for netdevice notification, and cleanup and put back the
5347  * reference if they receive an UNREGISTER event.
5348  * We can get stuck here if buggy protocols don't correctly
5349  * call dev_put.
5350  */
5351 static void netdev_wait_allrefs(struct net_device *dev)
5352 {
5353 	unsigned long rebroadcast_time, warning_time;
5354 	int refcnt;
5355 
5356 	linkwatch_forget_dev(dev);
5357 
5358 	rebroadcast_time = warning_time = jiffies;
5359 	refcnt = netdev_refcnt_read(dev);
5360 
5361 	while (refcnt != 0) {
5362 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5363 			rtnl_lock();
5364 
5365 			/* Rebroadcast unregister notification */
5366 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5367 
5368 			__rtnl_unlock();
5369 			rcu_barrier();
5370 			rtnl_lock();
5371 
5372 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5373 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5374 				     &dev->state)) {
5375 				/* We must not have linkwatch events
5376 				 * pending on unregister. If this
5377 				 * happens, we simply run the queue
5378 				 * unscheduled, resulting in a noop
5379 				 * for this device.
5380 				 */
5381 				linkwatch_run_queue();
5382 			}
5383 
5384 			__rtnl_unlock();
5385 
5386 			rebroadcast_time = jiffies;
5387 		}
5388 
5389 		msleep(250);
5390 
5391 		refcnt = netdev_refcnt_read(dev);
5392 
5393 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5394 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5395 				 dev->name, refcnt);
5396 			warning_time = jiffies;
5397 		}
5398 	}
5399 }
5400 
5401 /* The sequence is:
5402  *
5403  *	rtnl_lock();
5404  *	...
5405  *	register_netdevice(x1);
5406  *	register_netdevice(x2);
5407  *	...
5408  *	unregister_netdevice(y1);
5409  *	unregister_netdevice(y2);
5410  *      ...
5411  *	rtnl_unlock();
5412  *	free_netdev(y1);
5413  *	free_netdev(y2);
5414  *
5415  * We are invoked by rtnl_unlock().
5416  * This allows us to deal with problems:
5417  * 1) We can delete sysfs objects which invoke hotplug
5418  *    without deadlocking with linkwatch via keventd.
5419  * 2) Since we run with the RTNL semaphore not held, we can sleep
5420  *    safely in order to wait for the netdev refcnt to drop to zero.
5421  *
5422  * We must not return until all unregister events added during
5423  * the interval the lock was held have been completed.
5424  */
5425 void netdev_run_todo(void)
5426 {
5427 	struct list_head list;
5428 
5429 	/* Snapshot list, allow later requests */
5430 	list_replace_init(&net_todo_list, &list);
5431 
5432 	__rtnl_unlock();
5433 
5434 
5435 	/* Wait for rcu callbacks to finish before next phase */
5436 	if (!list_empty(&list))
5437 		rcu_barrier();
5438 
5439 	while (!list_empty(&list)) {
5440 		struct net_device *dev
5441 			= list_first_entry(&list, struct net_device, todo_list);
5442 		list_del(&dev->todo_list);
5443 
5444 		rtnl_lock();
5445 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5446 		__rtnl_unlock();
5447 
5448 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5449 			pr_err("network todo '%s' but state %d\n",
5450 			       dev->name, dev->reg_state);
5451 			dump_stack();
5452 			continue;
5453 		}
5454 
5455 		dev->reg_state = NETREG_UNREGISTERED;
5456 
5457 		on_each_cpu(flush_backlog, dev, 1);
5458 
5459 		netdev_wait_allrefs(dev);
5460 
5461 		/* paranoia */
5462 		BUG_ON(netdev_refcnt_read(dev));
5463 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
5464 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5465 		WARN_ON(dev->dn_ptr);
5466 
5467 		if (dev->destructor)
5468 			dev->destructor(dev);
5469 
5470 		/* Free network device */
5471 		kobject_put(&dev->dev.kobj);
5472 	}
5473 }
5474 
5475 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5476  * fields in the same order, with only the type differing.
5477  */
5478 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5479 			     const struct net_device_stats *netdev_stats)
5480 {
5481 #if BITS_PER_LONG == 64
5482 	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5483 	memcpy(stats64, netdev_stats, sizeof(*stats64));
5484 #else
5485 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5486 	const unsigned long *src = (const unsigned long *)netdev_stats;
5487 	u64 *dst = (u64 *)stats64;
5488 
5489 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5490 		     sizeof(*stats64) / sizeof(u64));
5491 	for (i = 0; i < n; i++)
5492 		dst[i] = src[i];
5493 #endif
5494 }
5495 EXPORT_SYMBOL(netdev_stats_to_stats64);
5496 
5497 /**
5498  *	dev_get_stats	- get network device statistics
5499  *	@dev: device to get statistics from
5500  *	@storage: place to store stats
5501  *
5502  *	Get network statistics from device. Return @storage.
5503  *	The device driver may provide its own method by setting
5504  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5505  *	otherwise the internal statistics structure is used.
5506  */
5507 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5508 					struct rtnl_link_stats64 *storage)
5509 {
5510 	const struct net_device_ops *ops = dev->netdev_ops;
5511 
5512 	if (ops->ndo_get_stats64) {
5513 		memset(storage, 0, sizeof(*storage));
5514 		ops->ndo_get_stats64(dev, storage);
5515 	} else if (ops->ndo_get_stats) {
5516 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5517 	} else {
5518 		netdev_stats_to_stats64(storage, &dev->stats);
5519 	}
5520 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5521 	return storage;
5522 }
5523 EXPORT_SYMBOL(dev_get_stats);
5524 
5525 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5526 {
5527 	struct netdev_queue *queue = dev_ingress_queue(dev);
5528 
5529 #ifdef CONFIG_NET_CLS_ACT
5530 	if (queue)
5531 		return queue;
5532 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5533 	if (!queue)
5534 		return NULL;
5535 	netdev_init_one_queue(dev, queue, NULL);
5536 	queue->qdisc = &noop_qdisc;
5537 	queue->qdisc_sleeping = &noop_qdisc;
5538 	rcu_assign_pointer(dev->ingress_queue, queue);
5539 #endif
5540 	return queue;
5541 }
5542 
5543 static const struct ethtool_ops default_ethtool_ops;
5544 
5545 void netdev_set_default_ethtool_ops(struct net_device *dev,
5546 				    const struct ethtool_ops *ops)
5547 {
5548 	if (dev->ethtool_ops == &default_ethtool_ops)
5549 		dev->ethtool_ops = ops;
5550 }
5551 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5552 
5553 /**
5554  *	alloc_netdev_mqs - allocate network device
5555  *	@sizeof_priv:	size of private data to allocate space for
5556  *	@name:		device name format string
5557  *	@setup:		callback to initialize device
5558  *	@txqs:		the number of TX subqueues to allocate
5559  *	@rxqs:		the number of RX subqueues to allocate
5560  *
5561  *	Allocates a struct net_device with private data area for driver use
5562  *	and performs basic initialization.  Also allocates subquue structs
5563  *	for each queue on the device.
5564  */
5565 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5566 		void (*setup)(struct net_device *),
5567 		unsigned int txqs, unsigned int rxqs)
5568 {
5569 	struct net_device *dev;
5570 	size_t alloc_size;
5571 	struct net_device *p;
5572 
5573 	BUG_ON(strlen(name) >= sizeof(dev->name));
5574 
5575 	if (txqs < 1) {
5576 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5577 		return NULL;
5578 	}
5579 
5580 #ifdef CONFIG_RPS
5581 	if (rxqs < 1) {
5582 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5583 		return NULL;
5584 	}
5585 #endif
5586 
5587 	alloc_size = sizeof(struct net_device);
5588 	if (sizeof_priv) {
5589 		/* ensure 32-byte alignment of private area */
5590 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5591 		alloc_size += sizeof_priv;
5592 	}
5593 	/* ensure 32-byte alignment of whole construct */
5594 	alloc_size += NETDEV_ALIGN - 1;
5595 
5596 	p = kzalloc(alloc_size, GFP_KERNEL);
5597 	if (!p)
5598 		return NULL;
5599 
5600 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5601 	dev->padded = (char *)dev - (char *)p;
5602 
5603 	dev->pcpu_refcnt = alloc_percpu(int);
5604 	if (!dev->pcpu_refcnt)
5605 		goto free_p;
5606 
5607 	if (dev_addr_init(dev))
5608 		goto free_pcpu;
5609 
5610 	dev_mc_init(dev);
5611 	dev_uc_init(dev);
5612 
5613 	dev_net_set(dev, &init_net);
5614 
5615 	dev->gso_max_size = GSO_MAX_SIZE;
5616 	dev->gso_max_segs = GSO_MAX_SEGS;
5617 
5618 	INIT_LIST_HEAD(&dev->napi_list);
5619 	INIT_LIST_HEAD(&dev->unreg_list);
5620 	INIT_LIST_HEAD(&dev->link_watch_list);
5621 	INIT_LIST_HEAD(&dev->upper_dev_list);
5622 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5623 	setup(dev);
5624 
5625 	dev->num_tx_queues = txqs;
5626 	dev->real_num_tx_queues = txqs;
5627 	if (netif_alloc_netdev_queues(dev))
5628 		goto free_all;
5629 
5630 #ifdef CONFIG_RPS
5631 	dev->num_rx_queues = rxqs;
5632 	dev->real_num_rx_queues = rxqs;
5633 	if (netif_alloc_rx_queues(dev))
5634 		goto free_all;
5635 #endif
5636 
5637 	strcpy(dev->name, name);
5638 	dev->group = INIT_NETDEV_GROUP;
5639 	if (!dev->ethtool_ops)
5640 		dev->ethtool_ops = &default_ethtool_ops;
5641 	return dev;
5642 
5643 free_all:
5644 	free_netdev(dev);
5645 	return NULL;
5646 
5647 free_pcpu:
5648 	free_percpu(dev->pcpu_refcnt);
5649 	kfree(dev->_tx);
5650 #ifdef CONFIG_RPS
5651 	kfree(dev->_rx);
5652 #endif
5653 
5654 free_p:
5655 	kfree(p);
5656 	return NULL;
5657 }
5658 EXPORT_SYMBOL(alloc_netdev_mqs);
5659 
5660 /**
5661  *	free_netdev - free network device
5662  *	@dev: device
5663  *
5664  *	This function does the last stage of destroying an allocated device
5665  * 	interface. The reference to the device object is released.
5666  *	If this is the last reference then it will be freed.
5667  */
5668 void free_netdev(struct net_device *dev)
5669 {
5670 	struct napi_struct *p, *n;
5671 
5672 	release_net(dev_net(dev));
5673 
5674 	kfree(dev->_tx);
5675 #ifdef CONFIG_RPS
5676 	kfree(dev->_rx);
5677 #endif
5678 
5679 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5680 
5681 	/* Flush device addresses */
5682 	dev_addr_flush(dev);
5683 
5684 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5685 		netif_napi_del(p);
5686 
5687 	free_percpu(dev->pcpu_refcnt);
5688 	dev->pcpu_refcnt = NULL;
5689 
5690 	/*  Compatibility with error handling in drivers */
5691 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5692 		kfree((char *)dev - dev->padded);
5693 		return;
5694 	}
5695 
5696 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5697 	dev->reg_state = NETREG_RELEASED;
5698 
5699 	/* will free via device release */
5700 	put_device(&dev->dev);
5701 }
5702 EXPORT_SYMBOL(free_netdev);
5703 
5704 /**
5705  *	synchronize_net -  Synchronize with packet receive processing
5706  *
5707  *	Wait for packets currently being received to be done.
5708  *	Does not block later packets from starting.
5709  */
5710 void synchronize_net(void)
5711 {
5712 	might_sleep();
5713 	if (rtnl_is_locked())
5714 		synchronize_rcu_expedited();
5715 	else
5716 		synchronize_rcu();
5717 }
5718 EXPORT_SYMBOL(synchronize_net);
5719 
5720 /**
5721  *	unregister_netdevice_queue - remove device from the kernel
5722  *	@dev: device
5723  *	@head: list
5724  *
5725  *	This function shuts down a device interface and removes it
5726  *	from the kernel tables.
5727  *	If head not NULL, device is queued to be unregistered later.
5728  *
5729  *	Callers must hold the rtnl semaphore.  You may want
5730  *	unregister_netdev() instead of this.
5731  */
5732 
5733 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5734 {
5735 	ASSERT_RTNL();
5736 
5737 	if (head) {
5738 		list_move_tail(&dev->unreg_list, head);
5739 	} else {
5740 		rollback_registered(dev);
5741 		/* Finish processing unregister after unlock */
5742 		net_set_todo(dev);
5743 	}
5744 }
5745 EXPORT_SYMBOL(unregister_netdevice_queue);
5746 
5747 /**
5748  *	unregister_netdevice_many - unregister many devices
5749  *	@head: list of devices
5750  */
5751 void unregister_netdevice_many(struct list_head *head)
5752 {
5753 	struct net_device *dev;
5754 
5755 	if (!list_empty(head)) {
5756 		rollback_registered_many(head);
5757 		list_for_each_entry(dev, head, unreg_list)
5758 			net_set_todo(dev);
5759 	}
5760 }
5761 EXPORT_SYMBOL(unregister_netdevice_many);
5762 
5763 /**
5764  *	unregister_netdev - remove device from the kernel
5765  *	@dev: device
5766  *
5767  *	This function shuts down a device interface and removes it
5768  *	from the kernel tables.
5769  *
5770  *	This is just a wrapper for unregister_netdevice that takes
5771  *	the rtnl semaphore.  In general you want to use this and not
5772  *	unregister_netdevice.
5773  */
5774 void unregister_netdev(struct net_device *dev)
5775 {
5776 	rtnl_lock();
5777 	unregister_netdevice(dev);
5778 	rtnl_unlock();
5779 }
5780 EXPORT_SYMBOL(unregister_netdev);
5781 
5782 /**
5783  *	dev_change_net_namespace - move device to different nethost namespace
5784  *	@dev: device
5785  *	@net: network namespace
5786  *	@pat: If not NULL name pattern to try if the current device name
5787  *	      is already taken in the destination network namespace.
5788  *
5789  *	This function shuts down a device interface and moves it
5790  *	to a new network namespace. On success 0 is returned, on
5791  *	a failure a netagive errno code is returned.
5792  *
5793  *	Callers must hold the rtnl semaphore.
5794  */
5795 
5796 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5797 {
5798 	int err;
5799 
5800 	ASSERT_RTNL();
5801 
5802 	/* Don't allow namespace local devices to be moved. */
5803 	err = -EINVAL;
5804 	if (dev->features & NETIF_F_NETNS_LOCAL)
5805 		goto out;
5806 
5807 	/* Ensure the device has been registrered */
5808 	if (dev->reg_state != NETREG_REGISTERED)
5809 		goto out;
5810 
5811 	/* Get out if there is nothing todo */
5812 	err = 0;
5813 	if (net_eq(dev_net(dev), net))
5814 		goto out;
5815 
5816 	/* Pick the destination device name, and ensure
5817 	 * we can use it in the destination network namespace.
5818 	 */
5819 	err = -EEXIST;
5820 	if (__dev_get_by_name(net, dev->name)) {
5821 		/* We get here if we can't use the current device name */
5822 		if (!pat)
5823 			goto out;
5824 		if (dev_get_valid_name(net, dev, pat) < 0)
5825 			goto out;
5826 	}
5827 
5828 	/*
5829 	 * And now a mini version of register_netdevice unregister_netdevice.
5830 	 */
5831 
5832 	/* If device is running close it first. */
5833 	dev_close(dev);
5834 
5835 	/* And unlink it from device chain */
5836 	err = -ENODEV;
5837 	unlist_netdevice(dev);
5838 
5839 	synchronize_net();
5840 
5841 	/* Shutdown queueing discipline. */
5842 	dev_shutdown(dev);
5843 
5844 	/* Notify protocols, that we are about to destroy
5845 	   this device. They should clean all the things.
5846 
5847 	   Note that dev->reg_state stays at NETREG_REGISTERED.
5848 	   This is wanted because this way 8021q and macvlan know
5849 	   the device is just moving and can keep their slaves up.
5850 	*/
5851 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5852 	rcu_barrier();
5853 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5854 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5855 
5856 	/*
5857 	 *	Flush the unicast and multicast chains
5858 	 */
5859 	dev_uc_flush(dev);
5860 	dev_mc_flush(dev);
5861 
5862 	/* Send a netdev-removed uevent to the old namespace */
5863 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
5864 
5865 	/* Actually switch the network namespace */
5866 	dev_net_set(dev, net);
5867 
5868 	/* If there is an ifindex conflict assign a new one */
5869 	if (__dev_get_by_index(net, dev->ifindex)) {
5870 		int iflink = (dev->iflink == dev->ifindex);
5871 		dev->ifindex = dev_new_index(net);
5872 		if (iflink)
5873 			dev->iflink = dev->ifindex;
5874 	}
5875 
5876 	/* Send a netdev-add uevent to the new namespace */
5877 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
5878 
5879 	/* Fixup kobjects */
5880 	err = device_rename(&dev->dev, dev->name);
5881 	WARN_ON(err);
5882 
5883 	/* Add the device back in the hashes */
5884 	list_netdevice(dev);
5885 
5886 	/* Notify protocols, that a new device appeared. */
5887 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
5888 
5889 	/*
5890 	 *	Prevent userspace races by waiting until the network
5891 	 *	device is fully setup before sending notifications.
5892 	 */
5893 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5894 
5895 	synchronize_net();
5896 	err = 0;
5897 out:
5898 	return err;
5899 }
5900 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5901 
5902 static int dev_cpu_callback(struct notifier_block *nfb,
5903 			    unsigned long action,
5904 			    void *ocpu)
5905 {
5906 	struct sk_buff **list_skb;
5907 	struct sk_buff *skb;
5908 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5909 	struct softnet_data *sd, *oldsd;
5910 
5911 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5912 		return NOTIFY_OK;
5913 
5914 	local_irq_disable();
5915 	cpu = smp_processor_id();
5916 	sd = &per_cpu(softnet_data, cpu);
5917 	oldsd = &per_cpu(softnet_data, oldcpu);
5918 
5919 	/* Find end of our completion_queue. */
5920 	list_skb = &sd->completion_queue;
5921 	while (*list_skb)
5922 		list_skb = &(*list_skb)->next;
5923 	/* Append completion queue from offline CPU. */
5924 	*list_skb = oldsd->completion_queue;
5925 	oldsd->completion_queue = NULL;
5926 
5927 	/* Append output queue from offline CPU. */
5928 	if (oldsd->output_queue) {
5929 		*sd->output_queue_tailp = oldsd->output_queue;
5930 		sd->output_queue_tailp = oldsd->output_queue_tailp;
5931 		oldsd->output_queue = NULL;
5932 		oldsd->output_queue_tailp = &oldsd->output_queue;
5933 	}
5934 	/* Append NAPI poll list from offline CPU. */
5935 	if (!list_empty(&oldsd->poll_list)) {
5936 		list_splice_init(&oldsd->poll_list, &sd->poll_list);
5937 		raise_softirq_irqoff(NET_RX_SOFTIRQ);
5938 	}
5939 
5940 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5941 	local_irq_enable();
5942 
5943 	/* Process offline CPU's input_pkt_queue */
5944 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5945 		netif_rx(skb);
5946 		input_queue_head_incr(oldsd);
5947 	}
5948 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5949 		netif_rx(skb);
5950 		input_queue_head_incr(oldsd);
5951 	}
5952 
5953 	return NOTIFY_OK;
5954 }
5955 
5956 
5957 /**
5958  *	netdev_increment_features - increment feature set by one
5959  *	@all: current feature set
5960  *	@one: new feature set
5961  *	@mask: mask feature set
5962  *
5963  *	Computes a new feature set after adding a device with feature set
5964  *	@one to the master device with current feature set @all.  Will not
5965  *	enable anything that is off in @mask. Returns the new feature set.
5966  */
5967 netdev_features_t netdev_increment_features(netdev_features_t all,
5968 	netdev_features_t one, netdev_features_t mask)
5969 {
5970 	if (mask & NETIF_F_GEN_CSUM)
5971 		mask |= NETIF_F_ALL_CSUM;
5972 	mask |= NETIF_F_VLAN_CHALLENGED;
5973 
5974 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
5975 	all &= one | ~NETIF_F_ALL_FOR_ALL;
5976 
5977 	/* If one device supports hw checksumming, set for all. */
5978 	if (all & NETIF_F_GEN_CSUM)
5979 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
5980 
5981 	return all;
5982 }
5983 EXPORT_SYMBOL(netdev_increment_features);
5984 
5985 static struct hlist_head *netdev_create_hash(void)
5986 {
5987 	int i;
5988 	struct hlist_head *hash;
5989 
5990 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5991 	if (hash != NULL)
5992 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5993 			INIT_HLIST_HEAD(&hash[i]);
5994 
5995 	return hash;
5996 }
5997 
5998 /* Initialize per network namespace state */
5999 static int __net_init netdev_init(struct net *net)
6000 {
6001 	if (net != &init_net)
6002 		INIT_LIST_HEAD(&net->dev_base_head);
6003 
6004 	net->dev_name_head = netdev_create_hash();
6005 	if (net->dev_name_head == NULL)
6006 		goto err_name;
6007 
6008 	net->dev_index_head = netdev_create_hash();
6009 	if (net->dev_index_head == NULL)
6010 		goto err_idx;
6011 
6012 	return 0;
6013 
6014 err_idx:
6015 	kfree(net->dev_name_head);
6016 err_name:
6017 	return -ENOMEM;
6018 }
6019 
6020 /**
6021  *	netdev_drivername - network driver for the device
6022  *	@dev: network device
6023  *
6024  *	Determine network driver for device.
6025  */
6026 const char *netdev_drivername(const struct net_device *dev)
6027 {
6028 	const struct device_driver *driver;
6029 	const struct device *parent;
6030 	const char *empty = "";
6031 
6032 	parent = dev->dev.parent;
6033 	if (!parent)
6034 		return empty;
6035 
6036 	driver = parent->driver;
6037 	if (driver && driver->name)
6038 		return driver->name;
6039 	return empty;
6040 }
6041 
6042 static int __netdev_printk(const char *level, const struct net_device *dev,
6043 			   struct va_format *vaf)
6044 {
6045 	int r;
6046 
6047 	if (dev && dev->dev.parent) {
6048 		r = dev_printk_emit(level[1] - '0',
6049 				    dev->dev.parent,
6050 				    "%s %s %s: %pV",
6051 				    dev_driver_string(dev->dev.parent),
6052 				    dev_name(dev->dev.parent),
6053 				    netdev_name(dev), vaf);
6054 	} else if (dev) {
6055 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6056 	} else {
6057 		r = printk("%s(NULL net_device): %pV", level, vaf);
6058 	}
6059 
6060 	return r;
6061 }
6062 
6063 int netdev_printk(const char *level, const struct net_device *dev,
6064 		  const char *format, ...)
6065 {
6066 	struct va_format vaf;
6067 	va_list args;
6068 	int r;
6069 
6070 	va_start(args, format);
6071 
6072 	vaf.fmt = format;
6073 	vaf.va = &args;
6074 
6075 	r = __netdev_printk(level, dev, &vaf);
6076 
6077 	va_end(args);
6078 
6079 	return r;
6080 }
6081 EXPORT_SYMBOL(netdev_printk);
6082 
6083 #define define_netdev_printk_level(func, level)			\
6084 int func(const struct net_device *dev, const char *fmt, ...)	\
6085 {								\
6086 	int r;							\
6087 	struct va_format vaf;					\
6088 	va_list args;						\
6089 								\
6090 	va_start(args, fmt);					\
6091 								\
6092 	vaf.fmt = fmt;						\
6093 	vaf.va = &args;						\
6094 								\
6095 	r = __netdev_printk(level, dev, &vaf);			\
6096 								\
6097 	va_end(args);						\
6098 								\
6099 	return r;						\
6100 }								\
6101 EXPORT_SYMBOL(func);
6102 
6103 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6104 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6105 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6106 define_netdev_printk_level(netdev_err, KERN_ERR);
6107 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6108 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6109 define_netdev_printk_level(netdev_info, KERN_INFO);
6110 
6111 static void __net_exit netdev_exit(struct net *net)
6112 {
6113 	kfree(net->dev_name_head);
6114 	kfree(net->dev_index_head);
6115 }
6116 
6117 static struct pernet_operations __net_initdata netdev_net_ops = {
6118 	.init = netdev_init,
6119 	.exit = netdev_exit,
6120 };
6121 
6122 static void __net_exit default_device_exit(struct net *net)
6123 {
6124 	struct net_device *dev, *aux;
6125 	/*
6126 	 * Push all migratable network devices back to the
6127 	 * initial network namespace
6128 	 */
6129 	rtnl_lock();
6130 	for_each_netdev_safe(net, dev, aux) {
6131 		int err;
6132 		char fb_name[IFNAMSIZ];
6133 
6134 		/* Ignore unmoveable devices (i.e. loopback) */
6135 		if (dev->features & NETIF_F_NETNS_LOCAL)
6136 			continue;
6137 
6138 		/* Leave virtual devices for the generic cleanup */
6139 		if (dev->rtnl_link_ops)
6140 			continue;
6141 
6142 		/* Push remaining network devices to init_net */
6143 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6144 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6145 		if (err) {
6146 			pr_emerg("%s: failed to move %s to init_net: %d\n",
6147 				 __func__, dev->name, err);
6148 			BUG();
6149 		}
6150 	}
6151 	rtnl_unlock();
6152 }
6153 
6154 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6155 {
6156 	/* At exit all network devices most be removed from a network
6157 	 * namespace.  Do this in the reverse order of registration.
6158 	 * Do this across as many network namespaces as possible to
6159 	 * improve batching efficiency.
6160 	 */
6161 	struct net_device *dev;
6162 	struct net *net;
6163 	LIST_HEAD(dev_kill_list);
6164 
6165 	rtnl_lock();
6166 	list_for_each_entry(net, net_list, exit_list) {
6167 		for_each_netdev_reverse(net, dev) {
6168 			if (dev->rtnl_link_ops)
6169 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6170 			else
6171 				unregister_netdevice_queue(dev, &dev_kill_list);
6172 		}
6173 	}
6174 	unregister_netdevice_many(&dev_kill_list);
6175 	list_del(&dev_kill_list);
6176 	rtnl_unlock();
6177 }
6178 
6179 static struct pernet_operations __net_initdata default_device_ops = {
6180 	.exit = default_device_exit,
6181 	.exit_batch = default_device_exit_batch,
6182 };
6183 
6184 /*
6185  *	Initialize the DEV module. At boot time this walks the device list and
6186  *	unhooks any devices that fail to initialise (normally hardware not
6187  *	present) and leaves us with a valid list of present and active devices.
6188  *
6189  */
6190 
6191 /*
6192  *       This is called single threaded during boot, so no need
6193  *       to take the rtnl semaphore.
6194  */
6195 static int __init net_dev_init(void)
6196 {
6197 	int i, rc = -ENOMEM;
6198 
6199 	BUG_ON(!dev_boot_phase);
6200 
6201 	if (dev_proc_init())
6202 		goto out;
6203 
6204 	if (netdev_kobject_init())
6205 		goto out;
6206 
6207 	INIT_LIST_HEAD(&ptype_all);
6208 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6209 		INIT_LIST_HEAD(&ptype_base[i]);
6210 
6211 	INIT_LIST_HEAD(&offload_base);
6212 
6213 	if (register_pernet_subsys(&netdev_net_ops))
6214 		goto out;
6215 
6216 	/*
6217 	 *	Initialise the packet receive queues.
6218 	 */
6219 
6220 	for_each_possible_cpu(i) {
6221 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6222 
6223 		memset(sd, 0, sizeof(*sd));
6224 		skb_queue_head_init(&sd->input_pkt_queue);
6225 		skb_queue_head_init(&sd->process_queue);
6226 		sd->completion_queue = NULL;
6227 		INIT_LIST_HEAD(&sd->poll_list);
6228 		sd->output_queue = NULL;
6229 		sd->output_queue_tailp = &sd->output_queue;
6230 #ifdef CONFIG_RPS
6231 		sd->csd.func = rps_trigger_softirq;
6232 		sd->csd.info = sd;
6233 		sd->csd.flags = 0;
6234 		sd->cpu = i;
6235 #endif
6236 
6237 		sd->backlog.poll = process_backlog;
6238 		sd->backlog.weight = weight_p;
6239 		sd->backlog.gro_list = NULL;
6240 		sd->backlog.gro_count = 0;
6241 	}
6242 
6243 	dev_boot_phase = 0;
6244 
6245 	/* The loopback device is special if any other network devices
6246 	 * is present in a network namespace the loopback device must
6247 	 * be present. Since we now dynamically allocate and free the
6248 	 * loopback device ensure this invariant is maintained by
6249 	 * keeping the loopback device as the first device on the
6250 	 * list of network devices.  Ensuring the loopback devices
6251 	 * is the first device that appears and the last network device
6252 	 * that disappears.
6253 	 */
6254 	if (register_pernet_device(&loopback_net_ops))
6255 		goto out;
6256 
6257 	if (register_pernet_device(&default_device_ops))
6258 		goto out;
6259 
6260 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6261 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6262 
6263 	hotcpu_notifier(dev_cpu_callback, 0);
6264 	dst_init();
6265 	rc = 0;
6266 out:
6267 	return rc;
6268 }
6269 
6270 subsys_initcall(net_dev_init);
6271