xref: /linux/net/core/dev.c (revision 0d456bad36d42d16022be045c8a53ddbb59ee478)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/proc_fs.h>
101 #include <linux/seq_file.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/net_tstamp.h>
136 #include <linux/static_key.h>
137 #include <net/flow_keys.h>
138 
139 #include "net-sysfs.h"
140 
141 /* Instead of increasing this, you should create a hash table. */
142 #define MAX_GRO_SKBS 8
143 
144 /* This should be increased if a protocol with a bigger head is added. */
145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146 
147 /*
148  *	The list of packet types we will receive (as opposed to discard)
149  *	and the routines to invoke.
150  *
151  *	Why 16. Because with 16 the only overlap we get on a hash of the
152  *	low nibble of the protocol value is RARP/SNAP/X.25.
153  *
154  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
155  *             sure which should go first, but I bet it won't make much
156  *             difference if we are running VLANs.  The good news is that
157  *             this protocol won't be in the list unless compiled in, so
158  *             the average user (w/out VLANs) will not be adversely affected.
159  *             --BLG
160  *
161  *		0800	IP
162  *		8100    802.1Q VLAN
163  *		0001	802.3
164  *		0002	AX.25
165  *		0004	802.2
166  *		8035	RARP
167  *		0005	SNAP
168  *		0805	X.25
169  *		0806	ARP
170  *		8137	IPX
171  *		0009	Localtalk
172  *		86DD	IPv6
173  */
174 
175 #define PTYPE_HASH_SIZE	(16)
176 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
177 
178 static DEFINE_SPINLOCK(ptype_lock);
179 static DEFINE_SPINLOCK(offload_lock);
180 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
181 static struct list_head ptype_all __read_mostly;	/* Taps */
182 static struct list_head offload_base __read_mostly;
183 
184 /*
185  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
186  * semaphore.
187  *
188  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
189  *
190  * Writers must hold the rtnl semaphore while they loop through the
191  * dev_base_head list, and hold dev_base_lock for writing when they do the
192  * actual updates.  This allows pure readers to access the list even
193  * while a writer is preparing to update it.
194  *
195  * To put it another way, dev_base_lock is held for writing only to
196  * protect against pure readers; the rtnl semaphore provides the
197  * protection against other writers.
198  *
199  * See, for example usages, register_netdevice() and
200  * unregister_netdevice(), which must be called with the rtnl
201  * semaphore held.
202  */
203 DEFINE_RWLOCK(dev_base_lock);
204 EXPORT_SYMBOL(dev_base_lock);
205 
206 DEFINE_SEQLOCK(devnet_rename_seq);
207 
208 static inline void dev_base_seq_inc(struct net *net)
209 {
210 	while (++net->dev_base_seq == 0);
211 }
212 
213 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
214 {
215 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
216 
217 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
218 }
219 
220 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
221 {
222 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
223 }
224 
225 static inline void rps_lock(struct softnet_data *sd)
226 {
227 #ifdef CONFIG_RPS
228 	spin_lock(&sd->input_pkt_queue.lock);
229 #endif
230 }
231 
232 static inline void rps_unlock(struct softnet_data *sd)
233 {
234 #ifdef CONFIG_RPS
235 	spin_unlock(&sd->input_pkt_queue.lock);
236 #endif
237 }
238 
239 /* Device list insertion */
240 static int list_netdevice(struct net_device *dev)
241 {
242 	struct net *net = dev_net(dev);
243 
244 	ASSERT_RTNL();
245 
246 	write_lock_bh(&dev_base_lock);
247 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
248 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
249 	hlist_add_head_rcu(&dev->index_hlist,
250 			   dev_index_hash(net, dev->ifindex));
251 	write_unlock_bh(&dev_base_lock);
252 
253 	dev_base_seq_inc(net);
254 
255 	return 0;
256 }
257 
258 /* Device list removal
259  * caller must respect a RCU grace period before freeing/reusing dev
260  */
261 static void unlist_netdevice(struct net_device *dev)
262 {
263 	ASSERT_RTNL();
264 
265 	/* Unlink dev from the device chain */
266 	write_lock_bh(&dev_base_lock);
267 	list_del_rcu(&dev->dev_list);
268 	hlist_del_rcu(&dev->name_hlist);
269 	hlist_del_rcu(&dev->index_hlist);
270 	write_unlock_bh(&dev_base_lock);
271 
272 	dev_base_seq_inc(dev_net(dev));
273 }
274 
275 /*
276  *	Our notifier list
277  */
278 
279 static RAW_NOTIFIER_HEAD(netdev_chain);
280 
281 /*
282  *	Device drivers call our routines to queue packets here. We empty the
283  *	queue in the local softnet handler.
284  */
285 
286 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
287 EXPORT_PER_CPU_SYMBOL(softnet_data);
288 
289 #ifdef CONFIG_LOCKDEP
290 /*
291  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
292  * according to dev->type
293  */
294 static const unsigned short netdev_lock_type[] =
295 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
296 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
297 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
298 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
299 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
300 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
301 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
302 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
303 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
304 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
305 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
306 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
307 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
308 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
309 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
310 
311 static const char *const netdev_lock_name[] =
312 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
313 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
314 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
315 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
316 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
317 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
318 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
319 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
320 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
321 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
322 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
323 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
324 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
325 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
326 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
327 
328 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
329 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
330 
331 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
332 {
333 	int i;
334 
335 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
336 		if (netdev_lock_type[i] == dev_type)
337 			return i;
338 	/* the last key is used by default */
339 	return ARRAY_SIZE(netdev_lock_type) - 1;
340 }
341 
342 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
343 						 unsigned short dev_type)
344 {
345 	int i;
346 
347 	i = netdev_lock_pos(dev_type);
348 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
349 				   netdev_lock_name[i]);
350 }
351 
352 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
353 {
354 	int i;
355 
356 	i = netdev_lock_pos(dev->type);
357 	lockdep_set_class_and_name(&dev->addr_list_lock,
358 				   &netdev_addr_lock_key[i],
359 				   netdev_lock_name[i]);
360 }
361 #else
362 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
363 						 unsigned short dev_type)
364 {
365 }
366 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
367 {
368 }
369 #endif
370 
371 /*******************************************************************************
372 
373 		Protocol management and registration routines
374 
375 *******************************************************************************/
376 
377 /*
378  *	Add a protocol ID to the list. Now that the input handler is
379  *	smarter we can dispense with all the messy stuff that used to be
380  *	here.
381  *
382  *	BEWARE!!! Protocol handlers, mangling input packets,
383  *	MUST BE last in hash buckets and checking protocol handlers
384  *	MUST start from promiscuous ptype_all chain in net_bh.
385  *	It is true now, do not change it.
386  *	Explanation follows: if protocol handler, mangling packet, will
387  *	be the first on list, it is not able to sense, that packet
388  *	is cloned and should be copied-on-write, so that it will
389  *	change it and subsequent readers will get broken packet.
390  *							--ANK (980803)
391  */
392 
393 static inline struct list_head *ptype_head(const struct packet_type *pt)
394 {
395 	if (pt->type == htons(ETH_P_ALL))
396 		return &ptype_all;
397 	else
398 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
399 }
400 
401 /**
402  *	dev_add_pack - add packet handler
403  *	@pt: packet type declaration
404  *
405  *	Add a protocol handler to the networking stack. The passed &packet_type
406  *	is linked into kernel lists and may not be freed until it has been
407  *	removed from the kernel lists.
408  *
409  *	This call does not sleep therefore it can not
410  *	guarantee all CPU's that are in middle of receiving packets
411  *	will see the new packet type (until the next received packet).
412  */
413 
414 void dev_add_pack(struct packet_type *pt)
415 {
416 	struct list_head *head = ptype_head(pt);
417 
418 	spin_lock(&ptype_lock);
419 	list_add_rcu(&pt->list, head);
420 	spin_unlock(&ptype_lock);
421 }
422 EXPORT_SYMBOL(dev_add_pack);
423 
424 /**
425  *	__dev_remove_pack	 - remove packet handler
426  *	@pt: packet type declaration
427  *
428  *	Remove a protocol handler that was previously added to the kernel
429  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
430  *	from the kernel lists and can be freed or reused once this function
431  *	returns.
432  *
433  *      The packet type might still be in use by receivers
434  *	and must not be freed until after all the CPU's have gone
435  *	through a quiescent state.
436  */
437 void __dev_remove_pack(struct packet_type *pt)
438 {
439 	struct list_head *head = ptype_head(pt);
440 	struct packet_type *pt1;
441 
442 	spin_lock(&ptype_lock);
443 
444 	list_for_each_entry(pt1, head, list) {
445 		if (pt == pt1) {
446 			list_del_rcu(&pt->list);
447 			goto out;
448 		}
449 	}
450 
451 	pr_warn("dev_remove_pack: %p not found\n", pt);
452 out:
453 	spin_unlock(&ptype_lock);
454 }
455 EXPORT_SYMBOL(__dev_remove_pack);
456 
457 /**
458  *	dev_remove_pack	 - remove packet handler
459  *	@pt: packet type declaration
460  *
461  *	Remove a protocol handler that was previously added to the kernel
462  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
463  *	from the kernel lists and can be freed or reused once this function
464  *	returns.
465  *
466  *	This call sleeps to guarantee that no CPU is looking at the packet
467  *	type after return.
468  */
469 void dev_remove_pack(struct packet_type *pt)
470 {
471 	__dev_remove_pack(pt);
472 
473 	synchronize_net();
474 }
475 EXPORT_SYMBOL(dev_remove_pack);
476 
477 
478 /**
479  *	dev_add_offload - register offload handlers
480  *	@po: protocol offload declaration
481  *
482  *	Add protocol offload handlers to the networking stack. The passed
483  *	&proto_offload is linked into kernel lists and may not be freed until
484  *	it has been removed from the kernel lists.
485  *
486  *	This call does not sleep therefore it can not
487  *	guarantee all CPU's that are in middle of receiving packets
488  *	will see the new offload handlers (until the next received packet).
489  */
490 void dev_add_offload(struct packet_offload *po)
491 {
492 	struct list_head *head = &offload_base;
493 
494 	spin_lock(&offload_lock);
495 	list_add_rcu(&po->list, head);
496 	spin_unlock(&offload_lock);
497 }
498 EXPORT_SYMBOL(dev_add_offload);
499 
500 /**
501  *	__dev_remove_offload	 - remove offload handler
502  *	@po: packet offload declaration
503  *
504  *	Remove a protocol offload handler that was previously added to the
505  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
506  *	is removed from the kernel lists and can be freed or reused once this
507  *	function returns.
508  *
509  *      The packet type might still be in use by receivers
510  *	and must not be freed until after all the CPU's have gone
511  *	through a quiescent state.
512  */
513 void __dev_remove_offload(struct packet_offload *po)
514 {
515 	struct list_head *head = &offload_base;
516 	struct packet_offload *po1;
517 
518 	spin_lock(&offload_lock);
519 
520 	list_for_each_entry(po1, head, list) {
521 		if (po == po1) {
522 			list_del_rcu(&po->list);
523 			goto out;
524 		}
525 	}
526 
527 	pr_warn("dev_remove_offload: %p not found\n", po);
528 out:
529 	spin_unlock(&offload_lock);
530 }
531 EXPORT_SYMBOL(__dev_remove_offload);
532 
533 /**
534  *	dev_remove_offload	 - remove packet offload handler
535  *	@po: packet offload declaration
536  *
537  *	Remove a packet offload handler that was previously added to the kernel
538  *	offload handlers by dev_add_offload(). The passed &offload_type is
539  *	removed from the kernel lists and can be freed or reused once this
540  *	function returns.
541  *
542  *	This call sleeps to guarantee that no CPU is looking at the packet
543  *	type after return.
544  */
545 void dev_remove_offload(struct packet_offload *po)
546 {
547 	__dev_remove_offload(po);
548 
549 	synchronize_net();
550 }
551 EXPORT_SYMBOL(dev_remove_offload);
552 
553 /******************************************************************************
554 
555 		      Device Boot-time Settings Routines
556 
557 *******************************************************************************/
558 
559 /* Boot time configuration table */
560 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
561 
562 /**
563  *	netdev_boot_setup_add	- add new setup entry
564  *	@name: name of the device
565  *	@map: configured settings for the device
566  *
567  *	Adds new setup entry to the dev_boot_setup list.  The function
568  *	returns 0 on error and 1 on success.  This is a generic routine to
569  *	all netdevices.
570  */
571 static int netdev_boot_setup_add(char *name, struct ifmap *map)
572 {
573 	struct netdev_boot_setup *s;
574 	int i;
575 
576 	s = dev_boot_setup;
577 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
578 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
579 			memset(s[i].name, 0, sizeof(s[i].name));
580 			strlcpy(s[i].name, name, IFNAMSIZ);
581 			memcpy(&s[i].map, map, sizeof(s[i].map));
582 			break;
583 		}
584 	}
585 
586 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
587 }
588 
589 /**
590  *	netdev_boot_setup_check	- check boot time settings
591  *	@dev: the netdevice
592  *
593  * 	Check boot time settings for the device.
594  *	The found settings are set for the device to be used
595  *	later in the device probing.
596  *	Returns 0 if no settings found, 1 if they are.
597  */
598 int netdev_boot_setup_check(struct net_device *dev)
599 {
600 	struct netdev_boot_setup *s = dev_boot_setup;
601 	int i;
602 
603 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
604 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
605 		    !strcmp(dev->name, s[i].name)) {
606 			dev->irq 	= s[i].map.irq;
607 			dev->base_addr 	= s[i].map.base_addr;
608 			dev->mem_start 	= s[i].map.mem_start;
609 			dev->mem_end 	= s[i].map.mem_end;
610 			return 1;
611 		}
612 	}
613 	return 0;
614 }
615 EXPORT_SYMBOL(netdev_boot_setup_check);
616 
617 
618 /**
619  *	netdev_boot_base	- get address from boot time settings
620  *	@prefix: prefix for network device
621  *	@unit: id for network device
622  *
623  * 	Check boot time settings for the base address of device.
624  *	The found settings are set for the device to be used
625  *	later in the device probing.
626  *	Returns 0 if no settings found.
627  */
628 unsigned long netdev_boot_base(const char *prefix, int unit)
629 {
630 	const struct netdev_boot_setup *s = dev_boot_setup;
631 	char name[IFNAMSIZ];
632 	int i;
633 
634 	sprintf(name, "%s%d", prefix, unit);
635 
636 	/*
637 	 * If device already registered then return base of 1
638 	 * to indicate not to probe for this interface
639 	 */
640 	if (__dev_get_by_name(&init_net, name))
641 		return 1;
642 
643 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
644 		if (!strcmp(name, s[i].name))
645 			return s[i].map.base_addr;
646 	return 0;
647 }
648 
649 /*
650  * Saves at boot time configured settings for any netdevice.
651  */
652 int __init netdev_boot_setup(char *str)
653 {
654 	int ints[5];
655 	struct ifmap map;
656 
657 	str = get_options(str, ARRAY_SIZE(ints), ints);
658 	if (!str || !*str)
659 		return 0;
660 
661 	/* Save settings */
662 	memset(&map, 0, sizeof(map));
663 	if (ints[0] > 0)
664 		map.irq = ints[1];
665 	if (ints[0] > 1)
666 		map.base_addr = ints[2];
667 	if (ints[0] > 2)
668 		map.mem_start = ints[3];
669 	if (ints[0] > 3)
670 		map.mem_end = ints[4];
671 
672 	/* Add new entry to the list */
673 	return netdev_boot_setup_add(str, &map);
674 }
675 
676 __setup("netdev=", netdev_boot_setup);
677 
678 /*******************************************************************************
679 
680 			    Device Interface Subroutines
681 
682 *******************************************************************************/
683 
684 /**
685  *	__dev_get_by_name	- find a device by its name
686  *	@net: the applicable net namespace
687  *	@name: name to find
688  *
689  *	Find an interface by name. Must be called under RTNL semaphore
690  *	or @dev_base_lock. If the name is found a pointer to the device
691  *	is returned. If the name is not found then %NULL is returned. The
692  *	reference counters are not incremented so the caller must be
693  *	careful with locks.
694  */
695 
696 struct net_device *__dev_get_by_name(struct net *net, const char *name)
697 {
698 	struct hlist_node *p;
699 	struct net_device *dev;
700 	struct hlist_head *head = dev_name_hash(net, name);
701 
702 	hlist_for_each_entry(dev, p, head, name_hlist)
703 		if (!strncmp(dev->name, name, IFNAMSIZ))
704 			return dev;
705 
706 	return NULL;
707 }
708 EXPORT_SYMBOL(__dev_get_by_name);
709 
710 /**
711  *	dev_get_by_name_rcu	- find a device by its name
712  *	@net: the applicable net namespace
713  *	@name: name to find
714  *
715  *	Find an interface by name.
716  *	If the name is found a pointer to the device is returned.
717  * 	If the name is not found then %NULL is returned.
718  *	The reference counters are not incremented so the caller must be
719  *	careful with locks. The caller must hold RCU lock.
720  */
721 
722 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
723 {
724 	struct hlist_node *p;
725 	struct net_device *dev;
726 	struct hlist_head *head = dev_name_hash(net, name);
727 
728 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
729 		if (!strncmp(dev->name, name, IFNAMSIZ))
730 			return dev;
731 
732 	return NULL;
733 }
734 EXPORT_SYMBOL(dev_get_by_name_rcu);
735 
736 /**
737  *	dev_get_by_name		- find a device by its name
738  *	@net: the applicable net namespace
739  *	@name: name to find
740  *
741  *	Find an interface by name. This can be called from any
742  *	context and does its own locking. The returned handle has
743  *	the usage count incremented and the caller must use dev_put() to
744  *	release it when it is no longer needed. %NULL is returned if no
745  *	matching device is found.
746  */
747 
748 struct net_device *dev_get_by_name(struct net *net, const char *name)
749 {
750 	struct net_device *dev;
751 
752 	rcu_read_lock();
753 	dev = dev_get_by_name_rcu(net, name);
754 	if (dev)
755 		dev_hold(dev);
756 	rcu_read_unlock();
757 	return dev;
758 }
759 EXPORT_SYMBOL(dev_get_by_name);
760 
761 /**
762  *	__dev_get_by_index - find a device by its ifindex
763  *	@net: the applicable net namespace
764  *	@ifindex: index of device
765  *
766  *	Search for an interface by index. Returns %NULL if the device
767  *	is not found or a pointer to the device. The device has not
768  *	had its reference counter increased so the caller must be careful
769  *	about locking. The caller must hold either the RTNL semaphore
770  *	or @dev_base_lock.
771  */
772 
773 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
774 {
775 	struct hlist_node *p;
776 	struct net_device *dev;
777 	struct hlist_head *head = dev_index_hash(net, ifindex);
778 
779 	hlist_for_each_entry(dev, p, head, index_hlist)
780 		if (dev->ifindex == ifindex)
781 			return dev;
782 
783 	return NULL;
784 }
785 EXPORT_SYMBOL(__dev_get_by_index);
786 
787 /**
788  *	dev_get_by_index_rcu - find a device by its ifindex
789  *	@net: the applicable net namespace
790  *	@ifindex: index of device
791  *
792  *	Search for an interface by index. Returns %NULL if the device
793  *	is not found or a pointer to the device. The device has not
794  *	had its reference counter increased so the caller must be careful
795  *	about locking. The caller must hold RCU lock.
796  */
797 
798 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
799 {
800 	struct hlist_node *p;
801 	struct net_device *dev;
802 	struct hlist_head *head = dev_index_hash(net, ifindex);
803 
804 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
805 		if (dev->ifindex == ifindex)
806 			return dev;
807 
808 	return NULL;
809 }
810 EXPORT_SYMBOL(dev_get_by_index_rcu);
811 
812 
813 /**
814  *	dev_get_by_index - find a device by its ifindex
815  *	@net: the applicable net namespace
816  *	@ifindex: index of device
817  *
818  *	Search for an interface by index. Returns NULL if the device
819  *	is not found or a pointer to the device. The device returned has
820  *	had a reference added and the pointer is safe until the user calls
821  *	dev_put to indicate they have finished with it.
822  */
823 
824 struct net_device *dev_get_by_index(struct net *net, int ifindex)
825 {
826 	struct net_device *dev;
827 
828 	rcu_read_lock();
829 	dev = dev_get_by_index_rcu(net, ifindex);
830 	if (dev)
831 		dev_hold(dev);
832 	rcu_read_unlock();
833 	return dev;
834 }
835 EXPORT_SYMBOL(dev_get_by_index);
836 
837 /**
838  *	dev_getbyhwaddr_rcu - find a device by its hardware address
839  *	@net: the applicable net namespace
840  *	@type: media type of device
841  *	@ha: hardware address
842  *
843  *	Search for an interface by MAC address. Returns NULL if the device
844  *	is not found or a pointer to the device.
845  *	The caller must hold RCU or RTNL.
846  *	The returned device has not had its ref count increased
847  *	and the caller must therefore be careful about locking
848  *
849  */
850 
851 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
852 				       const char *ha)
853 {
854 	struct net_device *dev;
855 
856 	for_each_netdev_rcu(net, dev)
857 		if (dev->type == type &&
858 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
859 			return dev;
860 
861 	return NULL;
862 }
863 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
864 
865 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
866 {
867 	struct net_device *dev;
868 
869 	ASSERT_RTNL();
870 	for_each_netdev(net, dev)
871 		if (dev->type == type)
872 			return dev;
873 
874 	return NULL;
875 }
876 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
877 
878 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
879 {
880 	struct net_device *dev, *ret = NULL;
881 
882 	rcu_read_lock();
883 	for_each_netdev_rcu(net, dev)
884 		if (dev->type == type) {
885 			dev_hold(dev);
886 			ret = dev;
887 			break;
888 		}
889 	rcu_read_unlock();
890 	return ret;
891 }
892 EXPORT_SYMBOL(dev_getfirstbyhwtype);
893 
894 /**
895  *	dev_get_by_flags_rcu - find any device with given flags
896  *	@net: the applicable net namespace
897  *	@if_flags: IFF_* values
898  *	@mask: bitmask of bits in if_flags to check
899  *
900  *	Search for any interface with the given flags. Returns NULL if a device
901  *	is not found or a pointer to the device. Must be called inside
902  *	rcu_read_lock(), and result refcount is unchanged.
903  */
904 
905 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
906 				    unsigned short mask)
907 {
908 	struct net_device *dev, *ret;
909 
910 	ret = NULL;
911 	for_each_netdev_rcu(net, dev) {
912 		if (((dev->flags ^ if_flags) & mask) == 0) {
913 			ret = dev;
914 			break;
915 		}
916 	}
917 	return ret;
918 }
919 EXPORT_SYMBOL(dev_get_by_flags_rcu);
920 
921 /**
922  *	dev_valid_name - check if name is okay for network device
923  *	@name: name string
924  *
925  *	Network device names need to be valid file names to
926  *	to allow sysfs to work.  We also disallow any kind of
927  *	whitespace.
928  */
929 bool dev_valid_name(const char *name)
930 {
931 	if (*name == '\0')
932 		return false;
933 	if (strlen(name) >= IFNAMSIZ)
934 		return false;
935 	if (!strcmp(name, ".") || !strcmp(name, ".."))
936 		return false;
937 
938 	while (*name) {
939 		if (*name == '/' || isspace(*name))
940 			return false;
941 		name++;
942 	}
943 	return true;
944 }
945 EXPORT_SYMBOL(dev_valid_name);
946 
947 /**
948  *	__dev_alloc_name - allocate a name for a device
949  *	@net: network namespace to allocate the device name in
950  *	@name: name format string
951  *	@buf:  scratch buffer and result name string
952  *
953  *	Passed a format string - eg "lt%d" it will try and find a suitable
954  *	id. It scans list of devices to build up a free map, then chooses
955  *	the first empty slot. The caller must hold the dev_base or rtnl lock
956  *	while allocating the name and adding the device in order to avoid
957  *	duplicates.
958  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
959  *	Returns the number of the unit assigned or a negative errno code.
960  */
961 
962 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
963 {
964 	int i = 0;
965 	const char *p;
966 	const int max_netdevices = 8*PAGE_SIZE;
967 	unsigned long *inuse;
968 	struct net_device *d;
969 
970 	p = strnchr(name, IFNAMSIZ-1, '%');
971 	if (p) {
972 		/*
973 		 * Verify the string as this thing may have come from
974 		 * the user.  There must be either one "%d" and no other "%"
975 		 * characters.
976 		 */
977 		if (p[1] != 'd' || strchr(p + 2, '%'))
978 			return -EINVAL;
979 
980 		/* Use one page as a bit array of possible slots */
981 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
982 		if (!inuse)
983 			return -ENOMEM;
984 
985 		for_each_netdev(net, d) {
986 			if (!sscanf(d->name, name, &i))
987 				continue;
988 			if (i < 0 || i >= max_netdevices)
989 				continue;
990 
991 			/*  avoid cases where sscanf is not exact inverse of printf */
992 			snprintf(buf, IFNAMSIZ, name, i);
993 			if (!strncmp(buf, d->name, IFNAMSIZ))
994 				set_bit(i, inuse);
995 		}
996 
997 		i = find_first_zero_bit(inuse, max_netdevices);
998 		free_page((unsigned long) inuse);
999 	}
1000 
1001 	if (buf != name)
1002 		snprintf(buf, IFNAMSIZ, name, i);
1003 	if (!__dev_get_by_name(net, buf))
1004 		return i;
1005 
1006 	/* It is possible to run out of possible slots
1007 	 * when the name is long and there isn't enough space left
1008 	 * for the digits, or if all bits are used.
1009 	 */
1010 	return -ENFILE;
1011 }
1012 
1013 /**
1014  *	dev_alloc_name - allocate a name for a device
1015  *	@dev: device
1016  *	@name: name format string
1017  *
1018  *	Passed a format string - eg "lt%d" it will try and find a suitable
1019  *	id. It scans list of devices to build up a free map, then chooses
1020  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1021  *	while allocating the name and adding the device in order to avoid
1022  *	duplicates.
1023  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1024  *	Returns the number of the unit assigned or a negative errno code.
1025  */
1026 
1027 int dev_alloc_name(struct net_device *dev, const char *name)
1028 {
1029 	char buf[IFNAMSIZ];
1030 	struct net *net;
1031 	int ret;
1032 
1033 	BUG_ON(!dev_net(dev));
1034 	net = dev_net(dev);
1035 	ret = __dev_alloc_name(net, name, buf);
1036 	if (ret >= 0)
1037 		strlcpy(dev->name, buf, IFNAMSIZ);
1038 	return ret;
1039 }
1040 EXPORT_SYMBOL(dev_alloc_name);
1041 
1042 static int dev_alloc_name_ns(struct net *net,
1043 			     struct net_device *dev,
1044 			     const char *name)
1045 {
1046 	char buf[IFNAMSIZ];
1047 	int ret;
1048 
1049 	ret = __dev_alloc_name(net, name, buf);
1050 	if (ret >= 0)
1051 		strlcpy(dev->name, buf, IFNAMSIZ);
1052 	return ret;
1053 }
1054 
1055 static int dev_get_valid_name(struct net *net,
1056 			      struct net_device *dev,
1057 			      const char *name)
1058 {
1059 	BUG_ON(!net);
1060 
1061 	if (!dev_valid_name(name))
1062 		return -EINVAL;
1063 
1064 	if (strchr(name, '%'))
1065 		return dev_alloc_name_ns(net, dev, name);
1066 	else if (__dev_get_by_name(net, name))
1067 		return -EEXIST;
1068 	else if (dev->name != name)
1069 		strlcpy(dev->name, name, IFNAMSIZ);
1070 
1071 	return 0;
1072 }
1073 
1074 /**
1075  *	dev_change_name - change name of a device
1076  *	@dev: device
1077  *	@newname: name (or format string) must be at least IFNAMSIZ
1078  *
1079  *	Change name of a device, can pass format strings "eth%d".
1080  *	for wildcarding.
1081  */
1082 int dev_change_name(struct net_device *dev, const char *newname)
1083 {
1084 	char oldname[IFNAMSIZ];
1085 	int err = 0;
1086 	int ret;
1087 	struct net *net;
1088 
1089 	ASSERT_RTNL();
1090 	BUG_ON(!dev_net(dev));
1091 
1092 	net = dev_net(dev);
1093 	if (dev->flags & IFF_UP)
1094 		return -EBUSY;
1095 
1096 	write_seqlock(&devnet_rename_seq);
1097 
1098 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1099 		write_sequnlock(&devnet_rename_seq);
1100 		return 0;
1101 	}
1102 
1103 	memcpy(oldname, dev->name, IFNAMSIZ);
1104 
1105 	err = dev_get_valid_name(net, dev, newname);
1106 	if (err < 0) {
1107 		write_sequnlock(&devnet_rename_seq);
1108 		return err;
1109 	}
1110 
1111 rollback:
1112 	ret = device_rename(&dev->dev, dev->name);
1113 	if (ret) {
1114 		memcpy(dev->name, oldname, IFNAMSIZ);
1115 		write_sequnlock(&devnet_rename_seq);
1116 		return ret;
1117 	}
1118 
1119 	write_sequnlock(&devnet_rename_seq);
1120 
1121 	write_lock_bh(&dev_base_lock);
1122 	hlist_del_rcu(&dev->name_hlist);
1123 	write_unlock_bh(&dev_base_lock);
1124 
1125 	synchronize_rcu();
1126 
1127 	write_lock_bh(&dev_base_lock);
1128 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1129 	write_unlock_bh(&dev_base_lock);
1130 
1131 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1132 	ret = notifier_to_errno(ret);
1133 
1134 	if (ret) {
1135 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1136 		if (err >= 0) {
1137 			err = ret;
1138 			write_seqlock(&devnet_rename_seq);
1139 			memcpy(dev->name, oldname, IFNAMSIZ);
1140 			goto rollback;
1141 		} else {
1142 			pr_err("%s: name change rollback failed: %d\n",
1143 			       dev->name, ret);
1144 		}
1145 	}
1146 
1147 	return err;
1148 }
1149 
1150 /**
1151  *	dev_set_alias - change ifalias of a device
1152  *	@dev: device
1153  *	@alias: name up to IFALIASZ
1154  *	@len: limit of bytes to copy from info
1155  *
1156  *	Set ifalias for a device,
1157  */
1158 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1159 {
1160 	char *new_ifalias;
1161 
1162 	ASSERT_RTNL();
1163 
1164 	if (len >= IFALIASZ)
1165 		return -EINVAL;
1166 
1167 	if (!len) {
1168 		kfree(dev->ifalias);
1169 		dev->ifalias = NULL;
1170 		return 0;
1171 	}
1172 
1173 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1174 	if (!new_ifalias)
1175 		return -ENOMEM;
1176 	dev->ifalias = new_ifalias;
1177 
1178 	strlcpy(dev->ifalias, alias, len+1);
1179 	return len;
1180 }
1181 
1182 
1183 /**
1184  *	netdev_features_change - device changes features
1185  *	@dev: device to cause notification
1186  *
1187  *	Called to indicate a device has changed features.
1188  */
1189 void netdev_features_change(struct net_device *dev)
1190 {
1191 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1192 }
1193 EXPORT_SYMBOL(netdev_features_change);
1194 
1195 /**
1196  *	netdev_state_change - device changes state
1197  *	@dev: device to cause notification
1198  *
1199  *	Called to indicate a device has changed state. This function calls
1200  *	the notifier chains for netdev_chain and sends a NEWLINK message
1201  *	to the routing socket.
1202  */
1203 void netdev_state_change(struct net_device *dev)
1204 {
1205 	if (dev->flags & IFF_UP) {
1206 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1207 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1208 	}
1209 }
1210 EXPORT_SYMBOL(netdev_state_change);
1211 
1212 /**
1213  * 	netdev_notify_peers - notify network peers about existence of @dev
1214  * 	@dev: network device
1215  *
1216  * Generate traffic such that interested network peers are aware of
1217  * @dev, such as by generating a gratuitous ARP. This may be used when
1218  * a device wants to inform the rest of the network about some sort of
1219  * reconfiguration such as a failover event or virtual machine
1220  * migration.
1221  */
1222 void netdev_notify_peers(struct net_device *dev)
1223 {
1224 	rtnl_lock();
1225 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1226 	rtnl_unlock();
1227 }
1228 EXPORT_SYMBOL(netdev_notify_peers);
1229 
1230 /**
1231  *	dev_load 	- load a network module
1232  *	@net: the applicable net namespace
1233  *	@name: name of interface
1234  *
1235  *	If a network interface is not present and the process has suitable
1236  *	privileges this function loads the module. If module loading is not
1237  *	available in this kernel then it becomes a nop.
1238  */
1239 
1240 void dev_load(struct net *net, const char *name)
1241 {
1242 	struct net_device *dev;
1243 	int no_module;
1244 
1245 	rcu_read_lock();
1246 	dev = dev_get_by_name_rcu(net, name);
1247 	rcu_read_unlock();
1248 
1249 	no_module = !dev;
1250 	if (no_module && capable(CAP_NET_ADMIN))
1251 		no_module = request_module("netdev-%s", name);
1252 	if (no_module && capable(CAP_SYS_MODULE)) {
1253 		if (!request_module("%s", name))
1254 			pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1255 				name);
1256 	}
1257 }
1258 EXPORT_SYMBOL(dev_load);
1259 
1260 static int __dev_open(struct net_device *dev)
1261 {
1262 	const struct net_device_ops *ops = dev->netdev_ops;
1263 	int ret;
1264 
1265 	ASSERT_RTNL();
1266 
1267 	if (!netif_device_present(dev))
1268 		return -ENODEV;
1269 
1270 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1271 	ret = notifier_to_errno(ret);
1272 	if (ret)
1273 		return ret;
1274 
1275 	set_bit(__LINK_STATE_START, &dev->state);
1276 
1277 	if (ops->ndo_validate_addr)
1278 		ret = ops->ndo_validate_addr(dev);
1279 
1280 	if (!ret && ops->ndo_open)
1281 		ret = ops->ndo_open(dev);
1282 
1283 	if (ret)
1284 		clear_bit(__LINK_STATE_START, &dev->state);
1285 	else {
1286 		dev->flags |= IFF_UP;
1287 		net_dmaengine_get();
1288 		dev_set_rx_mode(dev);
1289 		dev_activate(dev);
1290 		add_device_randomness(dev->dev_addr, dev->addr_len);
1291 	}
1292 
1293 	return ret;
1294 }
1295 
1296 /**
1297  *	dev_open	- prepare an interface for use.
1298  *	@dev:	device to open
1299  *
1300  *	Takes a device from down to up state. The device's private open
1301  *	function is invoked and then the multicast lists are loaded. Finally
1302  *	the device is moved into the up state and a %NETDEV_UP message is
1303  *	sent to the netdev notifier chain.
1304  *
1305  *	Calling this function on an active interface is a nop. On a failure
1306  *	a negative errno code is returned.
1307  */
1308 int dev_open(struct net_device *dev)
1309 {
1310 	int ret;
1311 
1312 	if (dev->flags & IFF_UP)
1313 		return 0;
1314 
1315 	ret = __dev_open(dev);
1316 	if (ret < 0)
1317 		return ret;
1318 
1319 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1320 	call_netdevice_notifiers(NETDEV_UP, dev);
1321 
1322 	return ret;
1323 }
1324 EXPORT_SYMBOL(dev_open);
1325 
1326 static int __dev_close_many(struct list_head *head)
1327 {
1328 	struct net_device *dev;
1329 
1330 	ASSERT_RTNL();
1331 	might_sleep();
1332 
1333 	list_for_each_entry(dev, head, unreg_list) {
1334 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1335 
1336 		clear_bit(__LINK_STATE_START, &dev->state);
1337 
1338 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1339 		 * can be even on different cpu. So just clear netif_running().
1340 		 *
1341 		 * dev->stop() will invoke napi_disable() on all of it's
1342 		 * napi_struct instances on this device.
1343 		 */
1344 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1345 	}
1346 
1347 	dev_deactivate_many(head);
1348 
1349 	list_for_each_entry(dev, head, unreg_list) {
1350 		const struct net_device_ops *ops = dev->netdev_ops;
1351 
1352 		/*
1353 		 *	Call the device specific close. This cannot fail.
1354 		 *	Only if device is UP
1355 		 *
1356 		 *	We allow it to be called even after a DETACH hot-plug
1357 		 *	event.
1358 		 */
1359 		if (ops->ndo_stop)
1360 			ops->ndo_stop(dev);
1361 
1362 		dev->flags &= ~IFF_UP;
1363 		net_dmaengine_put();
1364 	}
1365 
1366 	return 0;
1367 }
1368 
1369 static int __dev_close(struct net_device *dev)
1370 {
1371 	int retval;
1372 	LIST_HEAD(single);
1373 
1374 	list_add(&dev->unreg_list, &single);
1375 	retval = __dev_close_many(&single);
1376 	list_del(&single);
1377 	return retval;
1378 }
1379 
1380 static int dev_close_many(struct list_head *head)
1381 {
1382 	struct net_device *dev, *tmp;
1383 	LIST_HEAD(tmp_list);
1384 
1385 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1386 		if (!(dev->flags & IFF_UP))
1387 			list_move(&dev->unreg_list, &tmp_list);
1388 
1389 	__dev_close_many(head);
1390 
1391 	list_for_each_entry(dev, head, unreg_list) {
1392 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1393 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1394 	}
1395 
1396 	/* rollback_registered_many needs the complete original list */
1397 	list_splice(&tmp_list, head);
1398 	return 0;
1399 }
1400 
1401 /**
1402  *	dev_close - shutdown an interface.
1403  *	@dev: device to shutdown
1404  *
1405  *	This function moves an active device into down state. A
1406  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1407  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1408  *	chain.
1409  */
1410 int dev_close(struct net_device *dev)
1411 {
1412 	if (dev->flags & IFF_UP) {
1413 		LIST_HEAD(single);
1414 
1415 		list_add(&dev->unreg_list, &single);
1416 		dev_close_many(&single);
1417 		list_del(&single);
1418 	}
1419 	return 0;
1420 }
1421 EXPORT_SYMBOL(dev_close);
1422 
1423 
1424 /**
1425  *	dev_disable_lro - disable Large Receive Offload on a device
1426  *	@dev: device
1427  *
1428  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1429  *	called under RTNL.  This is needed if received packets may be
1430  *	forwarded to another interface.
1431  */
1432 void dev_disable_lro(struct net_device *dev)
1433 {
1434 	/*
1435 	 * If we're trying to disable lro on a vlan device
1436 	 * use the underlying physical device instead
1437 	 */
1438 	if (is_vlan_dev(dev))
1439 		dev = vlan_dev_real_dev(dev);
1440 
1441 	dev->wanted_features &= ~NETIF_F_LRO;
1442 	netdev_update_features(dev);
1443 
1444 	if (unlikely(dev->features & NETIF_F_LRO))
1445 		netdev_WARN(dev, "failed to disable LRO!\n");
1446 }
1447 EXPORT_SYMBOL(dev_disable_lro);
1448 
1449 
1450 static int dev_boot_phase = 1;
1451 
1452 /**
1453  *	register_netdevice_notifier - register a network notifier block
1454  *	@nb: notifier
1455  *
1456  *	Register a notifier to be called when network device events occur.
1457  *	The notifier passed is linked into the kernel structures and must
1458  *	not be reused until it has been unregistered. A negative errno code
1459  *	is returned on a failure.
1460  *
1461  * 	When registered all registration and up events are replayed
1462  *	to the new notifier to allow device to have a race free
1463  *	view of the network device list.
1464  */
1465 
1466 int register_netdevice_notifier(struct notifier_block *nb)
1467 {
1468 	struct net_device *dev;
1469 	struct net_device *last;
1470 	struct net *net;
1471 	int err;
1472 
1473 	rtnl_lock();
1474 	err = raw_notifier_chain_register(&netdev_chain, nb);
1475 	if (err)
1476 		goto unlock;
1477 	if (dev_boot_phase)
1478 		goto unlock;
1479 	for_each_net(net) {
1480 		for_each_netdev(net, dev) {
1481 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1482 			err = notifier_to_errno(err);
1483 			if (err)
1484 				goto rollback;
1485 
1486 			if (!(dev->flags & IFF_UP))
1487 				continue;
1488 
1489 			nb->notifier_call(nb, NETDEV_UP, dev);
1490 		}
1491 	}
1492 
1493 unlock:
1494 	rtnl_unlock();
1495 	return err;
1496 
1497 rollback:
1498 	last = dev;
1499 	for_each_net(net) {
1500 		for_each_netdev(net, dev) {
1501 			if (dev == last)
1502 				goto outroll;
1503 
1504 			if (dev->flags & IFF_UP) {
1505 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1506 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1507 			}
1508 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1509 		}
1510 	}
1511 
1512 outroll:
1513 	raw_notifier_chain_unregister(&netdev_chain, nb);
1514 	goto unlock;
1515 }
1516 EXPORT_SYMBOL(register_netdevice_notifier);
1517 
1518 /**
1519  *	unregister_netdevice_notifier - unregister a network notifier block
1520  *	@nb: notifier
1521  *
1522  *	Unregister a notifier previously registered by
1523  *	register_netdevice_notifier(). The notifier is unlinked into the
1524  *	kernel structures and may then be reused. A negative errno code
1525  *	is returned on a failure.
1526  *
1527  * 	After unregistering unregister and down device events are synthesized
1528  *	for all devices on the device list to the removed notifier to remove
1529  *	the need for special case cleanup code.
1530  */
1531 
1532 int unregister_netdevice_notifier(struct notifier_block *nb)
1533 {
1534 	struct net_device *dev;
1535 	struct net *net;
1536 	int err;
1537 
1538 	rtnl_lock();
1539 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1540 	if (err)
1541 		goto unlock;
1542 
1543 	for_each_net(net) {
1544 		for_each_netdev(net, dev) {
1545 			if (dev->flags & IFF_UP) {
1546 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1547 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1548 			}
1549 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1550 		}
1551 	}
1552 unlock:
1553 	rtnl_unlock();
1554 	return err;
1555 }
1556 EXPORT_SYMBOL(unregister_netdevice_notifier);
1557 
1558 /**
1559  *	call_netdevice_notifiers - call all network notifier blocks
1560  *      @val: value passed unmodified to notifier function
1561  *      @dev: net_device pointer passed unmodified to notifier function
1562  *
1563  *	Call all network notifier blocks.  Parameters and return value
1564  *	are as for raw_notifier_call_chain().
1565  */
1566 
1567 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1568 {
1569 	ASSERT_RTNL();
1570 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1571 }
1572 EXPORT_SYMBOL(call_netdevice_notifiers);
1573 
1574 static struct static_key netstamp_needed __read_mostly;
1575 #ifdef HAVE_JUMP_LABEL
1576 /* We are not allowed to call static_key_slow_dec() from irq context
1577  * If net_disable_timestamp() is called from irq context, defer the
1578  * static_key_slow_dec() calls.
1579  */
1580 static atomic_t netstamp_needed_deferred;
1581 #endif
1582 
1583 void net_enable_timestamp(void)
1584 {
1585 #ifdef HAVE_JUMP_LABEL
1586 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1587 
1588 	if (deferred) {
1589 		while (--deferred)
1590 			static_key_slow_dec(&netstamp_needed);
1591 		return;
1592 	}
1593 #endif
1594 	WARN_ON(in_interrupt());
1595 	static_key_slow_inc(&netstamp_needed);
1596 }
1597 EXPORT_SYMBOL(net_enable_timestamp);
1598 
1599 void net_disable_timestamp(void)
1600 {
1601 #ifdef HAVE_JUMP_LABEL
1602 	if (in_interrupt()) {
1603 		atomic_inc(&netstamp_needed_deferred);
1604 		return;
1605 	}
1606 #endif
1607 	static_key_slow_dec(&netstamp_needed);
1608 }
1609 EXPORT_SYMBOL(net_disable_timestamp);
1610 
1611 static inline void net_timestamp_set(struct sk_buff *skb)
1612 {
1613 	skb->tstamp.tv64 = 0;
1614 	if (static_key_false(&netstamp_needed))
1615 		__net_timestamp(skb);
1616 }
1617 
1618 #define net_timestamp_check(COND, SKB)			\
1619 	if (static_key_false(&netstamp_needed)) {		\
1620 		if ((COND) && !(SKB)->tstamp.tv64)	\
1621 			__net_timestamp(SKB);		\
1622 	}						\
1623 
1624 static int net_hwtstamp_validate(struct ifreq *ifr)
1625 {
1626 	struct hwtstamp_config cfg;
1627 	enum hwtstamp_tx_types tx_type;
1628 	enum hwtstamp_rx_filters rx_filter;
1629 	int tx_type_valid = 0;
1630 	int rx_filter_valid = 0;
1631 
1632 	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1633 		return -EFAULT;
1634 
1635 	if (cfg.flags) /* reserved for future extensions */
1636 		return -EINVAL;
1637 
1638 	tx_type = cfg.tx_type;
1639 	rx_filter = cfg.rx_filter;
1640 
1641 	switch (tx_type) {
1642 	case HWTSTAMP_TX_OFF:
1643 	case HWTSTAMP_TX_ON:
1644 	case HWTSTAMP_TX_ONESTEP_SYNC:
1645 		tx_type_valid = 1;
1646 		break;
1647 	}
1648 
1649 	switch (rx_filter) {
1650 	case HWTSTAMP_FILTER_NONE:
1651 	case HWTSTAMP_FILTER_ALL:
1652 	case HWTSTAMP_FILTER_SOME:
1653 	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1654 	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1655 	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1656 	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1657 	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1658 	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1659 	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1660 	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1661 	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1662 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
1663 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
1664 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1665 		rx_filter_valid = 1;
1666 		break;
1667 	}
1668 
1669 	if (!tx_type_valid || !rx_filter_valid)
1670 		return -ERANGE;
1671 
1672 	return 0;
1673 }
1674 
1675 static inline bool is_skb_forwardable(struct net_device *dev,
1676 				      struct sk_buff *skb)
1677 {
1678 	unsigned int len;
1679 
1680 	if (!(dev->flags & IFF_UP))
1681 		return false;
1682 
1683 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1684 	if (skb->len <= len)
1685 		return true;
1686 
1687 	/* if TSO is enabled, we don't care about the length as the packet
1688 	 * could be forwarded without being segmented before
1689 	 */
1690 	if (skb_is_gso(skb))
1691 		return true;
1692 
1693 	return false;
1694 }
1695 
1696 /**
1697  * dev_forward_skb - loopback an skb to another netif
1698  *
1699  * @dev: destination network device
1700  * @skb: buffer to forward
1701  *
1702  * return values:
1703  *	NET_RX_SUCCESS	(no congestion)
1704  *	NET_RX_DROP     (packet was dropped, but freed)
1705  *
1706  * dev_forward_skb can be used for injecting an skb from the
1707  * start_xmit function of one device into the receive queue
1708  * of another device.
1709  *
1710  * The receiving device may be in another namespace, so
1711  * we have to clear all information in the skb that could
1712  * impact namespace isolation.
1713  */
1714 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1715 {
1716 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1717 		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1718 			atomic_long_inc(&dev->rx_dropped);
1719 			kfree_skb(skb);
1720 			return NET_RX_DROP;
1721 		}
1722 	}
1723 
1724 	skb_orphan(skb);
1725 	nf_reset(skb);
1726 
1727 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1728 		atomic_long_inc(&dev->rx_dropped);
1729 		kfree_skb(skb);
1730 		return NET_RX_DROP;
1731 	}
1732 	skb->skb_iif = 0;
1733 	skb->dev = dev;
1734 	skb_dst_drop(skb);
1735 	skb->tstamp.tv64 = 0;
1736 	skb->pkt_type = PACKET_HOST;
1737 	skb->protocol = eth_type_trans(skb, dev);
1738 	skb->mark = 0;
1739 	secpath_reset(skb);
1740 	nf_reset(skb);
1741 	return netif_rx(skb);
1742 }
1743 EXPORT_SYMBOL_GPL(dev_forward_skb);
1744 
1745 static inline int deliver_skb(struct sk_buff *skb,
1746 			      struct packet_type *pt_prev,
1747 			      struct net_device *orig_dev)
1748 {
1749 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1750 		return -ENOMEM;
1751 	atomic_inc(&skb->users);
1752 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1753 }
1754 
1755 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1756 {
1757 	if (!ptype->af_packet_priv || !skb->sk)
1758 		return false;
1759 
1760 	if (ptype->id_match)
1761 		return ptype->id_match(ptype, skb->sk);
1762 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1763 		return true;
1764 
1765 	return false;
1766 }
1767 
1768 /*
1769  *	Support routine. Sends outgoing frames to any network
1770  *	taps currently in use.
1771  */
1772 
1773 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1774 {
1775 	struct packet_type *ptype;
1776 	struct sk_buff *skb2 = NULL;
1777 	struct packet_type *pt_prev = NULL;
1778 
1779 	rcu_read_lock();
1780 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1781 		/* Never send packets back to the socket
1782 		 * they originated from - MvS (miquels@drinkel.ow.org)
1783 		 */
1784 		if ((ptype->dev == dev || !ptype->dev) &&
1785 		    (!skb_loop_sk(ptype, skb))) {
1786 			if (pt_prev) {
1787 				deliver_skb(skb2, pt_prev, skb->dev);
1788 				pt_prev = ptype;
1789 				continue;
1790 			}
1791 
1792 			skb2 = skb_clone(skb, GFP_ATOMIC);
1793 			if (!skb2)
1794 				break;
1795 
1796 			net_timestamp_set(skb2);
1797 
1798 			/* skb->nh should be correctly
1799 			   set by sender, so that the second statement is
1800 			   just protection against buggy protocols.
1801 			 */
1802 			skb_reset_mac_header(skb2);
1803 
1804 			if (skb_network_header(skb2) < skb2->data ||
1805 			    skb2->network_header > skb2->tail) {
1806 				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1807 						     ntohs(skb2->protocol),
1808 						     dev->name);
1809 				skb_reset_network_header(skb2);
1810 			}
1811 
1812 			skb2->transport_header = skb2->network_header;
1813 			skb2->pkt_type = PACKET_OUTGOING;
1814 			pt_prev = ptype;
1815 		}
1816 	}
1817 	if (pt_prev)
1818 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1819 	rcu_read_unlock();
1820 }
1821 
1822 /**
1823  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1824  * @dev: Network device
1825  * @txq: number of queues available
1826  *
1827  * If real_num_tx_queues is changed the tc mappings may no longer be
1828  * valid. To resolve this verify the tc mapping remains valid and if
1829  * not NULL the mapping. With no priorities mapping to this
1830  * offset/count pair it will no longer be used. In the worst case TC0
1831  * is invalid nothing can be done so disable priority mappings. If is
1832  * expected that drivers will fix this mapping if they can before
1833  * calling netif_set_real_num_tx_queues.
1834  */
1835 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1836 {
1837 	int i;
1838 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1839 
1840 	/* If TC0 is invalidated disable TC mapping */
1841 	if (tc->offset + tc->count > txq) {
1842 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1843 		dev->num_tc = 0;
1844 		return;
1845 	}
1846 
1847 	/* Invalidated prio to tc mappings set to TC0 */
1848 	for (i = 1; i < TC_BITMASK + 1; i++) {
1849 		int q = netdev_get_prio_tc_map(dev, i);
1850 
1851 		tc = &dev->tc_to_txq[q];
1852 		if (tc->offset + tc->count > txq) {
1853 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1854 				i, q);
1855 			netdev_set_prio_tc_map(dev, i, 0);
1856 		}
1857 	}
1858 }
1859 
1860 /*
1861  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1862  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1863  */
1864 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1865 {
1866 	int rc;
1867 
1868 	if (txq < 1 || txq > dev->num_tx_queues)
1869 		return -EINVAL;
1870 
1871 	if (dev->reg_state == NETREG_REGISTERED ||
1872 	    dev->reg_state == NETREG_UNREGISTERING) {
1873 		ASSERT_RTNL();
1874 
1875 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1876 						  txq);
1877 		if (rc)
1878 			return rc;
1879 
1880 		if (dev->num_tc)
1881 			netif_setup_tc(dev, txq);
1882 
1883 		if (txq < dev->real_num_tx_queues)
1884 			qdisc_reset_all_tx_gt(dev, txq);
1885 	}
1886 
1887 	dev->real_num_tx_queues = txq;
1888 	return 0;
1889 }
1890 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1891 
1892 #ifdef CONFIG_RPS
1893 /**
1894  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1895  *	@dev: Network device
1896  *	@rxq: Actual number of RX queues
1897  *
1898  *	This must be called either with the rtnl_lock held or before
1899  *	registration of the net device.  Returns 0 on success, or a
1900  *	negative error code.  If called before registration, it always
1901  *	succeeds.
1902  */
1903 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1904 {
1905 	int rc;
1906 
1907 	if (rxq < 1 || rxq > dev->num_rx_queues)
1908 		return -EINVAL;
1909 
1910 	if (dev->reg_state == NETREG_REGISTERED) {
1911 		ASSERT_RTNL();
1912 
1913 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1914 						  rxq);
1915 		if (rc)
1916 			return rc;
1917 	}
1918 
1919 	dev->real_num_rx_queues = rxq;
1920 	return 0;
1921 }
1922 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1923 #endif
1924 
1925 /**
1926  * netif_get_num_default_rss_queues - default number of RSS queues
1927  *
1928  * This routine should set an upper limit on the number of RSS queues
1929  * used by default by multiqueue devices.
1930  */
1931 int netif_get_num_default_rss_queues(void)
1932 {
1933 	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
1934 }
1935 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
1936 
1937 static inline void __netif_reschedule(struct Qdisc *q)
1938 {
1939 	struct softnet_data *sd;
1940 	unsigned long flags;
1941 
1942 	local_irq_save(flags);
1943 	sd = &__get_cpu_var(softnet_data);
1944 	q->next_sched = NULL;
1945 	*sd->output_queue_tailp = q;
1946 	sd->output_queue_tailp = &q->next_sched;
1947 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1948 	local_irq_restore(flags);
1949 }
1950 
1951 void __netif_schedule(struct Qdisc *q)
1952 {
1953 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1954 		__netif_reschedule(q);
1955 }
1956 EXPORT_SYMBOL(__netif_schedule);
1957 
1958 void dev_kfree_skb_irq(struct sk_buff *skb)
1959 {
1960 	if (atomic_dec_and_test(&skb->users)) {
1961 		struct softnet_data *sd;
1962 		unsigned long flags;
1963 
1964 		local_irq_save(flags);
1965 		sd = &__get_cpu_var(softnet_data);
1966 		skb->next = sd->completion_queue;
1967 		sd->completion_queue = skb;
1968 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1969 		local_irq_restore(flags);
1970 	}
1971 }
1972 EXPORT_SYMBOL(dev_kfree_skb_irq);
1973 
1974 void dev_kfree_skb_any(struct sk_buff *skb)
1975 {
1976 	if (in_irq() || irqs_disabled())
1977 		dev_kfree_skb_irq(skb);
1978 	else
1979 		dev_kfree_skb(skb);
1980 }
1981 EXPORT_SYMBOL(dev_kfree_skb_any);
1982 
1983 
1984 /**
1985  * netif_device_detach - mark device as removed
1986  * @dev: network device
1987  *
1988  * Mark device as removed from system and therefore no longer available.
1989  */
1990 void netif_device_detach(struct net_device *dev)
1991 {
1992 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1993 	    netif_running(dev)) {
1994 		netif_tx_stop_all_queues(dev);
1995 	}
1996 }
1997 EXPORT_SYMBOL(netif_device_detach);
1998 
1999 /**
2000  * netif_device_attach - mark device as attached
2001  * @dev: network device
2002  *
2003  * Mark device as attached from system and restart if needed.
2004  */
2005 void netif_device_attach(struct net_device *dev)
2006 {
2007 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2008 	    netif_running(dev)) {
2009 		netif_tx_wake_all_queues(dev);
2010 		__netdev_watchdog_up(dev);
2011 	}
2012 }
2013 EXPORT_SYMBOL(netif_device_attach);
2014 
2015 static void skb_warn_bad_offload(const struct sk_buff *skb)
2016 {
2017 	static const netdev_features_t null_features = 0;
2018 	struct net_device *dev = skb->dev;
2019 	const char *driver = "";
2020 
2021 	if (dev && dev->dev.parent)
2022 		driver = dev_driver_string(dev->dev.parent);
2023 
2024 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2025 	     "gso_type=%d ip_summed=%d\n",
2026 	     driver, dev ? &dev->features : &null_features,
2027 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2028 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2029 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2030 }
2031 
2032 /*
2033  * Invalidate hardware checksum when packet is to be mangled, and
2034  * complete checksum manually on outgoing path.
2035  */
2036 int skb_checksum_help(struct sk_buff *skb)
2037 {
2038 	__wsum csum;
2039 	int ret = 0, offset;
2040 
2041 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2042 		goto out_set_summed;
2043 
2044 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2045 		skb_warn_bad_offload(skb);
2046 		return -EINVAL;
2047 	}
2048 
2049 	offset = skb_checksum_start_offset(skb);
2050 	BUG_ON(offset >= skb_headlen(skb));
2051 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2052 
2053 	offset += skb->csum_offset;
2054 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2055 
2056 	if (skb_cloned(skb) &&
2057 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2058 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2059 		if (ret)
2060 			goto out;
2061 	}
2062 
2063 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2064 out_set_summed:
2065 	skb->ip_summed = CHECKSUM_NONE;
2066 out:
2067 	return ret;
2068 }
2069 EXPORT_SYMBOL(skb_checksum_help);
2070 
2071 /**
2072  *	skb_gso_segment - Perform segmentation on skb.
2073  *	@skb: buffer to segment
2074  *	@features: features for the output path (see dev->features)
2075  *
2076  *	This function segments the given skb and returns a list of segments.
2077  *
2078  *	It may return NULL if the skb requires no segmentation.  This is
2079  *	only possible when GSO is used for verifying header integrity.
2080  */
2081 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
2082 	netdev_features_t features)
2083 {
2084 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2085 	struct packet_offload *ptype;
2086 	__be16 type = skb->protocol;
2087 	int vlan_depth = ETH_HLEN;
2088 	int err;
2089 
2090 	while (type == htons(ETH_P_8021Q)) {
2091 		struct vlan_hdr *vh;
2092 
2093 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2094 			return ERR_PTR(-EINVAL);
2095 
2096 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2097 		type = vh->h_vlan_encapsulated_proto;
2098 		vlan_depth += VLAN_HLEN;
2099 	}
2100 
2101 	skb_reset_mac_header(skb);
2102 	skb->mac_len = skb->network_header - skb->mac_header;
2103 	__skb_pull(skb, skb->mac_len);
2104 
2105 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2106 		skb_warn_bad_offload(skb);
2107 
2108 		if (skb_header_cloned(skb) &&
2109 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2110 			return ERR_PTR(err);
2111 	}
2112 
2113 	rcu_read_lock();
2114 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2115 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2116 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2117 				err = ptype->callbacks.gso_send_check(skb);
2118 				segs = ERR_PTR(err);
2119 				if (err || skb_gso_ok(skb, features))
2120 					break;
2121 				__skb_push(skb, (skb->data -
2122 						 skb_network_header(skb)));
2123 			}
2124 			segs = ptype->callbacks.gso_segment(skb, features);
2125 			break;
2126 		}
2127 	}
2128 	rcu_read_unlock();
2129 
2130 	__skb_push(skb, skb->data - skb_mac_header(skb));
2131 
2132 	return segs;
2133 }
2134 EXPORT_SYMBOL(skb_gso_segment);
2135 
2136 /* Take action when hardware reception checksum errors are detected. */
2137 #ifdef CONFIG_BUG
2138 void netdev_rx_csum_fault(struct net_device *dev)
2139 {
2140 	if (net_ratelimit()) {
2141 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2142 		dump_stack();
2143 	}
2144 }
2145 EXPORT_SYMBOL(netdev_rx_csum_fault);
2146 #endif
2147 
2148 /* Actually, we should eliminate this check as soon as we know, that:
2149  * 1. IOMMU is present and allows to map all the memory.
2150  * 2. No high memory really exists on this machine.
2151  */
2152 
2153 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2154 {
2155 #ifdef CONFIG_HIGHMEM
2156 	int i;
2157 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2158 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2159 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2160 			if (PageHighMem(skb_frag_page(frag)))
2161 				return 1;
2162 		}
2163 	}
2164 
2165 	if (PCI_DMA_BUS_IS_PHYS) {
2166 		struct device *pdev = dev->dev.parent;
2167 
2168 		if (!pdev)
2169 			return 0;
2170 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2171 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2172 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2173 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2174 				return 1;
2175 		}
2176 	}
2177 #endif
2178 	return 0;
2179 }
2180 
2181 struct dev_gso_cb {
2182 	void (*destructor)(struct sk_buff *skb);
2183 };
2184 
2185 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2186 
2187 static void dev_gso_skb_destructor(struct sk_buff *skb)
2188 {
2189 	struct dev_gso_cb *cb;
2190 
2191 	do {
2192 		struct sk_buff *nskb = skb->next;
2193 
2194 		skb->next = nskb->next;
2195 		nskb->next = NULL;
2196 		kfree_skb(nskb);
2197 	} while (skb->next);
2198 
2199 	cb = DEV_GSO_CB(skb);
2200 	if (cb->destructor)
2201 		cb->destructor(skb);
2202 }
2203 
2204 /**
2205  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2206  *	@skb: buffer to segment
2207  *	@features: device features as applicable to this skb
2208  *
2209  *	This function segments the given skb and stores the list of segments
2210  *	in skb->next.
2211  */
2212 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2213 {
2214 	struct sk_buff *segs;
2215 
2216 	segs = skb_gso_segment(skb, features);
2217 
2218 	/* Verifying header integrity only. */
2219 	if (!segs)
2220 		return 0;
2221 
2222 	if (IS_ERR(segs))
2223 		return PTR_ERR(segs);
2224 
2225 	skb->next = segs;
2226 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2227 	skb->destructor = dev_gso_skb_destructor;
2228 
2229 	return 0;
2230 }
2231 
2232 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2233 {
2234 	return ((features & NETIF_F_GEN_CSUM) ||
2235 		((features & NETIF_F_V4_CSUM) &&
2236 		 protocol == htons(ETH_P_IP)) ||
2237 		((features & NETIF_F_V6_CSUM) &&
2238 		 protocol == htons(ETH_P_IPV6)) ||
2239 		((features & NETIF_F_FCOE_CRC) &&
2240 		 protocol == htons(ETH_P_FCOE)));
2241 }
2242 
2243 static netdev_features_t harmonize_features(struct sk_buff *skb,
2244 	__be16 protocol, netdev_features_t features)
2245 {
2246 	if (skb->ip_summed != CHECKSUM_NONE &&
2247 	    !can_checksum_protocol(features, protocol)) {
2248 		features &= ~NETIF_F_ALL_CSUM;
2249 		features &= ~NETIF_F_SG;
2250 	} else if (illegal_highdma(skb->dev, skb)) {
2251 		features &= ~NETIF_F_SG;
2252 	}
2253 
2254 	return features;
2255 }
2256 
2257 netdev_features_t netif_skb_features(struct sk_buff *skb)
2258 {
2259 	__be16 protocol = skb->protocol;
2260 	netdev_features_t features = skb->dev->features;
2261 
2262 	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2263 		features &= ~NETIF_F_GSO_MASK;
2264 
2265 	if (protocol == htons(ETH_P_8021Q)) {
2266 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2267 		protocol = veh->h_vlan_encapsulated_proto;
2268 	} else if (!vlan_tx_tag_present(skb)) {
2269 		return harmonize_features(skb, protocol, features);
2270 	}
2271 
2272 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2273 
2274 	if (protocol != htons(ETH_P_8021Q)) {
2275 		return harmonize_features(skb, protocol, features);
2276 	} else {
2277 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2278 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2279 		return harmonize_features(skb, protocol, features);
2280 	}
2281 }
2282 EXPORT_SYMBOL(netif_skb_features);
2283 
2284 /*
2285  * Returns true if either:
2286  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2287  *	2. skb is fragmented and the device does not support SG.
2288  */
2289 static inline int skb_needs_linearize(struct sk_buff *skb,
2290 				      int features)
2291 {
2292 	return skb_is_nonlinear(skb) &&
2293 			((skb_has_frag_list(skb) &&
2294 				!(features & NETIF_F_FRAGLIST)) ||
2295 			(skb_shinfo(skb)->nr_frags &&
2296 				!(features & NETIF_F_SG)));
2297 }
2298 
2299 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2300 			struct netdev_queue *txq)
2301 {
2302 	const struct net_device_ops *ops = dev->netdev_ops;
2303 	int rc = NETDEV_TX_OK;
2304 	unsigned int skb_len;
2305 
2306 	if (likely(!skb->next)) {
2307 		netdev_features_t features;
2308 
2309 		/*
2310 		 * If device doesn't need skb->dst, release it right now while
2311 		 * its hot in this cpu cache
2312 		 */
2313 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2314 			skb_dst_drop(skb);
2315 
2316 		features = netif_skb_features(skb);
2317 
2318 		if (vlan_tx_tag_present(skb) &&
2319 		    !(features & NETIF_F_HW_VLAN_TX)) {
2320 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2321 			if (unlikely(!skb))
2322 				goto out;
2323 
2324 			skb->vlan_tci = 0;
2325 		}
2326 
2327 		/* If encapsulation offload request, verify we are testing
2328 		 * hardware encapsulation features instead of standard
2329 		 * features for the netdev
2330 		 */
2331 		if (skb->encapsulation)
2332 			features &= dev->hw_enc_features;
2333 
2334 		if (netif_needs_gso(skb, features)) {
2335 			if (unlikely(dev_gso_segment(skb, features)))
2336 				goto out_kfree_skb;
2337 			if (skb->next)
2338 				goto gso;
2339 		} else {
2340 			if (skb_needs_linearize(skb, features) &&
2341 			    __skb_linearize(skb))
2342 				goto out_kfree_skb;
2343 
2344 			/* If packet is not checksummed and device does not
2345 			 * support checksumming for this protocol, complete
2346 			 * checksumming here.
2347 			 */
2348 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2349 				if (skb->encapsulation)
2350 					skb_set_inner_transport_header(skb,
2351 						skb_checksum_start_offset(skb));
2352 				else
2353 					skb_set_transport_header(skb,
2354 						skb_checksum_start_offset(skb));
2355 				if (!(features & NETIF_F_ALL_CSUM) &&
2356 				     skb_checksum_help(skb))
2357 					goto out_kfree_skb;
2358 			}
2359 		}
2360 
2361 		if (!list_empty(&ptype_all))
2362 			dev_queue_xmit_nit(skb, dev);
2363 
2364 		skb_len = skb->len;
2365 		rc = ops->ndo_start_xmit(skb, dev);
2366 		trace_net_dev_xmit(skb, rc, dev, skb_len);
2367 		if (rc == NETDEV_TX_OK)
2368 			txq_trans_update(txq);
2369 		return rc;
2370 	}
2371 
2372 gso:
2373 	do {
2374 		struct sk_buff *nskb = skb->next;
2375 
2376 		skb->next = nskb->next;
2377 		nskb->next = NULL;
2378 
2379 		/*
2380 		 * If device doesn't need nskb->dst, release it right now while
2381 		 * its hot in this cpu cache
2382 		 */
2383 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2384 			skb_dst_drop(nskb);
2385 
2386 		if (!list_empty(&ptype_all))
2387 			dev_queue_xmit_nit(nskb, dev);
2388 
2389 		skb_len = nskb->len;
2390 		rc = ops->ndo_start_xmit(nskb, dev);
2391 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2392 		if (unlikely(rc != NETDEV_TX_OK)) {
2393 			if (rc & ~NETDEV_TX_MASK)
2394 				goto out_kfree_gso_skb;
2395 			nskb->next = skb->next;
2396 			skb->next = nskb;
2397 			return rc;
2398 		}
2399 		txq_trans_update(txq);
2400 		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2401 			return NETDEV_TX_BUSY;
2402 	} while (skb->next);
2403 
2404 out_kfree_gso_skb:
2405 	if (likely(skb->next == NULL))
2406 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2407 out_kfree_skb:
2408 	kfree_skb(skb);
2409 out:
2410 	return rc;
2411 }
2412 
2413 static u32 hashrnd __read_mostly;
2414 
2415 /*
2416  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2417  * to be used as a distribution range.
2418  */
2419 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2420 		  unsigned int num_tx_queues)
2421 {
2422 	u32 hash;
2423 	u16 qoffset = 0;
2424 	u16 qcount = num_tx_queues;
2425 
2426 	if (skb_rx_queue_recorded(skb)) {
2427 		hash = skb_get_rx_queue(skb);
2428 		while (unlikely(hash >= num_tx_queues))
2429 			hash -= num_tx_queues;
2430 		return hash;
2431 	}
2432 
2433 	if (dev->num_tc) {
2434 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2435 		qoffset = dev->tc_to_txq[tc].offset;
2436 		qcount = dev->tc_to_txq[tc].count;
2437 	}
2438 
2439 	if (skb->sk && skb->sk->sk_hash)
2440 		hash = skb->sk->sk_hash;
2441 	else
2442 		hash = (__force u16) skb->protocol;
2443 	hash = jhash_1word(hash, hashrnd);
2444 
2445 	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2446 }
2447 EXPORT_SYMBOL(__skb_tx_hash);
2448 
2449 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2450 {
2451 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2452 		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2453 				     dev->name, queue_index,
2454 				     dev->real_num_tx_queues);
2455 		return 0;
2456 	}
2457 	return queue_index;
2458 }
2459 
2460 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2461 {
2462 #ifdef CONFIG_XPS
2463 	struct xps_dev_maps *dev_maps;
2464 	struct xps_map *map;
2465 	int queue_index = -1;
2466 
2467 	rcu_read_lock();
2468 	dev_maps = rcu_dereference(dev->xps_maps);
2469 	if (dev_maps) {
2470 		map = rcu_dereference(
2471 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2472 		if (map) {
2473 			if (map->len == 1)
2474 				queue_index = map->queues[0];
2475 			else {
2476 				u32 hash;
2477 				if (skb->sk && skb->sk->sk_hash)
2478 					hash = skb->sk->sk_hash;
2479 				else
2480 					hash = (__force u16) skb->protocol ^
2481 					    skb->rxhash;
2482 				hash = jhash_1word(hash, hashrnd);
2483 				queue_index = map->queues[
2484 				    ((u64)hash * map->len) >> 32];
2485 			}
2486 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2487 				queue_index = -1;
2488 		}
2489 	}
2490 	rcu_read_unlock();
2491 
2492 	return queue_index;
2493 #else
2494 	return -1;
2495 #endif
2496 }
2497 
2498 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
2499 				    struct sk_buff *skb)
2500 {
2501 	int queue_index;
2502 	const struct net_device_ops *ops = dev->netdev_ops;
2503 
2504 	if (dev->real_num_tx_queues == 1)
2505 		queue_index = 0;
2506 	else if (ops->ndo_select_queue) {
2507 		queue_index = ops->ndo_select_queue(dev, skb);
2508 		queue_index = dev_cap_txqueue(dev, queue_index);
2509 	} else {
2510 		struct sock *sk = skb->sk;
2511 		queue_index = sk_tx_queue_get(sk);
2512 
2513 		if (queue_index < 0 || skb->ooo_okay ||
2514 		    queue_index >= dev->real_num_tx_queues) {
2515 			int old_index = queue_index;
2516 
2517 			queue_index = get_xps_queue(dev, skb);
2518 			if (queue_index < 0)
2519 				queue_index = skb_tx_hash(dev, skb);
2520 
2521 			if (queue_index != old_index && sk) {
2522 				struct dst_entry *dst =
2523 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2524 
2525 				if (dst && skb_dst(skb) == dst)
2526 					sk_tx_queue_set(sk, queue_index);
2527 			}
2528 		}
2529 	}
2530 
2531 	skb_set_queue_mapping(skb, queue_index);
2532 	return netdev_get_tx_queue(dev, queue_index);
2533 }
2534 
2535 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2536 				 struct net_device *dev,
2537 				 struct netdev_queue *txq)
2538 {
2539 	spinlock_t *root_lock = qdisc_lock(q);
2540 	bool contended;
2541 	int rc;
2542 
2543 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2544 	qdisc_calculate_pkt_len(skb, q);
2545 	/*
2546 	 * Heuristic to force contended enqueues to serialize on a
2547 	 * separate lock before trying to get qdisc main lock.
2548 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2549 	 * and dequeue packets faster.
2550 	 */
2551 	contended = qdisc_is_running(q);
2552 	if (unlikely(contended))
2553 		spin_lock(&q->busylock);
2554 
2555 	spin_lock(root_lock);
2556 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2557 		kfree_skb(skb);
2558 		rc = NET_XMIT_DROP;
2559 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2560 		   qdisc_run_begin(q)) {
2561 		/*
2562 		 * This is a work-conserving queue; there are no old skbs
2563 		 * waiting to be sent out; and the qdisc is not running -
2564 		 * xmit the skb directly.
2565 		 */
2566 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2567 			skb_dst_force(skb);
2568 
2569 		qdisc_bstats_update(q, skb);
2570 
2571 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2572 			if (unlikely(contended)) {
2573 				spin_unlock(&q->busylock);
2574 				contended = false;
2575 			}
2576 			__qdisc_run(q);
2577 		} else
2578 			qdisc_run_end(q);
2579 
2580 		rc = NET_XMIT_SUCCESS;
2581 	} else {
2582 		skb_dst_force(skb);
2583 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2584 		if (qdisc_run_begin(q)) {
2585 			if (unlikely(contended)) {
2586 				spin_unlock(&q->busylock);
2587 				contended = false;
2588 			}
2589 			__qdisc_run(q);
2590 		}
2591 	}
2592 	spin_unlock(root_lock);
2593 	if (unlikely(contended))
2594 		spin_unlock(&q->busylock);
2595 	return rc;
2596 }
2597 
2598 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2599 static void skb_update_prio(struct sk_buff *skb)
2600 {
2601 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2602 
2603 	if (!skb->priority && skb->sk && map) {
2604 		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2605 
2606 		if (prioidx < map->priomap_len)
2607 			skb->priority = map->priomap[prioidx];
2608 	}
2609 }
2610 #else
2611 #define skb_update_prio(skb)
2612 #endif
2613 
2614 static DEFINE_PER_CPU(int, xmit_recursion);
2615 #define RECURSION_LIMIT 10
2616 
2617 /**
2618  *	dev_loopback_xmit - loop back @skb
2619  *	@skb: buffer to transmit
2620  */
2621 int dev_loopback_xmit(struct sk_buff *skb)
2622 {
2623 	skb_reset_mac_header(skb);
2624 	__skb_pull(skb, skb_network_offset(skb));
2625 	skb->pkt_type = PACKET_LOOPBACK;
2626 	skb->ip_summed = CHECKSUM_UNNECESSARY;
2627 	WARN_ON(!skb_dst(skb));
2628 	skb_dst_force(skb);
2629 	netif_rx_ni(skb);
2630 	return 0;
2631 }
2632 EXPORT_SYMBOL(dev_loopback_xmit);
2633 
2634 /**
2635  *	dev_queue_xmit - transmit a buffer
2636  *	@skb: buffer to transmit
2637  *
2638  *	Queue a buffer for transmission to a network device. The caller must
2639  *	have set the device and priority and built the buffer before calling
2640  *	this function. The function can be called from an interrupt.
2641  *
2642  *	A negative errno code is returned on a failure. A success does not
2643  *	guarantee the frame will be transmitted as it may be dropped due
2644  *	to congestion or traffic shaping.
2645  *
2646  * -----------------------------------------------------------------------------------
2647  *      I notice this method can also return errors from the queue disciplines,
2648  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2649  *      be positive.
2650  *
2651  *      Regardless of the return value, the skb is consumed, so it is currently
2652  *      difficult to retry a send to this method.  (You can bump the ref count
2653  *      before sending to hold a reference for retry if you are careful.)
2654  *
2655  *      When calling this method, interrupts MUST be enabled.  This is because
2656  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2657  *          --BLG
2658  */
2659 int dev_queue_xmit(struct sk_buff *skb)
2660 {
2661 	struct net_device *dev = skb->dev;
2662 	struct netdev_queue *txq;
2663 	struct Qdisc *q;
2664 	int rc = -ENOMEM;
2665 
2666 	/* Disable soft irqs for various locks below. Also
2667 	 * stops preemption for RCU.
2668 	 */
2669 	rcu_read_lock_bh();
2670 
2671 	skb_update_prio(skb);
2672 
2673 	txq = netdev_pick_tx(dev, skb);
2674 	q = rcu_dereference_bh(txq->qdisc);
2675 
2676 #ifdef CONFIG_NET_CLS_ACT
2677 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2678 #endif
2679 	trace_net_dev_queue(skb);
2680 	if (q->enqueue) {
2681 		rc = __dev_xmit_skb(skb, q, dev, txq);
2682 		goto out;
2683 	}
2684 
2685 	/* The device has no queue. Common case for software devices:
2686 	   loopback, all the sorts of tunnels...
2687 
2688 	   Really, it is unlikely that netif_tx_lock protection is necessary
2689 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2690 	   counters.)
2691 	   However, it is possible, that they rely on protection
2692 	   made by us here.
2693 
2694 	   Check this and shot the lock. It is not prone from deadlocks.
2695 	   Either shot noqueue qdisc, it is even simpler 8)
2696 	 */
2697 	if (dev->flags & IFF_UP) {
2698 		int cpu = smp_processor_id(); /* ok because BHs are off */
2699 
2700 		if (txq->xmit_lock_owner != cpu) {
2701 
2702 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2703 				goto recursion_alert;
2704 
2705 			HARD_TX_LOCK(dev, txq, cpu);
2706 
2707 			if (!netif_xmit_stopped(txq)) {
2708 				__this_cpu_inc(xmit_recursion);
2709 				rc = dev_hard_start_xmit(skb, dev, txq);
2710 				__this_cpu_dec(xmit_recursion);
2711 				if (dev_xmit_complete(rc)) {
2712 					HARD_TX_UNLOCK(dev, txq);
2713 					goto out;
2714 				}
2715 			}
2716 			HARD_TX_UNLOCK(dev, txq);
2717 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2718 					     dev->name);
2719 		} else {
2720 			/* Recursion is detected! It is possible,
2721 			 * unfortunately
2722 			 */
2723 recursion_alert:
2724 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2725 					     dev->name);
2726 		}
2727 	}
2728 
2729 	rc = -ENETDOWN;
2730 	rcu_read_unlock_bh();
2731 
2732 	kfree_skb(skb);
2733 	return rc;
2734 out:
2735 	rcu_read_unlock_bh();
2736 	return rc;
2737 }
2738 EXPORT_SYMBOL(dev_queue_xmit);
2739 
2740 
2741 /*=======================================================================
2742 			Receiver routines
2743   =======================================================================*/
2744 
2745 int netdev_max_backlog __read_mostly = 1000;
2746 EXPORT_SYMBOL(netdev_max_backlog);
2747 
2748 int netdev_tstamp_prequeue __read_mostly = 1;
2749 int netdev_budget __read_mostly = 300;
2750 int weight_p __read_mostly = 64;            /* old backlog weight */
2751 
2752 /* Called with irq disabled */
2753 static inline void ____napi_schedule(struct softnet_data *sd,
2754 				     struct napi_struct *napi)
2755 {
2756 	list_add_tail(&napi->poll_list, &sd->poll_list);
2757 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2758 }
2759 
2760 /*
2761  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2762  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2763  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2764  * if hash is a canonical 4-tuple hash over transport ports.
2765  */
2766 void __skb_get_rxhash(struct sk_buff *skb)
2767 {
2768 	struct flow_keys keys;
2769 	u32 hash;
2770 
2771 	if (!skb_flow_dissect(skb, &keys))
2772 		return;
2773 
2774 	if (keys.ports)
2775 		skb->l4_rxhash = 1;
2776 
2777 	/* get a consistent hash (same value on both flow directions) */
2778 	if (((__force u32)keys.dst < (__force u32)keys.src) ||
2779 	    (((__force u32)keys.dst == (__force u32)keys.src) &&
2780 	     ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
2781 		swap(keys.dst, keys.src);
2782 		swap(keys.port16[0], keys.port16[1]);
2783 	}
2784 
2785 	hash = jhash_3words((__force u32)keys.dst,
2786 			    (__force u32)keys.src,
2787 			    (__force u32)keys.ports, hashrnd);
2788 	if (!hash)
2789 		hash = 1;
2790 
2791 	skb->rxhash = hash;
2792 }
2793 EXPORT_SYMBOL(__skb_get_rxhash);
2794 
2795 #ifdef CONFIG_RPS
2796 
2797 /* One global table that all flow-based protocols share. */
2798 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2799 EXPORT_SYMBOL(rps_sock_flow_table);
2800 
2801 struct static_key rps_needed __read_mostly;
2802 
2803 static struct rps_dev_flow *
2804 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2805 	    struct rps_dev_flow *rflow, u16 next_cpu)
2806 {
2807 	if (next_cpu != RPS_NO_CPU) {
2808 #ifdef CONFIG_RFS_ACCEL
2809 		struct netdev_rx_queue *rxqueue;
2810 		struct rps_dev_flow_table *flow_table;
2811 		struct rps_dev_flow *old_rflow;
2812 		u32 flow_id;
2813 		u16 rxq_index;
2814 		int rc;
2815 
2816 		/* Should we steer this flow to a different hardware queue? */
2817 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2818 		    !(dev->features & NETIF_F_NTUPLE))
2819 			goto out;
2820 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2821 		if (rxq_index == skb_get_rx_queue(skb))
2822 			goto out;
2823 
2824 		rxqueue = dev->_rx + rxq_index;
2825 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2826 		if (!flow_table)
2827 			goto out;
2828 		flow_id = skb->rxhash & flow_table->mask;
2829 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2830 							rxq_index, flow_id);
2831 		if (rc < 0)
2832 			goto out;
2833 		old_rflow = rflow;
2834 		rflow = &flow_table->flows[flow_id];
2835 		rflow->filter = rc;
2836 		if (old_rflow->filter == rflow->filter)
2837 			old_rflow->filter = RPS_NO_FILTER;
2838 	out:
2839 #endif
2840 		rflow->last_qtail =
2841 			per_cpu(softnet_data, next_cpu).input_queue_head;
2842 	}
2843 
2844 	rflow->cpu = next_cpu;
2845 	return rflow;
2846 }
2847 
2848 /*
2849  * get_rps_cpu is called from netif_receive_skb and returns the target
2850  * CPU from the RPS map of the receiving queue for a given skb.
2851  * rcu_read_lock must be held on entry.
2852  */
2853 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2854 		       struct rps_dev_flow **rflowp)
2855 {
2856 	struct netdev_rx_queue *rxqueue;
2857 	struct rps_map *map;
2858 	struct rps_dev_flow_table *flow_table;
2859 	struct rps_sock_flow_table *sock_flow_table;
2860 	int cpu = -1;
2861 	u16 tcpu;
2862 
2863 	if (skb_rx_queue_recorded(skb)) {
2864 		u16 index = skb_get_rx_queue(skb);
2865 		if (unlikely(index >= dev->real_num_rx_queues)) {
2866 			WARN_ONCE(dev->real_num_rx_queues > 1,
2867 				  "%s received packet on queue %u, but number "
2868 				  "of RX queues is %u\n",
2869 				  dev->name, index, dev->real_num_rx_queues);
2870 			goto done;
2871 		}
2872 		rxqueue = dev->_rx + index;
2873 	} else
2874 		rxqueue = dev->_rx;
2875 
2876 	map = rcu_dereference(rxqueue->rps_map);
2877 	if (map) {
2878 		if (map->len == 1 &&
2879 		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2880 			tcpu = map->cpus[0];
2881 			if (cpu_online(tcpu))
2882 				cpu = tcpu;
2883 			goto done;
2884 		}
2885 	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2886 		goto done;
2887 	}
2888 
2889 	skb_reset_network_header(skb);
2890 	if (!skb_get_rxhash(skb))
2891 		goto done;
2892 
2893 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2894 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2895 	if (flow_table && sock_flow_table) {
2896 		u16 next_cpu;
2897 		struct rps_dev_flow *rflow;
2898 
2899 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2900 		tcpu = rflow->cpu;
2901 
2902 		next_cpu = sock_flow_table->ents[skb->rxhash &
2903 		    sock_flow_table->mask];
2904 
2905 		/*
2906 		 * If the desired CPU (where last recvmsg was done) is
2907 		 * different from current CPU (one in the rx-queue flow
2908 		 * table entry), switch if one of the following holds:
2909 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2910 		 *   - Current CPU is offline.
2911 		 *   - The current CPU's queue tail has advanced beyond the
2912 		 *     last packet that was enqueued using this table entry.
2913 		 *     This guarantees that all previous packets for the flow
2914 		 *     have been dequeued, thus preserving in order delivery.
2915 		 */
2916 		if (unlikely(tcpu != next_cpu) &&
2917 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2918 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2919 		      rflow->last_qtail)) >= 0)) {
2920 			tcpu = next_cpu;
2921 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2922 		}
2923 
2924 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2925 			*rflowp = rflow;
2926 			cpu = tcpu;
2927 			goto done;
2928 		}
2929 	}
2930 
2931 	if (map) {
2932 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2933 
2934 		if (cpu_online(tcpu)) {
2935 			cpu = tcpu;
2936 			goto done;
2937 		}
2938 	}
2939 
2940 done:
2941 	return cpu;
2942 }
2943 
2944 #ifdef CONFIG_RFS_ACCEL
2945 
2946 /**
2947  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2948  * @dev: Device on which the filter was set
2949  * @rxq_index: RX queue index
2950  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2951  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2952  *
2953  * Drivers that implement ndo_rx_flow_steer() should periodically call
2954  * this function for each installed filter and remove the filters for
2955  * which it returns %true.
2956  */
2957 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2958 			 u32 flow_id, u16 filter_id)
2959 {
2960 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2961 	struct rps_dev_flow_table *flow_table;
2962 	struct rps_dev_flow *rflow;
2963 	bool expire = true;
2964 	int cpu;
2965 
2966 	rcu_read_lock();
2967 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2968 	if (flow_table && flow_id <= flow_table->mask) {
2969 		rflow = &flow_table->flows[flow_id];
2970 		cpu = ACCESS_ONCE(rflow->cpu);
2971 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2972 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2973 			   rflow->last_qtail) <
2974 		     (int)(10 * flow_table->mask)))
2975 			expire = false;
2976 	}
2977 	rcu_read_unlock();
2978 	return expire;
2979 }
2980 EXPORT_SYMBOL(rps_may_expire_flow);
2981 
2982 #endif /* CONFIG_RFS_ACCEL */
2983 
2984 /* Called from hardirq (IPI) context */
2985 static void rps_trigger_softirq(void *data)
2986 {
2987 	struct softnet_data *sd = data;
2988 
2989 	____napi_schedule(sd, &sd->backlog);
2990 	sd->received_rps++;
2991 }
2992 
2993 #endif /* CONFIG_RPS */
2994 
2995 /*
2996  * Check if this softnet_data structure is another cpu one
2997  * If yes, queue it to our IPI list and return 1
2998  * If no, return 0
2999  */
3000 static int rps_ipi_queued(struct softnet_data *sd)
3001 {
3002 #ifdef CONFIG_RPS
3003 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3004 
3005 	if (sd != mysd) {
3006 		sd->rps_ipi_next = mysd->rps_ipi_list;
3007 		mysd->rps_ipi_list = sd;
3008 
3009 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3010 		return 1;
3011 	}
3012 #endif /* CONFIG_RPS */
3013 	return 0;
3014 }
3015 
3016 /*
3017  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3018  * queue (may be a remote CPU queue).
3019  */
3020 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3021 			      unsigned int *qtail)
3022 {
3023 	struct softnet_data *sd;
3024 	unsigned long flags;
3025 
3026 	sd = &per_cpu(softnet_data, cpu);
3027 
3028 	local_irq_save(flags);
3029 
3030 	rps_lock(sd);
3031 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3032 		if (skb_queue_len(&sd->input_pkt_queue)) {
3033 enqueue:
3034 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3035 			input_queue_tail_incr_save(sd, qtail);
3036 			rps_unlock(sd);
3037 			local_irq_restore(flags);
3038 			return NET_RX_SUCCESS;
3039 		}
3040 
3041 		/* Schedule NAPI for backlog device
3042 		 * We can use non atomic operation since we own the queue lock
3043 		 */
3044 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3045 			if (!rps_ipi_queued(sd))
3046 				____napi_schedule(sd, &sd->backlog);
3047 		}
3048 		goto enqueue;
3049 	}
3050 
3051 	sd->dropped++;
3052 	rps_unlock(sd);
3053 
3054 	local_irq_restore(flags);
3055 
3056 	atomic_long_inc(&skb->dev->rx_dropped);
3057 	kfree_skb(skb);
3058 	return NET_RX_DROP;
3059 }
3060 
3061 /**
3062  *	netif_rx	-	post buffer to the network code
3063  *	@skb: buffer to post
3064  *
3065  *	This function receives a packet from a device driver and queues it for
3066  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3067  *	may be dropped during processing for congestion control or by the
3068  *	protocol layers.
3069  *
3070  *	return values:
3071  *	NET_RX_SUCCESS	(no congestion)
3072  *	NET_RX_DROP     (packet was dropped)
3073  *
3074  */
3075 
3076 int netif_rx(struct sk_buff *skb)
3077 {
3078 	int ret;
3079 
3080 	/* if netpoll wants it, pretend we never saw it */
3081 	if (netpoll_rx(skb))
3082 		return NET_RX_DROP;
3083 
3084 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3085 
3086 	trace_netif_rx(skb);
3087 #ifdef CONFIG_RPS
3088 	if (static_key_false(&rps_needed)) {
3089 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3090 		int cpu;
3091 
3092 		preempt_disable();
3093 		rcu_read_lock();
3094 
3095 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3096 		if (cpu < 0)
3097 			cpu = smp_processor_id();
3098 
3099 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3100 
3101 		rcu_read_unlock();
3102 		preempt_enable();
3103 	} else
3104 #endif
3105 	{
3106 		unsigned int qtail;
3107 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3108 		put_cpu();
3109 	}
3110 	return ret;
3111 }
3112 EXPORT_SYMBOL(netif_rx);
3113 
3114 int netif_rx_ni(struct sk_buff *skb)
3115 {
3116 	int err;
3117 
3118 	preempt_disable();
3119 	err = netif_rx(skb);
3120 	if (local_softirq_pending())
3121 		do_softirq();
3122 	preempt_enable();
3123 
3124 	return err;
3125 }
3126 EXPORT_SYMBOL(netif_rx_ni);
3127 
3128 static void net_tx_action(struct softirq_action *h)
3129 {
3130 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3131 
3132 	if (sd->completion_queue) {
3133 		struct sk_buff *clist;
3134 
3135 		local_irq_disable();
3136 		clist = sd->completion_queue;
3137 		sd->completion_queue = NULL;
3138 		local_irq_enable();
3139 
3140 		while (clist) {
3141 			struct sk_buff *skb = clist;
3142 			clist = clist->next;
3143 
3144 			WARN_ON(atomic_read(&skb->users));
3145 			trace_kfree_skb(skb, net_tx_action);
3146 			__kfree_skb(skb);
3147 		}
3148 	}
3149 
3150 	if (sd->output_queue) {
3151 		struct Qdisc *head;
3152 
3153 		local_irq_disable();
3154 		head = sd->output_queue;
3155 		sd->output_queue = NULL;
3156 		sd->output_queue_tailp = &sd->output_queue;
3157 		local_irq_enable();
3158 
3159 		while (head) {
3160 			struct Qdisc *q = head;
3161 			spinlock_t *root_lock;
3162 
3163 			head = head->next_sched;
3164 
3165 			root_lock = qdisc_lock(q);
3166 			if (spin_trylock(root_lock)) {
3167 				smp_mb__before_clear_bit();
3168 				clear_bit(__QDISC_STATE_SCHED,
3169 					  &q->state);
3170 				qdisc_run(q);
3171 				spin_unlock(root_lock);
3172 			} else {
3173 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3174 					      &q->state)) {
3175 					__netif_reschedule(q);
3176 				} else {
3177 					smp_mb__before_clear_bit();
3178 					clear_bit(__QDISC_STATE_SCHED,
3179 						  &q->state);
3180 				}
3181 			}
3182 		}
3183 	}
3184 }
3185 
3186 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3187     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3188 /* This hook is defined here for ATM LANE */
3189 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3190 			     unsigned char *addr) __read_mostly;
3191 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3192 #endif
3193 
3194 #ifdef CONFIG_NET_CLS_ACT
3195 /* TODO: Maybe we should just force sch_ingress to be compiled in
3196  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3197  * a compare and 2 stores extra right now if we dont have it on
3198  * but have CONFIG_NET_CLS_ACT
3199  * NOTE: This doesn't stop any functionality; if you dont have
3200  * the ingress scheduler, you just can't add policies on ingress.
3201  *
3202  */
3203 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3204 {
3205 	struct net_device *dev = skb->dev;
3206 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3207 	int result = TC_ACT_OK;
3208 	struct Qdisc *q;
3209 
3210 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3211 		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3212 				     skb->skb_iif, dev->ifindex);
3213 		return TC_ACT_SHOT;
3214 	}
3215 
3216 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3217 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3218 
3219 	q = rxq->qdisc;
3220 	if (q != &noop_qdisc) {
3221 		spin_lock(qdisc_lock(q));
3222 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3223 			result = qdisc_enqueue_root(skb, q);
3224 		spin_unlock(qdisc_lock(q));
3225 	}
3226 
3227 	return result;
3228 }
3229 
3230 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3231 					 struct packet_type **pt_prev,
3232 					 int *ret, struct net_device *orig_dev)
3233 {
3234 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3235 
3236 	if (!rxq || rxq->qdisc == &noop_qdisc)
3237 		goto out;
3238 
3239 	if (*pt_prev) {
3240 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3241 		*pt_prev = NULL;
3242 	}
3243 
3244 	switch (ing_filter(skb, rxq)) {
3245 	case TC_ACT_SHOT:
3246 	case TC_ACT_STOLEN:
3247 		kfree_skb(skb);
3248 		return NULL;
3249 	}
3250 
3251 out:
3252 	skb->tc_verd = 0;
3253 	return skb;
3254 }
3255 #endif
3256 
3257 /**
3258  *	netdev_rx_handler_register - register receive handler
3259  *	@dev: device to register a handler for
3260  *	@rx_handler: receive handler to register
3261  *	@rx_handler_data: data pointer that is used by rx handler
3262  *
3263  *	Register a receive hander for a device. This handler will then be
3264  *	called from __netif_receive_skb. A negative errno code is returned
3265  *	on a failure.
3266  *
3267  *	The caller must hold the rtnl_mutex.
3268  *
3269  *	For a general description of rx_handler, see enum rx_handler_result.
3270  */
3271 int netdev_rx_handler_register(struct net_device *dev,
3272 			       rx_handler_func_t *rx_handler,
3273 			       void *rx_handler_data)
3274 {
3275 	ASSERT_RTNL();
3276 
3277 	if (dev->rx_handler)
3278 		return -EBUSY;
3279 
3280 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3281 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3282 
3283 	return 0;
3284 }
3285 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3286 
3287 /**
3288  *	netdev_rx_handler_unregister - unregister receive handler
3289  *	@dev: device to unregister a handler from
3290  *
3291  *	Unregister a receive hander from a device.
3292  *
3293  *	The caller must hold the rtnl_mutex.
3294  */
3295 void netdev_rx_handler_unregister(struct net_device *dev)
3296 {
3297 
3298 	ASSERT_RTNL();
3299 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3300 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3301 }
3302 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3303 
3304 /*
3305  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3306  * the special handling of PFMEMALLOC skbs.
3307  */
3308 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3309 {
3310 	switch (skb->protocol) {
3311 	case __constant_htons(ETH_P_ARP):
3312 	case __constant_htons(ETH_P_IP):
3313 	case __constant_htons(ETH_P_IPV6):
3314 	case __constant_htons(ETH_P_8021Q):
3315 		return true;
3316 	default:
3317 		return false;
3318 	}
3319 }
3320 
3321 static int __netif_receive_skb(struct sk_buff *skb)
3322 {
3323 	struct packet_type *ptype, *pt_prev;
3324 	rx_handler_func_t *rx_handler;
3325 	struct net_device *orig_dev;
3326 	struct net_device *null_or_dev;
3327 	bool deliver_exact = false;
3328 	int ret = NET_RX_DROP;
3329 	__be16 type;
3330 	unsigned long pflags = current->flags;
3331 
3332 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3333 
3334 	trace_netif_receive_skb(skb);
3335 
3336 	/*
3337 	 * PFMEMALLOC skbs are special, they should
3338 	 * - be delivered to SOCK_MEMALLOC sockets only
3339 	 * - stay away from userspace
3340 	 * - have bounded memory usage
3341 	 *
3342 	 * Use PF_MEMALLOC as this saves us from propagating the allocation
3343 	 * context down to all allocation sites.
3344 	 */
3345 	if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3346 		current->flags |= PF_MEMALLOC;
3347 
3348 	/* if we've gotten here through NAPI, check netpoll */
3349 	if (netpoll_receive_skb(skb))
3350 		goto out;
3351 
3352 	orig_dev = skb->dev;
3353 
3354 	skb_reset_network_header(skb);
3355 	skb_reset_transport_header(skb);
3356 	skb_reset_mac_len(skb);
3357 
3358 	pt_prev = NULL;
3359 
3360 	rcu_read_lock();
3361 
3362 another_round:
3363 	skb->skb_iif = skb->dev->ifindex;
3364 
3365 	__this_cpu_inc(softnet_data.processed);
3366 
3367 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3368 		skb = vlan_untag(skb);
3369 		if (unlikely(!skb))
3370 			goto unlock;
3371 	}
3372 
3373 #ifdef CONFIG_NET_CLS_ACT
3374 	if (skb->tc_verd & TC_NCLS) {
3375 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3376 		goto ncls;
3377 	}
3378 #endif
3379 
3380 	if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3381 		goto skip_taps;
3382 
3383 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3384 		if (!ptype->dev || ptype->dev == skb->dev) {
3385 			if (pt_prev)
3386 				ret = deliver_skb(skb, pt_prev, orig_dev);
3387 			pt_prev = ptype;
3388 		}
3389 	}
3390 
3391 skip_taps:
3392 #ifdef CONFIG_NET_CLS_ACT
3393 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3394 	if (!skb)
3395 		goto unlock;
3396 ncls:
3397 #endif
3398 
3399 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3400 				&& !skb_pfmemalloc_protocol(skb))
3401 		goto drop;
3402 
3403 	if (vlan_tx_tag_present(skb)) {
3404 		if (pt_prev) {
3405 			ret = deliver_skb(skb, pt_prev, orig_dev);
3406 			pt_prev = NULL;
3407 		}
3408 		if (vlan_do_receive(&skb))
3409 			goto another_round;
3410 		else if (unlikely(!skb))
3411 			goto unlock;
3412 	}
3413 
3414 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3415 	if (rx_handler) {
3416 		if (pt_prev) {
3417 			ret = deliver_skb(skb, pt_prev, orig_dev);
3418 			pt_prev = NULL;
3419 		}
3420 		switch (rx_handler(&skb)) {
3421 		case RX_HANDLER_CONSUMED:
3422 			goto unlock;
3423 		case RX_HANDLER_ANOTHER:
3424 			goto another_round;
3425 		case RX_HANDLER_EXACT:
3426 			deliver_exact = true;
3427 		case RX_HANDLER_PASS:
3428 			break;
3429 		default:
3430 			BUG();
3431 		}
3432 	}
3433 
3434 	if (vlan_tx_nonzero_tag_present(skb))
3435 		skb->pkt_type = PACKET_OTHERHOST;
3436 
3437 	/* deliver only exact match when indicated */
3438 	null_or_dev = deliver_exact ? skb->dev : NULL;
3439 
3440 	type = skb->protocol;
3441 	list_for_each_entry_rcu(ptype,
3442 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3443 		if (ptype->type == type &&
3444 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3445 		     ptype->dev == orig_dev)) {
3446 			if (pt_prev)
3447 				ret = deliver_skb(skb, pt_prev, orig_dev);
3448 			pt_prev = ptype;
3449 		}
3450 	}
3451 
3452 	if (pt_prev) {
3453 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3454 			goto drop;
3455 		else
3456 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3457 	} else {
3458 drop:
3459 		atomic_long_inc(&skb->dev->rx_dropped);
3460 		kfree_skb(skb);
3461 		/* Jamal, now you will not able to escape explaining
3462 		 * me how you were going to use this. :-)
3463 		 */
3464 		ret = NET_RX_DROP;
3465 	}
3466 
3467 unlock:
3468 	rcu_read_unlock();
3469 out:
3470 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
3471 	return ret;
3472 }
3473 
3474 /**
3475  *	netif_receive_skb - process receive buffer from network
3476  *	@skb: buffer to process
3477  *
3478  *	netif_receive_skb() is the main receive data processing function.
3479  *	It always succeeds. The buffer may be dropped during processing
3480  *	for congestion control or by the protocol layers.
3481  *
3482  *	This function may only be called from softirq context and interrupts
3483  *	should be enabled.
3484  *
3485  *	Return values (usually ignored):
3486  *	NET_RX_SUCCESS: no congestion
3487  *	NET_RX_DROP: packet was dropped
3488  */
3489 int netif_receive_skb(struct sk_buff *skb)
3490 {
3491 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3492 
3493 	if (skb_defer_rx_timestamp(skb))
3494 		return NET_RX_SUCCESS;
3495 
3496 #ifdef CONFIG_RPS
3497 	if (static_key_false(&rps_needed)) {
3498 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3499 		int cpu, ret;
3500 
3501 		rcu_read_lock();
3502 
3503 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3504 
3505 		if (cpu >= 0) {
3506 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3507 			rcu_read_unlock();
3508 			return ret;
3509 		}
3510 		rcu_read_unlock();
3511 	}
3512 #endif
3513 	return __netif_receive_skb(skb);
3514 }
3515 EXPORT_SYMBOL(netif_receive_skb);
3516 
3517 /* Network device is going away, flush any packets still pending
3518  * Called with irqs disabled.
3519  */
3520 static void flush_backlog(void *arg)
3521 {
3522 	struct net_device *dev = arg;
3523 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3524 	struct sk_buff *skb, *tmp;
3525 
3526 	rps_lock(sd);
3527 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3528 		if (skb->dev == dev) {
3529 			__skb_unlink(skb, &sd->input_pkt_queue);
3530 			kfree_skb(skb);
3531 			input_queue_head_incr(sd);
3532 		}
3533 	}
3534 	rps_unlock(sd);
3535 
3536 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3537 		if (skb->dev == dev) {
3538 			__skb_unlink(skb, &sd->process_queue);
3539 			kfree_skb(skb);
3540 			input_queue_head_incr(sd);
3541 		}
3542 	}
3543 }
3544 
3545 static int napi_gro_complete(struct sk_buff *skb)
3546 {
3547 	struct packet_offload *ptype;
3548 	__be16 type = skb->protocol;
3549 	struct list_head *head = &offload_base;
3550 	int err = -ENOENT;
3551 
3552 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3553 
3554 	if (NAPI_GRO_CB(skb)->count == 1) {
3555 		skb_shinfo(skb)->gso_size = 0;
3556 		goto out;
3557 	}
3558 
3559 	rcu_read_lock();
3560 	list_for_each_entry_rcu(ptype, head, list) {
3561 		if (ptype->type != type || !ptype->callbacks.gro_complete)
3562 			continue;
3563 
3564 		err = ptype->callbacks.gro_complete(skb);
3565 		break;
3566 	}
3567 	rcu_read_unlock();
3568 
3569 	if (err) {
3570 		WARN_ON(&ptype->list == head);
3571 		kfree_skb(skb);
3572 		return NET_RX_SUCCESS;
3573 	}
3574 
3575 out:
3576 	return netif_receive_skb(skb);
3577 }
3578 
3579 /* napi->gro_list contains packets ordered by age.
3580  * youngest packets at the head of it.
3581  * Complete skbs in reverse order to reduce latencies.
3582  */
3583 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3584 {
3585 	struct sk_buff *skb, *prev = NULL;
3586 
3587 	/* scan list and build reverse chain */
3588 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3589 		skb->prev = prev;
3590 		prev = skb;
3591 	}
3592 
3593 	for (skb = prev; skb; skb = prev) {
3594 		skb->next = NULL;
3595 
3596 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3597 			return;
3598 
3599 		prev = skb->prev;
3600 		napi_gro_complete(skb);
3601 		napi->gro_count--;
3602 	}
3603 
3604 	napi->gro_list = NULL;
3605 }
3606 EXPORT_SYMBOL(napi_gro_flush);
3607 
3608 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3609 {
3610 	struct sk_buff *p;
3611 	unsigned int maclen = skb->dev->hard_header_len;
3612 
3613 	for (p = napi->gro_list; p; p = p->next) {
3614 		unsigned long diffs;
3615 
3616 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3617 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3618 		if (maclen == ETH_HLEN)
3619 			diffs |= compare_ether_header(skb_mac_header(p),
3620 						      skb_gro_mac_header(skb));
3621 		else if (!diffs)
3622 			diffs = memcmp(skb_mac_header(p),
3623 				       skb_gro_mac_header(skb),
3624 				       maclen);
3625 		NAPI_GRO_CB(p)->same_flow = !diffs;
3626 		NAPI_GRO_CB(p)->flush = 0;
3627 	}
3628 }
3629 
3630 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3631 {
3632 	struct sk_buff **pp = NULL;
3633 	struct packet_offload *ptype;
3634 	__be16 type = skb->protocol;
3635 	struct list_head *head = &offload_base;
3636 	int same_flow;
3637 	int mac_len;
3638 	enum gro_result ret;
3639 
3640 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3641 		goto normal;
3642 
3643 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3644 		goto normal;
3645 
3646 	gro_list_prepare(napi, skb);
3647 
3648 	rcu_read_lock();
3649 	list_for_each_entry_rcu(ptype, head, list) {
3650 		if (ptype->type != type || !ptype->callbacks.gro_receive)
3651 			continue;
3652 
3653 		skb_set_network_header(skb, skb_gro_offset(skb));
3654 		mac_len = skb->network_header - skb->mac_header;
3655 		skb->mac_len = mac_len;
3656 		NAPI_GRO_CB(skb)->same_flow = 0;
3657 		NAPI_GRO_CB(skb)->flush = 0;
3658 		NAPI_GRO_CB(skb)->free = 0;
3659 
3660 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3661 		break;
3662 	}
3663 	rcu_read_unlock();
3664 
3665 	if (&ptype->list == head)
3666 		goto normal;
3667 
3668 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3669 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3670 
3671 	if (pp) {
3672 		struct sk_buff *nskb = *pp;
3673 
3674 		*pp = nskb->next;
3675 		nskb->next = NULL;
3676 		napi_gro_complete(nskb);
3677 		napi->gro_count--;
3678 	}
3679 
3680 	if (same_flow)
3681 		goto ok;
3682 
3683 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3684 		goto normal;
3685 
3686 	napi->gro_count++;
3687 	NAPI_GRO_CB(skb)->count = 1;
3688 	NAPI_GRO_CB(skb)->age = jiffies;
3689 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3690 	skb->next = napi->gro_list;
3691 	napi->gro_list = skb;
3692 	ret = GRO_HELD;
3693 
3694 pull:
3695 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3696 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3697 
3698 		BUG_ON(skb->end - skb->tail < grow);
3699 
3700 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3701 
3702 		skb->tail += grow;
3703 		skb->data_len -= grow;
3704 
3705 		skb_shinfo(skb)->frags[0].page_offset += grow;
3706 		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3707 
3708 		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3709 			skb_frag_unref(skb, 0);
3710 			memmove(skb_shinfo(skb)->frags,
3711 				skb_shinfo(skb)->frags + 1,
3712 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3713 		}
3714 	}
3715 
3716 ok:
3717 	return ret;
3718 
3719 normal:
3720 	ret = GRO_NORMAL;
3721 	goto pull;
3722 }
3723 
3724 
3725 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3726 {
3727 	switch (ret) {
3728 	case GRO_NORMAL:
3729 		if (netif_receive_skb(skb))
3730 			ret = GRO_DROP;
3731 		break;
3732 
3733 	case GRO_DROP:
3734 		kfree_skb(skb);
3735 		break;
3736 
3737 	case GRO_MERGED_FREE:
3738 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3739 			kmem_cache_free(skbuff_head_cache, skb);
3740 		else
3741 			__kfree_skb(skb);
3742 		break;
3743 
3744 	case GRO_HELD:
3745 	case GRO_MERGED:
3746 		break;
3747 	}
3748 
3749 	return ret;
3750 }
3751 
3752 static void skb_gro_reset_offset(struct sk_buff *skb)
3753 {
3754 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
3755 	const skb_frag_t *frag0 = &pinfo->frags[0];
3756 
3757 	NAPI_GRO_CB(skb)->data_offset = 0;
3758 	NAPI_GRO_CB(skb)->frag0 = NULL;
3759 	NAPI_GRO_CB(skb)->frag0_len = 0;
3760 
3761 	if (skb->mac_header == skb->tail &&
3762 	    pinfo->nr_frags &&
3763 	    !PageHighMem(skb_frag_page(frag0))) {
3764 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3765 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3766 	}
3767 }
3768 
3769 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3770 {
3771 	skb_gro_reset_offset(skb);
3772 
3773 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3774 }
3775 EXPORT_SYMBOL(napi_gro_receive);
3776 
3777 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3778 {
3779 	__skb_pull(skb, skb_headlen(skb));
3780 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
3781 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3782 	skb->vlan_tci = 0;
3783 	skb->dev = napi->dev;
3784 	skb->skb_iif = 0;
3785 
3786 	napi->skb = skb;
3787 }
3788 
3789 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3790 {
3791 	struct sk_buff *skb = napi->skb;
3792 
3793 	if (!skb) {
3794 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3795 		if (skb)
3796 			napi->skb = skb;
3797 	}
3798 	return skb;
3799 }
3800 EXPORT_SYMBOL(napi_get_frags);
3801 
3802 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3803 			       gro_result_t ret)
3804 {
3805 	switch (ret) {
3806 	case GRO_NORMAL:
3807 	case GRO_HELD:
3808 		skb->protocol = eth_type_trans(skb, skb->dev);
3809 
3810 		if (ret == GRO_HELD)
3811 			skb_gro_pull(skb, -ETH_HLEN);
3812 		else if (netif_receive_skb(skb))
3813 			ret = GRO_DROP;
3814 		break;
3815 
3816 	case GRO_DROP:
3817 	case GRO_MERGED_FREE:
3818 		napi_reuse_skb(napi, skb);
3819 		break;
3820 
3821 	case GRO_MERGED:
3822 		break;
3823 	}
3824 
3825 	return ret;
3826 }
3827 
3828 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3829 {
3830 	struct sk_buff *skb = napi->skb;
3831 	struct ethhdr *eth;
3832 	unsigned int hlen;
3833 	unsigned int off;
3834 
3835 	napi->skb = NULL;
3836 
3837 	skb_reset_mac_header(skb);
3838 	skb_gro_reset_offset(skb);
3839 
3840 	off = skb_gro_offset(skb);
3841 	hlen = off + sizeof(*eth);
3842 	eth = skb_gro_header_fast(skb, off);
3843 	if (skb_gro_header_hard(skb, hlen)) {
3844 		eth = skb_gro_header_slow(skb, hlen, off);
3845 		if (unlikely(!eth)) {
3846 			napi_reuse_skb(napi, skb);
3847 			skb = NULL;
3848 			goto out;
3849 		}
3850 	}
3851 
3852 	skb_gro_pull(skb, sizeof(*eth));
3853 
3854 	/*
3855 	 * This works because the only protocols we care about don't require
3856 	 * special handling.  We'll fix it up properly at the end.
3857 	 */
3858 	skb->protocol = eth->h_proto;
3859 
3860 out:
3861 	return skb;
3862 }
3863 
3864 gro_result_t napi_gro_frags(struct napi_struct *napi)
3865 {
3866 	struct sk_buff *skb = napi_frags_skb(napi);
3867 
3868 	if (!skb)
3869 		return GRO_DROP;
3870 
3871 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
3872 }
3873 EXPORT_SYMBOL(napi_gro_frags);
3874 
3875 /*
3876  * net_rps_action sends any pending IPI's for rps.
3877  * Note: called with local irq disabled, but exits with local irq enabled.
3878  */
3879 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3880 {
3881 #ifdef CONFIG_RPS
3882 	struct softnet_data *remsd = sd->rps_ipi_list;
3883 
3884 	if (remsd) {
3885 		sd->rps_ipi_list = NULL;
3886 
3887 		local_irq_enable();
3888 
3889 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3890 		while (remsd) {
3891 			struct softnet_data *next = remsd->rps_ipi_next;
3892 
3893 			if (cpu_online(remsd->cpu))
3894 				__smp_call_function_single(remsd->cpu,
3895 							   &remsd->csd, 0);
3896 			remsd = next;
3897 		}
3898 	} else
3899 #endif
3900 		local_irq_enable();
3901 }
3902 
3903 static int process_backlog(struct napi_struct *napi, int quota)
3904 {
3905 	int work = 0;
3906 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3907 
3908 #ifdef CONFIG_RPS
3909 	/* Check if we have pending ipi, its better to send them now,
3910 	 * not waiting net_rx_action() end.
3911 	 */
3912 	if (sd->rps_ipi_list) {
3913 		local_irq_disable();
3914 		net_rps_action_and_irq_enable(sd);
3915 	}
3916 #endif
3917 	napi->weight = weight_p;
3918 	local_irq_disable();
3919 	while (work < quota) {
3920 		struct sk_buff *skb;
3921 		unsigned int qlen;
3922 
3923 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3924 			local_irq_enable();
3925 			__netif_receive_skb(skb);
3926 			local_irq_disable();
3927 			input_queue_head_incr(sd);
3928 			if (++work >= quota) {
3929 				local_irq_enable();
3930 				return work;
3931 			}
3932 		}
3933 
3934 		rps_lock(sd);
3935 		qlen = skb_queue_len(&sd->input_pkt_queue);
3936 		if (qlen)
3937 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3938 						   &sd->process_queue);
3939 
3940 		if (qlen < quota - work) {
3941 			/*
3942 			 * Inline a custom version of __napi_complete().
3943 			 * only current cpu owns and manipulates this napi,
3944 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3945 			 * we can use a plain write instead of clear_bit(),
3946 			 * and we dont need an smp_mb() memory barrier.
3947 			 */
3948 			list_del(&napi->poll_list);
3949 			napi->state = 0;
3950 
3951 			quota = work + qlen;
3952 		}
3953 		rps_unlock(sd);
3954 	}
3955 	local_irq_enable();
3956 
3957 	return work;
3958 }
3959 
3960 /**
3961  * __napi_schedule - schedule for receive
3962  * @n: entry to schedule
3963  *
3964  * The entry's receive function will be scheduled to run
3965  */
3966 void __napi_schedule(struct napi_struct *n)
3967 {
3968 	unsigned long flags;
3969 
3970 	local_irq_save(flags);
3971 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3972 	local_irq_restore(flags);
3973 }
3974 EXPORT_SYMBOL(__napi_schedule);
3975 
3976 void __napi_complete(struct napi_struct *n)
3977 {
3978 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3979 	BUG_ON(n->gro_list);
3980 
3981 	list_del(&n->poll_list);
3982 	smp_mb__before_clear_bit();
3983 	clear_bit(NAPI_STATE_SCHED, &n->state);
3984 }
3985 EXPORT_SYMBOL(__napi_complete);
3986 
3987 void napi_complete(struct napi_struct *n)
3988 {
3989 	unsigned long flags;
3990 
3991 	/*
3992 	 * don't let napi dequeue from the cpu poll list
3993 	 * just in case its running on a different cpu
3994 	 */
3995 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3996 		return;
3997 
3998 	napi_gro_flush(n, false);
3999 	local_irq_save(flags);
4000 	__napi_complete(n);
4001 	local_irq_restore(flags);
4002 }
4003 EXPORT_SYMBOL(napi_complete);
4004 
4005 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4006 		    int (*poll)(struct napi_struct *, int), int weight)
4007 {
4008 	INIT_LIST_HEAD(&napi->poll_list);
4009 	napi->gro_count = 0;
4010 	napi->gro_list = NULL;
4011 	napi->skb = NULL;
4012 	napi->poll = poll;
4013 	napi->weight = weight;
4014 	list_add(&napi->dev_list, &dev->napi_list);
4015 	napi->dev = dev;
4016 #ifdef CONFIG_NETPOLL
4017 	spin_lock_init(&napi->poll_lock);
4018 	napi->poll_owner = -1;
4019 #endif
4020 	set_bit(NAPI_STATE_SCHED, &napi->state);
4021 }
4022 EXPORT_SYMBOL(netif_napi_add);
4023 
4024 void netif_napi_del(struct napi_struct *napi)
4025 {
4026 	struct sk_buff *skb, *next;
4027 
4028 	list_del_init(&napi->dev_list);
4029 	napi_free_frags(napi);
4030 
4031 	for (skb = napi->gro_list; skb; skb = next) {
4032 		next = skb->next;
4033 		skb->next = NULL;
4034 		kfree_skb(skb);
4035 	}
4036 
4037 	napi->gro_list = NULL;
4038 	napi->gro_count = 0;
4039 }
4040 EXPORT_SYMBOL(netif_napi_del);
4041 
4042 static void net_rx_action(struct softirq_action *h)
4043 {
4044 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
4045 	unsigned long time_limit = jiffies + 2;
4046 	int budget = netdev_budget;
4047 	void *have;
4048 
4049 	local_irq_disable();
4050 
4051 	while (!list_empty(&sd->poll_list)) {
4052 		struct napi_struct *n;
4053 		int work, weight;
4054 
4055 		/* If softirq window is exhuasted then punt.
4056 		 * Allow this to run for 2 jiffies since which will allow
4057 		 * an average latency of 1.5/HZ.
4058 		 */
4059 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4060 			goto softnet_break;
4061 
4062 		local_irq_enable();
4063 
4064 		/* Even though interrupts have been re-enabled, this
4065 		 * access is safe because interrupts can only add new
4066 		 * entries to the tail of this list, and only ->poll()
4067 		 * calls can remove this head entry from the list.
4068 		 */
4069 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4070 
4071 		have = netpoll_poll_lock(n);
4072 
4073 		weight = n->weight;
4074 
4075 		/* This NAPI_STATE_SCHED test is for avoiding a race
4076 		 * with netpoll's poll_napi().  Only the entity which
4077 		 * obtains the lock and sees NAPI_STATE_SCHED set will
4078 		 * actually make the ->poll() call.  Therefore we avoid
4079 		 * accidentally calling ->poll() when NAPI is not scheduled.
4080 		 */
4081 		work = 0;
4082 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4083 			work = n->poll(n, weight);
4084 			trace_napi_poll(n);
4085 		}
4086 
4087 		WARN_ON_ONCE(work > weight);
4088 
4089 		budget -= work;
4090 
4091 		local_irq_disable();
4092 
4093 		/* Drivers must not modify the NAPI state if they
4094 		 * consume the entire weight.  In such cases this code
4095 		 * still "owns" the NAPI instance and therefore can
4096 		 * move the instance around on the list at-will.
4097 		 */
4098 		if (unlikely(work == weight)) {
4099 			if (unlikely(napi_disable_pending(n))) {
4100 				local_irq_enable();
4101 				napi_complete(n);
4102 				local_irq_disable();
4103 			} else {
4104 				if (n->gro_list) {
4105 					/* flush too old packets
4106 					 * If HZ < 1000, flush all packets.
4107 					 */
4108 					local_irq_enable();
4109 					napi_gro_flush(n, HZ >= 1000);
4110 					local_irq_disable();
4111 				}
4112 				list_move_tail(&n->poll_list, &sd->poll_list);
4113 			}
4114 		}
4115 
4116 		netpoll_poll_unlock(have);
4117 	}
4118 out:
4119 	net_rps_action_and_irq_enable(sd);
4120 
4121 #ifdef CONFIG_NET_DMA
4122 	/*
4123 	 * There may not be any more sk_buffs coming right now, so push
4124 	 * any pending DMA copies to hardware
4125 	 */
4126 	dma_issue_pending_all();
4127 #endif
4128 
4129 	return;
4130 
4131 softnet_break:
4132 	sd->time_squeeze++;
4133 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4134 	goto out;
4135 }
4136 
4137 static gifconf_func_t *gifconf_list[NPROTO];
4138 
4139 /**
4140  *	register_gifconf	-	register a SIOCGIF handler
4141  *	@family: Address family
4142  *	@gifconf: Function handler
4143  *
4144  *	Register protocol dependent address dumping routines. The handler
4145  *	that is passed must not be freed or reused until it has been replaced
4146  *	by another handler.
4147  */
4148 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4149 {
4150 	if (family >= NPROTO)
4151 		return -EINVAL;
4152 	gifconf_list[family] = gifconf;
4153 	return 0;
4154 }
4155 EXPORT_SYMBOL(register_gifconf);
4156 
4157 
4158 /*
4159  *	Map an interface index to its name (SIOCGIFNAME)
4160  */
4161 
4162 /*
4163  *	We need this ioctl for efficient implementation of the
4164  *	if_indextoname() function required by the IPv6 API.  Without
4165  *	it, we would have to search all the interfaces to find a
4166  *	match.  --pb
4167  */
4168 
4169 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4170 {
4171 	struct net_device *dev;
4172 	struct ifreq ifr;
4173 	unsigned seq;
4174 
4175 	/*
4176 	 *	Fetch the caller's info block.
4177 	 */
4178 
4179 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4180 		return -EFAULT;
4181 
4182 retry:
4183 	seq = read_seqbegin(&devnet_rename_seq);
4184 	rcu_read_lock();
4185 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4186 	if (!dev) {
4187 		rcu_read_unlock();
4188 		return -ENODEV;
4189 	}
4190 
4191 	strcpy(ifr.ifr_name, dev->name);
4192 	rcu_read_unlock();
4193 	if (read_seqretry(&devnet_rename_seq, seq))
4194 		goto retry;
4195 
4196 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4197 		return -EFAULT;
4198 	return 0;
4199 }
4200 
4201 /*
4202  *	Perform a SIOCGIFCONF call. This structure will change
4203  *	size eventually, and there is nothing I can do about it.
4204  *	Thus we will need a 'compatibility mode'.
4205  */
4206 
4207 static int dev_ifconf(struct net *net, char __user *arg)
4208 {
4209 	struct ifconf ifc;
4210 	struct net_device *dev;
4211 	char __user *pos;
4212 	int len;
4213 	int total;
4214 	int i;
4215 
4216 	/*
4217 	 *	Fetch the caller's info block.
4218 	 */
4219 
4220 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4221 		return -EFAULT;
4222 
4223 	pos = ifc.ifc_buf;
4224 	len = ifc.ifc_len;
4225 
4226 	/*
4227 	 *	Loop over the interfaces, and write an info block for each.
4228 	 */
4229 
4230 	total = 0;
4231 	for_each_netdev(net, dev) {
4232 		for (i = 0; i < NPROTO; i++) {
4233 			if (gifconf_list[i]) {
4234 				int done;
4235 				if (!pos)
4236 					done = gifconf_list[i](dev, NULL, 0);
4237 				else
4238 					done = gifconf_list[i](dev, pos + total,
4239 							       len - total);
4240 				if (done < 0)
4241 					return -EFAULT;
4242 				total += done;
4243 			}
4244 		}
4245 	}
4246 
4247 	/*
4248 	 *	All done.  Write the updated control block back to the caller.
4249 	 */
4250 	ifc.ifc_len = total;
4251 
4252 	/*
4253 	 * 	Both BSD and Solaris return 0 here, so we do too.
4254 	 */
4255 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4256 }
4257 
4258 #ifdef CONFIG_PROC_FS
4259 
4260 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4261 
4262 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4263 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4264 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4265 
4266 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4267 {
4268 	struct net *net = seq_file_net(seq);
4269 	struct net_device *dev;
4270 	struct hlist_node *p;
4271 	struct hlist_head *h;
4272 	unsigned int count = 0, offset = get_offset(*pos);
4273 
4274 	h = &net->dev_name_head[get_bucket(*pos)];
4275 	hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4276 		if (++count == offset)
4277 			return dev;
4278 	}
4279 
4280 	return NULL;
4281 }
4282 
4283 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4284 {
4285 	struct net_device *dev;
4286 	unsigned int bucket;
4287 
4288 	do {
4289 		dev = dev_from_same_bucket(seq, pos);
4290 		if (dev)
4291 			return dev;
4292 
4293 		bucket = get_bucket(*pos) + 1;
4294 		*pos = set_bucket_offset(bucket, 1);
4295 	} while (bucket < NETDEV_HASHENTRIES);
4296 
4297 	return NULL;
4298 }
4299 
4300 /*
4301  *	This is invoked by the /proc filesystem handler to display a device
4302  *	in detail.
4303  */
4304 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4305 	__acquires(RCU)
4306 {
4307 	rcu_read_lock();
4308 	if (!*pos)
4309 		return SEQ_START_TOKEN;
4310 
4311 	if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4312 		return NULL;
4313 
4314 	return dev_from_bucket(seq, pos);
4315 }
4316 
4317 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4318 {
4319 	++*pos;
4320 	return dev_from_bucket(seq, pos);
4321 }
4322 
4323 void dev_seq_stop(struct seq_file *seq, void *v)
4324 	__releases(RCU)
4325 {
4326 	rcu_read_unlock();
4327 }
4328 
4329 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4330 {
4331 	struct rtnl_link_stats64 temp;
4332 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4333 
4334 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4335 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4336 		   dev->name, stats->rx_bytes, stats->rx_packets,
4337 		   stats->rx_errors,
4338 		   stats->rx_dropped + stats->rx_missed_errors,
4339 		   stats->rx_fifo_errors,
4340 		   stats->rx_length_errors + stats->rx_over_errors +
4341 		    stats->rx_crc_errors + stats->rx_frame_errors,
4342 		   stats->rx_compressed, stats->multicast,
4343 		   stats->tx_bytes, stats->tx_packets,
4344 		   stats->tx_errors, stats->tx_dropped,
4345 		   stats->tx_fifo_errors, stats->collisions,
4346 		   stats->tx_carrier_errors +
4347 		    stats->tx_aborted_errors +
4348 		    stats->tx_window_errors +
4349 		    stats->tx_heartbeat_errors,
4350 		   stats->tx_compressed);
4351 }
4352 
4353 /*
4354  *	Called from the PROCfs module. This now uses the new arbitrary sized
4355  *	/proc/net interface to create /proc/net/dev
4356  */
4357 static int dev_seq_show(struct seq_file *seq, void *v)
4358 {
4359 	if (v == SEQ_START_TOKEN)
4360 		seq_puts(seq, "Inter-|   Receive                            "
4361 			      "                    |  Transmit\n"
4362 			      " face |bytes    packets errs drop fifo frame "
4363 			      "compressed multicast|bytes    packets errs "
4364 			      "drop fifo colls carrier compressed\n");
4365 	else
4366 		dev_seq_printf_stats(seq, v);
4367 	return 0;
4368 }
4369 
4370 static struct softnet_data *softnet_get_online(loff_t *pos)
4371 {
4372 	struct softnet_data *sd = NULL;
4373 
4374 	while (*pos < nr_cpu_ids)
4375 		if (cpu_online(*pos)) {
4376 			sd = &per_cpu(softnet_data, *pos);
4377 			break;
4378 		} else
4379 			++*pos;
4380 	return sd;
4381 }
4382 
4383 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4384 {
4385 	return softnet_get_online(pos);
4386 }
4387 
4388 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4389 {
4390 	++*pos;
4391 	return softnet_get_online(pos);
4392 }
4393 
4394 static void softnet_seq_stop(struct seq_file *seq, void *v)
4395 {
4396 }
4397 
4398 static int softnet_seq_show(struct seq_file *seq, void *v)
4399 {
4400 	struct softnet_data *sd = v;
4401 
4402 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4403 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4404 		   0, 0, 0, 0, /* was fastroute */
4405 		   sd->cpu_collision, sd->received_rps);
4406 	return 0;
4407 }
4408 
4409 static const struct seq_operations dev_seq_ops = {
4410 	.start = dev_seq_start,
4411 	.next  = dev_seq_next,
4412 	.stop  = dev_seq_stop,
4413 	.show  = dev_seq_show,
4414 };
4415 
4416 static int dev_seq_open(struct inode *inode, struct file *file)
4417 {
4418 	return seq_open_net(inode, file, &dev_seq_ops,
4419 			    sizeof(struct seq_net_private));
4420 }
4421 
4422 static const struct file_operations dev_seq_fops = {
4423 	.owner	 = THIS_MODULE,
4424 	.open    = dev_seq_open,
4425 	.read    = seq_read,
4426 	.llseek  = seq_lseek,
4427 	.release = seq_release_net,
4428 };
4429 
4430 static const struct seq_operations softnet_seq_ops = {
4431 	.start = softnet_seq_start,
4432 	.next  = softnet_seq_next,
4433 	.stop  = softnet_seq_stop,
4434 	.show  = softnet_seq_show,
4435 };
4436 
4437 static int softnet_seq_open(struct inode *inode, struct file *file)
4438 {
4439 	return seq_open(file, &softnet_seq_ops);
4440 }
4441 
4442 static const struct file_operations softnet_seq_fops = {
4443 	.owner	 = THIS_MODULE,
4444 	.open    = softnet_seq_open,
4445 	.read    = seq_read,
4446 	.llseek  = seq_lseek,
4447 	.release = seq_release,
4448 };
4449 
4450 static void *ptype_get_idx(loff_t pos)
4451 {
4452 	struct packet_type *pt = NULL;
4453 	loff_t i = 0;
4454 	int t;
4455 
4456 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4457 		if (i == pos)
4458 			return pt;
4459 		++i;
4460 	}
4461 
4462 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4463 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4464 			if (i == pos)
4465 				return pt;
4466 			++i;
4467 		}
4468 	}
4469 	return NULL;
4470 }
4471 
4472 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4473 	__acquires(RCU)
4474 {
4475 	rcu_read_lock();
4476 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4477 }
4478 
4479 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4480 {
4481 	struct packet_type *pt;
4482 	struct list_head *nxt;
4483 	int hash;
4484 
4485 	++*pos;
4486 	if (v == SEQ_START_TOKEN)
4487 		return ptype_get_idx(0);
4488 
4489 	pt = v;
4490 	nxt = pt->list.next;
4491 	if (pt->type == htons(ETH_P_ALL)) {
4492 		if (nxt != &ptype_all)
4493 			goto found;
4494 		hash = 0;
4495 		nxt = ptype_base[0].next;
4496 	} else
4497 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4498 
4499 	while (nxt == &ptype_base[hash]) {
4500 		if (++hash >= PTYPE_HASH_SIZE)
4501 			return NULL;
4502 		nxt = ptype_base[hash].next;
4503 	}
4504 found:
4505 	return list_entry(nxt, struct packet_type, list);
4506 }
4507 
4508 static void ptype_seq_stop(struct seq_file *seq, void *v)
4509 	__releases(RCU)
4510 {
4511 	rcu_read_unlock();
4512 }
4513 
4514 static int ptype_seq_show(struct seq_file *seq, void *v)
4515 {
4516 	struct packet_type *pt = v;
4517 
4518 	if (v == SEQ_START_TOKEN)
4519 		seq_puts(seq, "Type Device      Function\n");
4520 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4521 		if (pt->type == htons(ETH_P_ALL))
4522 			seq_puts(seq, "ALL ");
4523 		else
4524 			seq_printf(seq, "%04x", ntohs(pt->type));
4525 
4526 		seq_printf(seq, " %-8s %pF\n",
4527 			   pt->dev ? pt->dev->name : "", pt->func);
4528 	}
4529 
4530 	return 0;
4531 }
4532 
4533 static const struct seq_operations ptype_seq_ops = {
4534 	.start = ptype_seq_start,
4535 	.next  = ptype_seq_next,
4536 	.stop  = ptype_seq_stop,
4537 	.show  = ptype_seq_show,
4538 };
4539 
4540 static int ptype_seq_open(struct inode *inode, struct file *file)
4541 {
4542 	return seq_open_net(inode, file, &ptype_seq_ops,
4543 			sizeof(struct seq_net_private));
4544 }
4545 
4546 static const struct file_operations ptype_seq_fops = {
4547 	.owner	 = THIS_MODULE,
4548 	.open    = ptype_seq_open,
4549 	.read    = seq_read,
4550 	.llseek  = seq_lseek,
4551 	.release = seq_release_net,
4552 };
4553 
4554 
4555 static int __net_init dev_proc_net_init(struct net *net)
4556 {
4557 	int rc = -ENOMEM;
4558 
4559 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4560 		goto out;
4561 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4562 		goto out_dev;
4563 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4564 		goto out_softnet;
4565 
4566 	if (wext_proc_init(net))
4567 		goto out_ptype;
4568 	rc = 0;
4569 out:
4570 	return rc;
4571 out_ptype:
4572 	proc_net_remove(net, "ptype");
4573 out_softnet:
4574 	proc_net_remove(net, "softnet_stat");
4575 out_dev:
4576 	proc_net_remove(net, "dev");
4577 	goto out;
4578 }
4579 
4580 static void __net_exit dev_proc_net_exit(struct net *net)
4581 {
4582 	wext_proc_exit(net);
4583 
4584 	proc_net_remove(net, "ptype");
4585 	proc_net_remove(net, "softnet_stat");
4586 	proc_net_remove(net, "dev");
4587 }
4588 
4589 static struct pernet_operations __net_initdata dev_proc_ops = {
4590 	.init = dev_proc_net_init,
4591 	.exit = dev_proc_net_exit,
4592 };
4593 
4594 static int __init dev_proc_init(void)
4595 {
4596 	return register_pernet_subsys(&dev_proc_ops);
4597 }
4598 #else
4599 #define dev_proc_init() 0
4600 #endif	/* CONFIG_PROC_FS */
4601 
4602 
4603 /**
4604  *	netdev_set_master	-	set up master pointer
4605  *	@slave: slave device
4606  *	@master: new master device
4607  *
4608  *	Changes the master device of the slave. Pass %NULL to break the
4609  *	bonding. The caller must hold the RTNL semaphore. On a failure
4610  *	a negative errno code is returned. On success the reference counts
4611  *	are adjusted and the function returns zero.
4612  */
4613 int netdev_set_master(struct net_device *slave, struct net_device *master)
4614 {
4615 	struct net_device *old = slave->master;
4616 
4617 	ASSERT_RTNL();
4618 
4619 	if (master) {
4620 		if (old)
4621 			return -EBUSY;
4622 		dev_hold(master);
4623 	}
4624 
4625 	slave->master = master;
4626 
4627 	if (old)
4628 		dev_put(old);
4629 	return 0;
4630 }
4631 EXPORT_SYMBOL(netdev_set_master);
4632 
4633 /**
4634  *	netdev_set_bond_master	-	set up bonding master/slave pair
4635  *	@slave: slave device
4636  *	@master: new master device
4637  *
4638  *	Changes the master device of the slave. Pass %NULL to break the
4639  *	bonding. The caller must hold the RTNL semaphore. On a failure
4640  *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4641  *	to the routing socket and the function returns zero.
4642  */
4643 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4644 {
4645 	int err;
4646 
4647 	ASSERT_RTNL();
4648 
4649 	err = netdev_set_master(slave, master);
4650 	if (err)
4651 		return err;
4652 	if (master)
4653 		slave->flags |= IFF_SLAVE;
4654 	else
4655 		slave->flags &= ~IFF_SLAVE;
4656 
4657 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4658 	return 0;
4659 }
4660 EXPORT_SYMBOL(netdev_set_bond_master);
4661 
4662 static void dev_change_rx_flags(struct net_device *dev, int flags)
4663 {
4664 	const struct net_device_ops *ops = dev->netdev_ops;
4665 
4666 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4667 		ops->ndo_change_rx_flags(dev, flags);
4668 }
4669 
4670 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4671 {
4672 	unsigned int old_flags = dev->flags;
4673 	kuid_t uid;
4674 	kgid_t gid;
4675 
4676 	ASSERT_RTNL();
4677 
4678 	dev->flags |= IFF_PROMISC;
4679 	dev->promiscuity += inc;
4680 	if (dev->promiscuity == 0) {
4681 		/*
4682 		 * Avoid overflow.
4683 		 * If inc causes overflow, untouch promisc and return error.
4684 		 */
4685 		if (inc < 0)
4686 			dev->flags &= ~IFF_PROMISC;
4687 		else {
4688 			dev->promiscuity -= inc;
4689 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4690 				dev->name);
4691 			return -EOVERFLOW;
4692 		}
4693 	}
4694 	if (dev->flags != old_flags) {
4695 		pr_info("device %s %s promiscuous mode\n",
4696 			dev->name,
4697 			dev->flags & IFF_PROMISC ? "entered" : "left");
4698 		if (audit_enabled) {
4699 			current_uid_gid(&uid, &gid);
4700 			audit_log(current->audit_context, GFP_ATOMIC,
4701 				AUDIT_ANOM_PROMISCUOUS,
4702 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4703 				dev->name, (dev->flags & IFF_PROMISC),
4704 				(old_flags & IFF_PROMISC),
4705 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
4706 				from_kuid(&init_user_ns, uid),
4707 				from_kgid(&init_user_ns, gid),
4708 				audit_get_sessionid(current));
4709 		}
4710 
4711 		dev_change_rx_flags(dev, IFF_PROMISC);
4712 	}
4713 	return 0;
4714 }
4715 
4716 /**
4717  *	dev_set_promiscuity	- update promiscuity count on a device
4718  *	@dev: device
4719  *	@inc: modifier
4720  *
4721  *	Add or remove promiscuity from a device. While the count in the device
4722  *	remains above zero the interface remains promiscuous. Once it hits zero
4723  *	the device reverts back to normal filtering operation. A negative inc
4724  *	value is used to drop promiscuity on the device.
4725  *	Return 0 if successful or a negative errno code on error.
4726  */
4727 int dev_set_promiscuity(struct net_device *dev, int inc)
4728 {
4729 	unsigned int old_flags = dev->flags;
4730 	int err;
4731 
4732 	err = __dev_set_promiscuity(dev, inc);
4733 	if (err < 0)
4734 		return err;
4735 	if (dev->flags != old_flags)
4736 		dev_set_rx_mode(dev);
4737 	return err;
4738 }
4739 EXPORT_SYMBOL(dev_set_promiscuity);
4740 
4741 /**
4742  *	dev_set_allmulti	- update allmulti count on a device
4743  *	@dev: device
4744  *	@inc: modifier
4745  *
4746  *	Add or remove reception of all multicast frames to a device. While the
4747  *	count in the device remains above zero the interface remains listening
4748  *	to all interfaces. Once it hits zero the device reverts back to normal
4749  *	filtering operation. A negative @inc value is used to drop the counter
4750  *	when releasing a resource needing all multicasts.
4751  *	Return 0 if successful or a negative errno code on error.
4752  */
4753 
4754 int dev_set_allmulti(struct net_device *dev, int inc)
4755 {
4756 	unsigned int old_flags = dev->flags;
4757 
4758 	ASSERT_RTNL();
4759 
4760 	dev->flags |= IFF_ALLMULTI;
4761 	dev->allmulti += inc;
4762 	if (dev->allmulti == 0) {
4763 		/*
4764 		 * Avoid overflow.
4765 		 * If inc causes overflow, untouch allmulti and return error.
4766 		 */
4767 		if (inc < 0)
4768 			dev->flags &= ~IFF_ALLMULTI;
4769 		else {
4770 			dev->allmulti -= inc;
4771 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4772 				dev->name);
4773 			return -EOVERFLOW;
4774 		}
4775 	}
4776 	if (dev->flags ^ old_flags) {
4777 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4778 		dev_set_rx_mode(dev);
4779 	}
4780 	return 0;
4781 }
4782 EXPORT_SYMBOL(dev_set_allmulti);
4783 
4784 /*
4785  *	Upload unicast and multicast address lists to device and
4786  *	configure RX filtering. When the device doesn't support unicast
4787  *	filtering it is put in promiscuous mode while unicast addresses
4788  *	are present.
4789  */
4790 void __dev_set_rx_mode(struct net_device *dev)
4791 {
4792 	const struct net_device_ops *ops = dev->netdev_ops;
4793 
4794 	/* dev_open will call this function so the list will stay sane. */
4795 	if (!(dev->flags&IFF_UP))
4796 		return;
4797 
4798 	if (!netif_device_present(dev))
4799 		return;
4800 
4801 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4802 		/* Unicast addresses changes may only happen under the rtnl,
4803 		 * therefore calling __dev_set_promiscuity here is safe.
4804 		 */
4805 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4806 			__dev_set_promiscuity(dev, 1);
4807 			dev->uc_promisc = true;
4808 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4809 			__dev_set_promiscuity(dev, -1);
4810 			dev->uc_promisc = false;
4811 		}
4812 	}
4813 
4814 	if (ops->ndo_set_rx_mode)
4815 		ops->ndo_set_rx_mode(dev);
4816 }
4817 
4818 void dev_set_rx_mode(struct net_device *dev)
4819 {
4820 	netif_addr_lock_bh(dev);
4821 	__dev_set_rx_mode(dev);
4822 	netif_addr_unlock_bh(dev);
4823 }
4824 
4825 /**
4826  *	dev_get_flags - get flags reported to userspace
4827  *	@dev: device
4828  *
4829  *	Get the combination of flag bits exported through APIs to userspace.
4830  */
4831 unsigned int dev_get_flags(const struct net_device *dev)
4832 {
4833 	unsigned int flags;
4834 
4835 	flags = (dev->flags & ~(IFF_PROMISC |
4836 				IFF_ALLMULTI |
4837 				IFF_RUNNING |
4838 				IFF_LOWER_UP |
4839 				IFF_DORMANT)) |
4840 		(dev->gflags & (IFF_PROMISC |
4841 				IFF_ALLMULTI));
4842 
4843 	if (netif_running(dev)) {
4844 		if (netif_oper_up(dev))
4845 			flags |= IFF_RUNNING;
4846 		if (netif_carrier_ok(dev))
4847 			flags |= IFF_LOWER_UP;
4848 		if (netif_dormant(dev))
4849 			flags |= IFF_DORMANT;
4850 	}
4851 
4852 	return flags;
4853 }
4854 EXPORT_SYMBOL(dev_get_flags);
4855 
4856 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4857 {
4858 	unsigned int old_flags = dev->flags;
4859 	int ret;
4860 
4861 	ASSERT_RTNL();
4862 
4863 	/*
4864 	 *	Set the flags on our device.
4865 	 */
4866 
4867 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4868 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4869 			       IFF_AUTOMEDIA)) |
4870 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4871 				    IFF_ALLMULTI));
4872 
4873 	/*
4874 	 *	Load in the correct multicast list now the flags have changed.
4875 	 */
4876 
4877 	if ((old_flags ^ flags) & IFF_MULTICAST)
4878 		dev_change_rx_flags(dev, IFF_MULTICAST);
4879 
4880 	dev_set_rx_mode(dev);
4881 
4882 	/*
4883 	 *	Have we downed the interface. We handle IFF_UP ourselves
4884 	 *	according to user attempts to set it, rather than blindly
4885 	 *	setting it.
4886 	 */
4887 
4888 	ret = 0;
4889 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4890 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4891 
4892 		if (!ret)
4893 			dev_set_rx_mode(dev);
4894 	}
4895 
4896 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4897 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4898 
4899 		dev->gflags ^= IFF_PROMISC;
4900 		dev_set_promiscuity(dev, inc);
4901 	}
4902 
4903 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4904 	   is important. Some (broken) drivers set IFF_PROMISC, when
4905 	   IFF_ALLMULTI is requested not asking us and not reporting.
4906 	 */
4907 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4908 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4909 
4910 		dev->gflags ^= IFF_ALLMULTI;
4911 		dev_set_allmulti(dev, inc);
4912 	}
4913 
4914 	return ret;
4915 }
4916 
4917 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4918 {
4919 	unsigned int changes = dev->flags ^ old_flags;
4920 
4921 	if (changes & IFF_UP) {
4922 		if (dev->flags & IFF_UP)
4923 			call_netdevice_notifiers(NETDEV_UP, dev);
4924 		else
4925 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4926 	}
4927 
4928 	if (dev->flags & IFF_UP &&
4929 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4930 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4931 }
4932 
4933 /**
4934  *	dev_change_flags - change device settings
4935  *	@dev: device
4936  *	@flags: device state flags
4937  *
4938  *	Change settings on device based state flags. The flags are
4939  *	in the userspace exported format.
4940  */
4941 int dev_change_flags(struct net_device *dev, unsigned int flags)
4942 {
4943 	int ret;
4944 	unsigned int changes, old_flags = dev->flags;
4945 
4946 	ret = __dev_change_flags(dev, flags);
4947 	if (ret < 0)
4948 		return ret;
4949 
4950 	changes = old_flags ^ dev->flags;
4951 	if (changes)
4952 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4953 
4954 	__dev_notify_flags(dev, old_flags);
4955 	return ret;
4956 }
4957 EXPORT_SYMBOL(dev_change_flags);
4958 
4959 /**
4960  *	dev_set_mtu - Change maximum transfer unit
4961  *	@dev: device
4962  *	@new_mtu: new transfer unit
4963  *
4964  *	Change the maximum transfer size of the network device.
4965  */
4966 int dev_set_mtu(struct net_device *dev, int new_mtu)
4967 {
4968 	const struct net_device_ops *ops = dev->netdev_ops;
4969 	int err;
4970 
4971 	if (new_mtu == dev->mtu)
4972 		return 0;
4973 
4974 	/*	MTU must be positive.	 */
4975 	if (new_mtu < 0)
4976 		return -EINVAL;
4977 
4978 	if (!netif_device_present(dev))
4979 		return -ENODEV;
4980 
4981 	err = 0;
4982 	if (ops->ndo_change_mtu)
4983 		err = ops->ndo_change_mtu(dev, new_mtu);
4984 	else
4985 		dev->mtu = new_mtu;
4986 
4987 	if (!err)
4988 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4989 	return err;
4990 }
4991 EXPORT_SYMBOL(dev_set_mtu);
4992 
4993 /**
4994  *	dev_set_group - Change group this device belongs to
4995  *	@dev: device
4996  *	@new_group: group this device should belong to
4997  */
4998 void dev_set_group(struct net_device *dev, int new_group)
4999 {
5000 	dev->group = new_group;
5001 }
5002 EXPORT_SYMBOL(dev_set_group);
5003 
5004 /**
5005  *	dev_set_mac_address - Change Media Access Control Address
5006  *	@dev: device
5007  *	@sa: new address
5008  *
5009  *	Change the hardware (MAC) address of the device
5010  */
5011 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5012 {
5013 	const struct net_device_ops *ops = dev->netdev_ops;
5014 	int err;
5015 
5016 	if (!ops->ndo_set_mac_address)
5017 		return -EOPNOTSUPP;
5018 	if (sa->sa_family != dev->type)
5019 		return -EINVAL;
5020 	if (!netif_device_present(dev))
5021 		return -ENODEV;
5022 	err = ops->ndo_set_mac_address(dev, sa);
5023 	if (!err)
5024 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5025 	add_device_randomness(dev->dev_addr, dev->addr_len);
5026 	return err;
5027 }
5028 EXPORT_SYMBOL(dev_set_mac_address);
5029 
5030 /*
5031  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
5032  */
5033 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
5034 {
5035 	int err;
5036 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
5037 
5038 	if (!dev)
5039 		return -ENODEV;
5040 
5041 	switch (cmd) {
5042 	case SIOCGIFFLAGS:	/* Get interface flags */
5043 		ifr->ifr_flags = (short) dev_get_flags(dev);
5044 		return 0;
5045 
5046 	case SIOCGIFMETRIC:	/* Get the metric on the interface
5047 				   (currently unused) */
5048 		ifr->ifr_metric = 0;
5049 		return 0;
5050 
5051 	case SIOCGIFMTU:	/* Get the MTU of a device */
5052 		ifr->ifr_mtu = dev->mtu;
5053 		return 0;
5054 
5055 	case SIOCGIFHWADDR:
5056 		if (!dev->addr_len)
5057 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
5058 		else
5059 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
5060 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5061 		ifr->ifr_hwaddr.sa_family = dev->type;
5062 		return 0;
5063 
5064 	case SIOCGIFSLAVE:
5065 		err = -EINVAL;
5066 		break;
5067 
5068 	case SIOCGIFMAP:
5069 		ifr->ifr_map.mem_start = dev->mem_start;
5070 		ifr->ifr_map.mem_end   = dev->mem_end;
5071 		ifr->ifr_map.base_addr = dev->base_addr;
5072 		ifr->ifr_map.irq       = dev->irq;
5073 		ifr->ifr_map.dma       = dev->dma;
5074 		ifr->ifr_map.port      = dev->if_port;
5075 		return 0;
5076 
5077 	case SIOCGIFINDEX:
5078 		ifr->ifr_ifindex = dev->ifindex;
5079 		return 0;
5080 
5081 	case SIOCGIFTXQLEN:
5082 		ifr->ifr_qlen = dev->tx_queue_len;
5083 		return 0;
5084 
5085 	default:
5086 		/* dev_ioctl() should ensure this case
5087 		 * is never reached
5088 		 */
5089 		WARN_ON(1);
5090 		err = -ENOTTY;
5091 		break;
5092 
5093 	}
5094 	return err;
5095 }
5096 
5097 /*
5098  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
5099  */
5100 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
5101 {
5102 	int err;
5103 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5104 	const struct net_device_ops *ops;
5105 
5106 	if (!dev)
5107 		return -ENODEV;
5108 
5109 	ops = dev->netdev_ops;
5110 
5111 	switch (cmd) {
5112 	case SIOCSIFFLAGS:	/* Set interface flags */
5113 		return dev_change_flags(dev, ifr->ifr_flags);
5114 
5115 	case SIOCSIFMETRIC:	/* Set the metric on the interface
5116 				   (currently unused) */
5117 		return -EOPNOTSUPP;
5118 
5119 	case SIOCSIFMTU:	/* Set the MTU of a device */
5120 		return dev_set_mtu(dev, ifr->ifr_mtu);
5121 
5122 	case SIOCSIFHWADDR:
5123 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
5124 
5125 	case SIOCSIFHWBROADCAST:
5126 		if (ifr->ifr_hwaddr.sa_family != dev->type)
5127 			return -EINVAL;
5128 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
5129 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5130 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5131 		return 0;
5132 
5133 	case SIOCSIFMAP:
5134 		if (ops->ndo_set_config) {
5135 			if (!netif_device_present(dev))
5136 				return -ENODEV;
5137 			return ops->ndo_set_config(dev, &ifr->ifr_map);
5138 		}
5139 		return -EOPNOTSUPP;
5140 
5141 	case SIOCADDMULTI:
5142 		if (!ops->ndo_set_rx_mode ||
5143 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5144 			return -EINVAL;
5145 		if (!netif_device_present(dev))
5146 			return -ENODEV;
5147 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5148 
5149 	case SIOCDELMULTI:
5150 		if (!ops->ndo_set_rx_mode ||
5151 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5152 			return -EINVAL;
5153 		if (!netif_device_present(dev))
5154 			return -ENODEV;
5155 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5156 
5157 	case SIOCSIFTXQLEN:
5158 		if (ifr->ifr_qlen < 0)
5159 			return -EINVAL;
5160 		dev->tx_queue_len = ifr->ifr_qlen;
5161 		return 0;
5162 
5163 	case SIOCSIFNAME:
5164 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5165 		return dev_change_name(dev, ifr->ifr_newname);
5166 
5167 	case SIOCSHWTSTAMP:
5168 		err = net_hwtstamp_validate(ifr);
5169 		if (err)
5170 			return err;
5171 		/* fall through */
5172 
5173 	/*
5174 	 *	Unknown or private ioctl
5175 	 */
5176 	default:
5177 		if ((cmd >= SIOCDEVPRIVATE &&
5178 		    cmd <= SIOCDEVPRIVATE + 15) ||
5179 		    cmd == SIOCBONDENSLAVE ||
5180 		    cmd == SIOCBONDRELEASE ||
5181 		    cmd == SIOCBONDSETHWADDR ||
5182 		    cmd == SIOCBONDSLAVEINFOQUERY ||
5183 		    cmd == SIOCBONDINFOQUERY ||
5184 		    cmd == SIOCBONDCHANGEACTIVE ||
5185 		    cmd == SIOCGMIIPHY ||
5186 		    cmd == SIOCGMIIREG ||
5187 		    cmd == SIOCSMIIREG ||
5188 		    cmd == SIOCBRADDIF ||
5189 		    cmd == SIOCBRDELIF ||
5190 		    cmd == SIOCSHWTSTAMP ||
5191 		    cmd == SIOCWANDEV) {
5192 			err = -EOPNOTSUPP;
5193 			if (ops->ndo_do_ioctl) {
5194 				if (netif_device_present(dev))
5195 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
5196 				else
5197 					err = -ENODEV;
5198 			}
5199 		} else
5200 			err = -EINVAL;
5201 
5202 	}
5203 	return err;
5204 }
5205 
5206 /*
5207  *	This function handles all "interface"-type I/O control requests. The actual
5208  *	'doing' part of this is dev_ifsioc above.
5209  */
5210 
5211 /**
5212  *	dev_ioctl	-	network device ioctl
5213  *	@net: the applicable net namespace
5214  *	@cmd: command to issue
5215  *	@arg: pointer to a struct ifreq in user space
5216  *
5217  *	Issue ioctl functions to devices. This is normally called by the
5218  *	user space syscall interfaces but can sometimes be useful for
5219  *	other purposes. The return value is the return from the syscall if
5220  *	positive or a negative errno code on error.
5221  */
5222 
5223 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5224 {
5225 	struct ifreq ifr;
5226 	int ret;
5227 	char *colon;
5228 
5229 	/* One special case: SIOCGIFCONF takes ifconf argument
5230 	   and requires shared lock, because it sleeps writing
5231 	   to user space.
5232 	 */
5233 
5234 	if (cmd == SIOCGIFCONF) {
5235 		rtnl_lock();
5236 		ret = dev_ifconf(net, (char __user *) arg);
5237 		rtnl_unlock();
5238 		return ret;
5239 	}
5240 	if (cmd == SIOCGIFNAME)
5241 		return dev_ifname(net, (struct ifreq __user *)arg);
5242 
5243 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5244 		return -EFAULT;
5245 
5246 	ifr.ifr_name[IFNAMSIZ-1] = 0;
5247 
5248 	colon = strchr(ifr.ifr_name, ':');
5249 	if (colon)
5250 		*colon = 0;
5251 
5252 	/*
5253 	 *	See which interface the caller is talking about.
5254 	 */
5255 
5256 	switch (cmd) {
5257 	/*
5258 	 *	These ioctl calls:
5259 	 *	- can be done by all.
5260 	 *	- atomic and do not require locking.
5261 	 *	- return a value
5262 	 */
5263 	case SIOCGIFFLAGS:
5264 	case SIOCGIFMETRIC:
5265 	case SIOCGIFMTU:
5266 	case SIOCGIFHWADDR:
5267 	case SIOCGIFSLAVE:
5268 	case SIOCGIFMAP:
5269 	case SIOCGIFINDEX:
5270 	case SIOCGIFTXQLEN:
5271 		dev_load(net, ifr.ifr_name);
5272 		rcu_read_lock();
5273 		ret = dev_ifsioc_locked(net, &ifr, cmd);
5274 		rcu_read_unlock();
5275 		if (!ret) {
5276 			if (colon)
5277 				*colon = ':';
5278 			if (copy_to_user(arg, &ifr,
5279 					 sizeof(struct ifreq)))
5280 				ret = -EFAULT;
5281 		}
5282 		return ret;
5283 
5284 	case SIOCETHTOOL:
5285 		dev_load(net, ifr.ifr_name);
5286 		rtnl_lock();
5287 		ret = dev_ethtool(net, &ifr);
5288 		rtnl_unlock();
5289 		if (!ret) {
5290 			if (colon)
5291 				*colon = ':';
5292 			if (copy_to_user(arg, &ifr,
5293 					 sizeof(struct ifreq)))
5294 				ret = -EFAULT;
5295 		}
5296 		return ret;
5297 
5298 	/*
5299 	 *	These ioctl calls:
5300 	 *	- require superuser power.
5301 	 *	- require strict serialization.
5302 	 *	- return a value
5303 	 */
5304 	case SIOCGMIIPHY:
5305 	case SIOCGMIIREG:
5306 	case SIOCSIFNAME:
5307 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5308 			return -EPERM;
5309 		dev_load(net, ifr.ifr_name);
5310 		rtnl_lock();
5311 		ret = dev_ifsioc(net, &ifr, cmd);
5312 		rtnl_unlock();
5313 		if (!ret) {
5314 			if (colon)
5315 				*colon = ':';
5316 			if (copy_to_user(arg, &ifr,
5317 					 sizeof(struct ifreq)))
5318 				ret = -EFAULT;
5319 		}
5320 		return ret;
5321 
5322 	/*
5323 	 *	These ioctl calls:
5324 	 *	- require superuser power.
5325 	 *	- require strict serialization.
5326 	 *	- do not return a value
5327 	 */
5328 	case SIOCSIFMAP:
5329 	case SIOCSIFTXQLEN:
5330 		if (!capable(CAP_NET_ADMIN))
5331 			return -EPERM;
5332 		/* fall through */
5333 	/*
5334 	 *	These ioctl calls:
5335 	 *	- require local superuser power.
5336 	 *	- require strict serialization.
5337 	 *	- do not return a value
5338 	 */
5339 	case SIOCSIFFLAGS:
5340 	case SIOCSIFMETRIC:
5341 	case SIOCSIFMTU:
5342 	case SIOCSIFHWADDR:
5343 	case SIOCSIFSLAVE:
5344 	case SIOCADDMULTI:
5345 	case SIOCDELMULTI:
5346 	case SIOCSIFHWBROADCAST:
5347 	case SIOCSMIIREG:
5348 	case SIOCBONDENSLAVE:
5349 	case SIOCBONDRELEASE:
5350 	case SIOCBONDSETHWADDR:
5351 	case SIOCBONDCHANGEACTIVE:
5352 	case SIOCBRADDIF:
5353 	case SIOCBRDELIF:
5354 	case SIOCSHWTSTAMP:
5355 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
5356 			return -EPERM;
5357 		/* fall through */
5358 	case SIOCBONDSLAVEINFOQUERY:
5359 	case SIOCBONDINFOQUERY:
5360 		dev_load(net, ifr.ifr_name);
5361 		rtnl_lock();
5362 		ret = dev_ifsioc(net, &ifr, cmd);
5363 		rtnl_unlock();
5364 		return ret;
5365 
5366 	case SIOCGIFMEM:
5367 		/* Get the per device memory space. We can add this but
5368 		 * currently do not support it */
5369 	case SIOCSIFMEM:
5370 		/* Set the per device memory buffer space.
5371 		 * Not applicable in our case */
5372 	case SIOCSIFLINK:
5373 		return -ENOTTY;
5374 
5375 	/*
5376 	 *	Unknown or private ioctl.
5377 	 */
5378 	default:
5379 		if (cmd == SIOCWANDEV ||
5380 		    (cmd >= SIOCDEVPRIVATE &&
5381 		     cmd <= SIOCDEVPRIVATE + 15)) {
5382 			dev_load(net, ifr.ifr_name);
5383 			rtnl_lock();
5384 			ret = dev_ifsioc(net, &ifr, cmd);
5385 			rtnl_unlock();
5386 			if (!ret && copy_to_user(arg, &ifr,
5387 						 sizeof(struct ifreq)))
5388 				ret = -EFAULT;
5389 			return ret;
5390 		}
5391 		/* Take care of Wireless Extensions */
5392 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5393 			return wext_handle_ioctl(net, &ifr, cmd, arg);
5394 		return -ENOTTY;
5395 	}
5396 }
5397 
5398 
5399 /**
5400  *	dev_new_index	-	allocate an ifindex
5401  *	@net: the applicable net namespace
5402  *
5403  *	Returns a suitable unique value for a new device interface
5404  *	number.  The caller must hold the rtnl semaphore or the
5405  *	dev_base_lock to be sure it remains unique.
5406  */
5407 static int dev_new_index(struct net *net)
5408 {
5409 	int ifindex = net->ifindex;
5410 	for (;;) {
5411 		if (++ifindex <= 0)
5412 			ifindex = 1;
5413 		if (!__dev_get_by_index(net, ifindex))
5414 			return net->ifindex = ifindex;
5415 	}
5416 }
5417 
5418 /* Delayed registration/unregisteration */
5419 static LIST_HEAD(net_todo_list);
5420 
5421 static void net_set_todo(struct net_device *dev)
5422 {
5423 	list_add_tail(&dev->todo_list, &net_todo_list);
5424 }
5425 
5426 static void rollback_registered_many(struct list_head *head)
5427 {
5428 	struct net_device *dev, *tmp;
5429 
5430 	BUG_ON(dev_boot_phase);
5431 	ASSERT_RTNL();
5432 
5433 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5434 		/* Some devices call without registering
5435 		 * for initialization unwind. Remove those
5436 		 * devices and proceed with the remaining.
5437 		 */
5438 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5439 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5440 				 dev->name, dev);
5441 
5442 			WARN_ON(1);
5443 			list_del(&dev->unreg_list);
5444 			continue;
5445 		}
5446 		dev->dismantle = true;
5447 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5448 	}
5449 
5450 	/* If device is running, close it first. */
5451 	dev_close_many(head);
5452 
5453 	list_for_each_entry(dev, head, unreg_list) {
5454 		/* And unlink it from device chain. */
5455 		unlist_netdevice(dev);
5456 
5457 		dev->reg_state = NETREG_UNREGISTERING;
5458 	}
5459 
5460 	synchronize_net();
5461 
5462 	list_for_each_entry(dev, head, unreg_list) {
5463 		/* Shutdown queueing discipline. */
5464 		dev_shutdown(dev);
5465 
5466 
5467 		/* Notify protocols, that we are about to destroy
5468 		   this device. They should clean all the things.
5469 		*/
5470 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5471 
5472 		if (!dev->rtnl_link_ops ||
5473 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5474 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5475 
5476 		/*
5477 		 *	Flush the unicast and multicast chains
5478 		 */
5479 		dev_uc_flush(dev);
5480 		dev_mc_flush(dev);
5481 
5482 		if (dev->netdev_ops->ndo_uninit)
5483 			dev->netdev_ops->ndo_uninit(dev);
5484 
5485 		/* Notifier chain MUST detach us from master device. */
5486 		WARN_ON(dev->master);
5487 
5488 		/* Remove entries from kobject tree */
5489 		netdev_unregister_kobject(dev);
5490 	}
5491 
5492 	synchronize_net();
5493 
5494 	list_for_each_entry(dev, head, unreg_list)
5495 		dev_put(dev);
5496 }
5497 
5498 static void rollback_registered(struct net_device *dev)
5499 {
5500 	LIST_HEAD(single);
5501 
5502 	list_add(&dev->unreg_list, &single);
5503 	rollback_registered_many(&single);
5504 	list_del(&single);
5505 }
5506 
5507 static netdev_features_t netdev_fix_features(struct net_device *dev,
5508 	netdev_features_t features)
5509 {
5510 	/* Fix illegal checksum combinations */
5511 	if ((features & NETIF_F_HW_CSUM) &&
5512 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5513 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5514 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5515 	}
5516 
5517 	/* Fix illegal SG+CSUM combinations. */
5518 	if ((features & NETIF_F_SG) &&
5519 	    !(features & NETIF_F_ALL_CSUM)) {
5520 		netdev_dbg(dev,
5521 			"Dropping NETIF_F_SG since no checksum feature.\n");
5522 		features &= ~NETIF_F_SG;
5523 	}
5524 
5525 	/* TSO requires that SG is present as well. */
5526 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5527 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5528 		features &= ~NETIF_F_ALL_TSO;
5529 	}
5530 
5531 	/* TSO ECN requires that TSO is present as well. */
5532 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5533 		features &= ~NETIF_F_TSO_ECN;
5534 
5535 	/* Software GSO depends on SG. */
5536 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5537 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5538 		features &= ~NETIF_F_GSO;
5539 	}
5540 
5541 	/* UFO needs SG and checksumming */
5542 	if (features & NETIF_F_UFO) {
5543 		/* maybe split UFO into V4 and V6? */
5544 		if (!((features & NETIF_F_GEN_CSUM) ||
5545 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5546 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5547 			netdev_dbg(dev,
5548 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5549 			features &= ~NETIF_F_UFO;
5550 		}
5551 
5552 		if (!(features & NETIF_F_SG)) {
5553 			netdev_dbg(dev,
5554 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5555 			features &= ~NETIF_F_UFO;
5556 		}
5557 	}
5558 
5559 	return features;
5560 }
5561 
5562 int __netdev_update_features(struct net_device *dev)
5563 {
5564 	netdev_features_t features;
5565 	int err = 0;
5566 
5567 	ASSERT_RTNL();
5568 
5569 	features = netdev_get_wanted_features(dev);
5570 
5571 	if (dev->netdev_ops->ndo_fix_features)
5572 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5573 
5574 	/* driver might be less strict about feature dependencies */
5575 	features = netdev_fix_features(dev, features);
5576 
5577 	if (dev->features == features)
5578 		return 0;
5579 
5580 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5581 		&dev->features, &features);
5582 
5583 	if (dev->netdev_ops->ndo_set_features)
5584 		err = dev->netdev_ops->ndo_set_features(dev, features);
5585 
5586 	if (unlikely(err < 0)) {
5587 		netdev_err(dev,
5588 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5589 			err, &features, &dev->features);
5590 		return -1;
5591 	}
5592 
5593 	if (!err)
5594 		dev->features = features;
5595 
5596 	return 1;
5597 }
5598 
5599 /**
5600  *	netdev_update_features - recalculate device features
5601  *	@dev: the device to check
5602  *
5603  *	Recalculate dev->features set and send notifications if it
5604  *	has changed. Should be called after driver or hardware dependent
5605  *	conditions might have changed that influence the features.
5606  */
5607 void netdev_update_features(struct net_device *dev)
5608 {
5609 	if (__netdev_update_features(dev))
5610 		netdev_features_change(dev);
5611 }
5612 EXPORT_SYMBOL(netdev_update_features);
5613 
5614 /**
5615  *	netdev_change_features - recalculate device features
5616  *	@dev: the device to check
5617  *
5618  *	Recalculate dev->features set and send notifications even
5619  *	if they have not changed. Should be called instead of
5620  *	netdev_update_features() if also dev->vlan_features might
5621  *	have changed to allow the changes to be propagated to stacked
5622  *	VLAN devices.
5623  */
5624 void netdev_change_features(struct net_device *dev)
5625 {
5626 	__netdev_update_features(dev);
5627 	netdev_features_change(dev);
5628 }
5629 EXPORT_SYMBOL(netdev_change_features);
5630 
5631 /**
5632  *	netif_stacked_transfer_operstate -	transfer operstate
5633  *	@rootdev: the root or lower level device to transfer state from
5634  *	@dev: the device to transfer operstate to
5635  *
5636  *	Transfer operational state from root to device. This is normally
5637  *	called when a stacking relationship exists between the root
5638  *	device and the device(a leaf device).
5639  */
5640 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5641 					struct net_device *dev)
5642 {
5643 	if (rootdev->operstate == IF_OPER_DORMANT)
5644 		netif_dormant_on(dev);
5645 	else
5646 		netif_dormant_off(dev);
5647 
5648 	if (netif_carrier_ok(rootdev)) {
5649 		if (!netif_carrier_ok(dev))
5650 			netif_carrier_on(dev);
5651 	} else {
5652 		if (netif_carrier_ok(dev))
5653 			netif_carrier_off(dev);
5654 	}
5655 }
5656 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5657 
5658 #ifdef CONFIG_RPS
5659 static int netif_alloc_rx_queues(struct net_device *dev)
5660 {
5661 	unsigned int i, count = dev->num_rx_queues;
5662 	struct netdev_rx_queue *rx;
5663 
5664 	BUG_ON(count < 1);
5665 
5666 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5667 	if (!rx) {
5668 		pr_err("netdev: Unable to allocate %u rx queues\n", count);
5669 		return -ENOMEM;
5670 	}
5671 	dev->_rx = rx;
5672 
5673 	for (i = 0; i < count; i++)
5674 		rx[i].dev = dev;
5675 	return 0;
5676 }
5677 #endif
5678 
5679 static void netdev_init_one_queue(struct net_device *dev,
5680 				  struct netdev_queue *queue, void *_unused)
5681 {
5682 	/* Initialize queue lock */
5683 	spin_lock_init(&queue->_xmit_lock);
5684 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5685 	queue->xmit_lock_owner = -1;
5686 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5687 	queue->dev = dev;
5688 #ifdef CONFIG_BQL
5689 	dql_init(&queue->dql, HZ);
5690 #endif
5691 }
5692 
5693 static int netif_alloc_netdev_queues(struct net_device *dev)
5694 {
5695 	unsigned int count = dev->num_tx_queues;
5696 	struct netdev_queue *tx;
5697 
5698 	BUG_ON(count < 1);
5699 
5700 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5701 	if (!tx) {
5702 		pr_err("netdev: Unable to allocate %u tx queues\n", count);
5703 		return -ENOMEM;
5704 	}
5705 	dev->_tx = tx;
5706 
5707 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5708 	spin_lock_init(&dev->tx_global_lock);
5709 
5710 	return 0;
5711 }
5712 
5713 /**
5714  *	register_netdevice	- register a network device
5715  *	@dev: device to register
5716  *
5717  *	Take a completed network device structure and add it to the kernel
5718  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5719  *	chain. 0 is returned on success. A negative errno code is returned
5720  *	on a failure to set up the device, or if the name is a duplicate.
5721  *
5722  *	Callers must hold the rtnl semaphore. You may want
5723  *	register_netdev() instead of this.
5724  *
5725  *	BUGS:
5726  *	The locking appears insufficient to guarantee two parallel registers
5727  *	will not get the same name.
5728  */
5729 
5730 int register_netdevice(struct net_device *dev)
5731 {
5732 	int ret;
5733 	struct net *net = dev_net(dev);
5734 
5735 	BUG_ON(dev_boot_phase);
5736 	ASSERT_RTNL();
5737 
5738 	might_sleep();
5739 
5740 	/* When net_device's are persistent, this will be fatal. */
5741 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5742 	BUG_ON(!net);
5743 
5744 	spin_lock_init(&dev->addr_list_lock);
5745 	netdev_set_addr_lockdep_class(dev);
5746 
5747 	dev->iflink = -1;
5748 
5749 	ret = dev_get_valid_name(net, dev, dev->name);
5750 	if (ret < 0)
5751 		goto out;
5752 
5753 	/* Init, if this function is available */
5754 	if (dev->netdev_ops->ndo_init) {
5755 		ret = dev->netdev_ops->ndo_init(dev);
5756 		if (ret) {
5757 			if (ret > 0)
5758 				ret = -EIO;
5759 			goto out;
5760 		}
5761 	}
5762 
5763 	ret = -EBUSY;
5764 	if (!dev->ifindex)
5765 		dev->ifindex = dev_new_index(net);
5766 	else if (__dev_get_by_index(net, dev->ifindex))
5767 		goto err_uninit;
5768 
5769 	if (dev->iflink == -1)
5770 		dev->iflink = dev->ifindex;
5771 
5772 	/* Transfer changeable features to wanted_features and enable
5773 	 * software offloads (GSO and GRO).
5774 	 */
5775 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5776 	dev->features |= NETIF_F_SOFT_FEATURES;
5777 	dev->wanted_features = dev->features & dev->hw_features;
5778 
5779 	/* Turn on no cache copy if HW is doing checksum */
5780 	if (!(dev->flags & IFF_LOOPBACK)) {
5781 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
5782 		if (dev->features & NETIF_F_ALL_CSUM) {
5783 			dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5784 			dev->features |= NETIF_F_NOCACHE_COPY;
5785 		}
5786 	}
5787 
5788 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5789 	 */
5790 	dev->vlan_features |= NETIF_F_HIGHDMA;
5791 
5792 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5793 	ret = notifier_to_errno(ret);
5794 	if (ret)
5795 		goto err_uninit;
5796 
5797 	ret = netdev_register_kobject(dev);
5798 	if (ret)
5799 		goto err_uninit;
5800 	dev->reg_state = NETREG_REGISTERED;
5801 
5802 	__netdev_update_features(dev);
5803 
5804 	/*
5805 	 *	Default initial state at registry is that the
5806 	 *	device is present.
5807 	 */
5808 
5809 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5810 
5811 	linkwatch_init_dev(dev);
5812 
5813 	dev_init_scheduler(dev);
5814 	dev_hold(dev);
5815 	list_netdevice(dev);
5816 	add_device_randomness(dev->dev_addr, dev->addr_len);
5817 
5818 	/* Notify protocols, that a new device appeared. */
5819 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5820 	ret = notifier_to_errno(ret);
5821 	if (ret) {
5822 		rollback_registered(dev);
5823 		dev->reg_state = NETREG_UNREGISTERED;
5824 	}
5825 	/*
5826 	 *	Prevent userspace races by waiting until the network
5827 	 *	device is fully setup before sending notifications.
5828 	 */
5829 	if (!dev->rtnl_link_ops ||
5830 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5831 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5832 
5833 out:
5834 	return ret;
5835 
5836 err_uninit:
5837 	if (dev->netdev_ops->ndo_uninit)
5838 		dev->netdev_ops->ndo_uninit(dev);
5839 	goto out;
5840 }
5841 EXPORT_SYMBOL(register_netdevice);
5842 
5843 /**
5844  *	init_dummy_netdev	- init a dummy network device for NAPI
5845  *	@dev: device to init
5846  *
5847  *	This takes a network device structure and initialize the minimum
5848  *	amount of fields so it can be used to schedule NAPI polls without
5849  *	registering a full blown interface. This is to be used by drivers
5850  *	that need to tie several hardware interfaces to a single NAPI
5851  *	poll scheduler due to HW limitations.
5852  */
5853 int init_dummy_netdev(struct net_device *dev)
5854 {
5855 	/* Clear everything. Note we don't initialize spinlocks
5856 	 * are they aren't supposed to be taken by any of the
5857 	 * NAPI code and this dummy netdev is supposed to be
5858 	 * only ever used for NAPI polls
5859 	 */
5860 	memset(dev, 0, sizeof(struct net_device));
5861 
5862 	/* make sure we BUG if trying to hit standard
5863 	 * register/unregister code path
5864 	 */
5865 	dev->reg_state = NETREG_DUMMY;
5866 
5867 	/* NAPI wants this */
5868 	INIT_LIST_HEAD(&dev->napi_list);
5869 
5870 	/* a dummy interface is started by default */
5871 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5872 	set_bit(__LINK_STATE_START, &dev->state);
5873 
5874 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5875 	 * because users of this 'device' dont need to change
5876 	 * its refcount.
5877 	 */
5878 
5879 	return 0;
5880 }
5881 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5882 
5883 
5884 /**
5885  *	register_netdev	- register a network device
5886  *	@dev: device to register
5887  *
5888  *	Take a completed network device structure and add it to the kernel
5889  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5890  *	chain. 0 is returned on success. A negative errno code is returned
5891  *	on a failure to set up the device, or if the name is a duplicate.
5892  *
5893  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5894  *	and expands the device name if you passed a format string to
5895  *	alloc_netdev.
5896  */
5897 int register_netdev(struct net_device *dev)
5898 {
5899 	int err;
5900 
5901 	rtnl_lock();
5902 	err = register_netdevice(dev);
5903 	rtnl_unlock();
5904 	return err;
5905 }
5906 EXPORT_SYMBOL(register_netdev);
5907 
5908 int netdev_refcnt_read(const struct net_device *dev)
5909 {
5910 	int i, refcnt = 0;
5911 
5912 	for_each_possible_cpu(i)
5913 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5914 	return refcnt;
5915 }
5916 EXPORT_SYMBOL(netdev_refcnt_read);
5917 
5918 /**
5919  * netdev_wait_allrefs - wait until all references are gone.
5920  * @dev: target net_device
5921  *
5922  * This is called when unregistering network devices.
5923  *
5924  * Any protocol or device that holds a reference should register
5925  * for netdevice notification, and cleanup and put back the
5926  * reference if they receive an UNREGISTER event.
5927  * We can get stuck here if buggy protocols don't correctly
5928  * call dev_put.
5929  */
5930 static void netdev_wait_allrefs(struct net_device *dev)
5931 {
5932 	unsigned long rebroadcast_time, warning_time;
5933 	int refcnt;
5934 
5935 	linkwatch_forget_dev(dev);
5936 
5937 	rebroadcast_time = warning_time = jiffies;
5938 	refcnt = netdev_refcnt_read(dev);
5939 
5940 	while (refcnt != 0) {
5941 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5942 			rtnl_lock();
5943 
5944 			/* Rebroadcast unregister notification */
5945 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5946 
5947 			__rtnl_unlock();
5948 			rcu_barrier();
5949 			rtnl_lock();
5950 
5951 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5952 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5953 				     &dev->state)) {
5954 				/* We must not have linkwatch events
5955 				 * pending on unregister. If this
5956 				 * happens, we simply run the queue
5957 				 * unscheduled, resulting in a noop
5958 				 * for this device.
5959 				 */
5960 				linkwatch_run_queue();
5961 			}
5962 
5963 			__rtnl_unlock();
5964 
5965 			rebroadcast_time = jiffies;
5966 		}
5967 
5968 		msleep(250);
5969 
5970 		refcnt = netdev_refcnt_read(dev);
5971 
5972 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5973 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5974 				 dev->name, refcnt);
5975 			warning_time = jiffies;
5976 		}
5977 	}
5978 }
5979 
5980 /* The sequence is:
5981  *
5982  *	rtnl_lock();
5983  *	...
5984  *	register_netdevice(x1);
5985  *	register_netdevice(x2);
5986  *	...
5987  *	unregister_netdevice(y1);
5988  *	unregister_netdevice(y2);
5989  *      ...
5990  *	rtnl_unlock();
5991  *	free_netdev(y1);
5992  *	free_netdev(y2);
5993  *
5994  * We are invoked by rtnl_unlock().
5995  * This allows us to deal with problems:
5996  * 1) We can delete sysfs objects which invoke hotplug
5997  *    without deadlocking with linkwatch via keventd.
5998  * 2) Since we run with the RTNL semaphore not held, we can sleep
5999  *    safely in order to wait for the netdev refcnt to drop to zero.
6000  *
6001  * We must not return until all unregister events added during
6002  * the interval the lock was held have been completed.
6003  */
6004 void netdev_run_todo(void)
6005 {
6006 	struct list_head list;
6007 
6008 	/* Snapshot list, allow later requests */
6009 	list_replace_init(&net_todo_list, &list);
6010 
6011 	__rtnl_unlock();
6012 
6013 
6014 	/* Wait for rcu callbacks to finish before next phase */
6015 	if (!list_empty(&list))
6016 		rcu_barrier();
6017 
6018 	while (!list_empty(&list)) {
6019 		struct net_device *dev
6020 			= list_first_entry(&list, struct net_device, todo_list);
6021 		list_del(&dev->todo_list);
6022 
6023 		rtnl_lock();
6024 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6025 		__rtnl_unlock();
6026 
6027 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6028 			pr_err("network todo '%s' but state %d\n",
6029 			       dev->name, dev->reg_state);
6030 			dump_stack();
6031 			continue;
6032 		}
6033 
6034 		dev->reg_state = NETREG_UNREGISTERED;
6035 
6036 		on_each_cpu(flush_backlog, dev, 1);
6037 
6038 		netdev_wait_allrefs(dev);
6039 
6040 		/* paranoia */
6041 		BUG_ON(netdev_refcnt_read(dev));
6042 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
6043 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6044 		WARN_ON(dev->dn_ptr);
6045 
6046 		if (dev->destructor)
6047 			dev->destructor(dev);
6048 
6049 		/* Free network device */
6050 		kobject_put(&dev->dev.kobj);
6051 	}
6052 }
6053 
6054 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6055  * fields in the same order, with only the type differing.
6056  */
6057 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6058 			     const struct net_device_stats *netdev_stats)
6059 {
6060 #if BITS_PER_LONG == 64
6061 	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6062 	memcpy(stats64, netdev_stats, sizeof(*stats64));
6063 #else
6064 	size_t i, n = sizeof(*stats64) / sizeof(u64);
6065 	const unsigned long *src = (const unsigned long *)netdev_stats;
6066 	u64 *dst = (u64 *)stats64;
6067 
6068 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6069 		     sizeof(*stats64) / sizeof(u64));
6070 	for (i = 0; i < n; i++)
6071 		dst[i] = src[i];
6072 #endif
6073 }
6074 EXPORT_SYMBOL(netdev_stats_to_stats64);
6075 
6076 /**
6077  *	dev_get_stats	- get network device statistics
6078  *	@dev: device to get statistics from
6079  *	@storage: place to store stats
6080  *
6081  *	Get network statistics from device. Return @storage.
6082  *	The device driver may provide its own method by setting
6083  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6084  *	otherwise the internal statistics structure is used.
6085  */
6086 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6087 					struct rtnl_link_stats64 *storage)
6088 {
6089 	const struct net_device_ops *ops = dev->netdev_ops;
6090 
6091 	if (ops->ndo_get_stats64) {
6092 		memset(storage, 0, sizeof(*storage));
6093 		ops->ndo_get_stats64(dev, storage);
6094 	} else if (ops->ndo_get_stats) {
6095 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6096 	} else {
6097 		netdev_stats_to_stats64(storage, &dev->stats);
6098 	}
6099 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6100 	return storage;
6101 }
6102 EXPORT_SYMBOL(dev_get_stats);
6103 
6104 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6105 {
6106 	struct netdev_queue *queue = dev_ingress_queue(dev);
6107 
6108 #ifdef CONFIG_NET_CLS_ACT
6109 	if (queue)
6110 		return queue;
6111 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6112 	if (!queue)
6113 		return NULL;
6114 	netdev_init_one_queue(dev, queue, NULL);
6115 	queue->qdisc = &noop_qdisc;
6116 	queue->qdisc_sleeping = &noop_qdisc;
6117 	rcu_assign_pointer(dev->ingress_queue, queue);
6118 #endif
6119 	return queue;
6120 }
6121 
6122 static const struct ethtool_ops default_ethtool_ops;
6123 
6124 /**
6125  *	alloc_netdev_mqs - allocate network device
6126  *	@sizeof_priv:	size of private data to allocate space for
6127  *	@name:		device name format string
6128  *	@setup:		callback to initialize device
6129  *	@txqs:		the number of TX subqueues to allocate
6130  *	@rxqs:		the number of RX subqueues to allocate
6131  *
6132  *	Allocates a struct net_device with private data area for driver use
6133  *	and performs basic initialization.  Also allocates subquue structs
6134  *	for each queue on the device.
6135  */
6136 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6137 		void (*setup)(struct net_device *),
6138 		unsigned int txqs, unsigned int rxqs)
6139 {
6140 	struct net_device *dev;
6141 	size_t alloc_size;
6142 	struct net_device *p;
6143 
6144 	BUG_ON(strlen(name) >= sizeof(dev->name));
6145 
6146 	if (txqs < 1) {
6147 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6148 		return NULL;
6149 	}
6150 
6151 #ifdef CONFIG_RPS
6152 	if (rxqs < 1) {
6153 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6154 		return NULL;
6155 	}
6156 #endif
6157 
6158 	alloc_size = sizeof(struct net_device);
6159 	if (sizeof_priv) {
6160 		/* ensure 32-byte alignment of private area */
6161 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6162 		alloc_size += sizeof_priv;
6163 	}
6164 	/* ensure 32-byte alignment of whole construct */
6165 	alloc_size += NETDEV_ALIGN - 1;
6166 
6167 	p = kzalloc(alloc_size, GFP_KERNEL);
6168 	if (!p) {
6169 		pr_err("alloc_netdev: Unable to allocate device\n");
6170 		return NULL;
6171 	}
6172 
6173 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6174 	dev->padded = (char *)dev - (char *)p;
6175 
6176 	dev->pcpu_refcnt = alloc_percpu(int);
6177 	if (!dev->pcpu_refcnt)
6178 		goto free_p;
6179 
6180 	if (dev_addr_init(dev))
6181 		goto free_pcpu;
6182 
6183 	dev_mc_init(dev);
6184 	dev_uc_init(dev);
6185 
6186 	dev_net_set(dev, &init_net);
6187 
6188 	dev->gso_max_size = GSO_MAX_SIZE;
6189 	dev->gso_max_segs = GSO_MAX_SEGS;
6190 
6191 	INIT_LIST_HEAD(&dev->napi_list);
6192 	INIT_LIST_HEAD(&dev->unreg_list);
6193 	INIT_LIST_HEAD(&dev->link_watch_list);
6194 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
6195 	setup(dev);
6196 
6197 	dev->num_tx_queues = txqs;
6198 	dev->real_num_tx_queues = txqs;
6199 	if (netif_alloc_netdev_queues(dev))
6200 		goto free_all;
6201 
6202 #ifdef CONFIG_RPS
6203 	dev->num_rx_queues = rxqs;
6204 	dev->real_num_rx_queues = rxqs;
6205 	if (netif_alloc_rx_queues(dev))
6206 		goto free_all;
6207 #endif
6208 
6209 	strcpy(dev->name, name);
6210 	dev->group = INIT_NETDEV_GROUP;
6211 	if (!dev->ethtool_ops)
6212 		dev->ethtool_ops = &default_ethtool_ops;
6213 	return dev;
6214 
6215 free_all:
6216 	free_netdev(dev);
6217 	return NULL;
6218 
6219 free_pcpu:
6220 	free_percpu(dev->pcpu_refcnt);
6221 	kfree(dev->_tx);
6222 #ifdef CONFIG_RPS
6223 	kfree(dev->_rx);
6224 #endif
6225 
6226 free_p:
6227 	kfree(p);
6228 	return NULL;
6229 }
6230 EXPORT_SYMBOL(alloc_netdev_mqs);
6231 
6232 /**
6233  *	free_netdev - free network device
6234  *	@dev: device
6235  *
6236  *	This function does the last stage of destroying an allocated device
6237  * 	interface. The reference to the device object is released.
6238  *	If this is the last reference then it will be freed.
6239  */
6240 void free_netdev(struct net_device *dev)
6241 {
6242 	struct napi_struct *p, *n;
6243 
6244 	release_net(dev_net(dev));
6245 
6246 	kfree(dev->_tx);
6247 #ifdef CONFIG_RPS
6248 	kfree(dev->_rx);
6249 #endif
6250 
6251 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6252 
6253 	/* Flush device addresses */
6254 	dev_addr_flush(dev);
6255 
6256 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6257 		netif_napi_del(p);
6258 
6259 	free_percpu(dev->pcpu_refcnt);
6260 	dev->pcpu_refcnt = NULL;
6261 
6262 	/*  Compatibility with error handling in drivers */
6263 	if (dev->reg_state == NETREG_UNINITIALIZED) {
6264 		kfree((char *)dev - dev->padded);
6265 		return;
6266 	}
6267 
6268 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6269 	dev->reg_state = NETREG_RELEASED;
6270 
6271 	/* will free via device release */
6272 	put_device(&dev->dev);
6273 }
6274 EXPORT_SYMBOL(free_netdev);
6275 
6276 /**
6277  *	synchronize_net -  Synchronize with packet receive processing
6278  *
6279  *	Wait for packets currently being received to be done.
6280  *	Does not block later packets from starting.
6281  */
6282 void synchronize_net(void)
6283 {
6284 	might_sleep();
6285 	if (rtnl_is_locked())
6286 		synchronize_rcu_expedited();
6287 	else
6288 		synchronize_rcu();
6289 }
6290 EXPORT_SYMBOL(synchronize_net);
6291 
6292 /**
6293  *	unregister_netdevice_queue - remove device from the kernel
6294  *	@dev: device
6295  *	@head: list
6296  *
6297  *	This function shuts down a device interface and removes it
6298  *	from the kernel tables.
6299  *	If head not NULL, device is queued to be unregistered later.
6300  *
6301  *	Callers must hold the rtnl semaphore.  You may want
6302  *	unregister_netdev() instead of this.
6303  */
6304 
6305 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6306 {
6307 	ASSERT_RTNL();
6308 
6309 	if (head) {
6310 		list_move_tail(&dev->unreg_list, head);
6311 	} else {
6312 		rollback_registered(dev);
6313 		/* Finish processing unregister after unlock */
6314 		net_set_todo(dev);
6315 	}
6316 }
6317 EXPORT_SYMBOL(unregister_netdevice_queue);
6318 
6319 /**
6320  *	unregister_netdevice_many - unregister many devices
6321  *	@head: list of devices
6322  */
6323 void unregister_netdevice_many(struct list_head *head)
6324 {
6325 	struct net_device *dev;
6326 
6327 	if (!list_empty(head)) {
6328 		rollback_registered_many(head);
6329 		list_for_each_entry(dev, head, unreg_list)
6330 			net_set_todo(dev);
6331 	}
6332 }
6333 EXPORT_SYMBOL(unregister_netdevice_many);
6334 
6335 /**
6336  *	unregister_netdev - remove device from the kernel
6337  *	@dev: device
6338  *
6339  *	This function shuts down a device interface and removes it
6340  *	from the kernel tables.
6341  *
6342  *	This is just a wrapper for unregister_netdevice that takes
6343  *	the rtnl semaphore.  In general you want to use this and not
6344  *	unregister_netdevice.
6345  */
6346 void unregister_netdev(struct net_device *dev)
6347 {
6348 	rtnl_lock();
6349 	unregister_netdevice(dev);
6350 	rtnl_unlock();
6351 }
6352 EXPORT_SYMBOL(unregister_netdev);
6353 
6354 /**
6355  *	dev_change_net_namespace - move device to different nethost namespace
6356  *	@dev: device
6357  *	@net: network namespace
6358  *	@pat: If not NULL name pattern to try if the current device name
6359  *	      is already taken in the destination network namespace.
6360  *
6361  *	This function shuts down a device interface and moves it
6362  *	to a new network namespace. On success 0 is returned, on
6363  *	a failure a netagive errno code is returned.
6364  *
6365  *	Callers must hold the rtnl semaphore.
6366  */
6367 
6368 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6369 {
6370 	int err;
6371 
6372 	ASSERT_RTNL();
6373 
6374 	/* Don't allow namespace local devices to be moved. */
6375 	err = -EINVAL;
6376 	if (dev->features & NETIF_F_NETNS_LOCAL)
6377 		goto out;
6378 
6379 	/* Ensure the device has been registrered */
6380 	if (dev->reg_state != NETREG_REGISTERED)
6381 		goto out;
6382 
6383 	/* Get out if there is nothing todo */
6384 	err = 0;
6385 	if (net_eq(dev_net(dev), net))
6386 		goto out;
6387 
6388 	/* Pick the destination device name, and ensure
6389 	 * we can use it in the destination network namespace.
6390 	 */
6391 	err = -EEXIST;
6392 	if (__dev_get_by_name(net, dev->name)) {
6393 		/* We get here if we can't use the current device name */
6394 		if (!pat)
6395 			goto out;
6396 		if (dev_get_valid_name(net, dev, pat) < 0)
6397 			goto out;
6398 	}
6399 
6400 	/*
6401 	 * And now a mini version of register_netdevice unregister_netdevice.
6402 	 */
6403 
6404 	/* If device is running close it first. */
6405 	dev_close(dev);
6406 
6407 	/* And unlink it from device chain */
6408 	err = -ENODEV;
6409 	unlist_netdevice(dev);
6410 
6411 	synchronize_net();
6412 
6413 	/* Shutdown queueing discipline. */
6414 	dev_shutdown(dev);
6415 
6416 	/* Notify protocols, that we are about to destroy
6417 	   this device. They should clean all the things.
6418 
6419 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6420 	   This is wanted because this way 8021q and macvlan know
6421 	   the device is just moving and can keep their slaves up.
6422 	*/
6423 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6424 	rcu_barrier();
6425 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6426 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6427 
6428 	/*
6429 	 *	Flush the unicast and multicast chains
6430 	 */
6431 	dev_uc_flush(dev);
6432 	dev_mc_flush(dev);
6433 
6434 	/* Send a netdev-removed uevent to the old namespace */
6435 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6436 
6437 	/* Actually switch the network namespace */
6438 	dev_net_set(dev, net);
6439 
6440 	/* If there is an ifindex conflict assign a new one */
6441 	if (__dev_get_by_index(net, dev->ifindex)) {
6442 		int iflink = (dev->iflink == dev->ifindex);
6443 		dev->ifindex = dev_new_index(net);
6444 		if (iflink)
6445 			dev->iflink = dev->ifindex;
6446 	}
6447 
6448 	/* Send a netdev-add uevent to the new namespace */
6449 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6450 
6451 	/* Fixup kobjects */
6452 	err = device_rename(&dev->dev, dev->name);
6453 	WARN_ON(err);
6454 
6455 	/* Add the device back in the hashes */
6456 	list_netdevice(dev);
6457 
6458 	/* Notify protocols, that a new device appeared. */
6459 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6460 
6461 	/*
6462 	 *	Prevent userspace races by waiting until the network
6463 	 *	device is fully setup before sending notifications.
6464 	 */
6465 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6466 
6467 	synchronize_net();
6468 	err = 0;
6469 out:
6470 	return err;
6471 }
6472 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6473 
6474 static int dev_cpu_callback(struct notifier_block *nfb,
6475 			    unsigned long action,
6476 			    void *ocpu)
6477 {
6478 	struct sk_buff **list_skb;
6479 	struct sk_buff *skb;
6480 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6481 	struct softnet_data *sd, *oldsd;
6482 
6483 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6484 		return NOTIFY_OK;
6485 
6486 	local_irq_disable();
6487 	cpu = smp_processor_id();
6488 	sd = &per_cpu(softnet_data, cpu);
6489 	oldsd = &per_cpu(softnet_data, oldcpu);
6490 
6491 	/* Find end of our completion_queue. */
6492 	list_skb = &sd->completion_queue;
6493 	while (*list_skb)
6494 		list_skb = &(*list_skb)->next;
6495 	/* Append completion queue from offline CPU. */
6496 	*list_skb = oldsd->completion_queue;
6497 	oldsd->completion_queue = NULL;
6498 
6499 	/* Append output queue from offline CPU. */
6500 	if (oldsd->output_queue) {
6501 		*sd->output_queue_tailp = oldsd->output_queue;
6502 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6503 		oldsd->output_queue = NULL;
6504 		oldsd->output_queue_tailp = &oldsd->output_queue;
6505 	}
6506 	/* Append NAPI poll list from offline CPU. */
6507 	if (!list_empty(&oldsd->poll_list)) {
6508 		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6509 		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6510 	}
6511 
6512 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6513 	local_irq_enable();
6514 
6515 	/* Process offline CPU's input_pkt_queue */
6516 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6517 		netif_rx(skb);
6518 		input_queue_head_incr(oldsd);
6519 	}
6520 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6521 		netif_rx(skb);
6522 		input_queue_head_incr(oldsd);
6523 	}
6524 
6525 	return NOTIFY_OK;
6526 }
6527 
6528 
6529 /**
6530  *	netdev_increment_features - increment feature set by one
6531  *	@all: current feature set
6532  *	@one: new feature set
6533  *	@mask: mask feature set
6534  *
6535  *	Computes a new feature set after adding a device with feature set
6536  *	@one to the master device with current feature set @all.  Will not
6537  *	enable anything that is off in @mask. Returns the new feature set.
6538  */
6539 netdev_features_t netdev_increment_features(netdev_features_t all,
6540 	netdev_features_t one, netdev_features_t mask)
6541 {
6542 	if (mask & NETIF_F_GEN_CSUM)
6543 		mask |= NETIF_F_ALL_CSUM;
6544 	mask |= NETIF_F_VLAN_CHALLENGED;
6545 
6546 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6547 	all &= one | ~NETIF_F_ALL_FOR_ALL;
6548 
6549 	/* If one device supports hw checksumming, set for all. */
6550 	if (all & NETIF_F_GEN_CSUM)
6551 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6552 
6553 	return all;
6554 }
6555 EXPORT_SYMBOL(netdev_increment_features);
6556 
6557 static struct hlist_head *netdev_create_hash(void)
6558 {
6559 	int i;
6560 	struct hlist_head *hash;
6561 
6562 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6563 	if (hash != NULL)
6564 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6565 			INIT_HLIST_HEAD(&hash[i]);
6566 
6567 	return hash;
6568 }
6569 
6570 /* Initialize per network namespace state */
6571 static int __net_init netdev_init(struct net *net)
6572 {
6573 	if (net != &init_net)
6574 		INIT_LIST_HEAD(&net->dev_base_head);
6575 
6576 	net->dev_name_head = netdev_create_hash();
6577 	if (net->dev_name_head == NULL)
6578 		goto err_name;
6579 
6580 	net->dev_index_head = netdev_create_hash();
6581 	if (net->dev_index_head == NULL)
6582 		goto err_idx;
6583 
6584 	return 0;
6585 
6586 err_idx:
6587 	kfree(net->dev_name_head);
6588 err_name:
6589 	return -ENOMEM;
6590 }
6591 
6592 /**
6593  *	netdev_drivername - network driver for the device
6594  *	@dev: network device
6595  *
6596  *	Determine network driver for device.
6597  */
6598 const char *netdev_drivername(const struct net_device *dev)
6599 {
6600 	const struct device_driver *driver;
6601 	const struct device *parent;
6602 	const char *empty = "";
6603 
6604 	parent = dev->dev.parent;
6605 	if (!parent)
6606 		return empty;
6607 
6608 	driver = parent->driver;
6609 	if (driver && driver->name)
6610 		return driver->name;
6611 	return empty;
6612 }
6613 
6614 static int __netdev_printk(const char *level, const struct net_device *dev,
6615 			   struct va_format *vaf)
6616 {
6617 	int r;
6618 
6619 	if (dev && dev->dev.parent) {
6620 		r = dev_printk_emit(level[1] - '0',
6621 				    dev->dev.parent,
6622 				    "%s %s %s: %pV",
6623 				    dev_driver_string(dev->dev.parent),
6624 				    dev_name(dev->dev.parent),
6625 				    netdev_name(dev), vaf);
6626 	} else if (dev) {
6627 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6628 	} else {
6629 		r = printk("%s(NULL net_device): %pV", level, vaf);
6630 	}
6631 
6632 	return r;
6633 }
6634 
6635 int netdev_printk(const char *level, const struct net_device *dev,
6636 		  const char *format, ...)
6637 {
6638 	struct va_format vaf;
6639 	va_list args;
6640 	int r;
6641 
6642 	va_start(args, format);
6643 
6644 	vaf.fmt = format;
6645 	vaf.va = &args;
6646 
6647 	r = __netdev_printk(level, dev, &vaf);
6648 
6649 	va_end(args);
6650 
6651 	return r;
6652 }
6653 EXPORT_SYMBOL(netdev_printk);
6654 
6655 #define define_netdev_printk_level(func, level)			\
6656 int func(const struct net_device *dev, const char *fmt, ...)	\
6657 {								\
6658 	int r;							\
6659 	struct va_format vaf;					\
6660 	va_list args;						\
6661 								\
6662 	va_start(args, fmt);					\
6663 								\
6664 	vaf.fmt = fmt;						\
6665 	vaf.va = &args;						\
6666 								\
6667 	r = __netdev_printk(level, dev, &vaf);			\
6668 								\
6669 	va_end(args);						\
6670 								\
6671 	return r;						\
6672 }								\
6673 EXPORT_SYMBOL(func);
6674 
6675 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6676 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6677 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6678 define_netdev_printk_level(netdev_err, KERN_ERR);
6679 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6680 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6681 define_netdev_printk_level(netdev_info, KERN_INFO);
6682 
6683 static void __net_exit netdev_exit(struct net *net)
6684 {
6685 	kfree(net->dev_name_head);
6686 	kfree(net->dev_index_head);
6687 }
6688 
6689 static struct pernet_operations __net_initdata netdev_net_ops = {
6690 	.init = netdev_init,
6691 	.exit = netdev_exit,
6692 };
6693 
6694 static void __net_exit default_device_exit(struct net *net)
6695 {
6696 	struct net_device *dev, *aux;
6697 	/*
6698 	 * Push all migratable network devices back to the
6699 	 * initial network namespace
6700 	 */
6701 	rtnl_lock();
6702 	for_each_netdev_safe(net, dev, aux) {
6703 		int err;
6704 		char fb_name[IFNAMSIZ];
6705 
6706 		/* Ignore unmoveable devices (i.e. loopback) */
6707 		if (dev->features & NETIF_F_NETNS_LOCAL)
6708 			continue;
6709 
6710 		/* Leave virtual devices for the generic cleanup */
6711 		if (dev->rtnl_link_ops)
6712 			continue;
6713 
6714 		/* Push remaining network devices to init_net */
6715 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6716 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6717 		if (err) {
6718 			pr_emerg("%s: failed to move %s to init_net: %d\n",
6719 				 __func__, dev->name, err);
6720 			BUG();
6721 		}
6722 	}
6723 	rtnl_unlock();
6724 }
6725 
6726 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6727 {
6728 	/* At exit all network devices most be removed from a network
6729 	 * namespace.  Do this in the reverse order of registration.
6730 	 * Do this across as many network namespaces as possible to
6731 	 * improve batching efficiency.
6732 	 */
6733 	struct net_device *dev;
6734 	struct net *net;
6735 	LIST_HEAD(dev_kill_list);
6736 
6737 	rtnl_lock();
6738 	list_for_each_entry(net, net_list, exit_list) {
6739 		for_each_netdev_reverse(net, dev) {
6740 			if (dev->rtnl_link_ops)
6741 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6742 			else
6743 				unregister_netdevice_queue(dev, &dev_kill_list);
6744 		}
6745 	}
6746 	unregister_netdevice_many(&dev_kill_list);
6747 	list_del(&dev_kill_list);
6748 	rtnl_unlock();
6749 }
6750 
6751 static struct pernet_operations __net_initdata default_device_ops = {
6752 	.exit = default_device_exit,
6753 	.exit_batch = default_device_exit_batch,
6754 };
6755 
6756 /*
6757  *	Initialize the DEV module. At boot time this walks the device list and
6758  *	unhooks any devices that fail to initialise (normally hardware not
6759  *	present) and leaves us with a valid list of present and active devices.
6760  *
6761  */
6762 
6763 /*
6764  *       This is called single threaded during boot, so no need
6765  *       to take the rtnl semaphore.
6766  */
6767 static int __init net_dev_init(void)
6768 {
6769 	int i, rc = -ENOMEM;
6770 
6771 	BUG_ON(!dev_boot_phase);
6772 
6773 	if (dev_proc_init())
6774 		goto out;
6775 
6776 	if (netdev_kobject_init())
6777 		goto out;
6778 
6779 	INIT_LIST_HEAD(&ptype_all);
6780 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6781 		INIT_LIST_HEAD(&ptype_base[i]);
6782 
6783 	INIT_LIST_HEAD(&offload_base);
6784 
6785 	if (register_pernet_subsys(&netdev_net_ops))
6786 		goto out;
6787 
6788 	/*
6789 	 *	Initialise the packet receive queues.
6790 	 */
6791 
6792 	for_each_possible_cpu(i) {
6793 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6794 
6795 		memset(sd, 0, sizeof(*sd));
6796 		skb_queue_head_init(&sd->input_pkt_queue);
6797 		skb_queue_head_init(&sd->process_queue);
6798 		sd->completion_queue = NULL;
6799 		INIT_LIST_HEAD(&sd->poll_list);
6800 		sd->output_queue = NULL;
6801 		sd->output_queue_tailp = &sd->output_queue;
6802 #ifdef CONFIG_RPS
6803 		sd->csd.func = rps_trigger_softirq;
6804 		sd->csd.info = sd;
6805 		sd->csd.flags = 0;
6806 		sd->cpu = i;
6807 #endif
6808 
6809 		sd->backlog.poll = process_backlog;
6810 		sd->backlog.weight = weight_p;
6811 		sd->backlog.gro_list = NULL;
6812 		sd->backlog.gro_count = 0;
6813 	}
6814 
6815 	dev_boot_phase = 0;
6816 
6817 	/* The loopback device is special if any other network devices
6818 	 * is present in a network namespace the loopback device must
6819 	 * be present. Since we now dynamically allocate and free the
6820 	 * loopback device ensure this invariant is maintained by
6821 	 * keeping the loopback device as the first device on the
6822 	 * list of network devices.  Ensuring the loopback devices
6823 	 * is the first device that appears and the last network device
6824 	 * that disappears.
6825 	 */
6826 	if (register_pernet_device(&loopback_net_ops))
6827 		goto out;
6828 
6829 	if (register_pernet_device(&default_device_ops))
6830 		goto out;
6831 
6832 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6833 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6834 
6835 	hotcpu_notifier(dev_cpu_callback, 0);
6836 	dst_init();
6837 	dev_mcast_init();
6838 	rc = 0;
6839 out:
6840 	return rc;
6841 }
6842 
6843 subsys_initcall(net_dev_init);
6844 
6845 static int __init initialize_hashrnd(void)
6846 {
6847 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6848 	return 0;
6849 }
6850 
6851 late_initcall_sync(initialize_hashrnd);
6852 
6853