xref: /linux/net/core/dev.c (revision 02e2af20f4f9f2aa0c84e9a30a35c02f0fbb7daa)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   *      NET3    Protocol independent device support routines.
4   *
5   *	Derived from the non IP parts of dev.c 1.0.19
6   *              Authors:	Ross Biro
7   *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
8   *				Mark Evans, <evansmp@uhura.aston.ac.uk>
9   *
10   *	Additional Authors:
11   *		Florian la Roche <rzsfl@rz.uni-sb.de>
12   *		Alan Cox <gw4pts@gw4pts.ampr.org>
13   *		David Hinds <dahinds@users.sourceforge.net>
14   *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
15   *		Adam Sulmicki <adam@cfar.umd.edu>
16   *              Pekka Riikonen <priikone@poesidon.pspt.fi>
17   *
18   *	Changes:
19   *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
20   *                                      to 2 if register_netdev gets called
21   *                                      before net_dev_init & also removed a
22   *                                      few lines of code in the process.
23   *		Alan Cox	:	device private ioctl copies fields back.
24   *		Alan Cox	:	Transmit queue code does relevant
25   *					stunts to keep the queue safe.
26   *		Alan Cox	:	Fixed double lock.
27   *		Alan Cox	:	Fixed promisc NULL pointer trap
28   *		????????	:	Support the full private ioctl range
29   *		Alan Cox	:	Moved ioctl permission check into
30   *					drivers
31   *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
32   *		Alan Cox	:	100 backlog just doesn't cut it when
33   *					you start doing multicast video 8)
34   *		Alan Cox	:	Rewrote net_bh and list manager.
35   *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
36   *		Alan Cox	:	Took out transmit every packet pass
37   *					Saved a few bytes in the ioctl handler
38   *		Alan Cox	:	Network driver sets packet type before
39   *					calling netif_rx. Saves a function
40   *					call a packet.
41   *		Alan Cox	:	Hashed net_bh()
42   *		Richard Kooijman:	Timestamp fixes.
43   *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
44   *		Alan Cox	:	Device lock protection.
45   *              Alan Cox        :       Fixed nasty side effect of device close
46   *					changes.
47   *		Rudi Cilibrasi	:	Pass the right thing to
48   *					set_mac_address()
49   *		Dave Miller	:	32bit quantity for the device lock to
50   *					make it work out on a Sparc.
51   *		Bjorn Ekwall	:	Added KERNELD hack.
52   *		Alan Cox	:	Cleaned up the backlog initialise.
53   *		Craig Metz	:	SIOCGIFCONF fix if space for under
54   *					1 device.
55   *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
56   *					is no device open function.
57   *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
58   *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
59   *		Cyrus Durgin	:	Cleaned for KMOD
60   *		Adam Sulmicki   :	Bug Fix : Network Device Unload
61   *					A network device unload needs to purge
62   *					the backlog queue.
63   *	Paul Rusty Russell	:	SIOCSIFNAME
64   *              Pekka Riikonen  :	Netdev boot-time settings code
65   *              Andrew Morton   :       Make unregister_netdevice wait
66   *                                      indefinitely on dev->refcnt
67   *              J Hadi Salim    :       - Backlog queue sampling
68   *				        - netif_rx() feedback
69   */
70  
71  #include <linux/uaccess.h>
72  #include <linux/bitops.h>
73  #include <linux/capability.h>
74  #include <linux/cpu.h>
75  #include <linux/types.h>
76  #include <linux/kernel.h>
77  #include <linux/hash.h>
78  #include <linux/slab.h>
79  #include <linux/sched.h>
80  #include <linux/sched/mm.h>
81  #include <linux/mutex.h>
82  #include <linux/rwsem.h>
83  #include <linux/string.h>
84  #include <linux/mm.h>
85  #include <linux/socket.h>
86  #include <linux/sockios.h>
87  #include <linux/errno.h>
88  #include <linux/interrupt.h>
89  #include <linux/if_ether.h>
90  #include <linux/netdevice.h>
91  #include <linux/etherdevice.h>
92  #include <linux/ethtool.h>
93  #include <linux/skbuff.h>
94  #include <linux/kthread.h>
95  #include <linux/bpf.h>
96  #include <linux/bpf_trace.h>
97  #include <net/net_namespace.h>
98  #include <net/sock.h>
99  #include <net/busy_poll.h>
100  #include <linux/rtnetlink.h>
101  #include <linux/stat.h>
102  #include <net/dsa.h>
103  #include <net/dst.h>
104  #include <net/dst_metadata.h>
105  #include <net/gro.h>
106  #include <net/pkt_sched.h>
107  #include <net/pkt_cls.h>
108  #include <net/checksum.h>
109  #include <net/xfrm.h>
110  #include <linux/highmem.h>
111  #include <linux/init.h>
112  #include <linux/module.h>
113  #include <linux/netpoll.h>
114  #include <linux/rcupdate.h>
115  #include <linux/delay.h>
116  #include <net/iw_handler.h>
117  #include <asm/current.h>
118  #include <linux/audit.h>
119  #include <linux/dmaengine.h>
120  #include <linux/err.h>
121  #include <linux/ctype.h>
122  #include <linux/if_arp.h>
123  #include <linux/if_vlan.h>
124  #include <linux/ip.h>
125  #include <net/ip.h>
126  #include <net/mpls.h>
127  #include <linux/ipv6.h>
128  #include <linux/in.h>
129  #include <linux/jhash.h>
130  #include <linux/random.h>
131  #include <trace/events/napi.h>
132  #include <trace/events/net.h>
133  #include <trace/events/skb.h>
134  #include <trace/events/qdisc.h>
135  #include <linux/inetdevice.h>
136  #include <linux/cpu_rmap.h>
137  #include <linux/static_key.h>
138  #include <linux/hashtable.h>
139  #include <linux/vmalloc.h>
140  #include <linux/if_macvlan.h>
141  #include <linux/errqueue.h>
142  #include <linux/hrtimer.h>
143  #include <linux/netfilter_netdev.h>
144  #include <linux/crash_dump.h>
145  #include <linux/sctp.h>
146  #include <net/udp_tunnel.h>
147  #include <linux/net_namespace.h>
148  #include <linux/indirect_call_wrapper.h>
149  #include <net/devlink.h>
150  #include <linux/pm_runtime.h>
151  #include <linux/prandom.h>
152  #include <linux/once_lite.h>
153  
154  #include "net-sysfs.h"
155  
156  
157  static DEFINE_SPINLOCK(ptype_lock);
158  struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
159  struct list_head ptype_all __read_mostly;	/* Taps */
160  
161  static int netif_rx_internal(struct sk_buff *skb);
162  static int call_netdevice_notifiers_info(unsigned long val,
163  					 struct netdev_notifier_info *info);
164  static int call_netdevice_notifiers_extack(unsigned long val,
165  					   struct net_device *dev,
166  					   struct netlink_ext_ack *extack);
167  static struct napi_struct *napi_by_id(unsigned int napi_id);
168  
169  /*
170   * The @dev_base_head list is protected by @dev_base_lock and the rtnl
171   * semaphore.
172   *
173   * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
174   *
175   * Writers must hold the rtnl semaphore while they loop through the
176   * dev_base_head list, and hold dev_base_lock for writing when they do the
177   * actual updates.  This allows pure readers to access the list even
178   * while a writer is preparing to update it.
179   *
180   * To put it another way, dev_base_lock is held for writing only to
181   * protect against pure readers; the rtnl semaphore provides the
182   * protection against other writers.
183   *
184   * See, for example usages, register_netdevice() and
185   * unregister_netdevice(), which must be called with the rtnl
186   * semaphore held.
187   */
188  DEFINE_RWLOCK(dev_base_lock);
189  EXPORT_SYMBOL(dev_base_lock);
190  
191  static DEFINE_MUTEX(ifalias_mutex);
192  
193  /* protects napi_hash addition/deletion and napi_gen_id */
194  static DEFINE_SPINLOCK(napi_hash_lock);
195  
196  static unsigned int napi_gen_id = NR_CPUS;
197  static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
198  
199  static DECLARE_RWSEM(devnet_rename_sem);
200  
201  static inline void dev_base_seq_inc(struct net *net)
202  {
203  	while (++net->dev_base_seq == 0)
204  		;
205  }
206  
207  static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
208  {
209  	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
210  
211  	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
212  }
213  
214  static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
215  {
216  	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
217  }
218  
219  static inline void rps_lock_irqsave(struct softnet_data *sd,
220  				    unsigned long *flags)
221  {
222  	if (IS_ENABLED(CONFIG_RPS))
223  		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
224  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
225  		local_irq_save(*flags);
226  }
227  
228  static inline void rps_lock_irq_disable(struct softnet_data *sd)
229  {
230  	if (IS_ENABLED(CONFIG_RPS))
231  		spin_lock_irq(&sd->input_pkt_queue.lock);
232  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
233  		local_irq_disable();
234  }
235  
236  static inline void rps_unlock_irq_restore(struct softnet_data *sd,
237  					  unsigned long *flags)
238  {
239  	if (IS_ENABLED(CONFIG_RPS))
240  		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
241  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
242  		local_irq_restore(*flags);
243  }
244  
245  static inline void rps_unlock_irq_enable(struct softnet_data *sd)
246  {
247  	if (IS_ENABLED(CONFIG_RPS))
248  		spin_unlock_irq(&sd->input_pkt_queue.lock);
249  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
250  		local_irq_enable();
251  }
252  
253  static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
254  						       const char *name)
255  {
256  	struct netdev_name_node *name_node;
257  
258  	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
259  	if (!name_node)
260  		return NULL;
261  	INIT_HLIST_NODE(&name_node->hlist);
262  	name_node->dev = dev;
263  	name_node->name = name;
264  	return name_node;
265  }
266  
267  static struct netdev_name_node *
268  netdev_name_node_head_alloc(struct net_device *dev)
269  {
270  	struct netdev_name_node *name_node;
271  
272  	name_node = netdev_name_node_alloc(dev, dev->name);
273  	if (!name_node)
274  		return NULL;
275  	INIT_LIST_HEAD(&name_node->list);
276  	return name_node;
277  }
278  
279  static void netdev_name_node_free(struct netdev_name_node *name_node)
280  {
281  	kfree(name_node);
282  }
283  
284  static void netdev_name_node_add(struct net *net,
285  				 struct netdev_name_node *name_node)
286  {
287  	hlist_add_head_rcu(&name_node->hlist,
288  			   dev_name_hash(net, name_node->name));
289  }
290  
291  static void netdev_name_node_del(struct netdev_name_node *name_node)
292  {
293  	hlist_del_rcu(&name_node->hlist);
294  }
295  
296  static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
297  							const char *name)
298  {
299  	struct hlist_head *head = dev_name_hash(net, name);
300  	struct netdev_name_node *name_node;
301  
302  	hlist_for_each_entry(name_node, head, hlist)
303  		if (!strcmp(name_node->name, name))
304  			return name_node;
305  	return NULL;
306  }
307  
308  static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
309  							    const char *name)
310  {
311  	struct hlist_head *head = dev_name_hash(net, name);
312  	struct netdev_name_node *name_node;
313  
314  	hlist_for_each_entry_rcu(name_node, head, hlist)
315  		if (!strcmp(name_node->name, name))
316  			return name_node;
317  	return NULL;
318  }
319  
320  bool netdev_name_in_use(struct net *net, const char *name)
321  {
322  	return netdev_name_node_lookup(net, name);
323  }
324  EXPORT_SYMBOL(netdev_name_in_use);
325  
326  int netdev_name_node_alt_create(struct net_device *dev, const char *name)
327  {
328  	struct netdev_name_node *name_node;
329  	struct net *net = dev_net(dev);
330  
331  	name_node = netdev_name_node_lookup(net, name);
332  	if (name_node)
333  		return -EEXIST;
334  	name_node = netdev_name_node_alloc(dev, name);
335  	if (!name_node)
336  		return -ENOMEM;
337  	netdev_name_node_add(net, name_node);
338  	/* The node that holds dev->name acts as a head of per-device list. */
339  	list_add_tail(&name_node->list, &dev->name_node->list);
340  
341  	return 0;
342  }
343  
344  static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
345  {
346  	list_del(&name_node->list);
347  	netdev_name_node_del(name_node);
348  	kfree(name_node->name);
349  	netdev_name_node_free(name_node);
350  }
351  
352  int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
353  {
354  	struct netdev_name_node *name_node;
355  	struct net *net = dev_net(dev);
356  
357  	name_node = netdev_name_node_lookup(net, name);
358  	if (!name_node)
359  		return -ENOENT;
360  	/* lookup might have found our primary name or a name belonging
361  	 * to another device.
362  	 */
363  	if (name_node == dev->name_node || name_node->dev != dev)
364  		return -EINVAL;
365  
366  	__netdev_name_node_alt_destroy(name_node);
367  
368  	return 0;
369  }
370  
371  static void netdev_name_node_alt_flush(struct net_device *dev)
372  {
373  	struct netdev_name_node *name_node, *tmp;
374  
375  	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
376  		__netdev_name_node_alt_destroy(name_node);
377  }
378  
379  /* Device list insertion */
380  static void list_netdevice(struct net_device *dev)
381  {
382  	struct net *net = dev_net(dev);
383  
384  	ASSERT_RTNL();
385  
386  	write_lock(&dev_base_lock);
387  	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
388  	netdev_name_node_add(net, dev->name_node);
389  	hlist_add_head_rcu(&dev->index_hlist,
390  			   dev_index_hash(net, dev->ifindex));
391  	write_unlock(&dev_base_lock);
392  
393  	dev_base_seq_inc(net);
394  }
395  
396  /* Device list removal
397   * caller must respect a RCU grace period before freeing/reusing dev
398   */
399  static void unlist_netdevice(struct net_device *dev)
400  {
401  	ASSERT_RTNL();
402  
403  	/* Unlink dev from the device chain */
404  	write_lock(&dev_base_lock);
405  	list_del_rcu(&dev->dev_list);
406  	netdev_name_node_del(dev->name_node);
407  	hlist_del_rcu(&dev->index_hlist);
408  	write_unlock(&dev_base_lock);
409  
410  	dev_base_seq_inc(dev_net(dev));
411  }
412  
413  /*
414   *	Our notifier list
415   */
416  
417  static RAW_NOTIFIER_HEAD(netdev_chain);
418  
419  /*
420   *	Device drivers call our routines to queue packets here. We empty the
421   *	queue in the local softnet handler.
422   */
423  
424  DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
425  EXPORT_PER_CPU_SYMBOL(softnet_data);
426  
427  #ifdef CONFIG_LOCKDEP
428  /*
429   * register_netdevice() inits txq->_xmit_lock and sets lockdep class
430   * according to dev->type
431   */
432  static const unsigned short netdev_lock_type[] = {
433  	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
434  	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
435  	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
436  	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
437  	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
438  	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
439  	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
440  	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
441  	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
442  	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
443  	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
444  	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
445  	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
446  	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
447  	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
448  
449  static const char *const netdev_lock_name[] = {
450  	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
451  	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
452  	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
453  	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
454  	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
455  	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
456  	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
457  	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
458  	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
459  	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
460  	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
461  	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
462  	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
463  	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
464  	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
465  
466  static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
467  static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
468  
469  static inline unsigned short netdev_lock_pos(unsigned short dev_type)
470  {
471  	int i;
472  
473  	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
474  		if (netdev_lock_type[i] == dev_type)
475  			return i;
476  	/* the last key is used by default */
477  	return ARRAY_SIZE(netdev_lock_type) - 1;
478  }
479  
480  static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
481  						 unsigned short dev_type)
482  {
483  	int i;
484  
485  	i = netdev_lock_pos(dev_type);
486  	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
487  				   netdev_lock_name[i]);
488  }
489  
490  static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
491  {
492  	int i;
493  
494  	i = netdev_lock_pos(dev->type);
495  	lockdep_set_class_and_name(&dev->addr_list_lock,
496  				   &netdev_addr_lock_key[i],
497  				   netdev_lock_name[i]);
498  }
499  #else
500  static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
501  						 unsigned short dev_type)
502  {
503  }
504  
505  static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
506  {
507  }
508  #endif
509  
510  /*******************************************************************************
511   *
512   *		Protocol management and registration routines
513   *
514   *******************************************************************************/
515  
516  
517  /*
518   *	Add a protocol ID to the list. Now that the input handler is
519   *	smarter we can dispense with all the messy stuff that used to be
520   *	here.
521   *
522   *	BEWARE!!! Protocol handlers, mangling input packets,
523   *	MUST BE last in hash buckets and checking protocol handlers
524   *	MUST start from promiscuous ptype_all chain in net_bh.
525   *	It is true now, do not change it.
526   *	Explanation follows: if protocol handler, mangling packet, will
527   *	be the first on list, it is not able to sense, that packet
528   *	is cloned and should be copied-on-write, so that it will
529   *	change it and subsequent readers will get broken packet.
530   *							--ANK (980803)
531   */
532  
533  static inline struct list_head *ptype_head(const struct packet_type *pt)
534  {
535  	if (pt->type == htons(ETH_P_ALL))
536  		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
537  	else
538  		return pt->dev ? &pt->dev->ptype_specific :
539  				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
540  }
541  
542  /**
543   *	dev_add_pack - add packet handler
544   *	@pt: packet type declaration
545   *
546   *	Add a protocol handler to the networking stack. The passed &packet_type
547   *	is linked into kernel lists and may not be freed until it has been
548   *	removed from the kernel lists.
549   *
550   *	This call does not sleep therefore it can not
551   *	guarantee all CPU's that are in middle of receiving packets
552   *	will see the new packet type (until the next received packet).
553   */
554  
555  void dev_add_pack(struct packet_type *pt)
556  {
557  	struct list_head *head = ptype_head(pt);
558  
559  	spin_lock(&ptype_lock);
560  	list_add_rcu(&pt->list, head);
561  	spin_unlock(&ptype_lock);
562  }
563  EXPORT_SYMBOL(dev_add_pack);
564  
565  /**
566   *	__dev_remove_pack	 - remove packet handler
567   *	@pt: packet type declaration
568   *
569   *	Remove a protocol handler that was previously added to the kernel
570   *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
571   *	from the kernel lists and can be freed or reused once this function
572   *	returns.
573   *
574   *      The packet type might still be in use by receivers
575   *	and must not be freed until after all the CPU's have gone
576   *	through a quiescent state.
577   */
578  void __dev_remove_pack(struct packet_type *pt)
579  {
580  	struct list_head *head = ptype_head(pt);
581  	struct packet_type *pt1;
582  
583  	spin_lock(&ptype_lock);
584  
585  	list_for_each_entry(pt1, head, list) {
586  		if (pt == pt1) {
587  			list_del_rcu(&pt->list);
588  			goto out;
589  		}
590  	}
591  
592  	pr_warn("dev_remove_pack: %p not found\n", pt);
593  out:
594  	spin_unlock(&ptype_lock);
595  }
596  EXPORT_SYMBOL(__dev_remove_pack);
597  
598  /**
599   *	dev_remove_pack	 - remove packet handler
600   *	@pt: packet type declaration
601   *
602   *	Remove a protocol handler that was previously added to the kernel
603   *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
604   *	from the kernel lists and can be freed or reused once this function
605   *	returns.
606   *
607   *	This call sleeps to guarantee that no CPU is looking at the packet
608   *	type after return.
609   */
610  void dev_remove_pack(struct packet_type *pt)
611  {
612  	__dev_remove_pack(pt);
613  
614  	synchronize_net();
615  }
616  EXPORT_SYMBOL(dev_remove_pack);
617  
618  
619  /*******************************************************************************
620   *
621   *			    Device Interface Subroutines
622   *
623   *******************************************************************************/
624  
625  /**
626   *	dev_get_iflink	- get 'iflink' value of a interface
627   *	@dev: targeted interface
628   *
629   *	Indicates the ifindex the interface is linked to.
630   *	Physical interfaces have the same 'ifindex' and 'iflink' values.
631   */
632  
633  int dev_get_iflink(const struct net_device *dev)
634  {
635  	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
636  		return dev->netdev_ops->ndo_get_iflink(dev);
637  
638  	return dev->ifindex;
639  }
640  EXPORT_SYMBOL(dev_get_iflink);
641  
642  /**
643   *	dev_fill_metadata_dst - Retrieve tunnel egress information.
644   *	@dev: targeted interface
645   *	@skb: The packet.
646   *
647   *	For better visibility of tunnel traffic OVS needs to retrieve
648   *	egress tunnel information for a packet. Following API allows
649   *	user to get this info.
650   */
651  int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
652  {
653  	struct ip_tunnel_info *info;
654  
655  	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
656  		return -EINVAL;
657  
658  	info = skb_tunnel_info_unclone(skb);
659  	if (!info)
660  		return -ENOMEM;
661  	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
662  		return -EINVAL;
663  
664  	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
665  }
666  EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
667  
668  static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
669  {
670  	int k = stack->num_paths++;
671  
672  	if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
673  		return NULL;
674  
675  	return &stack->path[k];
676  }
677  
678  int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
679  			  struct net_device_path_stack *stack)
680  {
681  	const struct net_device *last_dev;
682  	struct net_device_path_ctx ctx = {
683  		.dev	= dev,
684  		.daddr	= daddr,
685  	};
686  	struct net_device_path *path;
687  	int ret = 0;
688  
689  	stack->num_paths = 0;
690  	while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
691  		last_dev = ctx.dev;
692  		path = dev_fwd_path(stack);
693  		if (!path)
694  			return -1;
695  
696  		memset(path, 0, sizeof(struct net_device_path));
697  		ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
698  		if (ret < 0)
699  			return -1;
700  
701  		if (WARN_ON_ONCE(last_dev == ctx.dev))
702  			return -1;
703  	}
704  	path = dev_fwd_path(stack);
705  	if (!path)
706  		return -1;
707  	path->type = DEV_PATH_ETHERNET;
708  	path->dev = ctx.dev;
709  
710  	return ret;
711  }
712  EXPORT_SYMBOL_GPL(dev_fill_forward_path);
713  
714  /**
715   *	__dev_get_by_name	- find a device by its name
716   *	@net: the applicable net namespace
717   *	@name: name to find
718   *
719   *	Find an interface by name. Must be called under RTNL semaphore
720   *	or @dev_base_lock. If the name is found a pointer to the device
721   *	is returned. If the name is not found then %NULL is returned. The
722   *	reference counters are not incremented so the caller must be
723   *	careful with locks.
724   */
725  
726  struct net_device *__dev_get_by_name(struct net *net, const char *name)
727  {
728  	struct netdev_name_node *node_name;
729  
730  	node_name = netdev_name_node_lookup(net, name);
731  	return node_name ? node_name->dev : NULL;
732  }
733  EXPORT_SYMBOL(__dev_get_by_name);
734  
735  /**
736   * dev_get_by_name_rcu	- find a device by its name
737   * @net: the applicable net namespace
738   * @name: name to find
739   *
740   * Find an interface by name.
741   * If the name is found a pointer to the device is returned.
742   * If the name is not found then %NULL is returned.
743   * The reference counters are not incremented so the caller must be
744   * careful with locks. The caller must hold RCU lock.
745   */
746  
747  struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
748  {
749  	struct netdev_name_node *node_name;
750  
751  	node_name = netdev_name_node_lookup_rcu(net, name);
752  	return node_name ? node_name->dev : NULL;
753  }
754  EXPORT_SYMBOL(dev_get_by_name_rcu);
755  
756  /**
757   *	dev_get_by_name		- find a device by its name
758   *	@net: the applicable net namespace
759   *	@name: name to find
760   *
761   *	Find an interface by name. This can be called from any
762   *	context and does its own locking. The returned handle has
763   *	the usage count incremented and the caller must use dev_put() to
764   *	release it when it is no longer needed. %NULL is returned if no
765   *	matching device is found.
766   */
767  
768  struct net_device *dev_get_by_name(struct net *net, const char *name)
769  {
770  	struct net_device *dev;
771  
772  	rcu_read_lock();
773  	dev = dev_get_by_name_rcu(net, name);
774  	dev_hold(dev);
775  	rcu_read_unlock();
776  	return dev;
777  }
778  EXPORT_SYMBOL(dev_get_by_name);
779  
780  /**
781   *	__dev_get_by_index - find a device by its ifindex
782   *	@net: the applicable net namespace
783   *	@ifindex: index of device
784   *
785   *	Search for an interface by index. Returns %NULL if the device
786   *	is not found or a pointer to the device. The device has not
787   *	had its reference counter increased so the caller must be careful
788   *	about locking. The caller must hold either the RTNL semaphore
789   *	or @dev_base_lock.
790   */
791  
792  struct net_device *__dev_get_by_index(struct net *net, int ifindex)
793  {
794  	struct net_device *dev;
795  	struct hlist_head *head = dev_index_hash(net, ifindex);
796  
797  	hlist_for_each_entry(dev, head, index_hlist)
798  		if (dev->ifindex == ifindex)
799  			return dev;
800  
801  	return NULL;
802  }
803  EXPORT_SYMBOL(__dev_get_by_index);
804  
805  /**
806   *	dev_get_by_index_rcu - find a device by its ifindex
807   *	@net: the applicable net namespace
808   *	@ifindex: index of device
809   *
810   *	Search for an interface by index. Returns %NULL if the device
811   *	is not found or a pointer to the device. The device has not
812   *	had its reference counter increased so the caller must be careful
813   *	about locking. The caller must hold RCU lock.
814   */
815  
816  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
817  {
818  	struct net_device *dev;
819  	struct hlist_head *head = dev_index_hash(net, ifindex);
820  
821  	hlist_for_each_entry_rcu(dev, head, index_hlist)
822  		if (dev->ifindex == ifindex)
823  			return dev;
824  
825  	return NULL;
826  }
827  EXPORT_SYMBOL(dev_get_by_index_rcu);
828  
829  
830  /**
831   *	dev_get_by_index - find a device by its ifindex
832   *	@net: the applicable net namespace
833   *	@ifindex: index of device
834   *
835   *	Search for an interface by index. Returns NULL if the device
836   *	is not found or a pointer to the device. The device returned has
837   *	had a reference added and the pointer is safe until the user calls
838   *	dev_put to indicate they have finished with it.
839   */
840  
841  struct net_device *dev_get_by_index(struct net *net, int ifindex)
842  {
843  	struct net_device *dev;
844  
845  	rcu_read_lock();
846  	dev = dev_get_by_index_rcu(net, ifindex);
847  	dev_hold(dev);
848  	rcu_read_unlock();
849  	return dev;
850  }
851  EXPORT_SYMBOL(dev_get_by_index);
852  
853  /**
854   *	dev_get_by_napi_id - find a device by napi_id
855   *	@napi_id: ID of the NAPI struct
856   *
857   *	Search for an interface by NAPI ID. Returns %NULL if the device
858   *	is not found or a pointer to the device. The device has not had
859   *	its reference counter increased so the caller must be careful
860   *	about locking. The caller must hold RCU lock.
861   */
862  
863  struct net_device *dev_get_by_napi_id(unsigned int napi_id)
864  {
865  	struct napi_struct *napi;
866  
867  	WARN_ON_ONCE(!rcu_read_lock_held());
868  
869  	if (napi_id < MIN_NAPI_ID)
870  		return NULL;
871  
872  	napi = napi_by_id(napi_id);
873  
874  	return napi ? napi->dev : NULL;
875  }
876  EXPORT_SYMBOL(dev_get_by_napi_id);
877  
878  /**
879   *	netdev_get_name - get a netdevice name, knowing its ifindex.
880   *	@net: network namespace
881   *	@name: a pointer to the buffer where the name will be stored.
882   *	@ifindex: the ifindex of the interface to get the name from.
883   */
884  int netdev_get_name(struct net *net, char *name, int ifindex)
885  {
886  	struct net_device *dev;
887  	int ret;
888  
889  	down_read(&devnet_rename_sem);
890  	rcu_read_lock();
891  
892  	dev = dev_get_by_index_rcu(net, ifindex);
893  	if (!dev) {
894  		ret = -ENODEV;
895  		goto out;
896  	}
897  
898  	strcpy(name, dev->name);
899  
900  	ret = 0;
901  out:
902  	rcu_read_unlock();
903  	up_read(&devnet_rename_sem);
904  	return ret;
905  }
906  
907  /**
908   *	dev_getbyhwaddr_rcu - find a device by its hardware address
909   *	@net: the applicable net namespace
910   *	@type: media type of device
911   *	@ha: hardware address
912   *
913   *	Search for an interface by MAC address. Returns NULL if the device
914   *	is not found or a pointer to the device.
915   *	The caller must hold RCU or RTNL.
916   *	The returned device has not had its ref count increased
917   *	and the caller must therefore be careful about locking
918   *
919   */
920  
921  struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
922  				       const char *ha)
923  {
924  	struct net_device *dev;
925  
926  	for_each_netdev_rcu(net, dev)
927  		if (dev->type == type &&
928  		    !memcmp(dev->dev_addr, ha, dev->addr_len))
929  			return dev;
930  
931  	return NULL;
932  }
933  EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
934  
935  struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
936  {
937  	struct net_device *dev, *ret = NULL;
938  
939  	rcu_read_lock();
940  	for_each_netdev_rcu(net, dev)
941  		if (dev->type == type) {
942  			dev_hold(dev);
943  			ret = dev;
944  			break;
945  		}
946  	rcu_read_unlock();
947  	return ret;
948  }
949  EXPORT_SYMBOL(dev_getfirstbyhwtype);
950  
951  /**
952   *	__dev_get_by_flags - find any device with given flags
953   *	@net: the applicable net namespace
954   *	@if_flags: IFF_* values
955   *	@mask: bitmask of bits in if_flags to check
956   *
957   *	Search for any interface with the given flags. Returns NULL if a device
958   *	is not found or a pointer to the device. Must be called inside
959   *	rtnl_lock(), and result refcount is unchanged.
960   */
961  
962  struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
963  				      unsigned short mask)
964  {
965  	struct net_device *dev, *ret;
966  
967  	ASSERT_RTNL();
968  
969  	ret = NULL;
970  	for_each_netdev(net, dev) {
971  		if (((dev->flags ^ if_flags) & mask) == 0) {
972  			ret = dev;
973  			break;
974  		}
975  	}
976  	return ret;
977  }
978  EXPORT_SYMBOL(__dev_get_by_flags);
979  
980  /**
981   *	dev_valid_name - check if name is okay for network device
982   *	@name: name string
983   *
984   *	Network device names need to be valid file names to
985   *	allow sysfs to work.  We also disallow any kind of
986   *	whitespace.
987   */
988  bool dev_valid_name(const char *name)
989  {
990  	if (*name == '\0')
991  		return false;
992  	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
993  		return false;
994  	if (!strcmp(name, ".") || !strcmp(name, ".."))
995  		return false;
996  
997  	while (*name) {
998  		if (*name == '/' || *name == ':' || isspace(*name))
999  			return false;
1000  		name++;
1001  	}
1002  	return true;
1003  }
1004  EXPORT_SYMBOL(dev_valid_name);
1005  
1006  /**
1007   *	__dev_alloc_name - allocate a name for a device
1008   *	@net: network namespace to allocate the device name in
1009   *	@name: name format string
1010   *	@buf:  scratch buffer and result name string
1011   *
1012   *	Passed a format string - eg "lt%d" it will try and find a suitable
1013   *	id. It scans list of devices to build up a free map, then chooses
1014   *	the first empty slot. The caller must hold the dev_base or rtnl lock
1015   *	while allocating the name and adding the device in order to avoid
1016   *	duplicates.
1017   *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1018   *	Returns the number of the unit assigned or a negative errno code.
1019   */
1020  
1021  static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1022  {
1023  	int i = 0;
1024  	const char *p;
1025  	const int max_netdevices = 8*PAGE_SIZE;
1026  	unsigned long *inuse;
1027  	struct net_device *d;
1028  
1029  	if (!dev_valid_name(name))
1030  		return -EINVAL;
1031  
1032  	p = strchr(name, '%');
1033  	if (p) {
1034  		/*
1035  		 * Verify the string as this thing may have come from
1036  		 * the user.  There must be either one "%d" and no other "%"
1037  		 * characters.
1038  		 */
1039  		if (p[1] != 'd' || strchr(p + 2, '%'))
1040  			return -EINVAL;
1041  
1042  		/* Use one page as a bit array of possible slots */
1043  		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044  		if (!inuse)
1045  			return -ENOMEM;
1046  
1047  		for_each_netdev(net, d) {
1048  			struct netdev_name_node *name_node;
1049  			list_for_each_entry(name_node, &d->name_node->list, list) {
1050  				if (!sscanf(name_node->name, name, &i))
1051  					continue;
1052  				if (i < 0 || i >= max_netdevices)
1053  					continue;
1054  
1055  				/*  avoid cases where sscanf is not exact inverse of printf */
1056  				snprintf(buf, IFNAMSIZ, name, i);
1057  				if (!strncmp(buf, name_node->name, IFNAMSIZ))
1058  					__set_bit(i, inuse);
1059  			}
1060  			if (!sscanf(d->name, name, &i))
1061  				continue;
1062  			if (i < 0 || i >= max_netdevices)
1063  				continue;
1064  
1065  			/*  avoid cases where sscanf is not exact inverse of printf */
1066  			snprintf(buf, IFNAMSIZ, name, i);
1067  			if (!strncmp(buf, d->name, IFNAMSIZ))
1068  				__set_bit(i, inuse);
1069  		}
1070  
1071  		i = find_first_zero_bit(inuse, max_netdevices);
1072  		free_page((unsigned long) inuse);
1073  	}
1074  
1075  	snprintf(buf, IFNAMSIZ, name, i);
1076  	if (!netdev_name_in_use(net, buf))
1077  		return i;
1078  
1079  	/* It is possible to run out of possible slots
1080  	 * when the name is long and there isn't enough space left
1081  	 * for the digits, or if all bits are used.
1082  	 */
1083  	return -ENFILE;
1084  }
1085  
1086  static int dev_alloc_name_ns(struct net *net,
1087  			     struct net_device *dev,
1088  			     const char *name)
1089  {
1090  	char buf[IFNAMSIZ];
1091  	int ret;
1092  
1093  	BUG_ON(!net);
1094  	ret = __dev_alloc_name(net, name, buf);
1095  	if (ret >= 0)
1096  		strlcpy(dev->name, buf, IFNAMSIZ);
1097  	return ret;
1098  }
1099  
1100  /**
1101   *	dev_alloc_name - allocate a name for a device
1102   *	@dev: device
1103   *	@name: name format string
1104   *
1105   *	Passed a format string - eg "lt%d" it will try and find a suitable
1106   *	id. It scans list of devices to build up a free map, then chooses
1107   *	the first empty slot. The caller must hold the dev_base or rtnl lock
1108   *	while allocating the name and adding the device in order to avoid
1109   *	duplicates.
1110   *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1111   *	Returns the number of the unit assigned or a negative errno code.
1112   */
1113  
1114  int dev_alloc_name(struct net_device *dev, const char *name)
1115  {
1116  	return dev_alloc_name_ns(dev_net(dev), dev, name);
1117  }
1118  EXPORT_SYMBOL(dev_alloc_name);
1119  
1120  static int dev_get_valid_name(struct net *net, struct net_device *dev,
1121  			      const char *name)
1122  {
1123  	BUG_ON(!net);
1124  
1125  	if (!dev_valid_name(name))
1126  		return -EINVAL;
1127  
1128  	if (strchr(name, '%'))
1129  		return dev_alloc_name_ns(net, dev, name);
1130  	else if (netdev_name_in_use(net, name))
1131  		return -EEXIST;
1132  	else if (dev->name != name)
1133  		strlcpy(dev->name, name, IFNAMSIZ);
1134  
1135  	return 0;
1136  }
1137  
1138  /**
1139   *	dev_change_name - change name of a device
1140   *	@dev: device
1141   *	@newname: name (or format string) must be at least IFNAMSIZ
1142   *
1143   *	Change name of a device, can pass format strings "eth%d".
1144   *	for wildcarding.
1145   */
1146  int dev_change_name(struct net_device *dev, const char *newname)
1147  {
1148  	unsigned char old_assign_type;
1149  	char oldname[IFNAMSIZ];
1150  	int err = 0;
1151  	int ret;
1152  	struct net *net;
1153  
1154  	ASSERT_RTNL();
1155  	BUG_ON(!dev_net(dev));
1156  
1157  	net = dev_net(dev);
1158  
1159  	/* Some auto-enslaved devices e.g. failover slaves are
1160  	 * special, as userspace might rename the device after
1161  	 * the interface had been brought up and running since
1162  	 * the point kernel initiated auto-enslavement. Allow
1163  	 * live name change even when these slave devices are
1164  	 * up and running.
1165  	 *
1166  	 * Typically, users of these auto-enslaving devices
1167  	 * don't actually care about slave name change, as
1168  	 * they are supposed to operate on master interface
1169  	 * directly.
1170  	 */
1171  	if (dev->flags & IFF_UP &&
1172  	    likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
1173  		return -EBUSY;
1174  
1175  	down_write(&devnet_rename_sem);
1176  
1177  	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1178  		up_write(&devnet_rename_sem);
1179  		return 0;
1180  	}
1181  
1182  	memcpy(oldname, dev->name, IFNAMSIZ);
1183  
1184  	err = dev_get_valid_name(net, dev, newname);
1185  	if (err < 0) {
1186  		up_write(&devnet_rename_sem);
1187  		return err;
1188  	}
1189  
1190  	if (oldname[0] && !strchr(oldname, '%'))
1191  		netdev_info(dev, "renamed from %s\n", oldname);
1192  
1193  	old_assign_type = dev->name_assign_type;
1194  	dev->name_assign_type = NET_NAME_RENAMED;
1195  
1196  rollback:
1197  	ret = device_rename(&dev->dev, dev->name);
1198  	if (ret) {
1199  		memcpy(dev->name, oldname, IFNAMSIZ);
1200  		dev->name_assign_type = old_assign_type;
1201  		up_write(&devnet_rename_sem);
1202  		return ret;
1203  	}
1204  
1205  	up_write(&devnet_rename_sem);
1206  
1207  	netdev_adjacent_rename_links(dev, oldname);
1208  
1209  	write_lock(&dev_base_lock);
1210  	netdev_name_node_del(dev->name_node);
1211  	write_unlock(&dev_base_lock);
1212  
1213  	synchronize_rcu();
1214  
1215  	write_lock(&dev_base_lock);
1216  	netdev_name_node_add(net, dev->name_node);
1217  	write_unlock(&dev_base_lock);
1218  
1219  	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1220  	ret = notifier_to_errno(ret);
1221  
1222  	if (ret) {
1223  		/* err >= 0 after dev_alloc_name() or stores the first errno */
1224  		if (err >= 0) {
1225  			err = ret;
1226  			down_write(&devnet_rename_sem);
1227  			memcpy(dev->name, oldname, IFNAMSIZ);
1228  			memcpy(oldname, newname, IFNAMSIZ);
1229  			dev->name_assign_type = old_assign_type;
1230  			old_assign_type = NET_NAME_RENAMED;
1231  			goto rollback;
1232  		} else {
1233  			netdev_err(dev, "name change rollback failed: %d\n",
1234  				   ret);
1235  		}
1236  	}
1237  
1238  	return err;
1239  }
1240  
1241  /**
1242   *	dev_set_alias - change ifalias of a device
1243   *	@dev: device
1244   *	@alias: name up to IFALIASZ
1245   *	@len: limit of bytes to copy from info
1246   *
1247   *	Set ifalias for a device,
1248   */
1249  int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1250  {
1251  	struct dev_ifalias *new_alias = NULL;
1252  
1253  	if (len >= IFALIASZ)
1254  		return -EINVAL;
1255  
1256  	if (len) {
1257  		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1258  		if (!new_alias)
1259  			return -ENOMEM;
1260  
1261  		memcpy(new_alias->ifalias, alias, len);
1262  		new_alias->ifalias[len] = 0;
1263  	}
1264  
1265  	mutex_lock(&ifalias_mutex);
1266  	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1267  					mutex_is_locked(&ifalias_mutex));
1268  	mutex_unlock(&ifalias_mutex);
1269  
1270  	if (new_alias)
1271  		kfree_rcu(new_alias, rcuhead);
1272  
1273  	return len;
1274  }
1275  EXPORT_SYMBOL(dev_set_alias);
1276  
1277  /**
1278   *	dev_get_alias - get ifalias of a device
1279   *	@dev: device
1280   *	@name: buffer to store name of ifalias
1281   *	@len: size of buffer
1282   *
1283   *	get ifalias for a device.  Caller must make sure dev cannot go
1284   *	away,  e.g. rcu read lock or own a reference count to device.
1285   */
1286  int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1287  {
1288  	const struct dev_ifalias *alias;
1289  	int ret = 0;
1290  
1291  	rcu_read_lock();
1292  	alias = rcu_dereference(dev->ifalias);
1293  	if (alias)
1294  		ret = snprintf(name, len, "%s", alias->ifalias);
1295  	rcu_read_unlock();
1296  
1297  	return ret;
1298  }
1299  
1300  /**
1301   *	netdev_features_change - device changes features
1302   *	@dev: device to cause notification
1303   *
1304   *	Called to indicate a device has changed features.
1305   */
1306  void netdev_features_change(struct net_device *dev)
1307  {
1308  	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1309  }
1310  EXPORT_SYMBOL(netdev_features_change);
1311  
1312  /**
1313   *	netdev_state_change - device changes state
1314   *	@dev: device to cause notification
1315   *
1316   *	Called to indicate a device has changed state. This function calls
1317   *	the notifier chains for netdev_chain and sends a NEWLINK message
1318   *	to the routing socket.
1319   */
1320  void netdev_state_change(struct net_device *dev)
1321  {
1322  	if (dev->flags & IFF_UP) {
1323  		struct netdev_notifier_change_info change_info = {
1324  			.info.dev = dev,
1325  		};
1326  
1327  		call_netdevice_notifiers_info(NETDEV_CHANGE,
1328  					      &change_info.info);
1329  		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1330  	}
1331  }
1332  EXPORT_SYMBOL(netdev_state_change);
1333  
1334  /**
1335   * __netdev_notify_peers - notify network peers about existence of @dev,
1336   * to be called when rtnl lock is already held.
1337   * @dev: network device
1338   *
1339   * Generate traffic such that interested network peers are aware of
1340   * @dev, such as by generating a gratuitous ARP. This may be used when
1341   * a device wants to inform the rest of the network about some sort of
1342   * reconfiguration such as a failover event or virtual machine
1343   * migration.
1344   */
1345  void __netdev_notify_peers(struct net_device *dev)
1346  {
1347  	ASSERT_RTNL();
1348  	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1349  	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1350  }
1351  EXPORT_SYMBOL(__netdev_notify_peers);
1352  
1353  /**
1354   * netdev_notify_peers - notify network peers about existence of @dev
1355   * @dev: network device
1356   *
1357   * Generate traffic such that interested network peers are aware of
1358   * @dev, such as by generating a gratuitous ARP. This may be used when
1359   * a device wants to inform the rest of the network about some sort of
1360   * reconfiguration such as a failover event or virtual machine
1361   * migration.
1362   */
1363  void netdev_notify_peers(struct net_device *dev)
1364  {
1365  	rtnl_lock();
1366  	__netdev_notify_peers(dev);
1367  	rtnl_unlock();
1368  }
1369  EXPORT_SYMBOL(netdev_notify_peers);
1370  
1371  static int napi_threaded_poll(void *data);
1372  
1373  static int napi_kthread_create(struct napi_struct *n)
1374  {
1375  	int err = 0;
1376  
1377  	/* Create and wake up the kthread once to put it in
1378  	 * TASK_INTERRUPTIBLE mode to avoid the blocked task
1379  	 * warning and work with loadavg.
1380  	 */
1381  	n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
1382  				n->dev->name, n->napi_id);
1383  	if (IS_ERR(n->thread)) {
1384  		err = PTR_ERR(n->thread);
1385  		pr_err("kthread_run failed with err %d\n", err);
1386  		n->thread = NULL;
1387  	}
1388  
1389  	return err;
1390  }
1391  
1392  static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1393  {
1394  	const struct net_device_ops *ops = dev->netdev_ops;
1395  	int ret;
1396  
1397  	ASSERT_RTNL();
1398  	dev_addr_check(dev);
1399  
1400  	if (!netif_device_present(dev)) {
1401  		/* may be detached because parent is runtime-suspended */
1402  		if (dev->dev.parent)
1403  			pm_runtime_resume(dev->dev.parent);
1404  		if (!netif_device_present(dev))
1405  			return -ENODEV;
1406  	}
1407  
1408  	/* Block netpoll from trying to do any rx path servicing.
1409  	 * If we don't do this there is a chance ndo_poll_controller
1410  	 * or ndo_poll may be running while we open the device
1411  	 */
1412  	netpoll_poll_disable(dev);
1413  
1414  	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1415  	ret = notifier_to_errno(ret);
1416  	if (ret)
1417  		return ret;
1418  
1419  	set_bit(__LINK_STATE_START, &dev->state);
1420  
1421  	if (ops->ndo_validate_addr)
1422  		ret = ops->ndo_validate_addr(dev);
1423  
1424  	if (!ret && ops->ndo_open)
1425  		ret = ops->ndo_open(dev);
1426  
1427  	netpoll_poll_enable(dev);
1428  
1429  	if (ret)
1430  		clear_bit(__LINK_STATE_START, &dev->state);
1431  	else {
1432  		dev->flags |= IFF_UP;
1433  		dev_set_rx_mode(dev);
1434  		dev_activate(dev);
1435  		add_device_randomness(dev->dev_addr, dev->addr_len);
1436  	}
1437  
1438  	return ret;
1439  }
1440  
1441  /**
1442   *	dev_open	- prepare an interface for use.
1443   *	@dev: device to open
1444   *	@extack: netlink extended ack
1445   *
1446   *	Takes a device from down to up state. The device's private open
1447   *	function is invoked and then the multicast lists are loaded. Finally
1448   *	the device is moved into the up state and a %NETDEV_UP message is
1449   *	sent to the netdev notifier chain.
1450   *
1451   *	Calling this function on an active interface is a nop. On a failure
1452   *	a negative errno code is returned.
1453   */
1454  int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1455  {
1456  	int ret;
1457  
1458  	if (dev->flags & IFF_UP)
1459  		return 0;
1460  
1461  	ret = __dev_open(dev, extack);
1462  	if (ret < 0)
1463  		return ret;
1464  
1465  	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1466  	call_netdevice_notifiers(NETDEV_UP, dev);
1467  
1468  	return ret;
1469  }
1470  EXPORT_SYMBOL(dev_open);
1471  
1472  static void __dev_close_many(struct list_head *head)
1473  {
1474  	struct net_device *dev;
1475  
1476  	ASSERT_RTNL();
1477  	might_sleep();
1478  
1479  	list_for_each_entry(dev, head, close_list) {
1480  		/* Temporarily disable netpoll until the interface is down */
1481  		netpoll_poll_disable(dev);
1482  
1483  		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1484  
1485  		clear_bit(__LINK_STATE_START, &dev->state);
1486  
1487  		/* Synchronize to scheduled poll. We cannot touch poll list, it
1488  		 * can be even on different cpu. So just clear netif_running().
1489  		 *
1490  		 * dev->stop() will invoke napi_disable() on all of it's
1491  		 * napi_struct instances on this device.
1492  		 */
1493  		smp_mb__after_atomic(); /* Commit netif_running(). */
1494  	}
1495  
1496  	dev_deactivate_many(head);
1497  
1498  	list_for_each_entry(dev, head, close_list) {
1499  		const struct net_device_ops *ops = dev->netdev_ops;
1500  
1501  		/*
1502  		 *	Call the device specific close. This cannot fail.
1503  		 *	Only if device is UP
1504  		 *
1505  		 *	We allow it to be called even after a DETACH hot-plug
1506  		 *	event.
1507  		 */
1508  		if (ops->ndo_stop)
1509  			ops->ndo_stop(dev);
1510  
1511  		dev->flags &= ~IFF_UP;
1512  		netpoll_poll_enable(dev);
1513  	}
1514  }
1515  
1516  static void __dev_close(struct net_device *dev)
1517  {
1518  	LIST_HEAD(single);
1519  
1520  	list_add(&dev->close_list, &single);
1521  	__dev_close_many(&single);
1522  	list_del(&single);
1523  }
1524  
1525  void dev_close_many(struct list_head *head, bool unlink)
1526  {
1527  	struct net_device *dev, *tmp;
1528  
1529  	/* Remove the devices that don't need to be closed */
1530  	list_for_each_entry_safe(dev, tmp, head, close_list)
1531  		if (!(dev->flags & IFF_UP))
1532  			list_del_init(&dev->close_list);
1533  
1534  	__dev_close_many(head);
1535  
1536  	list_for_each_entry_safe(dev, tmp, head, close_list) {
1537  		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1538  		call_netdevice_notifiers(NETDEV_DOWN, dev);
1539  		if (unlink)
1540  			list_del_init(&dev->close_list);
1541  	}
1542  }
1543  EXPORT_SYMBOL(dev_close_many);
1544  
1545  /**
1546   *	dev_close - shutdown an interface.
1547   *	@dev: device to shutdown
1548   *
1549   *	This function moves an active device into down state. A
1550   *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1551   *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1552   *	chain.
1553   */
1554  void dev_close(struct net_device *dev)
1555  {
1556  	if (dev->flags & IFF_UP) {
1557  		LIST_HEAD(single);
1558  
1559  		list_add(&dev->close_list, &single);
1560  		dev_close_many(&single, true);
1561  		list_del(&single);
1562  	}
1563  }
1564  EXPORT_SYMBOL(dev_close);
1565  
1566  
1567  /**
1568   *	dev_disable_lro - disable Large Receive Offload on a device
1569   *	@dev: device
1570   *
1571   *	Disable Large Receive Offload (LRO) on a net device.  Must be
1572   *	called under RTNL.  This is needed if received packets may be
1573   *	forwarded to another interface.
1574   */
1575  void dev_disable_lro(struct net_device *dev)
1576  {
1577  	struct net_device *lower_dev;
1578  	struct list_head *iter;
1579  
1580  	dev->wanted_features &= ~NETIF_F_LRO;
1581  	netdev_update_features(dev);
1582  
1583  	if (unlikely(dev->features & NETIF_F_LRO))
1584  		netdev_WARN(dev, "failed to disable LRO!\n");
1585  
1586  	netdev_for_each_lower_dev(dev, lower_dev, iter)
1587  		dev_disable_lro(lower_dev);
1588  }
1589  EXPORT_SYMBOL(dev_disable_lro);
1590  
1591  /**
1592   *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1593   *	@dev: device
1594   *
1595   *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1596   *	called under RTNL.  This is needed if Generic XDP is installed on
1597   *	the device.
1598   */
1599  static void dev_disable_gro_hw(struct net_device *dev)
1600  {
1601  	dev->wanted_features &= ~NETIF_F_GRO_HW;
1602  	netdev_update_features(dev);
1603  
1604  	if (unlikely(dev->features & NETIF_F_GRO_HW))
1605  		netdev_WARN(dev, "failed to disable GRO_HW!\n");
1606  }
1607  
1608  const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1609  {
1610  #define N(val) 						\
1611  	case NETDEV_##val:				\
1612  		return "NETDEV_" __stringify(val);
1613  	switch (cmd) {
1614  	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1615  	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1616  	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1617  	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1618  	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1619  	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1620  	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1621  	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1622  	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1623  	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
1624  	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
1625  	}
1626  #undef N
1627  	return "UNKNOWN_NETDEV_EVENT";
1628  }
1629  EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1630  
1631  static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1632  				   struct net_device *dev)
1633  {
1634  	struct netdev_notifier_info info = {
1635  		.dev = dev,
1636  	};
1637  
1638  	return nb->notifier_call(nb, val, &info);
1639  }
1640  
1641  static int call_netdevice_register_notifiers(struct notifier_block *nb,
1642  					     struct net_device *dev)
1643  {
1644  	int err;
1645  
1646  	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1647  	err = notifier_to_errno(err);
1648  	if (err)
1649  		return err;
1650  
1651  	if (!(dev->flags & IFF_UP))
1652  		return 0;
1653  
1654  	call_netdevice_notifier(nb, NETDEV_UP, dev);
1655  	return 0;
1656  }
1657  
1658  static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1659  						struct net_device *dev)
1660  {
1661  	if (dev->flags & IFF_UP) {
1662  		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1663  					dev);
1664  		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1665  	}
1666  	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1667  }
1668  
1669  static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1670  						 struct net *net)
1671  {
1672  	struct net_device *dev;
1673  	int err;
1674  
1675  	for_each_netdev(net, dev) {
1676  		err = call_netdevice_register_notifiers(nb, dev);
1677  		if (err)
1678  			goto rollback;
1679  	}
1680  	return 0;
1681  
1682  rollback:
1683  	for_each_netdev_continue_reverse(net, dev)
1684  		call_netdevice_unregister_notifiers(nb, dev);
1685  	return err;
1686  }
1687  
1688  static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1689  						    struct net *net)
1690  {
1691  	struct net_device *dev;
1692  
1693  	for_each_netdev(net, dev)
1694  		call_netdevice_unregister_notifiers(nb, dev);
1695  }
1696  
1697  static int dev_boot_phase = 1;
1698  
1699  /**
1700   * register_netdevice_notifier - register a network notifier block
1701   * @nb: notifier
1702   *
1703   * Register a notifier to be called when network device events occur.
1704   * The notifier passed is linked into the kernel structures and must
1705   * not be reused until it has been unregistered. A negative errno code
1706   * is returned on a failure.
1707   *
1708   * When registered all registration and up events are replayed
1709   * to the new notifier to allow device to have a race free
1710   * view of the network device list.
1711   */
1712  
1713  int register_netdevice_notifier(struct notifier_block *nb)
1714  {
1715  	struct net *net;
1716  	int err;
1717  
1718  	/* Close race with setup_net() and cleanup_net() */
1719  	down_write(&pernet_ops_rwsem);
1720  	rtnl_lock();
1721  	err = raw_notifier_chain_register(&netdev_chain, nb);
1722  	if (err)
1723  		goto unlock;
1724  	if (dev_boot_phase)
1725  		goto unlock;
1726  	for_each_net(net) {
1727  		err = call_netdevice_register_net_notifiers(nb, net);
1728  		if (err)
1729  			goto rollback;
1730  	}
1731  
1732  unlock:
1733  	rtnl_unlock();
1734  	up_write(&pernet_ops_rwsem);
1735  	return err;
1736  
1737  rollback:
1738  	for_each_net_continue_reverse(net)
1739  		call_netdevice_unregister_net_notifiers(nb, net);
1740  
1741  	raw_notifier_chain_unregister(&netdev_chain, nb);
1742  	goto unlock;
1743  }
1744  EXPORT_SYMBOL(register_netdevice_notifier);
1745  
1746  /**
1747   * unregister_netdevice_notifier - unregister a network notifier block
1748   * @nb: notifier
1749   *
1750   * Unregister a notifier previously registered by
1751   * register_netdevice_notifier(). The notifier is unlinked into the
1752   * kernel structures and may then be reused. A negative errno code
1753   * is returned on a failure.
1754   *
1755   * After unregistering unregister and down device events are synthesized
1756   * for all devices on the device list to the removed notifier to remove
1757   * the need for special case cleanup code.
1758   */
1759  
1760  int unregister_netdevice_notifier(struct notifier_block *nb)
1761  {
1762  	struct net *net;
1763  	int err;
1764  
1765  	/* Close race with setup_net() and cleanup_net() */
1766  	down_write(&pernet_ops_rwsem);
1767  	rtnl_lock();
1768  	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1769  	if (err)
1770  		goto unlock;
1771  
1772  	for_each_net(net)
1773  		call_netdevice_unregister_net_notifiers(nb, net);
1774  
1775  unlock:
1776  	rtnl_unlock();
1777  	up_write(&pernet_ops_rwsem);
1778  	return err;
1779  }
1780  EXPORT_SYMBOL(unregister_netdevice_notifier);
1781  
1782  static int __register_netdevice_notifier_net(struct net *net,
1783  					     struct notifier_block *nb,
1784  					     bool ignore_call_fail)
1785  {
1786  	int err;
1787  
1788  	err = raw_notifier_chain_register(&net->netdev_chain, nb);
1789  	if (err)
1790  		return err;
1791  	if (dev_boot_phase)
1792  		return 0;
1793  
1794  	err = call_netdevice_register_net_notifiers(nb, net);
1795  	if (err && !ignore_call_fail)
1796  		goto chain_unregister;
1797  
1798  	return 0;
1799  
1800  chain_unregister:
1801  	raw_notifier_chain_unregister(&net->netdev_chain, nb);
1802  	return err;
1803  }
1804  
1805  static int __unregister_netdevice_notifier_net(struct net *net,
1806  					       struct notifier_block *nb)
1807  {
1808  	int err;
1809  
1810  	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1811  	if (err)
1812  		return err;
1813  
1814  	call_netdevice_unregister_net_notifiers(nb, net);
1815  	return 0;
1816  }
1817  
1818  /**
1819   * register_netdevice_notifier_net - register a per-netns network notifier block
1820   * @net: network namespace
1821   * @nb: notifier
1822   *
1823   * Register a notifier to be called when network device events occur.
1824   * The notifier passed is linked into the kernel structures and must
1825   * not be reused until it has been unregistered. A negative errno code
1826   * is returned on a failure.
1827   *
1828   * When registered all registration and up events are replayed
1829   * to the new notifier to allow device to have a race free
1830   * view of the network device list.
1831   */
1832  
1833  int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1834  {
1835  	int err;
1836  
1837  	rtnl_lock();
1838  	err = __register_netdevice_notifier_net(net, nb, false);
1839  	rtnl_unlock();
1840  	return err;
1841  }
1842  EXPORT_SYMBOL(register_netdevice_notifier_net);
1843  
1844  /**
1845   * unregister_netdevice_notifier_net - unregister a per-netns
1846   *                                     network notifier block
1847   * @net: network namespace
1848   * @nb: notifier
1849   *
1850   * Unregister a notifier previously registered by
1851   * register_netdevice_notifier(). The notifier is unlinked into the
1852   * kernel structures and may then be reused. A negative errno code
1853   * is returned on a failure.
1854   *
1855   * After unregistering unregister and down device events are synthesized
1856   * for all devices on the device list to the removed notifier to remove
1857   * the need for special case cleanup code.
1858   */
1859  
1860  int unregister_netdevice_notifier_net(struct net *net,
1861  				      struct notifier_block *nb)
1862  {
1863  	int err;
1864  
1865  	rtnl_lock();
1866  	err = __unregister_netdevice_notifier_net(net, nb);
1867  	rtnl_unlock();
1868  	return err;
1869  }
1870  EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1871  
1872  int register_netdevice_notifier_dev_net(struct net_device *dev,
1873  					struct notifier_block *nb,
1874  					struct netdev_net_notifier *nn)
1875  {
1876  	int err;
1877  
1878  	rtnl_lock();
1879  	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1880  	if (!err) {
1881  		nn->nb = nb;
1882  		list_add(&nn->list, &dev->net_notifier_list);
1883  	}
1884  	rtnl_unlock();
1885  	return err;
1886  }
1887  EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
1888  
1889  int unregister_netdevice_notifier_dev_net(struct net_device *dev,
1890  					  struct notifier_block *nb,
1891  					  struct netdev_net_notifier *nn)
1892  {
1893  	int err;
1894  
1895  	rtnl_lock();
1896  	list_del(&nn->list);
1897  	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
1898  	rtnl_unlock();
1899  	return err;
1900  }
1901  EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
1902  
1903  static void move_netdevice_notifiers_dev_net(struct net_device *dev,
1904  					     struct net *net)
1905  {
1906  	struct netdev_net_notifier *nn;
1907  
1908  	list_for_each_entry(nn, &dev->net_notifier_list, list) {
1909  		__unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
1910  		__register_netdevice_notifier_net(net, nn->nb, true);
1911  	}
1912  }
1913  
1914  /**
1915   *	call_netdevice_notifiers_info - call all network notifier blocks
1916   *	@val: value passed unmodified to notifier function
1917   *	@info: notifier information data
1918   *
1919   *	Call all network notifier blocks.  Parameters and return value
1920   *	are as for raw_notifier_call_chain().
1921   */
1922  
1923  static int call_netdevice_notifiers_info(unsigned long val,
1924  					 struct netdev_notifier_info *info)
1925  {
1926  	struct net *net = dev_net(info->dev);
1927  	int ret;
1928  
1929  	ASSERT_RTNL();
1930  
1931  	/* Run per-netns notifier block chain first, then run the global one.
1932  	 * Hopefully, one day, the global one is going to be removed after
1933  	 * all notifier block registrators get converted to be per-netns.
1934  	 */
1935  	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
1936  	if (ret & NOTIFY_STOP_MASK)
1937  		return ret;
1938  	return raw_notifier_call_chain(&netdev_chain, val, info);
1939  }
1940  
1941  /**
1942   *	call_netdevice_notifiers_info_robust - call per-netns notifier blocks
1943   *	                                       for and rollback on error
1944   *	@val_up: value passed unmodified to notifier function
1945   *	@val_down: value passed unmodified to the notifier function when
1946   *	           recovering from an error on @val_up
1947   *	@info: notifier information data
1948   *
1949   *	Call all per-netns network notifier blocks, but not notifier blocks on
1950   *	the global notifier chain. Parameters and return value are as for
1951   *	raw_notifier_call_chain_robust().
1952   */
1953  
1954  static int
1955  call_netdevice_notifiers_info_robust(unsigned long val_up,
1956  				     unsigned long val_down,
1957  				     struct netdev_notifier_info *info)
1958  {
1959  	struct net *net = dev_net(info->dev);
1960  
1961  	ASSERT_RTNL();
1962  
1963  	return raw_notifier_call_chain_robust(&net->netdev_chain,
1964  					      val_up, val_down, info);
1965  }
1966  
1967  static int call_netdevice_notifiers_extack(unsigned long val,
1968  					   struct net_device *dev,
1969  					   struct netlink_ext_ack *extack)
1970  {
1971  	struct netdev_notifier_info info = {
1972  		.dev = dev,
1973  		.extack = extack,
1974  	};
1975  
1976  	return call_netdevice_notifiers_info(val, &info);
1977  }
1978  
1979  /**
1980   *	call_netdevice_notifiers - call all network notifier blocks
1981   *      @val: value passed unmodified to notifier function
1982   *      @dev: net_device pointer passed unmodified to notifier function
1983   *
1984   *	Call all network notifier blocks.  Parameters and return value
1985   *	are as for raw_notifier_call_chain().
1986   */
1987  
1988  int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1989  {
1990  	return call_netdevice_notifiers_extack(val, dev, NULL);
1991  }
1992  EXPORT_SYMBOL(call_netdevice_notifiers);
1993  
1994  /**
1995   *	call_netdevice_notifiers_mtu - call all network notifier blocks
1996   *	@val: value passed unmodified to notifier function
1997   *	@dev: net_device pointer passed unmodified to notifier function
1998   *	@arg: additional u32 argument passed to the notifier function
1999   *
2000   *	Call all network notifier blocks.  Parameters and return value
2001   *	are as for raw_notifier_call_chain().
2002   */
2003  static int call_netdevice_notifiers_mtu(unsigned long val,
2004  					struct net_device *dev, u32 arg)
2005  {
2006  	struct netdev_notifier_info_ext info = {
2007  		.info.dev = dev,
2008  		.ext.mtu = arg,
2009  	};
2010  
2011  	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2012  
2013  	return call_netdevice_notifiers_info(val, &info.info);
2014  }
2015  
2016  #ifdef CONFIG_NET_INGRESS
2017  static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2018  
2019  void net_inc_ingress_queue(void)
2020  {
2021  	static_branch_inc(&ingress_needed_key);
2022  }
2023  EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2024  
2025  void net_dec_ingress_queue(void)
2026  {
2027  	static_branch_dec(&ingress_needed_key);
2028  }
2029  EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2030  #endif
2031  
2032  #ifdef CONFIG_NET_EGRESS
2033  static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2034  
2035  void net_inc_egress_queue(void)
2036  {
2037  	static_branch_inc(&egress_needed_key);
2038  }
2039  EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2040  
2041  void net_dec_egress_queue(void)
2042  {
2043  	static_branch_dec(&egress_needed_key);
2044  }
2045  EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2046  #endif
2047  
2048  DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2049  EXPORT_SYMBOL(netstamp_needed_key);
2050  #ifdef CONFIG_JUMP_LABEL
2051  static atomic_t netstamp_needed_deferred;
2052  static atomic_t netstamp_wanted;
2053  static void netstamp_clear(struct work_struct *work)
2054  {
2055  	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2056  	int wanted;
2057  
2058  	wanted = atomic_add_return(deferred, &netstamp_wanted);
2059  	if (wanted > 0)
2060  		static_branch_enable(&netstamp_needed_key);
2061  	else
2062  		static_branch_disable(&netstamp_needed_key);
2063  }
2064  static DECLARE_WORK(netstamp_work, netstamp_clear);
2065  #endif
2066  
2067  void net_enable_timestamp(void)
2068  {
2069  #ifdef CONFIG_JUMP_LABEL
2070  	int wanted;
2071  
2072  	while (1) {
2073  		wanted = atomic_read(&netstamp_wanted);
2074  		if (wanted <= 0)
2075  			break;
2076  		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
2077  			return;
2078  	}
2079  	atomic_inc(&netstamp_needed_deferred);
2080  	schedule_work(&netstamp_work);
2081  #else
2082  	static_branch_inc(&netstamp_needed_key);
2083  #endif
2084  }
2085  EXPORT_SYMBOL(net_enable_timestamp);
2086  
2087  void net_disable_timestamp(void)
2088  {
2089  #ifdef CONFIG_JUMP_LABEL
2090  	int wanted;
2091  
2092  	while (1) {
2093  		wanted = atomic_read(&netstamp_wanted);
2094  		if (wanted <= 1)
2095  			break;
2096  		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
2097  			return;
2098  	}
2099  	atomic_dec(&netstamp_needed_deferred);
2100  	schedule_work(&netstamp_work);
2101  #else
2102  	static_branch_dec(&netstamp_needed_key);
2103  #endif
2104  }
2105  EXPORT_SYMBOL(net_disable_timestamp);
2106  
2107  static inline void net_timestamp_set(struct sk_buff *skb)
2108  {
2109  	skb->tstamp = 0;
2110  	skb->mono_delivery_time = 0;
2111  	if (static_branch_unlikely(&netstamp_needed_key))
2112  		skb->tstamp = ktime_get_real();
2113  }
2114  
2115  #define net_timestamp_check(COND, SKB)				\
2116  	if (static_branch_unlikely(&netstamp_needed_key)) {	\
2117  		if ((COND) && !(SKB)->tstamp)			\
2118  			(SKB)->tstamp = ktime_get_real();	\
2119  	}							\
2120  
2121  bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2122  {
2123  	return __is_skb_forwardable(dev, skb, true);
2124  }
2125  EXPORT_SYMBOL_GPL(is_skb_forwardable);
2126  
2127  static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
2128  			      bool check_mtu)
2129  {
2130  	int ret = ____dev_forward_skb(dev, skb, check_mtu);
2131  
2132  	if (likely(!ret)) {
2133  		skb->protocol = eth_type_trans(skb, dev);
2134  		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2135  	}
2136  
2137  	return ret;
2138  }
2139  
2140  int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2141  {
2142  	return __dev_forward_skb2(dev, skb, true);
2143  }
2144  EXPORT_SYMBOL_GPL(__dev_forward_skb);
2145  
2146  /**
2147   * dev_forward_skb - loopback an skb to another netif
2148   *
2149   * @dev: destination network device
2150   * @skb: buffer to forward
2151   *
2152   * return values:
2153   *	NET_RX_SUCCESS	(no congestion)
2154   *	NET_RX_DROP     (packet was dropped, but freed)
2155   *
2156   * dev_forward_skb can be used for injecting an skb from the
2157   * start_xmit function of one device into the receive queue
2158   * of another device.
2159   *
2160   * The receiving device may be in another namespace, so
2161   * we have to clear all information in the skb that could
2162   * impact namespace isolation.
2163   */
2164  int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2165  {
2166  	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2167  }
2168  EXPORT_SYMBOL_GPL(dev_forward_skb);
2169  
2170  int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
2171  {
2172  	return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
2173  }
2174  
2175  static inline int deliver_skb(struct sk_buff *skb,
2176  			      struct packet_type *pt_prev,
2177  			      struct net_device *orig_dev)
2178  {
2179  	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2180  		return -ENOMEM;
2181  	refcount_inc(&skb->users);
2182  	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2183  }
2184  
2185  static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2186  					  struct packet_type **pt,
2187  					  struct net_device *orig_dev,
2188  					  __be16 type,
2189  					  struct list_head *ptype_list)
2190  {
2191  	struct packet_type *ptype, *pt_prev = *pt;
2192  
2193  	list_for_each_entry_rcu(ptype, ptype_list, list) {
2194  		if (ptype->type != type)
2195  			continue;
2196  		if (pt_prev)
2197  			deliver_skb(skb, pt_prev, orig_dev);
2198  		pt_prev = ptype;
2199  	}
2200  	*pt = pt_prev;
2201  }
2202  
2203  static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2204  {
2205  	if (!ptype->af_packet_priv || !skb->sk)
2206  		return false;
2207  
2208  	if (ptype->id_match)
2209  		return ptype->id_match(ptype, skb->sk);
2210  	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2211  		return true;
2212  
2213  	return false;
2214  }
2215  
2216  /**
2217   * dev_nit_active - return true if any network interface taps are in use
2218   *
2219   * @dev: network device to check for the presence of taps
2220   */
2221  bool dev_nit_active(struct net_device *dev)
2222  {
2223  	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2224  }
2225  EXPORT_SYMBOL_GPL(dev_nit_active);
2226  
2227  /*
2228   *	Support routine. Sends outgoing frames to any network
2229   *	taps currently in use.
2230   */
2231  
2232  void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2233  {
2234  	struct packet_type *ptype;
2235  	struct sk_buff *skb2 = NULL;
2236  	struct packet_type *pt_prev = NULL;
2237  	struct list_head *ptype_list = &ptype_all;
2238  
2239  	rcu_read_lock();
2240  again:
2241  	list_for_each_entry_rcu(ptype, ptype_list, list) {
2242  		if (ptype->ignore_outgoing)
2243  			continue;
2244  
2245  		/* Never send packets back to the socket
2246  		 * they originated from - MvS (miquels@drinkel.ow.org)
2247  		 */
2248  		if (skb_loop_sk(ptype, skb))
2249  			continue;
2250  
2251  		if (pt_prev) {
2252  			deliver_skb(skb2, pt_prev, skb->dev);
2253  			pt_prev = ptype;
2254  			continue;
2255  		}
2256  
2257  		/* need to clone skb, done only once */
2258  		skb2 = skb_clone(skb, GFP_ATOMIC);
2259  		if (!skb2)
2260  			goto out_unlock;
2261  
2262  		net_timestamp_set(skb2);
2263  
2264  		/* skb->nh should be correctly
2265  		 * set by sender, so that the second statement is
2266  		 * just protection against buggy protocols.
2267  		 */
2268  		skb_reset_mac_header(skb2);
2269  
2270  		if (skb_network_header(skb2) < skb2->data ||
2271  		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2272  			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2273  					     ntohs(skb2->protocol),
2274  					     dev->name);
2275  			skb_reset_network_header(skb2);
2276  		}
2277  
2278  		skb2->transport_header = skb2->network_header;
2279  		skb2->pkt_type = PACKET_OUTGOING;
2280  		pt_prev = ptype;
2281  	}
2282  
2283  	if (ptype_list == &ptype_all) {
2284  		ptype_list = &dev->ptype_all;
2285  		goto again;
2286  	}
2287  out_unlock:
2288  	if (pt_prev) {
2289  		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2290  			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2291  		else
2292  			kfree_skb(skb2);
2293  	}
2294  	rcu_read_unlock();
2295  }
2296  EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2297  
2298  /**
2299   * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2300   * @dev: Network device
2301   * @txq: number of queues available
2302   *
2303   * If real_num_tx_queues is changed the tc mappings may no longer be
2304   * valid. To resolve this verify the tc mapping remains valid and if
2305   * not NULL the mapping. With no priorities mapping to this
2306   * offset/count pair it will no longer be used. In the worst case TC0
2307   * is invalid nothing can be done so disable priority mappings. If is
2308   * expected that drivers will fix this mapping if they can before
2309   * calling netif_set_real_num_tx_queues.
2310   */
2311  static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2312  {
2313  	int i;
2314  	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2315  
2316  	/* If TC0 is invalidated disable TC mapping */
2317  	if (tc->offset + tc->count > txq) {
2318  		netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2319  		dev->num_tc = 0;
2320  		return;
2321  	}
2322  
2323  	/* Invalidated prio to tc mappings set to TC0 */
2324  	for (i = 1; i < TC_BITMASK + 1; i++) {
2325  		int q = netdev_get_prio_tc_map(dev, i);
2326  
2327  		tc = &dev->tc_to_txq[q];
2328  		if (tc->offset + tc->count > txq) {
2329  			netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2330  				    i, q);
2331  			netdev_set_prio_tc_map(dev, i, 0);
2332  		}
2333  	}
2334  }
2335  
2336  int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2337  {
2338  	if (dev->num_tc) {
2339  		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2340  		int i;
2341  
2342  		/* walk through the TCs and see if it falls into any of them */
2343  		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2344  			if ((txq - tc->offset) < tc->count)
2345  				return i;
2346  		}
2347  
2348  		/* didn't find it, just return -1 to indicate no match */
2349  		return -1;
2350  	}
2351  
2352  	return 0;
2353  }
2354  EXPORT_SYMBOL(netdev_txq_to_tc);
2355  
2356  #ifdef CONFIG_XPS
2357  static struct static_key xps_needed __read_mostly;
2358  static struct static_key xps_rxqs_needed __read_mostly;
2359  static DEFINE_MUTEX(xps_map_mutex);
2360  #define xmap_dereference(P)		\
2361  	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2362  
2363  static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2364  			     struct xps_dev_maps *old_maps, int tci, u16 index)
2365  {
2366  	struct xps_map *map = NULL;
2367  	int pos;
2368  
2369  	if (dev_maps)
2370  		map = xmap_dereference(dev_maps->attr_map[tci]);
2371  	if (!map)
2372  		return false;
2373  
2374  	for (pos = map->len; pos--;) {
2375  		if (map->queues[pos] != index)
2376  			continue;
2377  
2378  		if (map->len > 1) {
2379  			map->queues[pos] = map->queues[--map->len];
2380  			break;
2381  		}
2382  
2383  		if (old_maps)
2384  			RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
2385  		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2386  		kfree_rcu(map, rcu);
2387  		return false;
2388  	}
2389  
2390  	return true;
2391  }
2392  
2393  static bool remove_xps_queue_cpu(struct net_device *dev,
2394  				 struct xps_dev_maps *dev_maps,
2395  				 int cpu, u16 offset, u16 count)
2396  {
2397  	int num_tc = dev_maps->num_tc;
2398  	bool active = false;
2399  	int tci;
2400  
2401  	for (tci = cpu * num_tc; num_tc--; tci++) {
2402  		int i, j;
2403  
2404  		for (i = count, j = offset; i--; j++) {
2405  			if (!remove_xps_queue(dev_maps, NULL, tci, j))
2406  				break;
2407  		}
2408  
2409  		active |= i < 0;
2410  	}
2411  
2412  	return active;
2413  }
2414  
2415  static void reset_xps_maps(struct net_device *dev,
2416  			   struct xps_dev_maps *dev_maps,
2417  			   enum xps_map_type type)
2418  {
2419  	static_key_slow_dec_cpuslocked(&xps_needed);
2420  	if (type == XPS_RXQS)
2421  		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2422  
2423  	RCU_INIT_POINTER(dev->xps_maps[type], NULL);
2424  
2425  	kfree_rcu(dev_maps, rcu);
2426  }
2427  
2428  static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
2429  			   u16 offset, u16 count)
2430  {
2431  	struct xps_dev_maps *dev_maps;
2432  	bool active = false;
2433  	int i, j;
2434  
2435  	dev_maps = xmap_dereference(dev->xps_maps[type]);
2436  	if (!dev_maps)
2437  		return;
2438  
2439  	for (j = 0; j < dev_maps->nr_ids; j++)
2440  		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
2441  	if (!active)
2442  		reset_xps_maps(dev, dev_maps, type);
2443  
2444  	if (type == XPS_CPUS) {
2445  		for (i = offset + (count - 1); count--; i--)
2446  			netdev_queue_numa_node_write(
2447  				netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
2448  	}
2449  }
2450  
2451  static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2452  				   u16 count)
2453  {
2454  	if (!static_key_false(&xps_needed))
2455  		return;
2456  
2457  	cpus_read_lock();
2458  	mutex_lock(&xps_map_mutex);
2459  
2460  	if (static_key_false(&xps_rxqs_needed))
2461  		clean_xps_maps(dev, XPS_RXQS, offset, count);
2462  
2463  	clean_xps_maps(dev, XPS_CPUS, offset, count);
2464  
2465  	mutex_unlock(&xps_map_mutex);
2466  	cpus_read_unlock();
2467  }
2468  
2469  static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2470  {
2471  	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2472  }
2473  
2474  static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2475  				      u16 index, bool is_rxqs_map)
2476  {
2477  	struct xps_map *new_map;
2478  	int alloc_len = XPS_MIN_MAP_ALLOC;
2479  	int i, pos;
2480  
2481  	for (pos = 0; map && pos < map->len; pos++) {
2482  		if (map->queues[pos] != index)
2483  			continue;
2484  		return map;
2485  	}
2486  
2487  	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
2488  	if (map) {
2489  		if (pos < map->alloc_len)
2490  			return map;
2491  
2492  		alloc_len = map->alloc_len * 2;
2493  	}
2494  
2495  	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2496  	 *  map
2497  	 */
2498  	if (is_rxqs_map)
2499  		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2500  	else
2501  		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2502  				       cpu_to_node(attr_index));
2503  	if (!new_map)
2504  		return NULL;
2505  
2506  	for (i = 0; i < pos; i++)
2507  		new_map->queues[i] = map->queues[i];
2508  	new_map->alloc_len = alloc_len;
2509  	new_map->len = pos;
2510  
2511  	return new_map;
2512  }
2513  
2514  /* Copy xps maps at a given index */
2515  static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
2516  			      struct xps_dev_maps *new_dev_maps, int index,
2517  			      int tc, bool skip_tc)
2518  {
2519  	int i, tci = index * dev_maps->num_tc;
2520  	struct xps_map *map;
2521  
2522  	/* copy maps belonging to foreign traffic classes */
2523  	for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2524  		if (i == tc && skip_tc)
2525  			continue;
2526  
2527  		/* fill in the new device map from the old device map */
2528  		map = xmap_dereference(dev_maps->attr_map[tci]);
2529  		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2530  	}
2531  }
2532  
2533  /* Must be called under cpus_read_lock */
2534  int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2535  			  u16 index, enum xps_map_type type)
2536  {
2537  	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
2538  	const unsigned long *online_mask = NULL;
2539  	bool active = false, copy = false;
2540  	int i, j, tci, numa_node_id = -2;
2541  	int maps_sz, num_tc = 1, tc = 0;
2542  	struct xps_map *map, *new_map;
2543  	unsigned int nr_ids;
2544  
2545  	if (dev->num_tc) {
2546  		/* Do not allow XPS on subordinate device directly */
2547  		num_tc = dev->num_tc;
2548  		if (num_tc < 0)
2549  			return -EINVAL;
2550  
2551  		/* If queue belongs to subordinate dev use its map */
2552  		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2553  
2554  		tc = netdev_txq_to_tc(dev, index);
2555  		if (tc < 0)
2556  			return -EINVAL;
2557  	}
2558  
2559  	mutex_lock(&xps_map_mutex);
2560  
2561  	dev_maps = xmap_dereference(dev->xps_maps[type]);
2562  	if (type == XPS_RXQS) {
2563  		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2564  		nr_ids = dev->num_rx_queues;
2565  	} else {
2566  		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2567  		if (num_possible_cpus() > 1)
2568  			online_mask = cpumask_bits(cpu_online_mask);
2569  		nr_ids = nr_cpu_ids;
2570  	}
2571  
2572  	if (maps_sz < L1_CACHE_BYTES)
2573  		maps_sz = L1_CACHE_BYTES;
2574  
2575  	/* The old dev_maps could be larger or smaller than the one we're
2576  	 * setting up now, as dev->num_tc or nr_ids could have been updated in
2577  	 * between. We could try to be smart, but let's be safe instead and only
2578  	 * copy foreign traffic classes if the two map sizes match.
2579  	 */
2580  	if (dev_maps &&
2581  	    dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
2582  		copy = true;
2583  
2584  	/* allocate memory for queue storage */
2585  	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2586  	     j < nr_ids;) {
2587  		if (!new_dev_maps) {
2588  			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2589  			if (!new_dev_maps) {
2590  				mutex_unlock(&xps_map_mutex);
2591  				return -ENOMEM;
2592  			}
2593  
2594  			new_dev_maps->nr_ids = nr_ids;
2595  			new_dev_maps->num_tc = num_tc;
2596  		}
2597  
2598  		tci = j * num_tc + tc;
2599  		map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
2600  
2601  		map = expand_xps_map(map, j, index, type == XPS_RXQS);
2602  		if (!map)
2603  			goto error;
2604  
2605  		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2606  	}
2607  
2608  	if (!new_dev_maps)
2609  		goto out_no_new_maps;
2610  
2611  	if (!dev_maps) {
2612  		/* Increment static keys at most once per type */
2613  		static_key_slow_inc_cpuslocked(&xps_needed);
2614  		if (type == XPS_RXQS)
2615  			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2616  	}
2617  
2618  	for (j = 0; j < nr_ids; j++) {
2619  		bool skip_tc = false;
2620  
2621  		tci = j * num_tc + tc;
2622  		if (netif_attr_test_mask(j, mask, nr_ids) &&
2623  		    netif_attr_test_online(j, online_mask, nr_ids)) {
2624  			/* add tx-queue to CPU/rx-queue maps */
2625  			int pos = 0;
2626  
2627  			skip_tc = true;
2628  
2629  			map = xmap_dereference(new_dev_maps->attr_map[tci]);
2630  			while ((pos < map->len) && (map->queues[pos] != index))
2631  				pos++;
2632  
2633  			if (pos == map->len)
2634  				map->queues[map->len++] = index;
2635  #ifdef CONFIG_NUMA
2636  			if (type == XPS_CPUS) {
2637  				if (numa_node_id == -2)
2638  					numa_node_id = cpu_to_node(j);
2639  				else if (numa_node_id != cpu_to_node(j))
2640  					numa_node_id = -1;
2641  			}
2642  #endif
2643  		}
2644  
2645  		if (copy)
2646  			xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
2647  					  skip_tc);
2648  	}
2649  
2650  	rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
2651  
2652  	/* Cleanup old maps */
2653  	if (!dev_maps)
2654  		goto out_no_old_maps;
2655  
2656  	for (j = 0; j < dev_maps->nr_ids; j++) {
2657  		for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
2658  			map = xmap_dereference(dev_maps->attr_map[tci]);
2659  			if (!map)
2660  				continue;
2661  
2662  			if (copy) {
2663  				new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2664  				if (map == new_map)
2665  					continue;
2666  			}
2667  
2668  			RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2669  			kfree_rcu(map, rcu);
2670  		}
2671  	}
2672  
2673  	old_dev_maps = dev_maps;
2674  
2675  out_no_old_maps:
2676  	dev_maps = new_dev_maps;
2677  	active = true;
2678  
2679  out_no_new_maps:
2680  	if (type == XPS_CPUS)
2681  		/* update Tx queue numa node */
2682  		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2683  					     (numa_node_id >= 0) ?
2684  					     numa_node_id : NUMA_NO_NODE);
2685  
2686  	if (!dev_maps)
2687  		goto out_no_maps;
2688  
2689  	/* removes tx-queue from unused CPUs/rx-queues */
2690  	for (j = 0; j < dev_maps->nr_ids; j++) {
2691  		tci = j * dev_maps->num_tc;
2692  
2693  		for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2694  			if (i == tc &&
2695  			    netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
2696  			    netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
2697  				continue;
2698  
2699  			active |= remove_xps_queue(dev_maps,
2700  						   copy ? old_dev_maps : NULL,
2701  						   tci, index);
2702  		}
2703  	}
2704  
2705  	if (old_dev_maps)
2706  		kfree_rcu(old_dev_maps, rcu);
2707  
2708  	/* free map if not active */
2709  	if (!active)
2710  		reset_xps_maps(dev, dev_maps, type);
2711  
2712  out_no_maps:
2713  	mutex_unlock(&xps_map_mutex);
2714  
2715  	return 0;
2716  error:
2717  	/* remove any maps that we added */
2718  	for (j = 0; j < nr_ids; j++) {
2719  		for (i = num_tc, tci = j * num_tc; i--; tci++) {
2720  			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2721  			map = copy ?
2722  			      xmap_dereference(dev_maps->attr_map[tci]) :
2723  			      NULL;
2724  			if (new_map && new_map != map)
2725  				kfree(new_map);
2726  		}
2727  	}
2728  
2729  	mutex_unlock(&xps_map_mutex);
2730  
2731  	kfree(new_dev_maps);
2732  	return -ENOMEM;
2733  }
2734  EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2735  
2736  int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2737  			u16 index)
2738  {
2739  	int ret;
2740  
2741  	cpus_read_lock();
2742  	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
2743  	cpus_read_unlock();
2744  
2745  	return ret;
2746  }
2747  EXPORT_SYMBOL(netif_set_xps_queue);
2748  
2749  #endif
2750  static void netdev_unbind_all_sb_channels(struct net_device *dev)
2751  {
2752  	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2753  
2754  	/* Unbind any subordinate channels */
2755  	while (txq-- != &dev->_tx[0]) {
2756  		if (txq->sb_dev)
2757  			netdev_unbind_sb_channel(dev, txq->sb_dev);
2758  	}
2759  }
2760  
2761  void netdev_reset_tc(struct net_device *dev)
2762  {
2763  #ifdef CONFIG_XPS
2764  	netif_reset_xps_queues_gt(dev, 0);
2765  #endif
2766  	netdev_unbind_all_sb_channels(dev);
2767  
2768  	/* Reset TC configuration of device */
2769  	dev->num_tc = 0;
2770  	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2771  	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2772  }
2773  EXPORT_SYMBOL(netdev_reset_tc);
2774  
2775  int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2776  {
2777  	if (tc >= dev->num_tc)
2778  		return -EINVAL;
2779  
2780  #ifdef CONFIG_XPS
2781  	netif_reset_xps_queues(dev, offset, count);
2782  #endif
2783  	dev->tc_to_txq[tc].count = count;
2784  	dev->tc_to_txq[tc].offset = offset;
2785  	return 0;
2786  }
2787  EXPORT_SYMBOL(netdev_set_tc_queue);
2788  
2789  int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2790  {
2791  	if (num_tc > TC_MAX_QUEUE)
2792  		return -EINVAL;
2793  
2794  #ifdef CONFIG_XPS
2795  	netif_reset_xps_queues_gt(dev, 0);
2796  #endif
2797  	netdev_unbind_all_sb_channels(dev);
2798  
2799  	dev->num_tc = num_tc;
2800  	return 0;
2801  }
2802  EXPORT_SYMBOL(netdev_set_num_tc);
2803  
2804  void netdev_unbind_sb_channel(struct net_device *dev,
2805  			      struct net_device *sb_dev)
2806  {
2807  	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2808  
2809  #ifdef CONFIG_XPS
2810  	netif_reset_xps_queues_gt(sb_dev, 0);
2811  #endif
2812  	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2813  	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2814  
2815  	while (txq-- != &dev->_tx[0]) {
2816  		if (txq->sb_dev == sb_dev)
2817  			txq->sb_dev = NULL;
2818  	}
2819  }
2820  EXPORT_SYMBOL(netdev_unbind_sb_channel);
2821  
2822  int netdev_bind_sb_channel_queue(struct net_device *dev,
2823  				 struct net_device *sb_dev,
2824  				 u8 tc, u16 count, u16 offset)
2825  {
2826  	/* Make certain the sb_dev and dev are already configured */
2827  	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2828  		return -EINVAL;
2829  
2830  	/* We cannot hand out queues we don't have */
2831  	if ((offset + count) > dev->real_num_tx_queues)
2832  		return -EINVAL;
2833  
2834  	/* Record the mapping */
2835  	sb_dev->tc_to_txq[tc].count = count;
2836  	sb_dev->tc_to_txq[tc].offset = offset;
2837  
2838  	/* Provide a way for Tx queue to find the tc_to_txq map or
2839  	 * XPS map for itself.
2840  	 */
2841  	while (count--)
2842  		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2843  
2844  	return 0;
2845  }
2846  EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2847  
2848  int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2849  {
2850  	/* Do not use a multiqueue device to represent a subordinate channel */
2851  	if (netif_is_multiqueue(dev))
2852  		return -ENODEV;
2853  
2854  	/* We allow channels 1 - 32767 to be used for subordinate channels.
2855  	 * Channel 0 is meant to be "native" mode and used only to represent
2856  	 * the main root device. We allow writing 0 to reset the device back
2857  	 * to normal mode after being used as a subordinate channel.
2858  	 */
2859  	if (channel > S16_MAX)
2860  		return -EINVAL;
2861  
2862  	dev->num_tc = -channel;
2863  
2864  	return 0;
2865  }
2866  EXPORT_SYMBOL(netdev_set_sb_channel);
2867  
2868  /*
2869   * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2870   * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2871   */
2872  int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2873  {
2874  	bool disabling;
2875  	int rc;
2876  
2877  	disabling = txq < dev->real_num_tx_queues;
2878  
2879  	if (txq < 1 || txq > dev->num_tx_queues)
2880  		return -EINVAL;
2881  
2882  	if (dev->reg_state == NETREG_REGISTERED ||
2883  	    dev->reg_state == NETREG_UNREGISTERING) {
2884  		ASSERT_RTNL();
2885  
2886  		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2887  						  txq);
2888  		if (rc)
2889  			return rc;
2890  
2891  		if (dev->num_tc)
2892  			netif_setup_tc(dev, txq);
2893  
2894  		dev_qdisc_change_real_num_tx(dev, txq);
2895  
2896  		dev->real_num_tx_queues = txq;
2897  
2898  		if (disabling) {
2899  			synchronize_net();
2900  			qdisc_reset_all_tx_gt(dev, txq);
2901  #ifdef CONFIG_XPS
2902  			netif_reset_xps_queues_gt(dev, txq);
2903  #endif
2904  		}
2905  	} else {
2906  		dev->real_num_tx_queues = txq;
2907  	}
2908  
2909  	return 0;
2910  }
2911  EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2912  
2913  #ifdef CONFIG_SYSFS
2914  /**
2915   *	netif_set_real_num_rx_queues - set actual number of RX queues used
2916   *	@dev: Network device
2917   *	@rxq: Actual number of RX queues
2918   *
2919   *	This must be called either with the rtnl_lock held or before
2920   *	registration of the net device.  Returns 0 on success, or a
2921   *	negative error code.  If called before registration, it always
2922   *	succeeds.
2923   */
2924  int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2925  {
2926  	int rc;
2927  
2928  	if (rxq < 1 || rxq > dev->num_rx_queues)
2929  		return -EINVAL;
2930  
2931  	if (dev->reg_state == NETREG_REGISTERED) {
2932  		ASSERT_RTNL();
2933  
2934  		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2935  						  rxq);
2936  		if (rc)
2937  			return rc;
2938  	}
2939  
2940  	dev->real_num_rx_queues = rxq;
2941  	return 0;
2942  }
2943  EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2944  #endif
2945  
2946  /**
2947   *	netif_set_real_num_queues - set actual number of RX and TX queues used
2948   *	@dev: Network device
2949   *	@txq: Actual number of TX queues
2950   *	@rxq: Actual number of RX queues
2951   *
2952   *	Set the real number of both TX and RX queues.
2953   *	Does nothing if the number of queues is already correct.
2954   */
2955  int netif_set_real_num_queues(struct net_device *dev,
2956  			      unsigned int txq, unsigned int rxq)
2957  {
2958  	unsigned int old_rxq = dev->real_num_rx_queues;
2959  	int err;
2960  
2961  	if (txq < 1 || txq > dev->num_tx_queues ||
2962  	    rxq < 1 || rxq > dev->num_rx_queues)
2963  		return -EINVAL;
2964  
2965  	/* Start from increases, so the error path only does decreases -
2966  	 * decreases can't fail.
2967  	 */
2968  	if (rxq > dev->real_num_rx_queues) {
2969  		err = netif_set_real_num_rx_queues(dev, rxq);
2970  		if (err)
2971  			return err;
2972  	}
2973  	if (txq > dev->real_num_tx_queues) {
2974  		err = netif_set_real_num_tx_queues(dev, txq);
2975  		if (err)
2976  			goto undo_rx;
2977  	}
2978  	if (rxq < dev->real_num_rx_queues)
2979  		WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
2980  	if (txq < dev->real_num_tx_queues)
2981  		WARN_ON(netif_set_real_num_tx_queues(dev, txq));
2982  
2983  	return 0;
2984  undo_rx:
2985  	WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
2986  	return err;
2987  }
2988  EXPORT_SYMBOL(netif_set_real_num_queues);
2989  
2990  /**
2991   * netif_get_num_default_rss_queues - default number of RSS queues
2992   *
2993   * Default value is the number of physical cores if there are only 1 or 2, or
2994   * divided by 2 if there are more.
2995   */
2996  int netif_get_num_default_rss_queues(void)
2997  {
2998  	cpumask_var_t cpus;
2999  	int cpu, count = 0;
3000  
3001  	if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
3002  		return 1;
3003  
3004  	cpumask_copy(cpus, cpu_online_mask);
3005  	for_each_cpu(cpu, cpus) {
3006  		++count;
3007  		cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
3008  	}
3009  	free_cpumask_var(cpus);
3010  
3011  	return count > 2 ? DIV_ROUND_UP(count, 2) : count;
3012  }
3013  EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3014  
3015  static void __netif_reschedule(struct Qdisc *q)
3016  {
3017  	struct softnet_data *sd;
3018  	unsigned long flags;
3019  
3020  	local_irq_save(flags);
3021  	sd = this_cpu_ptr(&softnet_data);
3022  	q->next_sched = NULL;
3023  	*sd->output_queue_tailp = q;
3024  	sd->output_queue_tailp = &q->next_sched;
3025  	raise_softirq_irqoff(NET_TX_SOFTIRQ);
3026  	local_irq_restore(flags);
3027  }
3028  
3029  void __netif_schedule(struct Qdisc *q)
3030  {
3031  	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3032  		__netif_reschedule(q);
3033  }
3034  EXPORT_SYMBOL(__netif_schedule);
3035  
3036  struct dev_kfree_skb_cb {
3037  	enum skb_free_reason reason;
3038  };
3039  
3040  static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3041  {
3042  	return (struct dev_kfree_skb_cb *)skb->cb;
3043  }
3044  
3045  void netif_schedule_queue(struct netdev_queue *txq)
3046  {
3047  	rcu_read_lock();
3048  	if (!netif_xmit_stopped(txq)) {
3049  		struct Qdisc *q = rcu_dereference(txq->qdisc);
3050  
3051  		__netif_schedule(q);
3052  	}
3053  	rcu_read_unlock();
3054  }
3055  EXPORT_SYMBOL(netif_schedule_queue);
3056  
3057  void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3058  {
3059  	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3060  		struct Qdisc *q;
3061  
3062  		rcu_read_lock();
3063  		q = rcu_dereference(dev_queue->qdisc);
3064  		__netif_schedule(q);
3065  		rcu_read_unlock();
3066  	}
3067  }
3068  EXPORT_SYMBOL(netif_tx_wake_queue);
3069  
3070  void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
3071  {
3072  	unsigned long flags;
3073  
3074  	if (unlikely(!skb))
3075  		return;
3076  
3077  	if (likely(refcount_read(&skb->users) == 1)) {
3078  		smp_rmb();
3079  		refcount_set(&skb->users, 0);
3080  	} else if (likely(!refcount_dec_and_test(&skb->users))) {
3081  		return;
3082  	}
3083  	get_kfree_skb_cb(skb)->reason = reason;
3084  	local_irq_save(flags);
3085  	skb->next = __this_cpu_read(softnet_data.completion_queue);
3086  	__this_cpu_write(softnet_data.completion_queue, skb);
3087  	raise_softirq_irqoff(NET_TX_SOFTIRQ);
3088  	local_irq_restore(flags);
3089  }
3090  EXPORT_SYMBOL(__dev_kfree_skb_irq);
3091  
3092  void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
3093  {
3094  	if (in_hardirq() || irqs_disabled())
3095  		__dev_kfree_skb_irq(skb, reason);
3096  	else
3097  		dev_kfree_skb(skb);
3098  }
3099  EXPORT_SYMBOL(__dev_kfree_skb_any);
3100  
3101  
3102  /**
3103   * netif_device_detach - mark device as removed
3104   * @dev: network device
3105   *
3106   * Mark device as removed from system and therefore no longer available.
3107   */
3108  void netif_device_detach(struct net_device *dev)
3109  {
3110  	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3111  	    netif_running(dev)) {
3112  		netif_tx_stop_all_queues(dev);
3113  	}
3114  }
3115  EXPORT_SYMBOL(netif_device_detach);
3116  
3117  /**
3118   * netif_device_attach - mark device as attached
3119   * @dev: network device
3120   *
3121   * Mark device as attached from system and restart if needed.
3122   */
3123  void netif_device_attach(struct net_device *dev)
3124  {
3125  	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3126  	    netif_running(dev)) {
3127  		netif_tx_wake_all_queues(dev);
3128  		__netdev_watchdog_up(dev);
3129  	}
3130  }
3131  EXPORT_SYMBOL(netif_device_attach);
3132  
3133  /*
3134   * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3135   * to be used as a distribution range.
3136   */
3137  static u16 skb_tx_hash(const struct net_device *dev,
3138  		       const struct net_device *sb_dev,
3139  		       struct sk_buff *skb)
3140  {
3141  	u32 hash;
3142  	u16 qoffset = 0;
3143  	u16 qcount = dev->real_num_tx_queues;
3144  
3145  	if (dev->num_tc) {
3146  		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3147  
3148  		qoffset = sb_dev->tc_to_txq[tc].offset;
3149  		qcount = sb_dev->tc_to_txq[tc].count;
3150  		if (unlikely(!qcount)) {
3151  			net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
3152  					     sb_dev->name, qoffset, tc);
3153  			qoffset = 0;
3154  			qcount = dev->real_num_tx_queues;
3155  		}
3156  	}
3157  
3158  	if (skb_rx_queue_recorded(skb)) {
3159  		hash = skb_get_rx_queue(skb);
3160  		if (hash >= qoffset)
3161  			hash -= qoffset;
3162  		while (unlikely(hash >= qcount))
3163  			hash -= qcount;
3164  		return hash + qoffset;
3165  	}
3166  
3167  	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3168  }
3169  
3170  static void skb_warn_bad_offload(const struct sk_buff *skb)
3171  {
3172  	static const netdev_features_t null_features;
3173  	struct net_device *dev = skb->dev;
3174  	const char *name = "";
3175  
3176  	if (!net_ratelimit())
3177  		return;
3178  
3179  	if (dev) {
3180  		if (dev->dev.parent)
3181  			name = dev_driver_string(dev->dev.parent);
3182  		else
3183  			name = netdev_name(dev);
3184  	}
3185  	skb_dump(KERN_WARNING, skb, false);
3186  	WARN(1, "%s: caps=(%pNF, %pNF)\n",
3187  	     name, dev ? &dev->features : &null_features,
3188  	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
3189  }
3190  
3191  /*
3192   * Invalidate hardware checksum when packet is to be mangled, and
3193   * complete checksum manually on outgoing path.
3194   */
3195  int skb_checksum_help(struct sk_buff *skb)
3196  {
3197  	__wsum csum;
3198  	int ret = 0, offset;
3199  
3200  	if (skb->ip_summed == CHECKSUM_COMPLETE)
3201  		goto out_set_summed;
3202  
3203  	if (unlikely(skb_is_gso(skb))) {
3204  		skb_warn_bad_offload(skb);
3205  		return -EINVAL;
3206  	}
3207  
3208  	/* Before computing a checksum, we should make sure no frag could
3209  	 * be modified by an external entity : checksum could be wrong.
3210  	 */
3211  	if (skb_has_shared_frag(skb)) {
3212  		ret = __skb_linearize(skb);
3213  		if (ret)
3214  			goto out;
3215  	}
3216  
3217  	offset = skb_checksum_start_offset(skb);
3218  	BUG_ON(offset >= skb_headlen(skb));
3219  	csum = skb_checksum(skb, offset, skb->len - offset, 0);
3220  
3221  	offset += skb->csum_offset;
3222  	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
3223  
3224  	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3225  	if (ret)
3226  		goto out;
3227  
3228  	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3229  out_set_summed:
3230  	skb->ip_summed = CHECKSUM_NONE;
3231  out:
3232  	return ret;
3233  }
3234  EXPORT_SYMBOL(skb_checksum_help);
3235  
3236  int skb_crc32c_csum_help(struct sk_buff *skb)
3237  {
3238  	__le32 crc32c_csum;
3239  	int ret = 0, offset, start;
3240  
3241  	if (skb->ip_summed != CHECKSUM_PARTIAL)
3242  		goto out;
3243  
3244  	if (unlikely(skb_is_gso(skb)))
3245  		goto out;
3246  
3247  	/* Before computing a checksum, we should make sure no frag could
3248  	 * be modified by an external entity : checksum could be wrong.
3249  	 */
3250  	if (unlikely(skb_has_shared_frag(skb))) {
3251  		ret = __skb_linearize(skb);
3252  		if (ret)
3253  			goto out;
3254  	}
3255  	start = skb_checksum_start_offset(skb);
3256  	offset = start + offsetof(struct sctphdr, checksum);
3257  	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3258  		ret = -EINVAL;
3259  		goto out;
3260  	}
3261  
3262  	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3263  	if (ret)
3264  		goto out;
3265  
3266  	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3267  						  skb->len - start, ~(__u32)0,
3268  						  crc32c_csum_stub));
3269  	*(__le32 *)(skb->data + offset) = crc32c_csum;
3270  	skb->ip_summed = CHECKSUM_NONE;
3271  	skb->csum_not_inet = 0;
3272  out:
3273  	return ret;
3274  }
3275  
3276  __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3277  {
3278  	__be16 type = skb->protocol;
3279  
3280  	/* Tunnel gso handlers can set protocol to ethernet. */
3281  	if (type == htons(ETH_P_TEB)) {
3282  		struct ethhdr *eth;
3283  
3284  		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3285  			return 0;
3286  
3287  		eth = (struct ethhdr *)skb->data;
3288  		type = eth->h_proto;
3289  	}
3290  
3291  	return __vlan_get_protocol(skb, type, depth);
3292  }
3293  
3294  /* openvswitch calls this on rx path, so we need a different check.
3295   */
3296  static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
3297  {
3298  	if (tx_path)
3299  		return skb->ip_summed != CHECKSUM_PARTIAL &&
3300  		       skb->ip_summed != CHECKSUM_UNNECESSARY;
3301  
3302  	return skb->ip_summed == CHECKSUM_NONE;
3303  }
3304  
3305  /**
3306   *	__skb_gso_segment - Perform segmentation on skb.
3307   *	@skb: buffer to segment
3308   *	@features: features for the output path (see dev->features)
3309   *	@tx_path: whether it is called in TX path
3310   *
3311   *	This function segments the given skb and returns a list of segments.
3312   *
3313   *	It may return NULL if the skb requires no segmentation.  This is
3314   *	only possible when GSO is used for verifying header integrity.
3315   *
3316   *	Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
3317   */
3318  struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
3319  				  netdev_features_t features, bool tx_path)
3320  {
3321  	struct sk_buff *segs;
3322  
3323  	if (unlikely(skb_needs_check(skb, tx_path))) {
3324  		int err;
3325  
3326  		/* We're going to init ->check field in TCP or UDP header */
3327  		err = skb_cow_head(skb, 0);
3328  		if (err < 0)
3329  			return ERR_PTR(err);
3330  	}
3331  
3332  	/* Only report GSO partial support if it will enable us to
3333  	 * support segmentation on this frame without needing additional
3334  	 * work.
3335  	 */
3336  	if (features & NETIF_F_GSO_PARTIAL) {
3337  		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
3338  		struct net_device *dev = skb->dev;
3339  
3340  		partial_features |= dev->features & dev->gso_partial_features;
3341  		if (!skb_gso_ok(skb, features | partial_features))
3342  			features &= ~NETIF_F_GSO_PARTIAL;
3343  	}
3344  
3345  	BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
3346  		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
3347  
3348  	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3349  	SKB_GSO_CB(skb)->encap_level = 0;
3350  
3351  	skb_reset_mac_header(skb);
3352  	skb_reset_mac_len(skb);
3353  
3354  	segs = skb_mac_gso_segment(skb, features);
3355  
3356  	if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3357  		skb_warn_bad_offload(skb);
3358  
3359  	return segs;
3360  }
3361  EXPORT_SYMBOL(__skb_gso_segment);
3362  
3363  /* Take action when hardware reception checksum errors are detected. */
3364  #ifdef CONFIG_BUG
3365  static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3366  {
3367  	netdev_err(dev, "hw csum failure\n");
3368  	skb_dump(KERN_ERR, skb, true);
3369  	dump_stack();
3370  }
3371  
3372  void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3373  {
3374  	DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
3375  }
3376  EXPORT_SYMBOL(netdev_rx_csum_fault);
3377  #endif
3378  
3379  /* XXX: check that highmem exists at all on the given machine. */
3380  static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3381  {
3382  #ifdef CONFIG_HIGHMEM
3383  	int i;
3384  
3385  	if (!(dev->features & NETIF_F_HIGHDMA)) {
3386  		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3387  			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3388  
3389  			if (PageHighMem(skb_frag_page(frag)))
3390  				return 1;
3391  		}
3392  	}
3393  #endif
3394  	return 0;
3395  }
3396  
3397  /* If MPLS offload request, verify we are testing hardware MPLS features
3398   * instead of standard features for the netdev.
3399   */
3400  #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3401  static netdev_features_t net_mpls_features(struct sk_buff *skb,
3402  					   netdev_features_t features,
3403  					   __be16 type)
3404  {
3405  	if (eth_p_mpls(type))
3406  		features &= skb->dev->mpls_features;
3407  
3408  	return features;
3409  }
3410  #else
3411  static netdev_features_t net_mpls_features(struct sk_buff *skb,
3412  					   netdev_features_t features,
3413  					   __be16 type)
3414  {
3415  	return features;
3416  }
3417  #endif
3418  
3419  static netdev_features_t harmonize_features(struct sk_buff *skb,
3420  	netdev_features_t features)
3421  {
3422  	__be16 type;
3423  
3424  	type = skb_network_protocol(skb, NULL);
3425  	features = net_mpls_features(skb, features, type);
3426  
3427  	if (skb->ip_summed != CHECKSUM_NONE &&
3428  	    !can_checksum_protocol(features, type)) {
3429  		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3430  	}
3431  	if (illegal_highdma(skb->dev, skb))
3432  		features &= ~NETIF_F_SG;
3433  
3434  	return features;
3435  }
3436  
3437  netdev_features_t passthru_features_check(struct sk_buff *skb,
3438  					  struct net_device *dev,
3439  					  netdev_features_t features)
3440  {
3441  	return features;
3442  }
3443  EXPORT_SYMBOL(passthru_features_check);
3444  
3445  static netdev_features_t dflt_features_check(struct sk_buff *skb,
3446  					     struct net_device *dev,
3447  					     netdev_features_t features)
3448  {
3449  	return vlan_features_check(skb, features);
3450  }
3451  
3452  static netdev_features_t gso_features_check(const struct sk_buff *skb,
3453  					    struct net_device *dev,
3454  					    netdev_features_t features)
3455  {
3456  	u16 gso_segs = skb_shinfo(skb)->gso_segs;
3457  
3458  	if (gso_segs > READ_ONCE(dev->gso_max_segs))
3459  		return features & ~NETIF_F_GSO_MASK;
3460  
3461  	if (!skb_shinfo(skb)->gso_type) {
3462  		skb_warn_bad_offload(skb);
3463  		return features & ~NETIF_F_GSO_MASK;
3464  	}
3465  
3466  	/* Support for GSO partial features requires software
3467  	 * intervention before we can actually process the packets
3468  	 * so we need to strip support for any partial features now
3469  	 * and we can pull them back in after we have partially
3470  	 * segmented the frame.
3471  	 */
3472  	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3473  		features &= ~dev->gso_partial_features;
3474  
3475  	/* Make sure to clear the IPv4 ID mangling feature if the
3476  	 * IPv4 header has the potential to be fragmented.
3477  	 */
3478  	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3479  		struct iphdr *iph = skb->encapsulation ?
3480  				    inner_ip_hdr(skb) : ip_hdr(skb);
3481  
3482  		if (!(iph->frag_off & htons(IP_DF)))
3483  			features &= ~NETIF_F_TSO_MANGLEID;
3484  	}
3485  
3486  	return features;
3487  }
3488  
3489  netdev_features_t netif_skb_features(struct sk_buff *skb)
3490  {
3491  	struct net_device *dev = skb->dev;
3492  	netdev_features_t features = dev->features;
3493  
3494  	if (skb_is_gso(skb))
3495  		features = gso_features_check(skb, dev, features);
3496  
3497  	/* If encapsulation offload request, verify we are testing
3498  	 * hardware encapsulation features instead of standard
3499  	 * features for the netdev
3500  	 */
3501  	if (skb->encapsulation)
3502  		features &= dev->hw_enc_features;
3503  
3504  	if (skb_vlan_tagged(skb))
3505  		features = netdev_intersect_features(features,
3506  						     dev->vlan_features |
3507  						     NETIF_F_HW_VLAN_CTAG_TX |
3508  						     NETIF_F_HW_VLAN_STAG_TX);
3509  
3510  	if (dev->netdev_ops->ndo_features_check)
3511  		features &= dev->netdev_ops->ndo_features_check(skb, dev,
3512  								features);
3513  	else
3514  		features &= dflt_features_check(skb, dev, features);
3515  
3516  	return harmonize_features(skb, features);
3517  }
3518  EXPORT_SYMBOL(netif_skb_features);
3519  
3520  static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3521  		    struct netdev_queue *txq, bool more)
3522  {
3523  	unsigned int len;
3524  	int rc;
3525  
3526  	if (dev_nit_active(dev))
3527  		dev_queue_xmit_nit(skb, dev);
3528  
3529  	len = skb->len;
3530  	PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies);
3531  	trace_net_dev_start_xmit(skb, dev);
3532  	rc = netdev_start_xmit(skb, dev, txq, more);
3533  	trace_net_dev_xmit(skb, rc, dev, len);
3534  
3535  	return rc;
3536  }
3537  
3538  struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3539  				    struct netdev_queue *txq, int *ret)
3540  {
3541  	struct sk_buff *skb = first;
3542  	int rc = NETDEV_TX_OK;
3543  
3544  	while (skb) {
3545  		struct sk_buff *next = skb->next;
3546  
3547  		skb_mark_not_on_list(skb);
3548  		rc = xmit_one(skb, dev, txq, next != NULL);
3549  		if (unlikely(!dev_xmit_complete(rc))) {
3550  			skb->next = next;
3551  			goto out;
3552  		}
3553  
3554  		skb = next;
3555  		if (netif_tx_queue_stopped(txq) && skb) {
3556  			rc = NETDEV_TX_BUSY;
3557  			break;
3558  		}
3559  	}
3560  
3561  out:
3562  	*ret = rc;
3563  	return skb;
3564  }
3565  
3566  static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3567  					  netdev_features_t features)
3568  {
3569  	if (skb_vlan_tag_present(skb) &&
3570  	    !vlan_hw_offload_capable(features, skb->vlan_proto))
3571  		skb = __vlan_hwaccel_push_inside(skb);
3572  	return skb;
3573  }
3574  
3575  int skb_csum_hwoffload_help(struct sk_buff *skb,
3576  			    const netdev_features_t features)
3577  {
3578  	if (unlikely(skb_csum_is_sctp(skb)))
3579  		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3580  			skb_crc32c_csum_help(skb);
3581  
3582  	if (features & NETIF_F_HW_CSUM)
3583  		return 0;
3584  
3585  	if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
3586  		switch (skb->csum_offset) {
3587  		case offsetof(struct tcphdr, check):
3588  		case offsetof(struct udphdr, check):
3589  			return 0;
3590  		}
3591  	}
3592  
3593  	return skb_checksum_help(skb);
3594  }
3595  EXPORT_SYMBOL(skb_csum_hwoffload_help);
3596  
3597  static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3598  {
3599  	netdev_features_t features;
3600  
3601  	features = netif_skb_features(skb);
3602  	skb = validate_xmit_vlan(skb, features);
3603  	if (unlikely(!skb))
3604  		goto out_null;
3605  
3606  	skb = sk_validate_xmit_skb(skb, dev);
3607  	if (unlikely(!skb))
3608  		goto out_null;
3609  
3610  	if (netif_needs_gso(skb, features)) {
3611  		struct sk_buff *segs;
3612  
3613  		segs = skb_gso_segment(skb, features);
3614  		if (IS_ERR(segs)) {
3615  			goto out_kfree_skb;
3616  		} else if (segs) {
3617  			consume_skb(skb);
3618  			skb = segs;
3619  		}
3620  	} else {
3621  		if (skb_needs_linearize(skb, features) &&
3622  		    __skb_linearize(skb))
3623  			goto out_kfree_skb;
3624  
3625  		/* If packet is not checksummed and device does not
3626  		 * support checksumming for this protocol, complete
3627  		 * checksumming here.
3628  		 */
3629  		if (skb->ip_summed == CHECKSUM_PARTIAL) {
3630  			if (skb->encapsulation)
3631  				skb_set_inner_transport_header(skb,
3632  							       skb_checksum_start_offset(skb));
3633  			else
3634  				skb_set_transport_header(skb,
3635  							 skb_checksum_start_offset(skb));
3636  			if (skb_csum_hwoffload_help(skb, features))
3637  				goto out_kfree_skb;
3638  		}
3639  	}
3640  
3641  	skb = validate_xmit_xfrm(skb, features, again);
3642  
3643  	return skb;
3644  
3645  out_kfree_skb:
3646  	kfree_skb(skb);
3647  out_null:
3648  	dev_core_stats_tx_dropped_inc(dev);
3649  	return NULL;
3650  }
3651  
3652  struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3653  {
3654  	struct sk_buff *next, *head = NULL, *tail;
3655  
3656  	for (; skb != NULL; skb = next) {
3657  		next = skb->next;
3658  		skb_mark_not_on_list(skb);
3659  
3660  		/* in case skb wont be segmented, point to itself */
3661  		skb->prev = skb;
3662  
3663  		skb = validate_xmit_skb(skb, dev, again);
3664  		if (!skb)
3665  			continue;
3666  
3667  		if (!head)
3668  			head = skb;
3669  		else
3670  			tail->next = skb;
3671  		/* If skb was segmented, skb->prev points to
3672  		 * the last segment. If not, it still contains skb.
3673  		 */
3674  		tail = skb->prev;
3675  	}
3676  	return head;
3677  }
3678  EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3679  
3680  static void qdisc_pkt_len_init(struct sk_buff *skb)
3681  {
3682  	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3683  
3684  	qdisc_skb_cb(skb)->pkt_len = skb->len;
3685  
3686  	/* To get more precise estimation of bytes sent on wire,
3687  	 * we add to pkt_len the headers size of all segments
3688  	 */
3689  	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3690  		unsigned int hdr_len;
3691  		u16 gso_segs = shinfo->gso_segs;
3692  
3693  		/* mac layer + network layer */
3694  		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3695  
3696  		/* + transport layer */
3697  		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3698  			const struct tcphdr *th;
3699  			struct tcphdr _tcphdr;
3700  
3701  			th = skb_header_pointer(skb, skb_transport_offset(skb),
3702  						sizeof(_tcphdr), &_tcphdr);
3703  			if (likely(th))
3704  				hdr_len += __tcp_hdrlen(th);
3705  		} else {
3706  			struct udphdr _udphdr;
3707  
3708  			if (skb_header_pointer(skb, skb_transport_offset(skb),
3709  					       sizeof(_udphdr), &_udphdr))
3710  				hdr_len += sizeof(struct udphdr);
3711  		}
3712  
3713  		if (shinfo->gso_type & SKB_GSO_DODGY)
3714  			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3715  						shinfo->gso_size);
3716  
3717  		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3718  	}
3719  }
3720  
3721  static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
3722  			     struct sk_buff **to_free,
3723  			     struct netdev_queue *txq)
3724  {
3725  	int rc;
3726  
3727  	rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
3728  	if (rc == NET_XMIT_SUCCESS)
3729  		trace_qdisc_enqueue(q, txq, skb);
3730  	return rc;
3731  }
3732  
3733  static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3734  				 struct net_device *dev,
3735  				 struct netdev_queue *txq)
3736  {
3737  	spinlock_t *root_lock = qdisc_lock(q);
3738  	struct sk_buff *to_free = NULL;
3739  	bool contended;
3740  	int rc;
3741  
3742  	qdisc_calculate_pkt_len(skb, q);
3743  
3744  	if (q->flags & TCQ_F_NOLOCK) {
3745  		if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
3746  		    qdisc_run_begin(q)) {
3747  			/* Retest nolock_qdisc_is_empty() within the protection
3748  			 * of q->seqlock to protect from racing with requeuing.
3749  			 */
3750  			if (unlikely(!nolock_qdisc_is_empty(q))) {
3751  				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3752  				__qdisc_run(q);
3753  				qdisc_run_end(q);
3754  
3755  				goto no_lock_out;
3756  			}
3757  
3758  			qdisc_bstats_cpu_update(q, skb);
3759  			if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
3760  			    !nolock_qdisc_is_empty(q))
3761  				__qdisc_run(q);
3762  
3763  			qdisc_run_end(q);
3764  			return NET_XMIT_SUCCESS;
3765  		}
3766  
3767  		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3768  		qdisc_run(q);
3769  
3770  no_lock_out:
3771  		if (unlikely(to_free))
3772  			kfree_skb_list_reason(to_free,
3773  					      SKB_DROP_REASON_QDISC_DROP);
3774  		return rc;
3775  	}
3776  
3777  	/*
3778  	 * Heuristic to force contended enqueues to serialize on a
3779  	 * separate lock before trying to get qdisc main lock.
3780  	 * This permits qdisc->running owner to get the lock more
3781  	 * often and dequeue packets faster.
3782  	 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
3783  	 * and then other tasks will only enqueue packets. The packets will be
3784  	 * sent after the qdisc owner is scheduled again. To prevent this
3785  	 * scenario the task always serialize on the lock.
3786  	 */
3787  	contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
3788  	if (unlikely(contended))
3789  		spin_lock(&q->busylock);
3790  
3791  	spin_lock(root_lock);
3792  	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3793  		__qdisc_drop(skb, &to_free);
3794  		rc = NET_XMIT_DROP;
3795  	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3796  		   qdisc_run_begin(q)) {
3797  		/*
3798  		 * This is a work-conserving queue; there are no old skbs
3799  		 * waiting to be sent out; and the qdisc is not running -
3800  		 * xmit the skb directly.
3801  		 */
3802  
3803  		qdisc_bstats_update(q, skb);
3804  
3805  		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3806  			if (unlikely(contended)) {
3807  				spin_unlock(&q->busylock);
3808  				contended = false;
3809  			}
3810  			__qdisc_run(q);
3811  		}
3812  
3813  		qdisc_run_end(q);
3814  		rc = NET_XMIT_SUCCESS;
3815  	} else {
3816  		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3817  		if (qdisc_run_begin(q)) {
3818  			if (unlikely(contended)) {
3819  				spin_unlock(&q->busylock);
3820  				contended = false;
3821  			}
3822  			__qdisc_run(q);
3823  			qdisc_run_end(q);
3824  		}
3825  	}
3826  	spin_unlock(root_lock);
3827  	if (unlikely(to_free))
3828  		kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP);
3829  	if (unlikely(contended))
3830  		spin_unlock(&q->busylock);
3831  	return rc;
3832  }
3833  
3834  #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3835  static void skb_update_prio(struct sk_buff *skb)
3836  {
3837  	const struct netprio_map *map;
3838  	const struct sock *sk;
3839  	unsigned int prioidx;
3840  
3841  	if (skb->priority)
3842  		return;
3843  	map = rcu_dereference_bh(skb->dev->priomap);
3844  	if (!map)
3845  		return;
3846  	sk = skb_to_full_sk(skb);
3847  	if (!sk)
3848  		return;
3849  
3850  	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3851  
3852  	if (prioidx < map->priomap_len)
3853  		skb->priority = map->priomap[prioidx];
3854  }
3855  #else
3856  #define skb_update_prio(skb)
3857  #endif
3858  
3859  /**
3860   *	dev_loopback_xmit - loop back @skb
3861   *	@net: network namespace this loopback is happening in
3862   *	@sk:  sk needed to be a netfilter okfn
3863   *	@skb: buffer to transmit
3864   */
3865  int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3866  {
3867  	skb_reset_mac_header(skb);
3868  	__skb_pull(skb, skb_network_offset(skb));
3869  	skb->pkt_type = PACKET_LOOPBACK;
3870  	if (skb->ip_summed == CHECKSUM_NONE)
3871  		skb->ip_summed = CHECKSUM_UNNECESSARY;
3872  	WARN_ON(!skb_dst(skb));
3873  	skb_dst_force(skb);
3874  	netif_rx(skb);
3875  	return 0;
3876  }
3877  EXPORT_SYMBOL(dev_loopback_xmit);
3878  
3879  #ifdef CONFIG_NET_EGRESS
3880  static struct sk_buff *
3881  sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3882  {
3883  #ifdef CONFIG_NET_CLS_ACT
3884  	struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3885  	struct tcf_result cl_res;
3886  
3887  	if (!miniq)
3888  		return skb;
3889  
3890  	/* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3891  	tc_skb_cb(skb)->mru = 0;
3892  	tc_skb_cb(skb)->post_ct = false;
3893  	mini_qdisc_bstats_cpu_update(miniq, skb);
3894  
3895  	switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
3896  	case TC_ACT_OK:
3897  	case TC_ACT_RECLASSIFY:
3898  		skb->tc_index = TC_H_MIN(cl_res.classid);
3899  		break;
3900  	case TC_ACT_SHOT:
3901  		mini_qdisc_qstats_cpu_drop(miniq);
3902  		*ret = NET_XMIT_DROP;
3903  		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
3904  		return NULL;
3905  	case TC_ACT_STOLEN:
3906  	case TC_ACT_QUEUED:
3907  	case TC_ACT_TRAP:
3908  		*ret = NET_XMIT_SUCCESS;
3909  		consume_skb(skb);
3910  		return NULL;
3911  	case TC_ACT_REDIRECT:
3912  		/* No need to push/pop skb's mac_header here on egress! */
3913  		skb_do_redirect(skb);
3914  		*ret = NET_XMIT_SUCCESS;
3915  		return NULL;
3916  	default:
3917  		break;
3918  	}
3919  #endif /* CONFIG_NET_CLS_ACT */
3920  
3921  	return skb;
3922  }
3923  #endif /* CONFIG_NET_EGRESS */
3924  
3925  #ifdef CONFIG_XPS
3926  static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3927  			       struct xps_dev_maps *dev_maps, unsigned int tci)
3928  {
3929  	int tc = netdev_get_prio_tc_map(dev, skb->priority);
3930  	struct xps_map *map;
3931  	int queue_index = -1;
3932  
3933  	if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
3934  		return queue_index;
3935  
3936  	tci *= dev_maps->num_tc;
3937  	tci += tc;
3938  
3939  	map = rcu_dereference(dev_maps->attr_map[tci]);
3940  	if (map) {
3941  		if (map->len == 1)
3942  			queue_index = map->queues[0];
3943  		else
3944  			queue_index = map->queues[reciprocal_scale(
3945  						skb_get_hash(skb), map->len)];
3946  		if (unlikely(queue_index >= dev->real_num_tx_queues))
3947  			queue_index = -1;
3948  	}
3949  	return queue_index;
3950  }
3951  #endif
3952  
3953  static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
3954  			 struct sk_buff *skb)
3955  {
3956  #ifdef CONFIG_XPS
3957  	struct xps_dev_maps *dev_maps;
3958  	struct sock *sk = skb->sk;
3959  	int queue_index = -1;
3960  
3961  	if (!static_key_false(&xps_needed))
3962  		return -1;
3963  
3964  	rcu_read_lock();
3965  	if (!static_key_false(&xps_rxqs_needed))
3966  		goto get_cpus_map;
3967  
3968  	dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
3969  	if (dev_maps) {
3970  		int tci = sk_rx_queue_get(sk);
3971  
3972  		if (tci >= 0)
3973  			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3974  							  tci);
3975  	}
3976  
3977  get_cpus_map:
3978  	if (queue_index < 0) {
3979  		dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
3980  		if (dev_maps) {
3981  			unsigned int tci = skb->sender_cpu - 1;
3982  
3983  			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3984  							  tci);
3985  		}
3986  	}
3987  	rcu_read_unlock();
3988  
3989  	return queue_index;
3990  #else
3991  	return -1;
3992  #endif
3993  }
3994  
3995  u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
3996  		     struct net_device *sb_dev)
3997  {
3998  	return 0;
3999  }
4000  EXPORT_SYMBOL(dev_pick_tx_zero);
4001  
4002  u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
4003  		       struct net_device *sb_dev)
4004  {
4005  	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
4006  }
4007  EXPORT_SYMBOL(dev_pick_tx_cpu_id);
4008  
4009  u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4010  		     struct net_device *sb_dev)
4011  {
4012  	struct sock *sk = skb->sk;
4013  	int queue_index = sk_tx_queue_get(sk);
4014  
4015  	sb_dev = sb_dev ? : dev;
4016  
4017  	if (queue_index < 0 || skb->ooo_okay ||
4018  	    queue_index >= dev->real_num_tx_queues) {
4019  		int new_index = get_xps_queue(dev, sb_dev, skb);
4020  
4021  		if (new_index < 0)
4022  			new_index = skb_tx_hash(dev, sb_dev, skb);
4023  
4024  		if (queue_index != new_index && sk &&
4025  		    sk_fullsock(sk) &&
4026  		    rcu_access_pointer(sk->sk_dst_cache))
4027  			sk_tx_queue_set(sk, new_index);
4028  
4029  		queue_index = new_index;
4030  	}
4031  
4032  	return queue_index;
4033  }
4034  EXPORT_SYMBOL(netdev_pick_tx);
4035  
4036  struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4037  					 struct sk_buff *skb,
4038  					 struct net_device *sb_dev)
4039  {
4040  	int queue_index = 0;
4041  
4042  #ifdef CONFIG_XPS
4043  	u32 sender_cpu = skb->sender_cpu - 1;
4044  
4045  	if (sender_cpu >= (u32)NR_CPUS)
4046  		skb->sender_cpu = raw_smp_processor_id() + 1;
4047  #endif
4048  
4049  	if (dev->real_num_tx_queues != 1) {
4050  		const struct net_device_ops *ops = dev->netdev_ops;
4051  
4052  		if (ops->ndo_select_queue)
4053  			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4054  		else
4055  			queue_index = netdev_pick_tx(dev, skb, sb_dev);
4056  
4057  		queue_index = netdev_cap_txqueue(dev, queue_index);
4058  	}
4059  
4060  	skb_set_queue_mapping(skb, queue_index);
4061  	return netdev_get_tx_queue(dev, queue_index);
4062  }
4063  
4064  /**
4065   *	__dev_queue_xmit - transmit a buffer
4066   *	@skb: buffer to transmit
4067   *	@sb_dev: suboordinate device used for L2 forwarding offload
4068   *
4069   *	Queue a buffer for transmission to a network device. The caller must
4070   *	have set the device and priority and built the buffer before calling
4071   *	this function. The function can be called from an interrupt.
4072   *
4073   *	A negative errno code is returned on a failure. A success does not
4074   *	guarantee the frame will be transmitted as it may be dropped due
4075   *	to congestion or traffic shaping.
4076   *
4077   * -----------------------------------------------------------------------------------
4078   *      I notice this method can also return errors from the queue disciplines,
4079   *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
4080   *      be positive.
4081   *
4082   *      Regardless of the return value, the skb is consumed, so it is currently
4083   *      difficult to retry a send to this method.  (You can bump the ref count
4084   *      before sending to hold a reference for retry if you are careful.)
4085   *
4086   *      When calling this method, interrupts MUST be enabled.  This is because
4087   *      the BH enable code must have IRQs enabled so that it will not deadlock.
4088   *          --BLG
4089   */
4090  static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4091  {
4092  	struct net_device *dev = skb->dev;
4093  	struct netdev_queue *txq;
4094  	struct Qdisc *q;
4095  	int rc = -ENOMEM;
4096  	bool again = false;
4097  
4098  	skb_reset_mac_header(skb);
4099  
4100  	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
4101  		__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
4102  
4103  	/* Disable soft irqs for various locks below. Also
4104  	 * stops preemption for RCU.
4105  	 */
4106  	rcu_read_lock_bh();
4107  
4108  	skb_update_prio(skb);
4109  
4110  	qdisc_pkt_len_init(skb);
4111  #ifdef CONFIG_NET_CLS_ACT
4112  	skb->tc_at_ingress = 0;
4113  #endif
4114  #ifdef CONFIG_NET_EGRESS
4115  	if (static_branch_unlikely(&egress_needed_key)) {
4116  		if (nf_hook_egress_active()) {
4117  			skb = nf_hook_egress(skb, &rc, dev);
4118  			if (!skb)
4119  				goto out;
4120  		}
4121  		nf_skip_egress(skb, true);
4122  		skb = sch_handle_egress(skb, &rc, dev);
4123  		if (!skb)
4124  			goto out;
4125  		nf_skip_egress(skb, false);
4126  	}
4127  #endif
4128  	/* If device/qdisc don't need skb->dst, release it right now while
4129  	 * its hot in this cpu cache.
4130  	 */
4131  	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4132  		skb_dst_drop(skb);
4133  	else
4134  		skb_dst_force(skb);
4135  
4136  	txq = netdev_core_pick_tx(dev, skb, sb_dev);
4137  	q = rcu_dereference_bh(txq->qdisc);
4138  
4139  	trace_net_dev_queue(skb);
4140  	if (q->enqueue) {
4141  		rc = __dev_xmit_skb(skb, q, dev, txq);
4142  		goto out;
4143  	}
4144  
4145  	/* The device has no queue. Common case for software devices:
4146  	 * loopback, all the sorts of tunnels...
4147  
4148  	 * Really, it is unlikely that netif_tx_lock protection is necessary
4149  	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
4150  	 * counters.)
4151  	 * However, it is possible, that they rely on protection
4152  	 * made by us here.
4153  
4154  	 * Check this and shot the lock. It is not prone from deadlocks.
4155  	 *Either shot noqueue qdisc, it is even simpler 8)
4156  	 */
4157  	if (dev->flags & IFF_UP) {
4158  		int cpu = smp_processor_id(); /* ok because BHs are off */
4159  
4160  		/* Other cpus might concurrently change txq->xmit_lock_owner
4161  		 * to -1 or to their cpu id, but not to our id.
4162  		 */
4163  		if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
4164  			if (dev_xmit_recursion())
4165  				goto recursion_alert;
4166  
4167  			skb = validate_xmit_skb(skb, dev, &again);
4168  			if (!skb)
4169  				goto out;
4170  
4171  			PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
4172  			HARD_TX_LOCK(dev, txq, cpu);
4173  
4174  			if (!netif_xmit_stopped(txq)) {
4175  				dev_xmit_recursion_inc();
4176  				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4177  				dev_xmit_recursion_dec();
4178  				if (dev_xmit_complete(rc)) {
4179  					HARD_TX_UNLOCK(dev, txq);
4180  					goto out;
4181  				}
4182  			}
4183  			HARD_TX_UNLOCK(dev, txq);
4184  			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4185  					     dev->name);
4186  		} else {
4187  			/* Recursion is detected! It is possible,
4188  			 * unfortunately
4189  			 */
4190  recursion_alert:
4191  			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4192  					     dev->name);
4193  		}
4194  	}
4195  
4196  	rc = -ENETDOWN;
4197  	rcu_read_unlock_bh();
4198  
4199  	dev_core_stats_tx_dropped_inc(dev);
4200  	kfree_skb_list(skb);
4201  	return rc;
4202  out:
4203  	rcu_read_unlock_bh();
4204  	return rc;
4205  }
4206  
4207  int dev_queue_xmit(struct sk_buff *skb)
4208  {
4209  	return __dev_queue_xmit(skb, NULL);
4210  }
4211  EXPORT_SYMBOL(dev_queue_xmit);
4212  
4213  int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
4214  {
4215  	return __dev_queue_xmit(skb, sb_dev);
4216  }
4217  EXPORT_SYMBOL(dev_queue_xmit_accel);
4218  
4219  int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4220  {
4221  	struct net_device *dev = skb->dev;
4222  	struct sk_buff *orig_skb = skb;
4223  	struct netdev_queue *txq;
4224  	int ret = NETDEV_TX_BUSY;
4225  	bool again = false;
4226  
4227  	if (unlikely(!netif_running(dev) ||
4228  		     !netif_carrier_ok(dev)))
4229  		goto drop;
4230  
4231  	skb = validate_xmit_skb_list(skb, dev, &again);
4232  	if (skb != orig_skb)
4233  		goto drop;
4234  
4235  	skb_set_queue_mapping(skb, queue_id);
4236  	txq = skb_get_tx_queue(dev, skb);
4237  	PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
4238  
4239  	local_bh_disable();
4240  
4241  	dev_xmit_recursion_inc();
4242  	HARD_TX_LOCK(dev, txq, smp_processor_id());
4243  	if (!netif_xmit_frozen_or_drv_stopped(txq))
4244  		ret = netdev_start_xmit(skb, dev, txq, false);
4245  	HARD_TX_UNLOCK(dev, txq);
4246  	dev_xmit_recursion_dec();
4247  
4248  	local_bh_enable();
4249  	return ret;
4250  drop:
4251  	dev_core_stats_tx_dropped_inc(dev);
4252  	kfree_skb_list(skb);
4253  	return NET_XMIT_DROP;
4254  }
4255  EXPORT_SYMBOL(__dev_direct_xmit);
4256  
4257  /*************************************************************************
4258   *			Receiver routines
4259   *************************************************************************/
4260  
4261  int netdev_max_backlog __read_mostly = 1000;
4262  EXPORT_SYMBOL(netdev_max_backlog);
4263  
4264  int netdev_tstamp_prequeue __read_mostly = 1;
4265  int netdev_budget __read_mostly = 300;
4266  /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
4267  unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
4268  int weight_p __read_mostly = 64;           /* old backlog weight */
4269  int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
4270  int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
4271  int dev_rx_weight __read_mostly = 64;
4272  int dev_tx_weight __read_mostly = 64;
4273  
4274  /* Called with irq disabled */
4275  static inline void ____napi_schedule(struct softnet_data *sd,
4276  				     struct napi_struct *napi)
4277  {
4278  	struct task_struct *thread;
4279  
4280  	lockdep_assert_irqs_disabled();
4281  
4282  	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
4283  		/* Paired with smp_mb__before_atomic() in
4284  		 * napi_enable()/dev_set_threaded().
4285  		 * Use READ_ONCE() to guarantee a complete
4286  		 * read on napi->thread. Only call
4287  		 * wake_up_process() when it's not NULL.
4288  		 */
4289  		thread = READ_ONCE(napi->thread);
4290  		if (thread) {
4291  			/* Avoid doing set_bit() if the thread is in
4292  			 * INTERRUPTIBLE state, cause napi_thread_wait()
4293  			 * makes sure to proceed with napi polling
4294  			 * if the thread is explicitly woken from here.
4295  			 */
4296  			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
4297  				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
4298  			wake_up_process(thread);
4299  			return;
4300  		}
4301  	}
4302  
4303  	list_add_tail(&napi->poll_list, &sd->poll_list);
4304  	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4305  }
4306  
4307  #ifdef CONFIG_RPS
4308  
4309  /* One global table that all flow-based protocols share. */
4310  struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
4311  EXPORT_SYMBOL(rps_sock_flow_table);
4312  u32 rps_cpu_mask __read_mostly;
4313  EXPORT_SYMBOL(rps_cpu_mask);
4314  
4315  struct static_key_false rps_needed __read_mostly;
4316  EXPORT_SYMBOL(rps_needed);
4317  struct static_key_false rfs_needed __read_mostly;
4318  EXPORT_SYMBOL(rfs_needed);
4319  
4320  static struct rps_dev_flow *
4321  set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4322  	    struct rps_dev_flow *rflow, u16 next_cpu)
4323  {
4324  	if (next_cpu < nr_cpu_ids) {
4325  #ifdef CONFIG_RFS_ACCEL
4326  		struct netdev_rx_queue *rxqueue;
4327  		struct rps_dev_flow_table *flow_table;
4328  		struct rps_dev_flow *old_rflow;
4329  		u32 flow_id;
4330  		u16 rxq_index;
4331  		int rc;
4332  
4333  		/* Should we steer this flow to a different hardware queue? */
4334  		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4335  		    !(dev->features & NETIF_F_NTUPLE))
4336  			goto out;
4337  		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4338  		if (rxq_index == skb_get_rx_queue(skb))
4339  			goto out;
4340  
4341  		rxqueue = dev->_rx + rxq_index;
4342  		flow_table = rcu_dereference(rxqueue->rps_flow_table);
4343  		if (!flow_table)
4344  			goto out;
4345  		flow_id = skb_get_hash(skb) & flow_table->mask;
4346  		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4347  							rxq_index, flow_id);
4348  		if (rc < 0)
4349  			goto out;
4350  		old_rflow = rflow;
4351  		rflow = &flow_table->flows[flow_id];
4352  		rflow->filter = rc;
4353  		if (old_rflow->filter == rflow->filter)
4354  			old_rflow->filter = RPS_NO_FILTER;
4355  	out:
4356  #endif
4357  		rflow->last_qtail =
4358  			per_cpu(softnet_data, next_cpu).input_queue_head;
4359  	}
4360  
4361  	rflow->cpu = next_cpu;
4362  	return rflow;
4363  }
4364  
4365  /*
4366   * get_rps_cpu is called from netif_receive_skb and returns the target
4367   * CPU from the RPS map of the receiving queue for a given skb.
4368   * rcu_read_lock must be held on entry.
4369   */
4370  static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4371  		       struct rps_dev_flow **rflowp)
4372  {
4373  	const struct rps_sock_flow_table *sock_flow_table;
4374  	struct netdev_rx_queue *rxqueue = dev->_rx;
4375  	struct rps_dev_flow_table *flow_table;
4376  	struct rps_map *map;
4377  	int cpu = -1;
4378  	u32 tcpu;
4379  	u32 hash;
4380  
4381  	if (skb_rx_queue_recorded(skb)) {
4382  		u16 index = skb_get_rx_queue(skb);
4383  
4384  		if (unlikely(index >= dev->real_num_rx_queues)) {
4385  			WARN_ONCE(dev->real_num_rx_queues > 1,
4386  				  "%s received packet on queue %u, but number "
4387  				  "of RX queues is %u\n",
4388  				  dev->name, index, dev->real_num_rx_queues);
4389  			goto done;
4390  		}
4391  		rxqueue += index;
4392  	}
4393  
4394  	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4395  
4396  	flow_table = rcu_dereference(rxqueue->rps_flow_table);
4397  	map = rcu_dereference(rxqueue->rps_map);
4398  	if (!flow_table && !map)
4399  		goto done;
4400  
4401  	skb_reset_network_header(skb);
4402  	hash = skb_get_hash(skb);
4403  	if (!hash)
4404  		goto done;
4405  
4406  	sock_flow_table = rcu_dereference(rps_sock_flow_table);
4407  	if (flow_table && sock_flow_table) {
4408  		struct rps_dev_flow *rflow;
4409  		u32 next_cpu;
4410  		u32 ident;
4411  
4412  		/* First check into global flow table if there is a match */
4413  		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
4414  		if ((ident ^ hash) & ~rps_cpu_mask)
4415  			goto try_rps;
4416  
4417  		next_cpu = ident & rps_cpu_mask;
4418  
4419  		/* OK, now we know there is a match,
4420  		 * we can look at the local (per receive queue) flow table
4421  		 */
4422  		rflow = &flow_table->flows[hash & flow_table->mask];
4423  		tcpu = rflow->cpu;
4424  
4425  		/*
4426  		 * If the desired CPU (where last recvmsg was done) is
4427  		 * different from current CPU (one in the rx-queue flow
4428  		 * table entry), switch if one of the following holds:
4429  		 *   - Current CPU is unset (>= nr_cpu_ids).
4430  		 *   - Current CPU is offline.
4431  		 *   - The current CPU's queue tail has advanced beyond the
4432  		 *     last packet that was enqueued using this table entry.
4433  		 *     This guarantees that all previous packets for the flow
4434  		 *     have been dequeued, thus preserving in order delivery.
4435  		 */
4436  		if (unlikely(tcpu != next_cpu) &&
4437  		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4438  		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4439  		      rflow->last_qtail)) >= 0)) {
4440  			tcpu = next_cpu;
4441  			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4442  		}
4443  
4444  		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4445  			*rflowp = rflow;
4446  			cpu = tcpu;
4447  			goto done;
4448  		}
4449  	}
4450  
4451  try_rps:
4452  
4453  	if (map) {
4454  		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4455  		if (cpu_online(tcpu)) {
4456  			cpu = tcpu;
4457  			goto done;
4458  		}
4459  	}
4460  
4461  done:
4462  	return cpu;
4463  }
4464  
4465  #ifdef CONFIG_RFS_ACCEL
4466  
4467  /**
4468   * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4469   * @dev: Device on which the filter was set
4470   * @rxq_index: RX queue index
4471   * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4472   * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4473   *
4474   * Drivers that implement ndo_rx_flow_steer() should periodically call
4475   * this function for each installed filter and remove the filters for
4476   * which it returns %true.
4477   */
4478  bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4479  			 u32 flow_id, u16 filter_id)
4480  {
4481  	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4482  	struct rps_dev_flow_table *flow_table;
4483  	struct rps_dev_flow *rflow;
4484  	bool expire = true;
4485  	unsigned int cpu;
4486  
4487  	rcu_read_lock();
4488  	flow_table = rcu_dereference(rxqueue->rps_flow_table);
4489  	if (flow_table && flow_id <= flow_table->mask) {
4490  		rflow = &flow_table->flows[flow_id];
4491  		cpu = READ_ONCE(rflow->cpu);
4492  		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4493  		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4494  			   rflow->last_qtail) <
4495  		     (int)(10 * flow_table->mask)))
4496  			expire = false;
4497  	}
4498  	rcu_read_unlock();
4499  	return expire;
4500  }
4501  EXPORT_SYMBOL(rps_may_expire_flow);
4502  
4503  #endif /* CONFIG_RFS_ACCEL */
4504  
4505  /* Called from hardirq (IPI) context */
4506  static void rps_trigger_softirq(void *data)
4507  {
4508  	struct softnet_data *sd = data;
4509  
4510  	____napi_schedule(sd, &sd->backlog);
4511  	sd->received_rps++;
4512  }
4513  
4514  #endif /* CONFIG_RPS */
4515  
4516  /*
4517   * Check if this softnet_data structure is another cpu one
4518   * If yes, queue it to our IPI list and return 1
4519   * If no, return 0
4520   */
4521  static int napi_schedule_rps(struct softnet_data *sd)
4522  {
4523  	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4524  
4525  #ifdef CONFIG_RPS
4526  	if (sd != mysd) {
4527  		sd->rps_ipi_next = mysd->rps_ipi_list;
4528  		mysd->rps_ipi_list = sd;
4529  
4530  		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4531  		return 1;
4532  	}
4533  #endif /* CONFIG_RPS */
4534  	__napi_schedule_irqoff(&mysd->backlog);
4535  	return 0;
4536  }
4537  
4538  #ifdef CONFIG_NET_FLOW_LIMIT
4539  int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4540  #endif
4541  
4542  static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4543  {
4544  #ifdef CONFIG_NET_FLOW_LIMIT
4545  	struct sd_flow_limit *fl;
4546  	struct softnet_data *sd;
4547  	unsigned int old_flow, new_flow;
4548  
4549  	if (qlen < (netdev_max_backlog >> 1))
4550  		return false;
4551  
4552  	sd = this_cpu_ptr(&softnet_data);
4553  
4554  	rcu_read_lock();
4555  	fl = rcu_dereference(sd->flow_limit);
4556  	if (fl) {
4557  		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4558  		old_flow = fl->history[fl->history_head];
4559  		fl->history[fl->history_head] = new_flow;
4560  
4561  		fl->history_head++;
4562  		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4563  
4564  		if (likely(fl->buckets[old_flow]))
4565  			fl->buckets[old_flow]--;
4566  
4567  		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4568  			fl->count++;
4569  			rcu_read_unlock();
4570  			return true;
4571  		}
4572  	}
4573  	rcu_read_unlock();
4574  #endif
4575  	return false;
4576  }
4577  
4578  /*
4579   * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4580   * queue (may be a remote CPU queue).
4581   */
4582  static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4583  			      unsigned int *qtail)
4584  {
4585  	enum skb_drop_reason reason;
4586  	struct softnet_data *sd;
4587  	unsigned long flags;
4588  	unsigned int qlen;
4589  
4590  	reason = SKB_DROP_REASON_NOT_SPECIFIED;
4591  	sd = &per_cpu(softnet_data, cpu);
4592  
4593  	rps_lock_irqsave(sd, &flags);
4594  	if (!netif_running(skb->dev))
4595  		goto drop;
4596  	qlen = skb_queue_len(&sd->input_pkt_queue);
4597  	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
4598  		if (qlen) {
4599  enqueue:
4600  			__skb_queue_tail(&sd->input_pkt_queue, skb);
4601  			input_queue_tail_incr_save(sd, qtail);
4602  			rps_unlock_irq_restore(sd, &flags);
4603  			return NET_RX_SUCCESS;
4604  		}
4605  
4606  		/* Schedule NAPI for backlog device
4607  		 * We can use non atomic operation since we own the queue lock
4608  		 */
4609  		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
4610  			napi_schedule_rps(sd);
4611  		goto enqueue;
4612  	}
4613  	reason = SKB_DROP_REASON_CPU_BACKLOG;
4614  
4615  drop:
4616  	sd->dropped++;
4617  	rps_unlock_irq_restore(sd, &flags);
4618  
4619  	dev_core_stats_rx_dropped_inc(skb->dev);
4620  	kfree_skb_reason(skb, reason);
4621  	return NET_RX_DROP;
4622  }
4623  
4624  static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4625  {
4626  	struct net_device *dev = skb->dev;
4627  	struct netdev_rx_queue *rxqueue;
4628  
4629  	rxqueue = dev->_rx;
4630  
4631  	if (skb_rx_queue_recorded(skb)) {
4632  		u16 index = skb_get_rx_queue(skb);
4633  
4634  		if (unlikely(index >= dev->real_num_rx_queues)) {
4635  			WARN_ONCE(dev->real_num_rx_queues > 1,
4636  				  "%s received packet on queue %u, but number "
4637  				  "of RX queues is %u\n",
4638  				  dev->name, index, dev->real_num_rx_queues);
4639  
4640  			return rxqueue; /* Return first rxqueue */
4641  		}
4642  		rxqueue += index;
4643  	}
4644  	return rxqueue;
4645  }
4646  
4647  u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
4648  			     struct bpf_prog *xdp_prog)
4649  {
4650  	void *orig_data, *orig_data_end, *hard_start;
4651  	struct netdev_rx_queue *rxqueue;
4652  	bool orig_bcast, orig_host;
4653  	u32 mac_len, frame_sz;
4654  	__be16 orig_eth_type;
4655  	struct ethhdr *eth;
4656  	u32 metalen, act;
4657  	int off;
4658  
4659  	/* The XDP program wants to see the packet starting at the MAC
4660  	 * header.
4661  	 */
4662  	mac_len = skb->data - skb_mac_header(skb);
4663  	hard_start = skb->data - skb_headroom(skb);
4664  
4665  	/* SKB "head" area always have tailroom for skb_shared_info */
4666  	frame_sz = (void *)skb_end_pointer(skb) - hard_start;
4667  	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4668  
4669  	rxqueue = netif_get_rxqueue(skb);
4670  	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
4671  	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
4672  			 skb_headlen(skb) + mac_len, true);
4673  
4674  	orig_data_end = xdp->data_end;
4675  	orig_data = xdp->data;
4676  	eth = (struct ethhdr *)xdp->data;
4677  	orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
4678  	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4679  	orig_eth_type = eth->h_proto;
4680  
4681  	act = bpf_prog_run_xdp(xdp_prog, xdp);
4682  
4683  	/* check if bpf_xdp_adjust_head was used */
4684  	off = xdp->data - orig_data;
4685  	if (off) {
4686  		if (off > 0)
4687  			__skb_pull(skb, off);
4688  		else if (off < 0)
4689  			__skb_push(skb, -off);
4690  
4691  		skb->mac_header += off;
4692  		skb_reset_network_header(skb);
4693  	}
4694  
4695  	/* check if bpf_xdp_adjust_tail was used */
4696  	off = xdp->data_end - orig_data_end;
4697  	if (off != 0) {
4698  		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4699  		skb->len += off; /* positive on grow, negative on shrink */
4700  	}
4701  
4702  	/* check if XDP changed eth hdr such SKB needs update */
4703  	eth = (struct ethhdr *)xdp->data;
4704  	if ((orig_eth_type != eth->h_proto) ||
4705  	    (orig_host != ether_addr_equal_64bits(eth->h_dest,
4706  						  skb->dev->dev_addr)) ||
4707  	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4708  		__skb_push(skb, ETH_HLEN);
4709  		skb->pkt_type = PACKET_HOST;
4710  		skb->protocol = eth_type_trans(skb, skb->dev);
4711  	}
4712  
4713  	/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
4714  	 * before calling us again on redirect path. We do not call do_redirect
4715  	 * as we leave that up to the caller.
4716  	 *
4717  	 * Caller is responsible for managing lifetime of skb (i.e. calling
4718  	 * kfree_skb in response to actions it cannot handle/XDP_DROP).
4719  	 */
4720  	switch (act) {
4721  	case XDP_REDIRECT:
4722  	case XDP_TX:
4723  		__skb_push(skb, mac_len);
4724  		break;
4725  	case XDP_PASS:
4726  		metalen = xdp->data - xdp->data_meta;
4727  		if (metalen)
4728  			skb_metadata_set(skb, metalen);
4729  		break;
4730  	}
4731  
4732  	return act;
4733  }
4734  
4735  static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4736  				     struct xdp_buff *xdp,
4737  				     struct bpf_prog *xdp_prog)
4738  {
4739  	u32 act = XDP_DROP;
4740  
4741  	/* Reinjected packets coming from act_mirred or similar should
4742  	 * not get XDP generic processing.
4743  	 */
4744  	if (skb_is_redirected(skb))
4745  		return XDP_PASS;
4746  
4747  	/* XDP packets must be linear and must have sufficient headroom
4748  	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4749  	 * native XDP provides, thus we need to do it here as well.
4750  	 */
4751  	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
4752  	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4753  		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4754  		int troom = skb->tail + skb->data_len - skb->end;
4755  
4756  		/* In case we have to go down the path and also linearize,
4757  		 * then lets do the pskb_expand_head() work just once here.
4758  		 */
4759  		if (pskb_expand_head(skb,
4760  				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4761  				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4762  			goto do_drop;
4763  		if (skb_linearize(skb))
4764  			goto do_drop;
4765  	}
4766  
4767  	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
4768  	switch (act) {
4769  	case XDP_REDIRECT:
4770  	case XDP_TX:
4771  	case XDP_PASS:
4772  		break;
4773  	default:
4774  		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
4775  		fallthrough;
4776  	case XDP_ABORTED:
4777  		trace_xdp_exception(skb->dev, xdp_prog, act);
4778  		fallthrough;
4779  	case XDP_DROP:
4780  	do_drop:
4781  		kfree_skb(skb);
4782  		break;
4783  	}
4784  
4785  	return act;
4786  }
4787  
4788  /* When doing generic XDP we have to bypass the qdisc layer and the
4789   * network taps in order to match in-driver-XDP behavior.
4790   */
4791  void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4792  {
4793  	struct net_device *dev = skb->dev;
4794  	struct netdev_queue *txq;
4795  	bool free_skb = true;
4796  	int cpu, rc;
4797  
4798  	txq = netdev_core_pick_tx(dev, skb, NULL);
4799  	cpu = smp_processor_id();
4800  	HARD_TX_LOCK(dev, txq, cpu);
4801  	if (!netif_xmit_stopped(txq)) {
4802  		rc = netdev_start_xmit(skb, dev, txq, 0);
4803  		if (dev_xmit_complete(rc))
4804  			free_skb = false;
4805  	}
4806  	HARD_TX_UNLOCK(dev, txq);
4807  	if (free_skb) {
4808  		trace_xdp_exception(dev, xdp_prog, XDP_TX);
4809  		kfree_skb(skb);
4810  	}
4811  }
4812  
4813  static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4814  
4815  int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4816  {
4817  	if (xdp_prog) {
4818  		struct xdp_buff xdp;
4819  		u32 act;
4820  		int err;
4821  
4822  		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4823  		if (act != XDP_PASS) {
4824  			switch (act) {
4825  			case XDP_REDIRECT:
4826  				err = xdp_do_generic_redirect(skb->dev, skb,
4827  							      &xdp, xdp_prog);
4828  				if (err)
4829  					goto out_redir;
4830  				break;
4831  			case XDP_TX:
4832  				generic_xdp_tx(skb, xdp_prog);
4833  				break;
4834  			}
4835  			return XDP_DROP;
4836  		}
4837  	}
4838  	return XDP_PASS;
4839  out_redir:
4840  	kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
4841  	return XDP_DROP;
4842  }
4843  EXPORT_SYMBOL_GPL(do_xdp_generic);
4844  
4845  static int netif_rx_internal(struct sk_buff *skb)
4846  {
4847  	int ret;
4848  
4849  	net_timestamp_check(netdev_tstamp_prequeue, skb);
4850  
4851  	trace_netif_rx(skb);
4852  
4853  #ifdef CONFIG_RPS
4854  	if (static_branch_unlikely(&rps_needed)) {
4855  		struct rps_dev_flow voidflow, *rflow = &voidflow;
4856  		int cpu;
4857  
4858  		rcu_read_lock();
4859  
4860  		cpu = get_rps_cpu(skb->dev, skb, &rflow);
4861  		if (cpu < 0)
4862  			cpu = smp_processor_id();
4863  
4864  		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4865  
4866  		rcu_read_unlock();
4867  	} else
4868  #endif
4869  	{
4870  		unsigned int qtail;
4871  
4872  		ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
4873  	}
4874  	return ret;
4875  }
4876  
4877  /**
4878   *	__netif_rx	-	Slightly optimized version of netif_rx
4879   *	@skb: buffer to post
4880   *
4881   *	This behaves as netif_rx except that it does not disable bottom halves.
4882   *	As a result this function may only be invoked from the interrupt context
4883   *	(either hard or soft interrupt).
4884   */
4885  int __netif_rx(struct sk_buff *skb)
4886  {
4887  	int ret;
4888  
4889  	lockdep_assert_once(hardirq_count() | softirq_count());
4890  
4891  	trace_netif_rx_entry(skb);
4892  	ret = netif_rx_internal(skb);
4893  	trace_netif_rx_exit(ret);
4894  	return ret;
4895  }
4896  EXPORT_SYMBOL(__netif_rx);
4897  
4898  /**
4899   *	netif_rx	-	post buffer to the network code
4900   *	@skb: buffer to post
4901   *
4902   *	This function receives a packet from a device driver and queues it for
4903   *	the upper (protocol) levels to process via the backlog NAPI device. It
4904   *	always succeeds. The buffer may be dropped during processing for
4905   *	congestion control or by the protocol layers.
4906   *	The network buffer is passed via the backlog NAPI device. Modern NIC
4907   *	driver should use NAPI and GRO.
4908   *	This function can used from interrupt and from process context. The
4909   *	caller from process context must not disable interrupts before invoking
4910   *	this function.
4911   *
4912   *	return values:
4913   *	NET_RX_SUCCESS	(no congestion)
4914   *	NET_RX_DROP     (packet was dropped)
4915   *
4916   */
4917  int netif_rx(struct sk_buff *skb)
4918  {
4919  	bool need_bh_off = !(hardirq_count() | softirq_count());
4920  	int ret;
4921  
4922  	if (need_bh_off)
4923  		local_bh_disable();
4924  	trace_netif_rx_entry(skb);
4925  	ret = netif_rx_internal(skb);
4926  	trace_netif_rx_exit(ret);
4927  	if (need_bh_off)
4928  		local_bh_enable();
4929  	return ret;
4930  }
4931  EXPORT_SYMBOL(netif_rx);
4932  
4933  static __latent_entropy void net_tx_action(struct softirq_action *h)
4934  {
4935  	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4936  
4937  	if (sd->completion_queue) {
4938  		struct sk_buff *clist;
4939  
4940  		local_irq_disable();
4941  		clist = sd->completion_queue;
4942  		sd->completion_queue = NULL;
4943  		local_irq_enable();
4944  
4945  		while (clist) {
4946  			struct sk_buff *skb = clist;
4947  
4948  			clist = clist->next;
4949  
4950  			WARN_ON(refcount_read(&skb->users));
4951  			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4952  				trace_consume_skb(skb);
4953  			else
4954  				trace_kfree_skb(skb, net_tx_action,
4955  						SKB_DROP_REASON_NOT_SPECIFIED);
4956  
4957  			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4958  				__kfree_skb(skb);
4959  			else
4960  				__kfree_skb_defer(skb);
4961  		}
4962  	}
4963  
4964  	if (sd->output_queue) {
4965  		struct Qdisc *head;
4966  
4967  		local_irq_disable();
4968  		head = sd->output_queue;
4969  		sd->output_queue = NULL;
4970  		sd->output_queue_tailp = &sd->output_queue;
4971  		local_irq_enable();
4972  
4973  		rcu_read_lock();
4974  
4975  		while (head) {
4976  			struct Qdisc *q = head;
4977  			spinlock_t *root_lock = NULL;
4978  
4979  			head = head->next_sched;
4980  
4981  			/* We need to make sure head->next_sched is read
4982  			 * before clearing __QDISC_STATE_SCHED
4983  			 */
4984  			smp_mb__before_atomic();
4985  
4986  			if (!(q->flags & TCQ_F_NOLOCK)) {
4987  				root_lock = qdisc_lock(q);
4988  				spin_lock(root_lock);
4989  			} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
4990  						     &q->state))) {
4991  				/* There is a synchronize_net() between
4992  				 * STATE_DEACTIVATED flag being set and
4993  				 * qdisc_reset()/some_qdisc_is_busy() in
4994  				 * dev_deactivate(), so we can safely bail out
4995  				 * early here to avoid data race between
4996  				 * qdisc_deactivate() and some_qdisc_is_busy()
4997  				 * for lockless qdisc.
4998  				 */
4999  				clear_bit(__QDISC_STATE_SCHED, &q->state);
5000  				continue;
5001  			}
5002  
5003  			clear_bit(__QDISC_STATE_SCHED, &q->state);
5004  			qdisc_run(q);
5005  			if (root_lock)
5006  				spin_unlock(root_lock);
5007  		}
5008  
5009  		rcu_read_unlock();
5010  	}
5011  
5012  	xfrm_dev_backlog(sd);
5013  }
5014  
5015  #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
5016  /* This hook is defined here for ATM LANE */
5017  int (*br_fdb_test_addr_hook)(struct net_device *dev,
5018  			     unsigned char *addr) __read_mostly;
5019  EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
5020  #endif
5021  
5022  static inline struct sk_buff *
5023  sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
5024  		   struct net_device *orig_dev, bool *another)
5025  {
5026  #ifdef CONFIG_NET_CLS_ACT
5027  	struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
5028  	struct tcf_result cl_res;
5029  
5030  	/* If there's at least one ingress present somewhere (so
5031  	 * we get here via enabled static key), remaining devices
5032  	 * that are not configured with an ingress qdisc will bail
5033  	 * out here.
5034  	 */
5035  	if (!miniq)
5036  		return skb;
5037  
5038  	if (*pt_prev) {
5039  		*ret = deliver_skb(skb, *pt_prev, orig_dev);
5040  		*pt_prev = NULL;
5041  	}
5042  
5043  	qdisc_skb_cb(skb)->pkt_len = skb->len;
5044  	tc_skb_cb(skb)->mru = 0;
5045  	tc_skb_cb(skb)->post_ct = false;
5046  	skb->tc_at_ingress = 1;
5047  	mini_qdisc_bstats_cpu_update(miniq, skb);
5048  
5049  	switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) {
5050  	case TC_ACT_OK:
5051  	case TC_ACT_RECLASSIFY:
5052  		skb->tc_index = TC_H_MIN(cl_res.classid);
5053  		break;
5054  	case TC_ACT_SHOT:
5055  		mini_qdisc_qstats_cpu_drop(miniq);
5056  		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
5057  		return NULL;
5058  	case TC_ACT_STOLEN:
5059  	case TC_ACT_QUEUED:
5060  	case TC_ACT_TRAP:
5061  		consume_skb(skb);
5062  		return NULL;
5063  	case TC_ACT_REDIRECT:
5064  		/* skb_mac_header check was done by cls/act_bpf, so
5065  		 * we can safely push the L2 header back before
5066  		 * redirecting to another netdev
5067  		 */
5068  		__skb_push(skb, skb->mac_len);
5069  		if (skb_do_redirect(skb) == -EAGAIN) {
5070  			__skb_pull(skb, skb->mac_len);
5071  			*another = true;
5072  			break;
5073  		}
5074  		return NULL;
5075  	case TC_ACT_CONSUMED:
5076  		return NULL;
5077  	default:
5078  		break;
5079  	}
5080  #endif /* CONFIG_NET_CLS_ACT */
5081  	return skb;
5082  }
5083  
5084  /**
5085   *	netdev_is_rx_handler_busy - check if receive handler is registered
5086   *	@dev: device to check
5087   *
5088   *	Check if a receive handler is already registered for a given device.
5089   *	Return true if there one.
5090   *
5091   *	The caller must hold the rtnl_mutex.
5092   */
5093  bool netdev_is_rx_handler_busy(struct net_device *dev)
5094  {
5095  	ASSERT_RTNL();
5096  	return dev && rtnl_dereference(dev->rx_handler);
5097  }
5098  EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
5099  
5100  /**
5101   *	netdev_rx_handler_register - register receive handler
5102   *	@dev: device to register a handler for
5103   *	@rx_handler: receive handler to register
5104   *	@rx_handler_data: data pointer that is used by rx handler
5105   *
5106   *	Register a receive handler for a device. This handler will then be
5107   *	called from __netif_receive_skb. A negative errno code is returned
5108   *	on a failure.
5109   *
5110   *	The caller must hold the rtnl_mutex.
5111   *
5112   *	For a general description of rx_handler, see enum rx_handler_result.
5113   */
5114  int netdev_rx_handler_register(struct net_device *dev,
5115  			       rx_handler_func_t *rx_handler,
5116  			       void *rx_handler_data)
5117  {
5118  	if (netdev_is_rx_handler_busy(dev))
5119  		return -EBUSY;
5120  
5121  	if (dev->priv_flags & IFF_NO_RX_HANDLER)
5122  		return -EINVAL;
5123  
5124  	/* Note: rx_handler_data must be set before rx_handler */
5125  	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5126  	rcu_assign_pointer(dev->rx_handler, rx_handler);
5127  
5128  	return 0;
5129  }
5130  EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5131  
5132  /**
5133   *	netdev_rx_handler_unregister - unregister receive handler
5134   *	@dev: device to unregister a handler from
5135   *
5136   *	Unregister a receive handler from a device.
5137   *
5138   *	The caller must hold the rtnl_mutex.
5139   */
5140  void netdev_rx_handler_unregister(struct net_device *dev)
5141  {
5142  
5143  	ASSERT_RTNL();
5144  	RCU_INIT_POINTER(dev->rx_handler, NULL);
5145  	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5146  	 * section has a guarantee to see a non NULL rx_handler_data
5147  	 * as well.
5148  	 */
5149  	synchronize_net();
5150  	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5151  }
5152  EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5153  
5154  /*
5155   * Limit the use of PFMEMALLOC reserves to those protocols that implement
5156   * the special handling of PFMEMALLOC skbs.
5157   */
5158  static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5159  {
5160  	switch (skb->protocol) {
5161  	case htons(ETH_P_ARP):
5162  	case htons(ETH_P_IP):
5163  	case htons(ETH_P_IPV6):
5164  	case htons(ETH_P_8021Q):
5165  	case htons(ETH_P_8021AD):
5166  		return true;
5167  	default:
5168  		return false;
5169  	}
5170  }
5171  
5172  static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5173  			     int *ret, struct net_device *orig_dev)
5174  {
5175  	if (nf_hook_ingress_active(skb)) {
5176  		int ingress_retval;
5177  
5178  		if (*pt_prev) {
5179  			*ret = deliver_skb(skb, *pt_prev, orig_dev);
5180  			*pt_prev = NULL;
5181  		}
5182  
5183  		rcu_read_lock();
5184  		ingress_retval = nf_hook_ingress(skb);
5185  		rcu_read_unlock();
5186  		return ingress_retval;
5187  	}
5188  	return 0;
5189  }
5190  
5191  static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5192  				    struct packet_type **ppt_prev)
5193  {
5194  	struct packet_type *ptype, *pt_prev;
5195  	rx_handler_func_t *rx_handler;
5196  	struct sk_buff *skb = *pskb;
5197  	struct net_device *orig_dev;
5198  	bool deliver_exact = false;
5199  	int ret = NET_RX_DROP;
5200  	__be16 type;
5201  
5202  	net_timestamp_check(!netdev_tstamp_prequeue, skb);
5203  
5204  	trace_netif_receive_skb(skb);
5205  
5206  	orig_dev = skb->dev;
5207  
5208  	skb_reset_network_header(skb);
5209  	if (!skb_transport_header_was_set(skb))
5210  		skb_reset_transport_header(skb);
5211  	skb_reset_mac_len(skb);
5212  
5213  	pt_prev = NULL;
5214  
5215  another_round:
5216  	skb->skb_iif = skb->dev->ifindex;
5217  
5218  	__this_cpu_inc(softnet_data.processed);
5219  
5220  	if (static_branch_unlikely(&generic_xdp_needed_key)) {
5221  		int ret2;
5222  
5223  		migrate_disable();
5224  		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
5225  		migrate_enable();
5226  
5227  		if (ret2 != XDP_PASS) {
5228  			ret = NET_RX_DROP;
5229  			goto out;
5230  		}
5231  	}
5232  
5233  	if (eth_type_vlan(skb->protocol)) {
5234  		skb = skb_vlan_untag(skb);
5235  		if (unlikely(!skb))
5236  			goto out;
5237  	}
5238  
5239  	if (skb_skip_tc_classify(skb))
5240  		goto skip_classify;
5241  
5242  	if (pfmemalloc)
5243  		goto skip_taps;
5244  
5245  	list_for_each_entry_rcu(ptype, &ptype_all, list) {
5246  		if (pt_prev)
5247  			ret = deliver_skb(skb, pt_prev, orig_dev);
5248  		pt_prev = ptype;
5249  	}
5250  
5251  	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5252  		if (pt_prev)
5253  			ret = deliver_skb(skb, pt_prev, orig_dev);
5254  		pt_prev = ptype;
5255  	}
5256  
5257  skip_taps:
5258  #ifdef CONFIG_NET_INGRESS
5259  	if (static_branch_unlikely(&ingress_needed_key)) {
5260  		bool another = false;
5261  
5262  		nf_skip_egress(skb, true);
5263  		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5264  					 &another);
5265  		if (another)
5266  			goto another_round;
5267  		if (!skb)
5268  			goto out;
5269  
5270  		nf_skip_egress(skb, false);
5271  		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5272  			goto out;
5273  	}
5274  #endif
5275  	skb_reset_redirect(skb);
5276  skip_classify:
5277  	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5278  		goto drop;
5279  
5280  	if (skb_vlan_tag_present(skb)) {
5281  		if (pt_prev) {
5282  			ret = deliver_skb(skb, pt_prev, orig_dev);
5283  			pt_prev = NULL;
5284  		}
5285  		if (vlan_do_receive(&skb))
5286  			goto another_round;
5287  		else if (unlikely(!skb))
5288  			goto out;
5289  	}
5290  
5291  	rx_handler = rcu_dereference(skb->dev->rx_handler);
5292  	if (rx_handler) {
5293  		if (pt_prev) {
5294  			ret = deliver_skb(skb, pt_prev, orig_dev);
5295  			pt_prev = NULL;
5296  		}
5297  		switch (rx_handler(&skb)) {
5298  		case RX_HANDLER_CONSUMED:
5299  			ret = NET_RX_SUCCESS;
5300  			goto out;
5301  		case RX_HANDLER_ANOTHER:
5302  			goto another_round;
5303  		case RX_HANDLER_EXACT:
5304  			deliver_exact = true;
5305  			break;
5306  		case RX_HANDLER_PASS:
5307  			break;
5308  		default:
5309  			BUG();
5310  		}
5311  	}
5312  
5313  	if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
5314  check_vlan_id:
5315  		if (skb_vlan_tag_get_id(skb)) {
5316  			/* Vlan id is non 0 and vlan_do_receive() above couldn't
5317  			 * find vlan device.
5318  			 */
5319  			skb->pkt_type = PACKET_OTHERHOST;
5320  		} else if (eth_type_vlan(skb->protocol)) {
5321  			/* Outer header is 802.1P with vlan 0, inner header is
5322  			 * 802.1Q or 802.1AD and vlan_do_receive() above could
5323  			 * not find vlan dev for vlan id 0.
5324  			 */
5325  			__vlan_hwaccel_clear_tag(skb);
5326  			skb = skb_vlan_untag(skb);
5327  			if (unlikely(!skb))
5328  				goto out;
5329  			if (vlan_do_receive(&skb))
5330  				/* After stripping off 802.1P header with vlan 0
5331  				 * vlan dev is found for inner header.
5332  				 */
5333  				goto another_round;
5334  			else if (unlikely(!skb))
5335  				goto out;
5336  			else
5337  				/* We have stripped outer 802.1P vlan 0 header.
5338  				 * But could not find vlan dev.
5339  				 * check again for vlan id to set OTHERHOST.
5340  				 */
5341  				goto check_vlan_id;
5342  		}
5343  		/* Note: we might in the future use prio bits
5344  		 * and set skb->priority like in vlan_do_receive()
5345  		 * For the time being, just ignore Priority Code Point
5346  		 */
5347  		__vlan_hwaccel_clear_tag(skb);
5348  	}
5349  
5350  	type = skb->protocol;
5351  
5352  	/* deliver only exact match when indicated */
5353  	if (likely(!deliver_exact)) {
5354  		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5355  				       &ptype_base[ntohs(type) &
5356  						   PTYPE_HASH_MASK]);
5357  	}
5358  
5359  	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5360  			       &orig_dev->ptype_specific);
5361  
5362  	if (unlikely(skb->dev != orig_dev)) {
5363  		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5364  				       &skb->dev->ptype_specific);
5365  	}
5366  
5367  	if (pt_prev) {
5368  		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5369  			goto drop;
5370  		*ppt_prev = pt_prev;
5371  	} else {
5372  drop:
5373  		if (!deliver_exact) {
5374  			dev_core_stats_rx_dropped_inc(skb->dev);
5375  			kfree_skb_reason(skb, SKB_DROP_REASON_PTYPE_ABSENT);
5376  		} else {
5377  			dev_core_stats_rx_nohandler_inc(skb->dev);
5378  			kfree_skb(skb);
5379  		}
5380  		/* Jamal, now you will not able to escape explaining
5381  		 * me how you were going to use this. :-)
5382  		 */
5383  		ret = NET_RX_DROP;
5384  	}
5385  
5386  out:
5387  	/* The invariant here is that if *ppt_prev is not NULL
5388  	 * then skb should also be non-NULL.
5389  	 *
5390  	 * Apparently *ppt_prev assignment above holds this invariant due to
5391  	 * skb dereferencing near it.
5392  	 */
5393  	*pskb = skb;
5394  	return ret;
5395  }
5396  
5397  static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5398  {
5399  	struct net_device *orig_dev = skb->dev;
5400  	struct packet_type *pt_prev = NULL;
5401  	int ret;
5402  
5403  	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5404  	if (pt_prev)
5405  		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5406  					 skb->dev, pt_prev, orig_dev);
5407  	return ret;
5408  }
5409  
5410  /**
5411   *	netif_receive_skb_core - special purpose version of netif_receive_skb
5412   *	@skb: buffer to process
5413   *
5414   *	More direct receive version of netif_receive_skb().  It should
5415   *	only be used by callers that have a need to skip RPS and Generic XDP.
5416   *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5417   *
5418   *	This function may only be called from softirq context and interrupts
5419   *	should be enabled.
5420   *
5421   *	Return values (usually ignored):
5422   *	NET_RX_SUCCESS: no congestion
5423   *	NET_RX_DROP: packet was dropped
5424   */
5425  int netif_receive_skb_core(struct sk_buff *skb)
5426  {
5427  	int ret;
5428  
5429  	rcu_read_lock();
5430  	ret = __netif_receive_skb_one_core(skb, false);
5431  	rcu_read_unlock();
5432  
5433  	return ret;
5434  }
5435  EXPORT_SYMBOL(netif_receive_skb_core);
5436  
5437  static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5438  						  struct packet_type *pt_prev,
5439  						  struct net_device *orig_dev)
5440  {
5441  	struct sk_buff *skb, *next;
5442  
5443  	if (!pt_prev)
5444  		return;
5445  	if (list_empty(head))
5446  		return;
5447  	if (pt_prev->list_func != NULL)
5448  		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5449  				   ip_list_rcv, head, pt_prev, orig_dev);
5450  	else
5451  		list_for_each_entry_safe(skb, next, head, list) {
5452  			skb_list_del_init(skb);
5453  			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5454  		}
5455  }
5456  
5457  static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5458  {
5459  	/* Fast-path assumptions:
5460  	 * - There is no RX handler.
5461  	 * - Only one packet_type matches.
5462  	 * If either of these fails, we will end up doing some per-packet
5463  	 * processing in-line, then handling the 'last ptype' for the whole
5464  	 * sublist.  This can't cause out-of-order delivery to any single ptype,
5465  	 * because the 'last ptype' must be constant across the sublist, and all
5466  	 * other ptypes are handled per-packet.
5467  	 */
5468  	/* Current (common) ptype of sublist */
5469  	struct packet_type *pt_curr = NULL;
5470  	/* Current (common) orig_dev of sublist */
5471  	struct net_device *od_curr = NULL;
5472  	struct list_head sublist;
5473  	struct sk_buff *skb, *next;
5474  
5475  	INIT_LIST_HEAD(&sublist);
5476  	list_for_each_entry_safe(skb, next, head, list) {
5477  		struct net_device *orig_dev = skb->dev;
5478  		struct packet_type *pt_prev = NULL;
5479  
5480  		skb_list_del_init(skb);
5481  		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5482  		if (!pt_prev)
5483  			continue;
5484  		if (pt_curr != pt_prev || od_curr != orig_dev) {
5485  			/* dispatch old sublist */
5486  			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5487  			/* start new sublist */
5488  			INIT_LIST_HEAD(&sublist);
5489  			pt_curr = pt_prev;
5490  			od_curr = orig_dev;
5491  		}
5492  		list_add_tail(&skb->list, &sublist);
5493  	}
5494  
5495  	/* dispatch final sublist */
5496  	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5497  }
5498  
5499  static int __netif_receive_skb(struct sk_buff *skb)
5500  {
5501  	int ret;
5502  
5503  	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5504  		unsigned int noreclaim_flag;
5505  
5506  		/*
5507  		 * PFMEMALLOC skbs are special, they should
5508  		 * - be delivered to SOCK_MEMALLOC sockets only
5509  		 * - stay away from userspace
5510  		 * - have bounded memory usage
5511  		 *
5512  		 * Use PF_MEMALLOC as this saves us from propagating the allocation
5513  		 * context down to all allocation sites.
5514  		 */
5515  		noreclaim_flag = memalloc_noreclaim_save();
5516  		ret = __netif_receive_skb_one_core(skb, true);
5517  		memalloc_noreclaim_restore(noreclaim_flag);
5518  	} else
5519  		ret = __netif_receive_skb_one_core(skb, false);
5520  
5521  	return ret;
5522  }
5523  
5524  static void __netif_receive_skb_list(struct list_head *head)
5525  {
5526  	unsigned long noreclaim_flag = 0;
5527  	struct sk_buff *skb, *next;
5528  	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5529  
5530  	list_for_each_entry_safe(skb, next, head, list) {
5531  		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5532  			struct list_head sublist;
5533  
5534  			/* Handle the previous sublist */
5535  			list_cut_before(&sublist, head, &skb->list);
5536  			if (!list_empty(&sublist))
5537  				__netif_receive_skb_list_core(&sublist, pfmemalloc);
5538  			pfmemalloc = !pfmemalloc;
5539  			/* See comments in __netif_receive_skb */
5540  			if (pfmemalloc)
5541  				noreclaim_flag = memalloc_noreclaim_save();
5542  			else
5543  				memalloc_noreclaim_restore(noreclaim_flag);
5544  		}
5545  	}
5546  	/* Handle the remaining sublist */
5547  	if (!list_empty(head))
5548  		__netif_receive_skb_list_core(head, pfmemalloc);
5549  	/* Restore pflags */
5550  	if (pfmemalloc)
5551  		memalloc_noreclaim_restore(noreclaim_flag);
5552  }
5553  
5554  static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5555  {
5556  	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5557  	struct bpf_prog *new = xdp->prog;
5558  	int ret = 0;
5559  
5560  	switch (xdp->command) {
5561  	case XDP_SETUP_PROG:
5562  		rcu_assign_pointer(dev->xdp_prog, new);
5563  		if (old)
5564  			bpf_prog_put(old);
5565  
5566  		if (old && !new) {
5567  			static_branch_dec(&generic_xdp_needed_key);
5568  		} else if (new && !old) {
5569  			static_branch_inc(&generic_xdp_needed_key);
5570  			dev_disable_lro(dev);
5571  			dev_disable_gro_hw(dev);
5572  		}
5573  		break;
5574  
5575  	default:
5576  		ret = -EINVAL;
5577  		break;
5578  	}
5579  
5580  	return ret;
5581  }
5582  
5583  static int netif_receive_skb_internal(struct sk_buff *skb)
5584  {
5585  	int ret;
5586  
5587  	net_timestamp_check(netdev_tstamp_prequeue, skb);
5588  
5589  	if (skb_defer_rx_timestamp(skb))
5590  		return NET_RX_SUCCESS;
5591  
5592  	rcu_read_lock();
5593  #ifdef CONFIG_RPS
5594  	if (static_branch_unlikely(&rps_needed)) {
5595  		struct rps_dev_flow voidflow, *rflow = &voidflow;
5596  		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5597  
5598  		if (cpu >= 0) {
5599  			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5600  			rcu_read_unlock();
5601  			return ret;
5602  		}
5603  	}
5604  #endif
5605  	ret = __netif_receive_skb(skb);
5606  	rcu_read_unlock();
5607  	return ret;
5608  }
5609  
5610  void netif_receive_skb_list_internal(struct list_head *head)
5611  {
5612  	struct sk_buff *skb, *next;
5613  	struct list_head sublist;
5614  
5615  	INIT_LIST_HEAD(&sublist);
5616  	list_for_each_entry_safe(skb, next, head, list) {
5617  		net_timestamp_check(netdev_tstamp_prequeue, skb);
5618  		skb_list_del_init(skb);
5619  		if (!skb_defer_rx_timestamp(skb))
5620  			list_add_tail(&skb->list, &sublist);
5621  	}
5622  	list_splice_init(&sublist, head);
5623  
5624  	rcu_read_lock();
5625  #ifdef CONFIG_RPS
5626  	if (static_branch_unlikely(&rps_needed)) {
5627  		list_for_each_entry_safe(skb, next, head, list) {
5628  			struct rps_dev_flow voidflow, *rflow = &voidflow;
5629  			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5630  
5631  			if (cpu >= 0) {
5632  				/* Will be handled, remove from list */
5633  				skb_list_del_init(skb);
5634  				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5635  			}
5636  		}
5637  	}
5638  #endif
5639  	__netif_receive_skb_list(head);
5640  	rcu_read_unlock();
5641  }
5642  
5643  /**
5644   *	netif_receive_skb - process receive buffer from network
5645   *	@skb: buffer to process
5646   *
5647   *	netif_receive_skb() is the main receive data processing function.
5648   *	It always succeeds. The buffer may be dropped during processing
5649   *	for congestion control or by the protocol layers.
5650   *
5651   *	This function may only be called from softirq context and interrupts
5652   *	should be enabled.
5653   *
5654   *	Return values (usually ignored):
5655   *	NET_RX_SUCCESS: no congestion
5656   *	NET_RX_DROP: packet was dropped
5657   */
5658  int netif_receive_skb(struct sk_buff *skb)
5659  {
5660  	int ret;
5661  
5662  	trace_netif_receive_skb_entry(skb);
5663  
5664  	ret = netif_receive_skb_internal(skb);
5665  	trace_netif_receive_skb_exit(ret);
5666  
5667  	return ret;
5668  }
5669  EXPORT_SYMBOL(netif_receive_skb);
5670  
5671  /**
5672   *	netif_receive_skb_list - process many receive buffers from network
5673   *	@head: list of skbs to process.
5674   *
5675   *	Since return value of netif_receive_skb() is normally ignored, and
5676   *	wouldn't be meaningful for a list, this function returns void.
5677   *
5678   *	This function may only be called from softirq context and interrupts
5679   *	should be enabled.
5680   */
5681  void netif_receive_skb_list(struct list_head *head)
5682  {
5683  	struct sk_buff *skb;
5684  
5685  	if (list_empty(head))
5686  		return;
5687  	if (trace_netif_receive_skb_list_entry_enabled()) {
5688  		list_for_each_entry(skb, head, list)
5689  			trace_netif_receive_skb_list_entry(skb);
5690  	}
5691  	netif_receive_skb_list_internal(head);
5692  	trace_netif_receive_skb_list_exit(0);
5693  }
5694  EXPORT_SYMBOL(netif_receive_skb_list);
5695  
5696  static DEFINE_PER_CPU(struct work_struct, flush_works);
5697  
5698  /* Network device is going away, flush any packets still pending */
5699  static void flush_backlog(struct work_struct *work)
5700  {
5701  	struct sk_buff *skb, *tmp;
5702  	struct softnet_data *sd;
5703  
5704  	local_bh_disable();
5705  	sd = this_cpu_ptr(&softnet_data);
5706  
5707  	rps_lock_irq_disable(sd);
5708  	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5709  		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5710  			__skb_unlink(skb, &sd->input_pkt_queue);
5711  			dev_kfree_skb_irq(skb);
5712  			input_queue_head_incr(sd);
5713  		}
5714  	}
5715  	rps_unlock_irq_enable(sd);
5716  
5717  	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5718  		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5719  			__skb_unlink(skb, &sd->process_queue);
5720  			kfree_skb(skb);
5721  			input_queue_head_incr(sd);
5722  		}
5723  	}
5724  	local_bh_enable();
5725  }
5726  
5727  static bool flush_required(int cpu)
5728  {
5729  #if IS_ENABLED(CONFIG_RPS)
5730  	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
5731  	bool do_flush;
5732  
5733  	rps_lock_irq_disable(sd);
5734  
5735  	/* as insertion into process_queue happens with the rps lock held,
5736  	 * process_queue access may race only with dequeue
5737  	 */
5738  	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
5739  		   !skb_queue_empty_lockless(&sd->process_queue);
5740  	rps_unlock_irq_enable(sd);
5741  
5742  	return do_flush;
5743  #endif
5744  	/* without RPS we can't safely check input_pkt_queue: during a
5745  	 * concurrent remote skb_queue_splice() we can detect as empty both
5746  	 * input_pkt_queue and process_queue even if the latter could end-up
5747  	 * containing a lot of packets.
5748  	 */
5749  	return true;
5750  }
5751  
5752  static void flush_all_backlogs(void)
5753  {
5754  	static cpumask_t flush_cpus;
5755  	unsigned int cpu;
5756  
5757  	/* since we are under rtnl lock protection we can use static data
5758  	 * for the cpumask and avoid allocating on stack the possibly
5759  	 * large mask
5760  	 */
5761  	ASSERT_RTNL();
5762  
5763  	cpus_read_lock();
5764  
5765  	cpumask_clear(&flush_cpus);
5766  	for_each_online_cpu(cpu) {
5767  		if (flush_required(cpu)) {
5768  			queue_work_on(cpu, system_highpri_wq,
5769  				      per_cpu_ptr(&flush_works, cpu));
5770  			cpumask_set_cpu(cpu, &flush_cpus);
5771  		}
5772  	}
5773  
5774  	/* we can have in flight packet[s] on the cpus we are not flushing,
5775  	 * synchronize_net() in unregister_netdevice_many() will take care of
5776  	 * them
5777  	 */
5778  	for_each_cpu(cpu, &flush_cpus)
5779  		flush_work(per_cpu_ptr(&flush_works, cpu));
5780  
5781  	cpus_read_unlock();
5782  }
5783  
5784  static void net_rps_send_ipi(struct softnet_data *remsd)
5785  {
5786  #ifdef CONFIG_RPS
5787  	while (remsd) {
5788  		struct softnet_data *next = remsd->rps_ipi_next;
5789  
5790  		if (cpu_online(remsd->cpu))
5791  			smp_call_function_single_async(remsd->cpu, &remsd->csd);
5792  		remsd = next;
5793  	}
5794  #endif
5795  }
5796  
5797  /*
5798   * net_rps_action_and_irq_enable sends any pending IPI's for rps.
5799   * Note: called with local irq disabled, but exits with local irq enabled.
5800   */
5801  static void net_rps_action_and_irq_enable(struct softnet_data *sd)
5802  {
5803  #ifdef CONFIG_RPS
5804  	struct softnet_data *remsd = sd->rps_ipi_list;
5805  
5806  	if (remsd) {
5807  		sd->rps_ipi_list = NULL;
5808  
5809  		local_irq_enable();
5810  
5811  		/* Send pending IPI's to kick RPS processing on remote cpus. */
5812  		net_rps_send_ipi(remsd);
5813  	} else
5814  #endif
5815  		local_irq_enable();
5816  }
5817  
5818  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
5819  {
5820  #ifdef CONFIG_RPS
5821  	return sd->rps_ipi_list != NULL;
5822  #else
5823  	return false;
5824  #endif
5825  }
5826  
5827  static int process_backlog(struct napi_struct *napi, int quota)
5828  {
5829  	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
5830  	bool again = true;
5831  	int work = 0;
5832  
5833  	/* Check if we have pending ipi, its better to send them now,
5834  	 * not waiting net_rx_action() end.
5835  	 */
5836  	if (sd_has_rps_ipi_waiting(sd)) {
5837  		local_irq_disable();
5838  		net_rps_action_and_irq_enable(sd);
5839  	}
5840  
5841  	napi->weight = dev_rx_weight;
5842  	while (again) {
5843  		struct sk_buff *skb;
5844  
5845  		while ((skb = __skb_dequeue(&sd->process_queue))) {
5846  			rcu_read_lock();
5847  			__netif_receive_skb(skb);
5848  			rcu_read_unlock();
5849  			input_queue_head_incr(sd);
5850  			if (++work >= quota)
5851  				return work;
5852  
5853  		}
5854  
5855  		rps_lock_irq_disable(sd);
5856  		if (skb_queue_empty(&sd->input_pkt_queue)) {
5857  			/*
5858  			 * Inline a custom version of __napi_complete().
5859  			 * only current cpu owns and manipulates this napi,
5860  			 * and NAPI_STATE_SCHED is the only possible flag set
5861  			 * on backlog.
5862  			 * We can use a plain write instead of clear_bit(),
5863  			 * and we dont need an smp_mb() memory barrier.
5864  			 */
5865  			napi->state = 0;
5866  			again = false;
5867  		} else {
5868  			skb_queue_splice_tail_init(&sd->input_pkt_queue,
5869  						   &sd->process_queue);
5870  		}
5871  		rps_unlock_irq_enable(sd);
5872  	}
5873  
5874  	return work;
5875  }
5876  
5877  /**
5878   * __napi_schedule - schedule for receive
5879   * @n: entry to schedule
5880   *
5881   * The entry's receive function will be scheduled to run.
5882   * Consider using __napi_schedule_irqoff() if hard irqs are masked.
5883   */
5884  void __napi_schedule(struct napi_struct *n)
5885  {
5886  	unsigned long flags;
5887  
5888  	local_irq_save(flags);
5889  	____napi_schedule(this_cpu_ptr(&softnet_data), n);
5890  	local_irq_restore(flags);
5891  }
5892  EXPORT_SYMBOL(__napi_schedule);
5893  
5894  /**
5895   *	napi_schedule_prep - check if napi can be scheduled
5896   *	@n: napi context
5897   *
5898   * Test if NAPI routine is already running, and if not mark
5899   * it as running.  This is used as a condition variable to
5900   * insure only one NAPI poll instance runs.  We also make
5901   * sure there is no pending NAPI disable.
5902   */
5903  bool napi_schedule_prep(struct napi_struct *n)
5904  {
5905  	unsigned long val, new;
5906  
5907  	do {
5908  		val = READ_ONCE(n->state);
5909  		if (unlikely(val & NAPIF_STATE_DISABLE))
5910  			return false;
5911  		new = val | NAPIF_STATE_SCHED;
5912  
5913  		/* Sets STATE_MISSED bit if STATE_SCHED was already set
5914  		 * This was suggested by Alexander Duyck, as compiler
5915  		 * emits better code than :
5916  		 * if (val & NAPIF_STATE_SCHED)
5917  		 *     new |= NAPIF_STATE_MISSED;
5918  		 */
5919  		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
5920  						   NAPIF_STATE_MISSED;
5921  	} while (cmpxchg(&n->state, val, new) != val);
5922  
5923  	return !(val & NAPIF_STATE_SCHED);
5924  }
5925  EXPORT_SYMBOL(napi_schedule_prep);
5926  
5927  /**
5928   * __napi_schedule_irqoff - schedule for receive
5929   * @n: entry to schedule
5930   *
5931   * Variant of __napi_schedule() assuming hard irqs are masked.
5932   *
5933   * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
5934   * because the interrupt disabled assumption might not be true
5935   * due to force-threaded interrupts and spinlock substitution.
5936   */
5937  void __napi_schedule_irqoff(struct napi_struct *n)
5938  {
5939  	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
5940  		____napi_schedule(this_cpu_ptr(&softnet_data), n);
5941  	else
5942  		__napi_schedule(n);
5943  }
5944  EXPORT_SYMBOL(__napi_schedule_irqoff);
5945  
5946  bool napi_complete_done(struct napi_struct *n, int work_done)
5947  {
5948  	unsigned long flags, val, new, timeout = 0;
5949  	bool ret = true;
5950  
5951  	/*
5952  	 * 1) Don't let napi dequeue from the cpu poll list
5953  	 *    just in case its running on a different cpu.
5954  	 * 2) If we are busy polling, do nothing here, we have
5955  	 *    the guarantee we will be called later.
5956  	 */
5957  	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
5958  				 NAPIF_STATE_IN_BUSY_POLL)))
5959  		return false;
5960  
5961  	if (work_done) {
5962  		if (n->gro_bitmask)
5963  			timeout = READ_ONCE(n->dev->gro_flush_timeout);
5964  		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
5965  	}
5966  	if (n->defer_hard_irqs_count > 0) {
5967  		n->defer_hard_irqs_count--;
5968  		timeout = READ_ONCE(n->dev->gro_flush_timeout);
5969  		if (timeout)
5970  			ret = false;
5971  	}
5972  	if (n->gro_bitmask) {
5973  		/* When the NAPI instance uses a timeout and keeps postponing
5974  		 * it, we need to bound somehow the time packets are kept in
5975  		 * the GRO layer
5976  		 */
5977  		napi_gro_flush(n, !!timeout);
5978  	}
5979  
5980  	gro_normal_list(n);
5981  
5982  	if (unlikely(!list_empty(&n->poll_list))) {
5983  		/* If n->poll_list is not empty, we need to mask irqs */
5984  		local_irq_save(flags);
5985  		list_del_init(&n->poll_list);
5986  		local_irq_restore(flags);
5987  	}
5988  
5989  	do {
5990  		val = READ_ONCE(n->state);
5991  
5992  		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
5993  
5994  		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
5995  			      NAPIF_STATE_SCHED_THREADED |
5996  			      NAPIF_STATE_PREFER_BUSY_POLL);
5997  
5998  		/* If STATE_MISSED was set, leave STATE_SCHED set,
5999  		 * because we will call napi->poll() one more time.
6000  		 * This C code was suggested by Alexander Duyck to help gcc.
6001  		 */
6002  		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6003  						    NAPIF_STATE_SCHED;
6004  	} while (cmpxchg(&n->state, val, new) != val);
6005  
6006  	if (unlikely(val & NAPIF_STATE_MISSED)) {
6007  		__napi_schedule(n);
6008  		return false;
6009  	}
6010  
6011  	if (timeout)
6012  		hrtimer_start(&n->timer, ns_to_ktime(timeout),
6013  			      HRTIMER_MODE_REL_PINNED);
6014  	return ret;
6015  }
6016  EXPORT_SYMBOL(napi_complete_done);
6017  
6018  /* must be called under rcu_read_lock(), as we dont take a reference */
6019  static struct napi_struct *napi_by_id(unsigned int napi_id)
6020  {
6021  	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6022  	struct napi_struct *napi;
6023  
6024  	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6025  		if (napi->napi_id == napi_id)
6026  			return napi;
6027  
6028  	return NULL;
6029  }
6030  
6031  #if defined(CONFIG_NET_RX_BUSY_POLL)
6032  
6033  static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
6034  {
6035  	if (!skip_schedule) {
6036  		gro_normal_list(napi);
6037  		__napi_schedule(napi);
6038  		return;
6039  	}
6040  
6041  	if (napi->gro_bitmask) {
6042  		/* flush too old packets
6043  		 * If HZ < 1000, flush all packets.
6044  		 */
6045  		napi_gro_flush(napi, HZ >= 1000);
6046  	}
6047  
6048  	gro_normal_list(napi);
6049  	clear_bit(NAPI_STATE_SCHED, &napi->state);
6050  }
6051  
6052  static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
6053  			   u16 budget)
6054  {
6055  	bool skip_schedule = false;
6056  	unsigned long timeout;
6057  	int rc;
6058  
6059  	/* Busy polling means there is a high chance device driver hard irq
6060  	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6061  	 * set in napi_schedule_prep().
6062  	 * Since we are about to call napi->poll() once more, we can safely
6063  	 * clear NAPI_STATE_MISSED.
6064  	 *
6065  	 * Note: x86 could use a single "lock and ..." instruction
6066  	 * to perform these two clear_bit()
6067  	 */
6068  	clear_bit(NAPI_STATE_MISSED, &napi->state);
6069  	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6070  
6071  	local_bh_disable();
6072  
6073  	if (prefer_busy_poll) {
6074  		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
6075  		timeout = READ_ONCE(napi->dev->gro_flush_timeout);
6076  		if (napi->defer_hard_irqs_count && timeout) {
6077  			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
6078  			skip_schedule = true;
6079  		}
6080  	}
6081  
6082  	/* All we really want here is to re-enable device interrupts.
6083  	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6084  	 */
6085  	rc = napi->poll(napi, budget);
6086  	/* We can't gro_normal_list() here, because napi->poll() might have
6087  	 * rearmed the napi (napi_complete_done()) in which case it could
6088  	 * already be running on another CPU.
6089  	 */
6090  	trace_napi_poll(napi, rc, budget);
6091  	netpoll_poll_unlock(have_poll_lock);
6092  	if (rc == budget)
6093  		__busy_poll_stop(napi, skip_schedule);
6094  	local_bh_enable();
6095  }
6096  
6097  void napi_busy_loop(unsigned int napi_id,
6098  		    bool (*loop_end)(void *, unsigned long),
6099  		    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6100  {
6101  	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6102  	int (*napi_poll)(struct napi_struct *napi, int budget);
6103  	void *have_poll_lock = NULL;
6104  	struct napi_struct *napi;
6105  
6106  restart:
6107  	napi_poll = NULL;
6108  
6109  	rcu_read_lock();
6110  
6111  	napi = napi_by_id(napi_id);
6112  	if (!napi)
6113  		goto out;
6114  
6115  	preempt_disable();
6116  	for (;;) {
6117  		int work = 0;
6118  
6119  		local_bh_disable();
6120  		if (!napi_poll) {
6121  			unsigned long val = READ_ONCE(napi->state);
6122  
6123  			/* If multiple threads are competing for this napi,
6124  			 * we avoid dirtying napi->state as much as we can.
6125  			 */
6126  			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6127  				   NAPIF_STATE_IN_BUSY_POLL)) {
6128  				if (prefer_busy_poll)
6129  					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6130  				goto count;
6131  			}
6132  			if (cmpxchg(&napi->state, val,
6133  				    val | NAPIF_STATE_IN_BUSY_POLL |
6134  					  NAPIF_STATE_SCHED) != val) {
6135  				if (prefer_busy_poll)
6136  					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6137  				goto count;
6138  			}
6139  			have_poll_lock = netpoll_poll_lock(napi);
6140  			napi_poll = napi->poll;
6141  		}
6142  		work = napi_poll(napi, budget);
6143  		trace_napi_poll(napi, work, budget);
6144  		gro_normal_list(napi);
6145  count:
6146  		if (work > 0)
6147  			__NET_ADD_STATS(dev_net(napi->dev),
6148  					LINUX_MIB_BUSYPOLLRXPACKETS, work);
6149  		local_bh_enable();
6150  
6151  		if (!loop_end || loop_end(loop_end_arg, start_time))
6152  			break;
6153  
6154  		if (unlikely(need_resched())) {
6155  			if (napi_poll)
6156  				busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
6157  			preempt_enable();
6158  			rcu_read_unlock();
6159  			cond_resched();
6160  			if (loop_end(loop_end_arg, start_time))
6161  				return;
6162  			goto restart;
6163  		}
6164  		cpu_relax();
6165  	}
6166  	if (napi_poll)
6167  		busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
6168  	preempt_enable();
6169  out:
6170  	rcu_read_unlock();
6171  }
6172  EXPORT_SYMBOL(napi_busy_loop);
6173  
6174  #endif /* CONFIG_NET_RX_BUSY_POLL */
6175  
6176  static void napi_hash_add(struct napi_struct *napi)
6177  {
6178  	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
6179  		return;
6180  
6181  	spin_lock(&napi_hash_lock);
6182  
6183  	/* 0..NR_CPUS range is reserved for sender_cpu use */
6184  	do {
6185  		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6186  			napi_gen_id = MIN_NAPI_ID;
6187  	} while (napi_by_id(napi_gen_id));
6188  	napi->napi_id = napi_gen_id;
6189  
6190  	hlist_add_head_rcu(&napi->napi_hash_node,
6191  			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6192  
6193  	spin_unlock(&napi_hash_lock);
6194  }
6195  
6196  /* Warning : caller is responsible to make sure rcu grace period
6197   * is respected before freeing memory containing @napi
6198   */
6199  static void napi_hash_del(struct napi_struct *napi)
6200  {
6201  	spin_lock(&napi_hash_lock);
6202  
6203  	hlist_del_init_rcu(&napi->napi_hash_node);
6204  
6205  	spin_unlock(&napi_hash_lock);
6206  }
6207  
6208  static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6209  {
6210  	struct napi_struct *napi;
6211  
6212  	napi = container_of(timer, struct napi_struct, timer);
6213  
6214  	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
6215  	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6216  	 */
6217  	if (!napi_disable_pending(napi) &&
6218  	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
6219  		clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6220  		__napi_schedule_irqoff(napi);
6221  	}
6222  
6223  	return HRTIMER_NORESTART;
6224  }
6225  
6226  static void init_gro_hash(struct napi_struct *napi)
6227  {
6228  	int i;
6229  
6230  	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6231  		INIT_LIST_HEAD(&napi->gro_hash[i].list);
6232  		napi->gro_hash[i].count = 0;
6233  	}
6234  	napi->gro_bitmask = 0;
6235  }
6236  
6237  int dev_set_threaded(struct net_device *dev, bool threaded)
6238  {
6239  	struct napi_struct *napi;
6240  	int err = 0;
6241  
6242  	if (dev->threaded == threaded)
6243  		return 0;
6244  
6245  	if (threaded) {
6246  		list_for_each_entry(napi, &dev->napi_list, dev_list) {
6247  			if (!napi->thread) {
6248  				err = napi_kthread_create(napi);
6249  				if (err) {
6250  					threaded = false;
6251  					break;
6252  				}
6253  			}
6254  		}
6255  	}
6256  
6257  	dev->threaded = threaded;
6258  
6259  	/* Make sure kthread is created before THREADED bit
6260  	 * is set.
6261  	 */
6262  	smp_mb__before_atomic();
6263  
6264  	/* Setting/unsetting threaded mode on a napi might not immediately
6265  	 * take effect, if the current napi instance is actively being
6266  	 * polled. In this case, the switch between threaded mode and
6267  	 * softirq mode will happen in the next round of napi_schedule().
6268  	 * This should not cause hiccups/stalls to the live traffic.
6269  	 */
6270  	list_for_each_entry(napi, &dev->napi_list, dev_list) {
6271  		if (threaded)
6272  			set_bit(NAPI_STATE_THREADED, &napi->state);
6273  		else
6274  			clear_bit(NAPI_STATE_THREADED, &napi->state);
6275  	}
6276  
6277  	return err;
6278  }
6279  EXPORT_SYMBOL(dev_set_threaded);
6280  
6281  void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
6282  		    int (*poll)(struct napi_struct *, int), int weight)
6283  {
6284  	if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
6285  		return;
6286  
6287  	INIT_LIST_HEAD(&napi->poll_list);
6288  	INIT_HLIST_NODE(&napi->napi_hash_node);
6289  	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6290  	napi->timer.function = napi_watchdog;
6291  	init_gro_hash(napi);
6292  	napi->skb = NULL;
6293  	INIT_LIST_HEAD(&napi->rx_list);
6294  	napi->rx_count = 0;
6295  	napi->poll = poll;
6296  	if (weight > NAPI_POLL_WEIGHT)
6297  		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6298  				weight);
6299  	napi->weight = weight;
6300  	napi->dev = dev;
6301  #ifdef CONFIG_NETPOLL
6302  	napi->poll_owner = -1;
6303  #endif
6304  	set_bit(NAPI_STATE_SCHED, &napi->state);
6305  	set_bit(NAPI_STATE_NPSVC, &napi->state);
6306  	list_add_rcu(&napi->dev_list, &dev->napi_list);
6307  	napi_hash_add(napi);
6308  	/* Create kthread for this napi if dev->threaded is set.
6309  	 * Clear dev->threaded if kthread creation failed so that
6310  	 * threaded mode will not be enabled in napi_enable().
6311  	 */
6312  	if (dev->threaded && napi_kthread_create(napi))
6313  		dev->threaded = 0;
6314  }
6315  EXPORT_SYMBOL(netif_napi_add);
6316  
6317  void napi_disable(struct napi_struct *n)
6318  {
6319  	unsigned long val, new;
6320  
6321  	might_sleep();
6322  	set_bit(NAPI_STATE_DISABLE, &n->state);
6323  
6324  	for ( ; ; ) {
6325  		val = READ_ONCE(n->state);
6326  		if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
6327  			usleep_range(20, 200);
6328  			continue;
6329  		}
6330  
6331  		new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
6332  		new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
6333  
6334  		if (cmpxchg(&n->state, val, new) == val)
6335  			break;
6336  	}
6337  
6338  	hrtimer_cancel(&n->timer);
6339  
6340  	clear_bit(NAPI_STATE_DISABLE, &n->state);
6341  }
6342  EXPORT_SYMBOL(napi_disable);
6343  
6344  /**
6345   *	napi_enable - enable NAPI scheduling
6346   *	@n: NAPI context
6347   *
6348   * Resume NAPI from being scheduled on this context.
6349   * Must be paired with napi_disable.
6350   */
6351  void napi_enable(struct napi_struct *n)
6352  {
6353  	unsigned long val, new;
6354  
6355  	do {
6356  		val = READ_ONCE(n->state);
6357  		BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
6358  
6359  		new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
6360  		if (n->dev->threaded && n->thread)
6361  			new |= NAPIF_STATE_THREADED;
6362  	} while (cmpxchg(&n->state, val, new) != val);
6363  }
6364  EXPORT_SYMBOL(napi_enable);
6365  
6366  static void flush_gro_hash(struct napi_struct *napi)
6367  {
6368  	int i;
6369  
6370  	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6371  		struct sk_buff *skb, *n;
6372  
6373  		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6374  			kfree_skb(skb);
6375  		napi->gro_hash[i].count = 0;
6376  	}
6377  }
6378  
6379  /* Must be called in process context */
6380  void __netif_napi_del(struct napi_struct *napi)
6381  {
6382  	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
6383  		return;
6384  
6385  	napi_hash_del(napi);
6386  	list_del_rcu(&napi->dev_list);
6387  	napi_free_frags(napi);
6388  
6389  	flush_gro_hash(napi);
6390  	napi->gro_bitmask = 0;
6391  
6392  	if (napi->thread) {
6393  		kthread_stop(napi->thread);
6394  		napi->thread = NULL;
6395  	}
6396  }
6397  EXPORT_SYMBOL(__netif_napi_del);
6398  
6399  static int __napi_poll(struct napi_struct *n, bool *repoll)
6400  {
6401  	int work, weight;
6402  
6403  	weight = n->weight;
6404  
6405  	/* This NAPI_STATE_SCHED test is for avoiding a race
6406  	 * with netpoll's poll_napi().  Only the entity which
6407  	 * obtains the lock and sees NAPI_STATE_SCHED set will
6408  	 * actually make the ->poll() call.  Therefore we avoid
6409  	 * accidentally calling ->poll() when NAPI is not scheduled.
6410  	 */
6411  	work = 0;
6412  	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6413  		work = n->poll(n, weight);
6414  		trace_napi_poll(n, work, weight);
6415  	}
6416  
6417  	if (unlikely(work > weight))
6418  		netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6419  				n->poll, work, weight);
6420  
6421  	if (likely(work < weight))
6422  		return work;
6423  
6424  	/* Drivers must not modify the NAPI state if they
6425  	 * consume the entire weight.  In such cases this code
6426  	 * still "owns" the NAPI instance and therefore can
6427  	 * move the instance around on the list at-will.
6428  	 */
6429  	if (unlikely(napi_disable_pending(n))) {
6430  		napi_complete(n);
6431  		return work;
6432  	}
6433  
6434  	/* The NAPI context has more processing work, but busy-polling
6435  	 * is preferred. Exit early.
6436  	 */
6437  	if (napi_prefer_busy_poll(n)) {
6438  		if (napi_complete_done(n, work)) {
6439  			/* If timeout is not set, we need to make sure
6440  			 * that the NAPI is re-scheduled.
6441  			 */
6442  			napi_schedule(n);
6443  		}
6444  		return work;
6445  	}
6446  
6447  	if (n->gro_bitmask) {
6448  		/* flush too old packets
6449  		 * If HZ < 1000, flush all packets.
6450  		 */
6451  		napi_gro_flush(n, HZ >= 1000);
6452  	}
6453  
6454  	gro_normal_list(n);
6455  
6456  	/* Some drivers may have called napi_schedule
6457  	 * prior to exhausting their budget.
6458  	 */
6459  	if (unlikely(!list_empty(&n->poll_list))) {
6460  		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6461  			     n->dev ? n->dev->name : "backlog");
6462  		return work;
6463  	}
6464  
6465  	*repoll = true;
6466  
6467  	return work;
6468  }
6469  
6470  static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6471  {
6472  	bool do_repoll = false;
6473  	void *have;
6474  	int work;
6475  
6476  	list_del_init(&n->poll_list);
6477  
6478  	have = netpoll_poll_lock(n);
6479  
6480  	work = __napi_poll(n, &do_repoll);
6481  
6482  	if (do_repoll)
6483  		list_add_tail(&n->poll_list, repoll);
6484  
6485  	netpoll_poll_unlock(have);
6486  
6487  	return work;
6488  }
6489  
6490  static int napi_thread_wait(struct napi_struct *napi)
6491  {
6492  	bool woken = false;
6493  
6494  	set_current_state(TASK_INTERRUPTIBLE);
6495  
6496  	while (!kthread_should_stop()) {
6497  		/* Testing SCHED_THREADED bit here to make sure the current
6498  		 * kthread owns this napi and could poll on this napi.
6499  		 * Testing SCHED bit is not enough because SCHED bit might be
6500  		 * set by some other busy poll thread or by napi_disable().
6501  		 */
6502  		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
6503  			WARN_ON(!list_empty(&napi->poll_list));
6504  			__set_current_state(TASK_RUNNING);
6505  			return 0;
6506  		}
6507  
6508  		schedule();
6509  		/* woken being true indicates this thread owns this napi. */
6510  		woken = true;
6511  		set_current_state(TASK_INTERRUPTIBLE);
6512  	}
6513  	__set_current_state(TASK_RUNNING);
6514  
6515  	return -1;
6516  }
6517  
6518  static int napi_threaded_poll(void *data)
6519  {
6520  	struct napi_struct *napi = data;
6521  	void *have;
6522  
6523  	while (!napi_thread_wait(napi)) {
6524  		for (;;) {
6525  			bool repoll = false;
6526  
6527  			local_bh_disable();
6528  
6529  			have = netpoll_poll_lock(napi);
6530  			__napi_poll(napi, &repoll);
6531  			netpoll_poll_unlock(have);
6532  
6533  			local_bh_enable();
6534  
6535  			if (!repoll)
6536  				break;
6537  
6538  			cond_resched();
6539  		}
6540  	}
6541  	return 0;
6542  }
6543  
6544  static __latent_entropy void net_rx_action(struct softirq_action *h)
6545  {
6546  	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6547  	unsigned long time_limit = jiffies +
6548  		usecs_to_jiffies(netdev_budget_usecs);
6549  	int budget = netdev_budget;
6550  	LIST_HEAD(list);
6551  	LIST_HEAD(repoll);
6552  
6553  	local_irq_disable();
6554  	list_splice_init(&sd->poll_list, &list);
6555  	local_irq_enable();
6556  
6557  	for (;;) {
6558  		struct napi_struct *n;
6559  
6560  		if (list_empty(&list)) {
6561  			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
6562  				return;
6563  			break;
6564  		}
6565  
6566  		n = list_first_entry(&list, struct napi_struct, poll_list);
6567  		budget -= napi_poll(n, &repoll);
6568  
6569  		/* If softirq window is exhausted then punt.
6570  		 * Allow this to run for 2 jiffies since which will allow
6571  		 * an average latency of 1.5/HZ.
6572  		 */
6573  		if (unlikely(budget <= 0 ||
6574  			     time_after_eq(jiffies, time_limit))) {
6575  			sd->time_squeeze++;
6576  			break;
6577  		}
6578  	}
6579  
6580  	local_irq_disable();
6581  
6582  	list_splice_tail_init(&sd->poll_list, &list);
6583  	list_splice_tail(&repoll, &list);
6584  	list_splice(&list, &sd->poll_list);
6585  	if (!list_empty(&sd->poll_list))
6586  		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
6587  
6588  	net_rps_action_and_irq_enable(sd);
6589  }
6590  
6591  struct netdev_adjacent {
6592  	struct net_device *dev;
6593  	netdevice_tracker dev_tracker;
6594  
6595  	/* upper master flag, there can only be one master device per list */
6596  	bool master;
6597  
6598  	/* lookup ignore flag */
6599  	bool ignore;
6600  
6601  	/* counter for the number of times this device was added to us */
6602  	u16 ref_nr;
6603  
6604  	/* private field for the users */
6605  	void *private;
6606  
6607  	struct list_head list;
6608  	struct rcu_head rcu;
6609  };
6610  
6611  static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6612  						 struct list_head *adj_list)
6613  {
6614  	struct netdev_adjacent *adj;
6615  
6616  	list_for_each_entry(adj, adj_list, list) {
6617  		if (adj->dev == adj_dev)
6618  			return adj;
6619  	}
6620  	return NULL;
6621  }
6622  
6623  static int ____netdev_has_upper_dev(struct net_device *upper_dev,
6624  				    struct netdev_nested_priv *priv)
6625  {
6626  	struct net_device *dev = (struct net_device *)priv->data;
6627  
6628  	return upper_dev == dev;
6629  }
6630  
6631  /**
6632   * netdev_has_upper_dev - Check if device is linked to an upper device
6633   * @dev: device
6634   * @upper_dev: upper device to check
6635   *
6636   * Find out if a device is linked to specified upper device and return true
6637   * in case it is. Note that this checks only immediate upper device,
6638   * not through a complete stack of devices. The caller must hold the RTNL lock.
6639   */
6640  bool netdev_has_upper_dev(struct net_device *dev,
6641  			  struct net_device *upper_dev)
6642  {
6643  	struct netdev_nested_priv priv = {
6644  		.data = (void *)upper_dev,
6645  	};
6646  
6647  	ASSERT_RTNL();
6648  
6649  	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6650  					     &priv);
6651  }
6652  EXPORT_SYMBOL(netdev_has_upper_dev);
6653  
6654  /**
6655   * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
6656   * @dev: device
6657   * @upper_dev: upper device to check
6658   *
6659   * Find out if a device is linked to specified upper device and return true
6660   * in case it is. Note that this checks the entire upper device chain.
6661   * The caller must hold rcu lock.
6662   */
6663  
6664  bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6665  				  struct net_device *upper_dev)
6666  {
6667  	struct netdev_nested_priv priv = {
6668  		.data = (void *)upper_dev,
6669  	};
6670  
6671  	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6672  					       &priv);
6673  }
6674  EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6675  
6676  /**
6677   * netdev_has_any_upper_dev - Check if device is linked to some device
6678   * @dev: device
6679   *
6680   * Find out if a device is linked to an upper device and return true in case
6681   * it is. The caller must hold the RTNL lock.
6682   */
6683  bool netdev_has_any_upper_dev(struct net_device *dev)
6684  {
6685  	ASSERT_RTNL();
6686  
6687  	return !list_empty(&dev->adj_list.upper);
6688  }
6689  EXPORT_SYMBOL(netdev_has_any_upper_dev);
6690  
6691  /**
6692   * netdev_master_upper_dev_get - Get master upper device
6693   * @dev: device
6694   *
6695   * Find a master upper device and return pointer to it or NULL in case
6696   * it's not there. The caller must hold the RTNL lock.
6697   */
6698  struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6699  {
6700  	struct netdev_adjacent *upper;
6701  
6702  	ASSERT_RTNL();
6703  
6704  	if (list_empty(&dev->adj_list.upper))
6705  		return NULL;
6706  
6707  	upper = list_first_entry(&dev->adj_list.upper,
6708  				 struct netdev_adjacent, list);
6709  	if (likely(upper->master))
6710  		return upper->dev;
6711  	return NULL;
6712  }
6713  EXPORT_SYMBOL(netdev_master_upper_dev_get);
6714  
6715  static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
6716  {
6717  	struct netdev_adjacent *upper;
6718  
6719  	ASSERT_RTNL();
6720  
6721  	if (list_empty(&dev->adj_list.upper))
6722  		return NULL;
6723  
6724  	upper = list_first_entry(&dev->adj_list.upper,
6725  				 struct netdev_adjacent, list);
6726  	if (likely(upper->master) && !upper->ignore)
6727  		return upper->dev;
6728  	return NULL;
6729  }
6730  
6731  /**
6732   * netdev_has_any_lower_dev - Check if device is linked to some device
6733   * @dev: device
6734   *
6735   * Find out if a device is linked to a lower device and return true in case
6736   * it is. The caller must hold the RTNL lock.
6737   */
6738  static bool netdev_has_any_lower_dev(struct net_device *dev)
6739  {
6740  	ASSERT_RTNL();
6741  
6742  	return !list_empty(&dev->adj_list.lower);
6743  }
6744  
6745  void *netdev_adjacent_get_private(struct list_head *adj_list)
6746  {
6747  	struct netdev_adjacent *adj;
6748  
6749  	adj = list_entry(adj_list, struct netdev_adjacent, list);
6750  
6751  	return adj->private;
6752  }
6753  EXPORT_SYMBOL(netdev_adjacent_get_private);
6754  
6755  /**
6756   * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
6757   * @dev: device
6758   * @iter: list_head ** of the current position
6759   *
6760   * Gets the next device from the dev's upper list, starting from iter
6761   * position. The caller must hold RCU read lock.
6762   */
6763  struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
6764  						 struct list_head **iter)
6765  {
6766  	struct netdev_adjacent *upper;
6767  
6768  	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6769  
6770  	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6771  
6772  	if (&upper->list == &dev->adj_list.upper)
6773  		return NULL;
6774  
6775  	*iter = &upper->list;
6776  
6777  	return upper->dev;
6778  }
6779  EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
6780  
6781  static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
6782  						  struct list_head **iter,
6783  						  bool *ignore)
6784  {
6785  	struct netdev_adjacent *upper;
6786  
6787  	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
6788  
6789  	if (&upper->list == &dev->adj_list.upper)
6790  		return NULL;
6791  
6792  	*iter = &upper->list;
6793  	*ignore = upper->ignore;
6794  
6795  	return upper->dev;
6796  }
6797  
6798  static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
6799  						    struct list_head **iter)
6800  {
6801  	struct netdev_adjacent *upper;
6802  
6803  	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
6804  
6805  	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6806  
6807  	if (&upper->list == &dev->adj_list.upper)
6808  		return NULL;
6809  
6810  	*iter = &upper->list;
6811  
6812  	return upper->dev;
6813  }
6814  
6815  static int __netdev_walk_all_upper_dev(struct net_device *dev,
6816  				       int (*fn)(struct net_device *dev,
6817  					 struct netdev_nested_priv *priv),
6818  				       struct netdev_nested_priv *priv)
6819  {
6820  	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
6821  	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
6822  	int ret, cur = 0;
6823  	bool ignore;
6824  
6825  	now = dev;
6826  	iter = &dev->adj_list.upper;
6827  
6828  	while (1) {
6829  		if (now != dev) {
6830  			ret = fn(now, priv);
6831  			if (ret)
6832  				return ret;
6833  		}
6834  
6835  		next = NULL;
6836  		while (1) {
6837  			udev = __netdev_next_upper_dev(now, &iter, &ignore);
6838  			if (!udev)
6839  				break;
6840  			if (ignore)
6841  				continue;
6842  
6843  			next = udev;
6844  			niter = &udev->adj_list.upper;
6845  			dev_stack[cur] = now;
6846  			iter_stack[cur++] = iter;
6847  			break;
6848  		}
6849  
6850  		if (!next) {
6851  			if (!cur)
6852  				return 0;
6853  			next = dev_stack[--cur];
6854  			niter = iter_stack[cur];
6855  		}
6856  
6857  		now = next;
6858  		iter = niter;
6859  	}
6860  
6861  	return 0;
6862  }
6863  
6864  int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
6865  				  int (*fn)(struct net_device *dev,
6866  					    struct netdev_nested_priv *priv),
6867  				  struct netdev_nested_priv *priv)
6868  {
6869  	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
6870  	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
6871  	int ret, cur = 0;
6872  
6873  	now = dev;
6874  	iter = &dev->adj_list.upper;
6875  
6876  	while (1) {
6877  		if (now != dev) {
6878  			ret = fn(now, priv);
6879  			if (ret)
6880  				return ret;
6881  		}
6882  
6883  		next = NULL;
6884  		while (1) {
6885  			udev = netdev_next_upper_dev_rcu(now, &iter);
6886  			if (!udev)
6887  				break;
6888  
6889  			next = udev;
6890  			niter = &udev->adj_list.upper;
6891  			dev_stack[cur] = now;
6892  			iter_stack[cur++] = iter;
6893  			break;
6894  		}
6895  
6896  		if (!next) {
6897  			if (!cur)
6898  				return 0;
6899  			next = dev_stack[--cur];
6900  			niter = iter_stack[cur];
6901  		}
6902  
6903  		now = next;
6904  		iter = niter;
6905  	}
6906  
6907  	return 0;
6908  }
6909  EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
6910  
6911  static bool __netdev_has_upper_dev(struct net_device *dev,
6912  				   struct net_device *upper_dev)
6913  {
6914  	struct netdev_nested_priv priv = {
6915  		.flags = 0,
6916  		.data = (void *)upper_dev,
6917  	};
6918  
6919  	ASSERT_RTNL();
6920  
6921  	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
6922  					   &priv);
6923  }
6924  
6925  /**
6926   * netdev_lower_get_next_private - Get the next ->private from the
6927   *				   lower neighbour list
6928   * @dev: device
6929   * @iter: list_head ** of the current position
6930   *
6931   * Gets the next netdev_adjacent->private from the dev's lower neighbour
6932   * list, starting from iter position. The caller must hold either hold the
6933   * RTNL lock or its own locking that guarantees that the neighbour lower
6934   * list will remain unchanged.
6935   */
6936  void *netdev_lower_get_next_private(struct net_device *dev,
6937  				    struct list_head **iter)
6938  {
6939  	struct netdev_adjacent *lower;
6940  
6941  	lower = list_entry(*iter, struct netdev_adjacent, list);
6942  
6943  	if (&lower->list == &dev->adj_list.lower)
6944  		return NULL;
6945  
6946  	*iter = lower->list.next;
6947  
6948  	return lower->private;
6949  }
6950  EXPORT_SYMBOL(netdev_lower_get_next_private);
6951  
6952  /**
6953   * netdev_lower_get_next_private_rcu - Get the next ->private from the
6954   *				       lower neighbour list, RCU
6955   *				       variant
6956   * @dev: device
6957   * @iter: list_head ** of the current position
6958   *
6959   * Gets the next netdev_adjacent->private from the dev's lower neighbour
6960   * list, starting from iter position. The caller must hold RCU read lock.
6961   */
6962  void *netdev_lower_get_next_private_rcu(struct net_device *dev,
6963  					struct list_head **iter)
6964  {
6965  	struct netdev_adjacent *lower;
6966  
6967  	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
6968  
6969  	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6970  
6971  	if (&lower->list == &dev->adj_list.lower)
6972  		return NULL;
6973  
6974  	*iter = &lower->list;
6975  
6976  	return lower->private;
6977  }
6978  EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
6979  
6980  /**
6981   * netdev_lower_get_next - Get the next device from the lower neighbour
6982   *                         list
6983   * @dev: device
6984   * @iter: list_head ** of the current position
6985   *
6986   * Gets the next netdev_adjacent from the dev's lower neighbour
6987   * list, starting from iter position. The caller must hold RTNL lock or
6988   * its own locking that guarantees that the neighbour lower
6989   * list will remain unchanged.
6990   */
6991  void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
6992  {
6993  	struct netdev_adjacent *lower;
6994  
6995  	lower = list_entry(*iter, struct netdev_adjacent, list);
6996  
6997  	if (&lower->list == &dev->adj_list.lower)
6998  		return NULL;
6999  
7000  	*iter = lower->list.next;
7001  
7002  	return lower->dev;
7003  }
7004  EXPORT_SYMBOL(netdev_lower_get_next);
7005  
7006  static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7007  						struct list_head **iter)
7008  {
7009  	struct netdev_adjacent *lower;
7010  
7011  	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7012  
7013  	if (&lower->list == &dev->adj_list.lower)
7014  		return NULL;
7015  
7016  	*iter = &lower->list;
7017  
7018  	return lower->dev;
7019  }
7020  
7021  static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7022  						  struct list_head **iter,
7023  						  bool *ignore)
7024  {
7025  	struct netdev_adjacent *lower;
7026  
7027  	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7028  
7029  	if (&lower->list == &dev->adj_list.lower)
7030  		return NULL;
7031  
7032  	*iter = &lower->list;
7033  	*ignore = lower->ignore;
7034  
7035  	return lower->dev;
7036  }
7037  
7038  int netdev_walk_all_lower_dev(struct net_device *dev,
7039  			      int (*fn)(struct net_device *dev,
7040  					struct netdev_nested_priv *priv),
7041  			      struct netdev_nested_priv *priv)
7042  {
7043  	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7044  	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7045  	int ret, cur = 0;
7046  
7047  	now = dev;
7048  	iter = &dev->adj_list.lower;
7049  
7050  	while (1) {
7051  		if (now != dev) {
7052  			ret = fn(now, priv);
7053  			if (ret)
7054  				return ret;
7055  		}
7056  
7057  		next = NULL;
7058  		while (1) {
7059  			ldev = netdev_next_lower_dev(now, &iter);
7060  			if (!ldev)
7061  				break;
7062  
7063  			next = ldev;
7064  			niter = &ldev->adj_list.lower;
7065  			dev_stack[cur] = now;
7066  			iter_stack[cur++] = iter;
7067  			break;
7068  		}
7069  
7070  		if (!next) {
7071  			if (!cur)
7072  				return 0;
7073  			next = dev_stack[--cur];
7074  			niter = iter_stack[cur];
7075  		}
7076  
7077  		now = next;
7078  		iter = niter;
7079  	}
7080  
7081  	return 0;
7082  }
7083  EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7084  
7085  static int __netdev_walk_all_lower_dev(struct net_device *dev,
7086  				       int (*fn)(struct net_device *dev,
7087  					 struct netdev_nested_priv *priv),
7088  				       struct netdev_nested_priv *priv)
7089  {
7090  	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7091  	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7092  	int ret, cur = 0;
7093  	bool ignore;
7094  
7095  	now = dev;
7096  	iter = &dev->adj_list.lower;
7097  
7098  	while (1) {
7099  		if (now != dev) {
7100  			ret = fn(now, priv);
7101  			if (ret)
7102  				return ret;
7103  		}
7104  
7105  		next = NULL;
7106  		while (1) {
7107  			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7108  			if (!ldev)
7109  				break;
7110  			if (ignore)
7111  				continue;
7112  
7113  			next = ldev;
7114  			niter = &ldev->adj_list.lower;
7115  			dev_stack[cur] = now;
7116  			iter_stack[cur++] = iter;
7117  			break;
7118  		}
7119  
7120  		if (!next) {
7121  			if (!cur)
7122  				return 0;
7123  			next = dev_stack[--cur];
7124  			niter = iter_stack[cur];
7125  		}
7126  
7127  		now = next;
7128  		iter = niter;
7129  	}
7130  
7131  	return 0;
7132  }
7133  
7134  struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7135  					     struct list_head **iter)
7136  {
7137  	struct netdev_adjacent *lower;
7138  
7139  	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7140  	if (&lower->list == &dev->adj_list.lower)
7141  		return NULL;
7142  
7143  	*iter = &lower->list;
7144  
7145  	return lower->dev;
7146  }
7147  EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7148  
7149  static u8 __netdev_upper_depth(struct net_device *dev)
7150  {
7151  	struct net_device *udev;
7152  	struct list_head *iter;
7153  	u8 max_depth = 0;
7154  	bool ignore;
7155  
7156  	for (iter = &dev->adj_list.upper,
7157  	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7158  	     udev;
7159  	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7160  		if (ignore)
7161  			continue;
7162  		if (max_depth < udev->upper_level)
7163  			max_depth = udev->upper_level;
7164  	}
7165  
7166  	return max_depth;
7167  }
7168  
7169  static u8 __netdev_lower_depth(struct net_device *dev)
7170  {
7171  	struct net_device *ldev;
7172  	struct list_head *iter;
7173  	u8 max_depth = 0;
7174  	bool ignore;
7175  
7176  	for (iter = &dev->adj_list.lower,
7177  	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7178  	     ldev;
7179  	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7180  		if (ignore)
7181  			continue;
7182  		if (max_depth < ldev->lower_level)
7183  			max_depth = ldev->lower_level;
7184  	}
7185  
7186  	return max_depth;
7187  }
7188  
7189  static int __netdev_update_upper_level(struct net_device *dev,
7190  				       struct netdev_nested_priv *__unused)
7191  {
7192  	dev->upper_level = __netdev_upper_depth(dev) + 1;
7193  	return 0;
7194  }
7195  
7196  static int __netdev_update_lower_level(struct net_device *dev,
7197  				       struct netdev_nested_priv *priv)
7198  {
7199  	dev->lower_level = __netdev_lower_depth(dev) + 1;
7200  
7201  #ifdef CONFIG_LOCKDEP
7202  	if (!priv)
7203  		return 0;
7204  
7205  	if (priv->flags & NESTED_SYNC_IMM)
7206  		dev->nested_level = dev->lower_level - 1;
7207  	if (priv->flags & NESTED_SYNC_TODO)
7208  		net_unlink_todo(dev);
7209  #endif
7210  	return 0;
7211  }
7212  
7213  int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7214  				  int (*fn)(struct net_device *dev,
7215  					    struct netdev_nested_priv *priv),
7216  				  struct netdev_nested_priv *priv)
7217  {
7218  	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7219  	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7220  	int ret, cur = 0;
7221  
7222  	now = dev;
7223  	iter = &dev->adj_list.lower;
7224  
7225  	while (1) {
7226  		if (now != dev) {
7227  			ret = fn(now, priv);
7228  			if (ret)
7229  				return ret;
7230  		}
7231  
7232  		next = NULL;
7233  		while (1) {
7234  			ldev = netdev_next_lower_dev_rcu(now, &iter);
7235  			if (!ldev)
7236  				break;
7237  
7238  			next = ldev;
7239  			niter = &ldev->adj_list.lower;
7240  			dev_stack[cur] = now;
7241  			iter_stack[cur++] = iter;
7242  			break;
7243  		}
7244  
7245  		if (!next) {
7246  			if (!cur)
7247  				return 0;
7248  			next = dev_stack[--cur];
7249  			niter = iter_stack[cur];
7250  		}
7251  
7252  		now = next;
7253  		iter = niter;
7254  	}
7255  
7256  	return 0;
7257  }
7258  EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7259  
7260  /**
7261   * netdev_lower_get_first_private_rcu - Get the first ->private from the
7262   *				       lower neighbour list, RCU
7263   *				       variant
7264   * @dev: device
7265   *
7266   * Gets the first netdev_adjacent->private from the dev's lower neighbour
7267   * list. The caller must hold RCU read lock.
7268   */
7269  void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7270  {
7271  	struct netdev_adjacent *lower;
7272  
7273  	lower = list_first_or_null_rcu(&dev->adj_list.lower,
7274  			struct netdev_adjacent, list);
7275  	if (lower)
7276  		return lower->private;
7277  	return NULL;
7278  }
7279  EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7280  
7281  /**
7282   * netdev_master_upper_dev_get_rcu - Get master upper device
7283   * @dev: device
7284   *
7285   * Find a master upper device and return pointer to it or NULL in case
7286   * it's not there. The caller must hold the RCU read lock.
7287   */
7288  struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7289  {
7290  	struct netdev_adjacent *upper;
7291  
7292  	upper = list_first_or_null_rcu(&dev->adj_list.upper,
7293  				       struct netdev_adjacent, list);
7294  	if (upper && likely(upper->master))
7295  		return upper->dev;
7296  	return NULL;
7297  }
7298  EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7299  
7300  static int netdev_adjacent_sysfs_add(struct net_device *dev,
7301  			      struct net_device *adj_dev,
7302  			      struct list_head *dev_list)
7303  {
7304  	char linkname[IFNAMSIZ+7];
7305  
7306  	sprintf(linkname, dev_list == &dev->adj_list.upper ?
7307  		"upper_%s" : "lower_%s", adj_dev->name);
7308  	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7309  				 linkname);
7310  }
7311  static void netdev_adjacent_sysfs_del(struct net_device *dev,
7312  			       char *name,
7313  			       struct list_head *dev_list)
7314  {
7315  	char linkname[IFNAMSIZ+7];
7316  
7317  	sprintf(linkname, dev_list == &dev->adj_list.upper ?
7318  		"upper_%s" : "lower_%s", name);
7319  	sysfs_remove_link(&(dev->dev.kobj), linkname);
7320  }
7321  
7322  static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7323  						 struct net_device *adj_dev,
7324  						 struct list_head *dev_list)
7325  {
7326  	return (dev_list == &dev->adj_list.upper ||
7327  		dev_list == &dev->adj_list.lower) &&
7328  		net_eq(dev_net(dev), dev_net(adj_dev));
7329  }
7330  
7331  static int __netdev_adjacent_dev_insert(struct net_device *dev,
7332  					struct net_device *adj_dev,
7333  					struct list_head *dev_list,
7334  					void *private, bool master)
7335  {
7336  	struct netdev_adjacent *adj;
7337  	int ret;
7338  
7339  	adj = __netdev_find_adj(adj_dev, dev_list);
7340  
7341  	if (adj) {
7342  		adj->ref_nr += 1;
7343  		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7344  			 dev->name, adj_dev->name, adj->ref_nr);
7345  
7346  		return 0;
7347  	}
7348  
7349  	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7350  	if (!adj)
7351  		return -ENOMEM;
7352  
7353  	adj->dev = adj_dev;
7354  	adj->master = master;
7355  	adj->ref_nr = 1;
7356  	adj->private = private;
7357  	adj->ignore = false;
7358  	dev_hold_track(adj_dev, &adj->dev_tracker, GFP_KERNEL);
7359  
7360  	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7361  		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7362  
7363  	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7364  		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7365  		if (ret)
7366  			goto free_adj;
7367  	}
7368  
7369  	/* Ensure that master link is always the first item in list. */
7370  	if (master) {
7371  		ret = sysfs_create_link(&(dev->dev.kobj),
7372  					&(adj_dev->dev.kobj), "master");
7373  		if (ret)
7374  			goto remove_symlinks;
7375  
7376  		list_add_rcu(&adj->list, dev_list);
7377  	} else {
7378  		list_add_tail_rcu(&adj->list, dev_list);
7379  	}
7380  
7381  	return 0;
7382  
7383  remove_symlinks:
7384  	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7385  		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7386  free_adj:
7387  	dev_put_track(adj_dev, &adj->dev_tracker);
7388  	kfree(adj);
7389  
7390  	return ret;
7391  }
7392  
7393  static void __netdev_adjacent_dev_remove(struct net_device *dev,
7394  					 struct net_device *adj_dev,
7395  					 u16 ref_nr,
7396  					 struct list_head *dev_list)
7397  {
7398  	struct netdev_adjacent *adj;
7399  
7400  	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7401  		 dev->name, adj_dev->name, ref_nr);
7402  
7403  	adj = __netdev_find_adj(adj_dev, dev_list);
7404  
7405  	if (!adj) {
7406  		pr_err("Adjacency does not exist for device %s from %s\n",
7407  		       dev->name, adj_dev->name);
7408  		WARN_ON(1);
7409  		return;
7410  	}
7411  
7412  	if (adj->ref_nr > ref_nr) {
7413  		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7414  			 dev->name, adj_dev->name, ref_nr,
7415  			 adj->ref_nr - ref_nr);
7416  		adj->ref_nr -= ref_nr;
7417  		return;
7418  	}
7419  
7420  	if (adj->master)
7421  		sysfs_remove_link(&(dev->dev.kobj), "master");
7422  
7423  	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7424  		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7425  
7426  	list_del_rcu(&adj->list);
7427  	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7428  		 adj_dev->name, dev->name, adj_dev->name);
7429  	dev_put_track(adj_dev, &adj->dev_tracker);
7430  	kfree_rcu(adj, rcu);
7431  }
7432  
7433  static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7434  					    struct net_device *upper_dev,
7435  					    struct list_head *up_list,
7436  					    struct list_head *down_list,
7437  					    void *private, bool master)
7438  {
7439  	int ret;
7440  
7441  	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7442  					   private, master);
7443  	if (ret)
7444  		return ret;
7445  
7446  	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7447  					   private, false);
7448  	if (ret) {
7449  		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7450  		return ret;
7451  	}
7452  
7453  	return 0;
7454  }
7455  
7456  static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7457  					       struct net_device *upper_dev,
7458  					       u16 ref_nr,
7459  					       struct list_head *up_list,
7460  					       struct list_head *down_list)
7461  {
7462  	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7463  	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7464  }
7465  
7466  static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7467  						struct net_device *upper_dev,
7468  						void *private, bool master)
7469  {
7470  	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7471  						&dev->adj_list.upper,
7472  						&upper_dev->adj_list.lower,
7473  						private, master);
7474  }
7475  
7476  static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7477  						   struct net_device *upper_dev)
7478  {
7479  	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7480  					   &dev->adj_list.upper,
7481  					   &upper_dev->adj_list.lower);
7482  }
7483  
7484  static int __netdev_upper_dev_link(struct net_device *dev,
7485  				   struct net_device *upper_dev, bool master,
7486  				   void *upper_priv, void *upper_info,
7487  				   struct netdev_nested_priv *priv,
7488  				   struct netlink_ext_ack *extack)
7489  {
7490  	struct netdev_notifier_changeupper_info changeupper_info = {
7491  		.info = {
7492  			.dev = dev,
7493  			.extack = extack,
7494  		},
7495  		.upper_dev = upper_dev,
7496  		.master = master,
7497  		.linking = true,
7498  		.upper_info = upper_info,
7499  	};
7500  	struct net_device *master_dev;
7501  	int ret = 0;
7502  
7503  	ASSERT_RTNL();
7504  
7505  	if (dev == upper_dev)
7506  		return -EBUSY;
7507  
7508  	/* To prevent loops, check if dev is not upper device to upper_dev. */
7509  	if (__netdev_has_upper_dev(upper_dev, dev))
7510  		return -EBUSY;
7511  
7512  	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7513  		return -EMLINK;
7514  
7515  	if (!master) {
7516  		if (__netdev_has_upper_dev(dev, upper_dev))
7517  			return -EEXIST;
7518  	} else {
7519  		master_dev = __netdev_master_upper_dev_get(dev);
7520  		if (master_dev)
7521  			return master_dev == upper_dev ? -EEXIST : -EBUSY;
7522  	}
7523  
7524  	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7525  					    &changeupper_info.info);
7526  	ret = notifier_to_errno(ret);
7527  	if (ret)
7528  		return ret;
7529  
7530  	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7531  						   master);
7532  	if (ret)
7533  		return ret;
7534  
7535  	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7536  					    &changeupper_info.info);
7537  	ret = notifier_to_errno(ret);
7538  	if (ret)
7539  		goto rollback;
7540  
7541  	__netdev_update_upper_level(dev, NULL);
7542  	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7543  
7544  	__netdev_update_lower_level(upper_dev, priv);
7545  	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7546  				    priv);
7547  
7548  	return 0;
7549  
7550  rollback:
7551  	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7552  
7553  	return ret;
7554  }
7555  
7556  /**
7557   * netdev_upper_dev_link - Add a link to the upper device
7558   * @dev: device
7559   * @upper_dev: new upper device
7560   * @extack: netlink extended ack
7561   *
7562   * Adds a link to device which is upper to this one. The caller must hold
7563   * the RTNL lock. On a failure a negative errno code is returned.
7564   * On success the reference counts are adjusted and the function
7565   * returns zero.
7566   */
7567  int netdev_upper_dev_link(struct net_device *dev,
7568  			  struct net_device *upper_dev,
7569  			  struct netlink_ext_ack *extack)
7570  {
7571  	struct netdev_nested_priv priv = {
7572  		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7573  		.data = NULL,
7574  	};
7575  
7576  	return __netdev_upper_dev_link(dev, upper_dev, false,
7577  				       NULL, NULL, &priv, extack);
7578  }
7579  EXPORT_SYMBOL(netdev_upper_dev_link);
7580  
7581  /**
7582   * netdev_master_upper_dev_link - Add a master link to the upper device
7583   * @dev: device
7584   * @upper_dev: new upper device
7585   * @upper_priv: upper device private
7586   * @upper_info: upper info to be passed down via notifier
7587   * @extack: netlink extended ack
7588   *
7589   * Adds a link to device which is upper to this one. In this case, only
7590   * one master upper device can be linked, although other non-master devices
7591   * might be linked as well. The caller must hold the RTNL lock.
7592   * On a failure a negative errno code is returned. On success the reference
7593   * counts are adjusted and the function returns zero.
7594   */
7595  int netdev_master_upper_dev_link(struct net_device *dev,
7596  				 struct net_device *upper_dev,
7597  				 void *upper_priv, void *upper_info,
7598  				 struct netlink_ext_ack *extack)
7599  {
7600  	struct netdev_nested_priv priv = {
7601  		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7602  		.data = NULL,
7603  	};
7604  
7605  	return __netdev_upper_dev_link(dev, upper_dev, true,
7606  				       upper_priv, upper_info, &priv, extack);
7607  }
7608  EXPORT_SYMBOL(netdev_master_upper_dev_link);
7609  
7610  static void __netdev_upper_dev_unlink(struct net_device *dev,
7611  				      struct net_device *upper_dev,
7612  				      struct netdev_nested_priv *priv)
7613  {
7614  	struct netdev_notifier_changeupper_info changeupper_info = {
7615  		.info = {
7616  			.dev = dev,
7617  		},
7618  		.upper_dev = upper_dev,
7619  		.linking = false,
7620  	};
7621  
7622  	ASSERT_RTNL();
7623  
7624  	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7625  
7626  	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7627  				      &changeupper_info.info);
7628  
7629  	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7630  
7631  	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7632  				      &changeupper_info.info);
7633  
7634  	__netdev_update_upper_level(dev, NULL);
7635  	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7636  
7637  	__netdev_update_lower_level(upper_dev, priv);
7638  	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7639  				    priv);
7640  }
7641  
7642  /**
7643   * netdev_upper_dev_unlink - Removes a link to upper device
7644   * @dev: device
7645   * @upper_dev: new upper device
7646   *
7647   * Removes a link to device which is upper to this one. The caller must hold
7648   * the RTNL lock.
7649   */
7650  void netdev_upper_dev_unlink(struct net_device *dev,
7651  			     struct net_device *upper_dev)
7652  {
7653  	struct netdev_nested_priv priv = {
7654  		.flags = NESTED_SYNC_TODO,
7655  		.data = NULL,
7656  	};
7657  
7658  	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
7659  }
7660  EXPORT_SYMBOL(netdev_upper_dev_unlink);
7661  
7662  static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
7663  				      struct net_device *lower_dev,
7664  				      bool val)
7665  {
7666  	struct netdev_adjacent *adj;
7667  
7668  	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
7669  	if (adj)
7670  		adj->ignore = val;
7671  
7672  	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
7673  	if (adj)
7674  		adj->ignore = val;
7675  }
7676  
7677  static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
7678  					struct net_device *lower_dev)
7679  {
7680  	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
7681  }
7682  
7683  static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
7684  				       struct net_device *lower_dev)
7685  {
7686  	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
7687  }
7688  
7689  int netdev_adjacent_change_prepare(struct net_device *old_dev,
7690  				   struct net_device *new_dev,
7691  				   struct net_device *dev,
7692  				   struct netlink_ext_ack *extack)
7693  {
7694  	struct netdev_nested_priv priv = {
7695  		.flags = 0,
7696  		.data = NULL,
7697  	};
7698  	int err;
7699  
7700  	if (!new_dev)
7701  		return 0;
7702  
7703  	if (old_dev && new_dev != old_dev)
7704  		netdev_adjacent_dev_disable(dev, old_dev);
7705  	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
7706  				      extack);
7707  	if (err) {
7708  		if (old_dev && new_dev != old_dev)
7709  			netdev_adjacent_dev_enable(dev, old_dev);
7710  		return err;
7711  	}
7712  
7713  	return 0;
7714  }
7715  EXPORT_SYMBOL(netdev_adjacent_change_prepare);
7716  
7717  void netdev_adjacent_change_commit(struct net_device *old_dev,
7718  				   struct net_device *new_dev,
7719  				   struct net_device *dev)
7720  {
7721  	struct netdev_nested_priv priv = {
7722  		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7723  		.data = NULL,
7724  	};
7725  
7726  	if (!new_dev || !old_dev)
7727  		return;
7728  
7729  	if (new_dev == old_dev)
7730  		return;
7731  
7732  	netdev_adjacent_dev_enable(dev, old_dev);
7733  	__netdev_upper_dev_unlink(old_dev, dev, &priv);
7734  }
7735  EXPORT_SYMBOL(netdev_adjacent_change_commit);
7736  
7737  void netdev_adjacent_change_abort(struct net_device *old_dev,
7738  				  struct net_device *new_dev,
7739  				  struct net_device *dev)
7740  {
7741  	struct netdev_nested_priv priv = {
7742  		.flags = 0,
7743  		.data = NULL,
7744  	};
7745  
7746  	if (!new_dev)
7747  		return;
7748  
7749  	if (old_dev && new_dev != old_dev)
7750  		netdev_adjacent_dev_enable(dev, old_dev);
7751  
7752  	__netdev_upper_dev_unlink(new_dev, dev, &priv);
7753  }
7754  EXPORT_SYMBOL(netdev_adjacent_change_abort);
7755  
7756  /**
7757   * netdev_bonding_info_change - Dispatch event about slave change
7758   * @dev: device
7759   * @bonding_info: info to dispatch
7760   *
7761   * Send NETDEV_BONDING_INFO to netdev notifiers with info.
7762   * The caller must hold the RTNL lock.
7763   */
7764  void netdev_bonding_info_change(struct net_device *dev,
7765  				struct netdev_bonding_info *bonding_info)
7766  {
7767  	struct netdev_notifier_bonding_info info = {
7768  		.info.dev = dev,
7769  	};
7770  
7771  	memcpy(&info.bonding_info, bonding_info,
7772  	       sizeof(struct netdev_bonding_info));
7773  	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
7774  				      &info.info);
7775  }
7776  EXPORT_SYMBOL(netdev_bonding_info_change);
7777  
7778  static int netdev_offload_xstats_enable_l3(struct net_device *dev,
7779  					   struct netlink_ext_ack *extack)
7780  {
7781  	struct netdev_notifier_offload_xstats_info info = {
7782  		.info.dev = dev,
7783  		.info.extack = extack,
7784  		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
7785  	};
7786  	int err;
7787  	int rc;
7788  
7789  	dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
7790  					 GFP_KERNEL);
7791  	if (!dev->offload_xstats_l3)
7792  		return -ENOMEM;
7793  
7794  	rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
7795  						  NETDEV_OFFLOAD_XSTATS_DISABLE,
7796  						  &info.info);
7797  	err = notifier_to_errno(rc);
7798  	if (err)
7799  		goto free_stats;
7800  
7801  	return 0;
7802  
7803  free_stats:
7804  	kfree(dev->offload_xstats_l3);
7805  	dev->offload_xstats_l3 = NULL;
7806  	return err;
7807  }
7808  
7809  int netdev_offload_xstats_enable(struct net_device *dev,
7810  				 enum netdev_offload_xstats_type type,
7811  				 struct netlink_ext_ack *extack)
7812  {
7813  	ASSERT_RTNL();
7814  
7815  	if (netdev_offload_xstats_enabled(dev, type))
7816  		return -EALREADY;
7817  
7818  	switch (type) {
7819  	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
7820  		return netdev_offload_xstats_enable_l3(dev, extack);
7821  	}
7822  
7823  	WARN_ON(1);
7824  	return -EINVAL;
7825  }
7826  EXPORT_SYMBOL(netdev_offload_xstats_enable);
7827  
7828  static void netdev_offload_xstats_disable_l3(struct net_device *dev)
7829  {
7830  	struct netdev_notifier_offload_xstats_info info = {
7831  		.info.dev = dev,
7832  		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
7833  	};
7834  
7835  	call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
7836  				      &info.info);
7837  	kfree(dev->offload_xstats_l3);
7838  	dev->offload_xstats_l3 = NULL;
7839  }
7840  
7841  int netdev_offload_xstats_disable(struct net_device *dev,
7842  				  enum netdev_offload_xstats_type type)
7843  {
7844  	ASSERT_RTNL();
7845  
7846  	if (!netdev_offload_xstats_enabled(dev, type))
7847  		return -EALREADY;
7848  
7849  	switch (type) {
7850  	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
7851  		netdev_offload_xstats_disable_l3(dev);
7852  		return 0;
7853  	}
7854  
7855  	WARN_ON(1);
7856  	return -EINVAL;
7857  }
7858  EXPORT_SYMBOL(netdev_offload_xstats_disable);
7859  
7860  static void netdev_offload_xstats_disable_all(struct net_device *dev)
7861  {
7862  	netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
7863  }
7864  
7865  static struct rtnl_hw_stats64 *
7866  netdev_offload_xstats_get_ptr(const struct net_device *dev,
7867  			      enum netdev_offload_xstats_type type)
7868  {
7869  	switch (type) {
7870  	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
7871  		return dev->offload_xstats_l3;
7872  	}
7873  
7874  	WARN_ON(1);
7875  	return NULL;
7876  }
7877  
7878  bool netdev_offload_xstats_enabled(const struct net_device *dev,
7879  				   enum netdev_offload_xstats_type type)
7880  {
7881  	ASSERT_RTNL();
7882  
7883  	return netdev_offload_xstats_get_ptr(dev, type);
7884  }
7885  EXPORT_SYMBOL(netdev_offload_xstats_enabled);
7886  
7887  struct netdev_notifier_offload_xstats_ru {
7888  	bool used;
7889  };
7890  
7891  struct netdev_notifier_offload_xstats_rd {
7892  	struct rtnl_hw_stats64 stats;
7893  	bool used;
7894  };
7895  
7896  static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
7897  				  const struct rtnl_hw_stats64 *src)
7898  {
7899  	dest->rx_packets	  += src->rx_packets;
7900  	dest->tx_packets	  += src->tx_packets;
7901  	dest->rx_bytes		  += src->rx_bytes;
7902  	dest->tx_bytes		  += src->tx_bytes;
7903  	dest->rx_errors		  += src->rx_errors;
7904  	dest->tx_errors		  += src->tx_errors;
7905  	dest->rx_dropped	  += src->rx_dropped;
7906  	dest->tx_dropped	  += src->tx_dropped;
7907  	dest->multicast		  += src->multicast;
7908  }
7909  
7910  static int netdev_offload_xstats_get_used(struct net_device *dev,
7911  					  enum netdev_offload_xstats_type type,
7912  					  bool *p_used,
7913  					  struct netlink_ext_ack *extack)
7914  {
7915  	struct netdev_notifier_offload_xstats_ru report_used = {};
7916  	struct netdev_notifier_offload_xstats_info info = {
7917  		.info.dev = dev,
7918  		.info.extack = extack,
7919  		.type = type,
7920  		.report_used = &report_used,
7921  	};
7922  	int rc;
7923  
7924  	WARN_ON(!netdev_offload_xstats_enabled(dev, type));
7925  	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
7926  					   &info.info);
7927  	*p_used = report_used.used;
7928  	return notifier_to_errno(rc);
7929  }
7930  
7931  static int netdev_offload_xstats_get_stats(struct net_device *dev,
7932  					   enum netdev_offload_xstats_type type,
7933  					   struct rtnl_hw_stats64 *p_stats,
7934  					   bool *p_used,
7935  					   struct netlink_ext_ack *extack)
7936  {
7937  	struct netdev_notifier_offload_xstats_rd report_delta = {};
7938  	struct netdev_notifier_offload_xstats_info info = {
7939  		.info.dev = dev,
7940  		.info.extack = extack,
7941  		.type = type,
7942  		.report_delta = &report_delta,
7943  	};
7944  	struct rtnl_hw_stats64 *stats;
7945  	int rc;
7946  
7947  	stats = netdev_offload_xstats_get_ptr(dev, type);
7948  	if (WARN_ON(!stats))
7949  		return -EINVAL;
7950  
7951  	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
7952  					   &info.info);
7953  
7954  	/* Cache whatever we got, even if there was an error, otherwise the
7955  	 * successful stats retrievals would get lost.
7956  	 */
7957  	netdev_hw_stats64_add(stats, &report_delta.stats);
7958  
7959  	if (p_stats)
7960  		*p_stats = *stats;
7961  	*p_used = report_delta.used;
7962  
7963  	return notifier_to_errno(rc);
7964  }
7965  
7966  int netdev_offload_xstats_get(struct net_device *dev,
7967  			      enum netdev_offload_xstats_type type,
7968  			      struct rtnl_hw_stats64 *p_stats, bool *p_used,
7969  			      struct netlink_ext_ack *extack)
7970  {
7971  	ASSERT_RTNL();
7972  
7973  	if (p_stats)
7974  		return netdev_offload_xstats_get_stats(dev, type, p_stats,
7975  						       p_used, extack);
7976  	else
7977  		return netdev_offload_xstats_get_used(dev, type, p_used,
7978  						      extack);
7979  }
7980  EXPORT_SYMBOL(netdev_offload_xstats_get);
7981  
7982  void
7983  netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
7984  				   const struct rtnl_hw_stats64 *stats)
7985  {
7986  	report_delta->used = true;
7987  	netdev_hw_stats64_add(&report_delta->stats, stats);
7988  }
7989  EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
7990  
7991  void
7992  netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
7993  {
7994  	report_used->used = true;
7995  }
7996  EXPORT_SYMBOL(netdev_offload_xstats_report_used);
7997  
7998  void netdev_offload_xstats_push_delta(struct net_device *dev,
7999  				      enum netdev_offload_xstats_type type,
8000  				      const struct rtnl_hw_stats64 *p_stats)
8001  {
8002  	struct rtnl_hw_stats64 *stats;
8003  
8004  	ASSERT_RTNL();
8005  
8006  	stats = netdev_offload_xstats_get_ptr(dev, type);
8007  	if (WARN_ON(!stats))
8008  		return;
8009  
8010  	netdev_hw_stats64_add(stats, p_stats);
8011  }
8012  EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
8013  
8014  /**
8015   * netdev_get_xmit_slave - Get the xmit slave of master device
8016   * @dev: device
8017   * @skb: The packet
8018   * @all_slaves: assume all the slaves are active
8019   *
8020   * The reference counters are not incremented so the caller must be
8021   * careful with locks. The caller must hold RCU lock.
8022   * %NULL is returned if no slave is found.
8023   */
8024  
8025  struct net_device *netdev_get_xmit_slave(struct net_device *dev,
8026  					 struct sk_buff *skb,
8027  					 bool all_slaves)
8028  {
8029  	const struct net_device_ops *ops = dev->netdev_ops;
8030  
8031  	if (!ops->ndo_get_xmit_slave)
8032  		return NULL;
8033  	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
8034  }
8035  EXPORT_SYMBOL(netdev_get_xmit_slave);
8036  
8037  static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
8038  						  struct sock *sk)
8039  {
8040  	const struct net_device_ops *ops = dev->netdev_ops;
8041  
8042  	if (!ops->ndo_sk_get_lower_dev)
8043  		return NULL;
8044  	return ops->ndo_sk_get_lower_dev(dev, sk);
8045  }
8046  
8047  /**
8048   * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
8049   * @dev: device
8050   * @sk: the socket
8051   *
8052   * %NULL is returned if no lower device is found.
8053   */
8054  
8055  struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
8056  					    struct sock *sk)
8057  {
8058  	struct net_device *lower;
8059  
8060  	lower = netdev_sk_get_lower_dev(dev, sk);
8061  	while (lower) {
8062  		dev = lower;
8063  		lower = netdev_sk_get_lower_dev(dev, sk);
8064  	}
8065  
8066  	return dev;
8067  }
8068  EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
8069  
8070  static void netdev_adjacent_add_links(struct net_device *dev)
8071  {
8072  	struct netdev_adjacent *iter;
8073  
8074  	struct net *net = dev_net(dev);
8075  
8076  	list_for_each_entry(iter, &dev->adj_list.upper, list) {
8077  		if (!net_eq(net, dev_net(iter->dev)))
8078  			continue;
8079  		netdev_adjacent_sysfs_add(iter->dev, dev,
8080  					  &iter->dev->adj_list.lower);
8081  		netdev_adjacent_sysfs_add(dev, iter->dev,
8082  					  &dev->adj_list.upper);
8083  	}
8084  
8085  	list_for_each_entry(iter, &dev->adj_list.lower, list) {
8086  		if (!net_eq(net, dev_net(iter->dev)))
8087  			continue;
8088  		netdev_adjacent_sysfs_add(iter->dev, dev,
8089  					  &iter->dev->adj_list.upper);
8090  		netdev_adjacent_sysfs_add(dev, iter->dev,
8091  					  &dev->adj_list.lower);
8092  	}
8093  }
8094  
8095  static void netdev_adjacent_del_links(struct net_device *dev)
8096  {
8097  	struct netdev_adjacent *iter;
8098  
8099  	struct net *net = dev_net(dev);
8100  
8101  	list_for_each_entry(iter, &dev->adj_list.upper, list) {
8102  		if (!net_eq(net, dev_net(iter->dev)))
8103  			continue;
8104  		netdev_adjacent_sysfs_del(iter->dev, dev->name,
8105  					  &iter->dev->adj_list.lower);
8106  		netdev_adjacent_sysfs_del(dev, iter->dev->name,
8107  					  &dev->adj_list.upper);
8108  	}
8109  
8110  	list_for_each_entry(iter, &dev->adj_list.lower, list) {
8111  		if (!net_eq(net, dev_net(iter->dev)))
8112  			continue;
8113  		netdev_adjacent_sysfs_del(iter->dev, dev->name,
8114  					  &iter->dev->adj_list.upper);
8115  		netdev_adjacent_sysfs_del(dev, iter->dev->name,
8116  					  &dev->adj_list.lower);
8117  	}
8118  }
8119  
8120  void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
8121  {
8122  	struct netdev_adjacent *iter;
8123  
8124  	struct net *net = dev_net(dev);
8125  
8126  	list_for_each_entry(iter, &dev->adj_list.upper, list) {
8127  		if (!net_eq(net, dev_net(iter->dev)))
8128  			continue;
8129  		netdev_adjacent_sysfs_del(iter->dev, oldname,
8130  					  &iter->dev->adj_list.lower);
8131  		netdev_adjacent_sysfs_add(iter->dev, dev,
8132  					  &iter->dev->adj_list.lower);
8133  	}
8134  
8135  	list_for_each_entry(iter, &dev->adj_list.lower, list) {
8136  		if (!net_eq(net, dev_net(iter->dev)))
8137  			continue;
8138  		netdev_adjacent_sysfs_del(iter->dev, oldname,
8139  					  &iter->dev->adj_list.upper);
8140  		netdev_adjacent_sysfs_add(iter->dev, dev,
8141  					  &iter->dev->adj_list.upper);
8142  	}
8143  }
8144  
8145  void *netdev_lower_dev_get_private(struct net_device *dev,
8146  				   struct net_device *lower_dev)
8147  {
8148  	struct netdev_adjacent *lower;
8149  
8150  	if (!lower_dev)
8151  		return NULL;
8152  	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8153  	if (!lower)
8154  		return NULL;
8155  
8156  	return lower->private;
8157  }
8158  EXPORT_SYMBOL(netdev_lower_dev_get_private);
8159  
8160  
8161  /**
8162   * netdev_lower_state_changed - Dispatch event about lower device state change
8163   * @lower_dev: device
8164   * @lower_state_info: state to dispatch
8165   *
8166   * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8167   * The caller must hold the RTNL lock.
8168   */
8169  void netdev_lower_state_changed(struct net_device *lower_dev,
8170  				void *lower_state_info)
8171  {
8172  	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
8173  		.info.dev = lower_dev,
8174  	};
8175  
8176  	ASSERT_RTNL();
8177  	changelowerstate_info.lower_state_info = lower_state_info;
8178  	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
8179  				      &changelowerstate_info.info);
8180  }
8181  EXPORT_SYMBOL(netdev_lower_state_changed);
8182  
8183  static void dev_change_rx_flags(struct net_device *dev, int flags)
8184  {
8185  	const struct net_device_ops *ops = dev->netdev_ops;
8186  
8187  	if (ops->ndo_change_rx_flags)
8188  		ops->ndo_change_rx_flags(dev, flags);
8189  }
8190  
8191  static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
8192  {
8193  	unsigned int old_flags = dev->flags;
8194  	kuid_t uid;
8195  	kgid_t gid;
8196  
8197  	ASSERT_RTNL();
8198  
8199  	dev->flags |= IFF_PROMISC;
8200  	dev->promiscuity += inc;
8201  	if (dev->promiscuity == 0) {
8202  		/*
8203  		 * Avoid overflow.
8204  		 * If inc causes overflow, untouch promisc and return error.
8205  		 */
8206  		if (inc < 0)
8207  			dev->flags &= ~IFF_PROMISC;
8208  		else {
8209  			dev->promiscuity -= inc;
8210  			netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
8211  			return -EOVERFLOW;
8212  		}
8213  	}
8214  	if (dev->flags != old_flags) {
8215  		pr_info("device %s %s promiscuous mode\n",
8216  			dev->name,
8217  			dev->flags & IFF_PROMISC ? "entered" : "left");
8218  		if (audit_enabled) {
8219  			current_uid_gid(&uid, &gid);
8220  			audit_log(audit_context(), GFP_ATOMIC,
8221  				  AUDIT_ANOM_PROMISCUOUS,
8222  				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
8223  				  dev->name, (dev->flags & IFF_PROMISC),
8224  				  (old_flags & IFF_PROMISC),
8225  				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
8226  				  from_kuid(&init_user_ns, uid),
8227  				  from_kgid(&init_user_ns, gid),
8228  				  audit_get_sessionid(current));
8229  		}
8230  
8231  		dev_change_rx_flags(dev, IFF_PROMISC);
8232  	}
8233  	if (notify)
8234  		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
8235  	return 0;
8236  }
8237  
8238  /**
8239   *	dev_set_promiscuity	- update promiscuity count on a device
8240   *	@dev: device
8241   *	@inc: modifier
8242   *
8243   *	Add or remove promiscuity from a device. While the count in the device
8244   *	remains above zero the interface remains promiscuous. Once it hits zero
8245   *	the device reverts back to normal filtering operation. A negative inc
8246   *	value is used to drop promiscuity on the device.
8247   *	Return 0 if successful or a negative errno code on error.
8248   */
8249  int dev_set_promiscuity(struct net_device *dev, int inc)
8250  {
8251  	unsigned int old_flags = dev->flags;
8252  	int err;
8253  
8254  	err = __dev_set_promiscuity(dev, inc, true);
8255  	if (err < 0)
8256  		return err;
8257  	if (dev->flags != old_flags)
8258  		dev_set_rx_mode(dev);
8259  	return err;
8260  }
8261  EXPORT_SYMBOL(dev_set_promiscuity);
8262  
8263  static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
8264  {
8265  	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
8266  
8267  	ASSERT_RTNL();
8268  
8269  	dev->flags |= IFF_ALLMULTI;
8270  	dev->allmulti += inc;
8271  	if (dev->allmulti == 0) {
8272  		/*
8273  		 * Avoid overflow.
8274  		 * If inc causes overflow, untouch allmulti and return error.
8275  		 */
8276  		if (inc < 0)
8277  			dev->flags &= ~IFF_ALLMULTI;
8278  		else {
8279  			dev->allmulti -= inc;
8280  			netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
8281  			return -EOVERFLOW;
8282  		}
8283  	}
8284  	if (dev->flags ^ old_flags) {
8285  		dev_change_rx_flags(dev, IFF_ALLMULTI);
8286  		dev_set_rx_mode(dev);
8287  		if (notify)
8288  			__dev_notify_flags(dev, old_flags,
8289  					   dev->gflags ^ old_gflags);
8290  	}
8291  	return 0;
8292  }
8293  
8294  /**
8295   *	dev_set_allmulti	- update allmulti count on a device
8296   *	@dev: device
8297   *	@inc: modifier
8298   *
8299   *	Add or remove reception of all multicast frames to a device. While the
8300   *	count in the device remains above zero the interface remains listening
8301   *	to all interfaces. Once it hits zero the device reverts back to normal
8302   *	filtering operation. A negative @inc value is used to drop the counter
8303   *	when releasing a resource needing all multicasts.
8304   *	Return 0 if successful or a negative errno code on error.
8305   */
8306  
8307  int dev_set_allmulti(struct net_device *dev, int inc)
8308  {
8309  	return __dev_set_allmulti(dev, inc, true);
8310  }
8311  EXPORT_SYMBOL(dev_set_allmulti);
8312  
8313  /*
8314   *	Upload unicast and multicast address lists to device and
8315   *	configure RX filtering. When the device doesn't support unicast
8316   *	filtering it is put in promiscuous mode while unicast addresses
8317   *	are present.
8318   */
8319  void __dev_set_rx_mode(struct net_device *dev)
8320  {
8321  	const struct net_device_ops *ops = dev->netdev_ops;
8322  
8323  	/* dev_open will call this function so the list will stay sane. */
8324  	if (!(dev->flags&IFF_UP))
8325  		return;
8326  
8327  	if (!netif_device_present(dev))
8328  		return;
8329  
8330  	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8331  		/* Unicast addresses changes may only happen under the rtnl,
8332  		 * therefore calling __dev_set_promiscuity here is safe.
8333  		 */
8334  		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8335  			__dev_set_promiscuity(dev, 1, false);
8336  			dev->uc_promisc = true;
8337  		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8338  			__dev_set_promiscuity(dev, -1, false);
8339  			dev->uc_promisc = false;
8340  		}
8341  	}
8342  
8343  	if (ops->ndo_set_rx_mode)
8344  		ops->ndo_set_rx_mode(dev);
8345  }
8346  
8347  void dev_set_rx_mode(struct net_device *dev)
8348  {
8349  	netif_addr_lock_bh(dev);
8350  	__dev_set_rx_mode(dev);
8351  	netif_addr_unlock_bh(dev);
8352  }
8353  
8354  /**
8355   *	dev_get_flags - get flags reported to userspace
8356   *	@dev: device
8357   *
8358   *	Get the combination of flag bits exported through APIs to userspace.
8359   */
8360  unsigned int dev_get_flags(const struct net_device *dev)
8361  {
8362  	unsigned int flags;
8363  
8364  	flags = (dev->flags & ~(IFF_PROMISC |
8365  				IFF_ALLMULTI |
8366  				IFF_RUNNING |
8367  				IFF_LOWER_UP |
8368  				IFF_DORMANT)) |
8369  		(dev->gflags & (IFF_PROMISC |
8370  				IFF_ALLMULTI));
8371  
8372  	if (netif_running(dev)) {
8373  		if (netif_oper_up(dev))
8374  			flags |= IFF_RUNNING;
8375  		if (netif_carrier_ok(dev))
8376  			flags |= IFF_LOWER_UP;
8377  		if (netif_dormant(dev))
8378  			flags |= IFF_DORMANT;
8379  	}
8380  
8381  	return flags;
8382  }
8383  EXPORT_SYMBOL(dev_get_flags);
8384  
8385  int __dev_change_flags(struct net_device *dev, unsigned int flags,
8386  		       struct netlink_ext_ack *extack)
8387  {
8388  	unsigned int old_flags = dev->flags;
8389  	int ret;
8390  
8391  	ASSERT_RTNL();
8392  
8393  	/*
8394  	 *	Set the flags on our device.
8395  	 */
8396  
8397  	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
8398  			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
8399  			       IFF_AUTOMEDIA)) |
8400  		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
8401  				    IFF_ALLMULTI));
8402  
8403  	/*
8404  	 *	Load in the correct multicast list now the flags have changed.
8405  	 */
8406  
8407  	if ((old_flags ^ flags) & IFF_MULTICAST)
8408  		dev_change_rx_flags(dev, IFF_MULTICAST);
8409  
8410  	dev_set_rx_mode(dev);
8411  
8412  	/*
8413  	 *	Have we downed the interface. We handle IFF_UP ourselves
8414  	 *	according to user attempts to set it, rather than blindly
8415  	 *	setting it.
8416  	 */
8417  
8418  	ret = 0;
8419  	if ((old_flags ^ flags) & IFF_UP) {
8420  		if (old_flags & IFF_UP)
8421  			__dev_close(dev);
8422  		else
8423  			ret = __dev_open(dev, extack);
8424  	}
8425  
8426  	if ((flags ^ dev->gflags) & IFF_PROMISC) {
8427  		int inc = (flags & IFF_PROMISC) ? 1 : -1;
8428  		unsigned int old_flags = dev->flags;
8429  
8430  		dev->gflags ^= IFF_PROMISC;
8431  
8432  		if (__dev_set_promiscuity(dev, inc, false) >= 0)
8433  			if (dev->flags != old_flags)
8434  				dev_set_rx_mode(dev);
8435  	}
8436  
8437  	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8438  	 * is important. Some (broken) drivers set IFF_PROMISC, when
8439  	 * IFF_ALLMULTI is requested not asking us and not reporting.
8440  	 */
8441  	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
8442  		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
8443  
8444  		dev->gflags ^= IFF_ALLMULTI;
8445  		__dev_set_allmulti(dev, inc, false);
8446  	}
8447  
8448  	return ret;
8449  }
8450  
8451  void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
8452  			unsigned int gchanges)
8453  {
8454  	unsigned int changes = dev->flags ^ old_flags;
8455  
8456  	if (gchanges)
8457  		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
8458  
8459  	if (changes & IFF_UP) {
8460  		if (dev->flags & IFF_UP)
8461  			call_netdevice_notifiers(NETDEV_UP, dev);
8462  		else
8463  			call_netdevice_notifiers(NETDEV_DOWN, dev);
8464  	}
8465  
8466  	if (dev->flags & IFF_UP &&
8467  	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8468  		struct netdev_notifier_change_info change_info = {
8469  			.info = {
8470  				.dev = dev,
8471  			},
8472  			.flags_changed = changes,
8473  		};
8474  
8475  		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8476  	}
8477  }
8478  
8479  /**
8480   *	dev_change_flags - change device settings
8481   *	@dev: device
8482   *	@flags: device state flags
8483   *	@extack: netlink extended ack
8484   *
8485   *	Change settings on device based state flags. The flags are
8486   *	in the userspace exported format.
8487   */
8488  int dev_change_flags(struct net_device *dev, unsigned int flags,
8489  		     struct netlink_ext_ack *extack)
8490  {
8491  	int ret;
8492  	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8493  
8494  	ret = __dev_change_flags(dev, flags, extack);
8495  	if (ret < 0)
8496  		return ret;
8497  
8498  	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8499  	__dev_notify_flags(dev, old_flags, changes);
8500  	return ret;
8501  }
8502  EXPORT_SYMBOL(dev_change_flags);
8503  
8504  int __dev_set_mtu(struct net_device *dev, int new_mtu)
8505  {
8506  	const struct net_device_ops *ops = dev->netdev_ops;
8507  
8508  	if (ops->ndo_change_mtu)
8509  		return ops->ndo_change_mtu(dev, new_mtu);
8510  
8511  	/* Pairs with all the lockless reads of dev->mtu in the stack */
8512  	WRITE_ONCE(dev->mtu, new_mtu);
8513  	return 0;
8514  }
8515  EXPORT_SYMBOL(__dev_set_mtu);
8516  
8517  int dev_validate_mtu(struct net_device *dev, int new_mtu,
8518  		     struct netlink_ext_ack *extack)
8519  {
8520  	/* MTU must be positive, and in range */
8521  	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
8522  		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
8523  		return -EINVAL;
8524  	}
8525  
8526  	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
8527  		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
8528  		return -EINVAL;
8529  	}
8530  	return 0;
8531  }
8532  
8533  /**
8534   *	dev_set_mtu_ext - Change maximum transfer unit
8535   *	@dev: device
8536   *	@new_mtu: new transfer unit
8537   *	@extack: netlink extended ack
8538   *
8539   *	Change the maximum transfer size of the network device.
8540   */
8541  int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
8542  		    struct netlink_ext_ack *extack)
8543  {
8544  	int err, orig_mtu;
8545  
8546  	if (new_mtu == dev->mtu)
8547  		return 0;
8548  
8549  	err = dev_validate_mtu(dev, new_mtu, extack);
8550  	if (err)
8551  		return err;
8552  
8553  	if (!netif_device_present(dev))
8554  		return -ENODEV;
8555  
8556  	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8557  	err = notifier_to_errno(err);
8558  	if (err)
8559  		return err;
8560  
8561  	orig_mtu = dev->mtu;
8562  	err = __dev_set_mtu(dev, new_mtu);
8563  
8564  	if (!err) {
8565  		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8566  						   orig_mtu);
8567  		err = notifier_to_errno(err);
8568  		if (err) {
8569  			/* setting mtu back and notifying everyone again,
8570  			 * so that they have a chance to revert changes.
8571  			 */
8572  			__dev_set_mtu(dev, orig_mtu);
8573  			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8574  						     new_mtu);
8575  		}
8576  	}
8577  	return err;
8578  }
8579  
8580  int dev_set_mtu(struct net_device *dev, int new_mtu)
8581  {
8582  	struct netlink_ext_ack extack;
8583  	int err;
8584  
8585  	memset(&extack, 0, sizeof(extack));
8586  	err = dev_set_mtu_ext(dev, new_mtu, &extack);
8587  	if (err && extack._msg)
8588  		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
8589  	return err;
8590  }
8591  EXPORT_SYMBOL(dev_set_mtu);
8592  
8593  /**
8594   *	dev_change_tx_queue_len - Change TX queue length of a netdevice
8595   *	@dev: device
8596   *	@new_len: new tx queue length
8597   */
8598  int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
8599  {
8600  	unsigned int orig_len = dev->tx_queue_len;
8601  	int res;
8602  
8603  	if (new_len != (unsigned int)new_len)
8604  		return -ERANGE;
8605  
8606  	if (new_len != orig_len) {
8607  		dev->tx_queue_len = new_len;
8608  		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
8609  		res = notifier_to_errno(res);
8610  		if (res)
8611  			goto err_rollback;
8612  		res = dev_qdisc_change_tx_queue_len(dev);
8613  		if (res)
8614  			goto err_rollback;
8615  	}
8616  
8617  	return 0;
8618  
8619  err_rollback:
8620  	netdev_err(dev, "refused to change device tx_queue_len\n");
8621  	dev->tx_queue_len = orig_len;
8622  	return res;
8623  }
8624  
8625  /**
8626   *	dev_set_group - Change group this device belongs to
8627   *	@dev: device
8628   *	@new_group: group this device should belong to
8629   */
8630  void dev_set_group(struct net_device *dev, int new_group)
8631  {
8632  	dev->group = new_group;
8633  }
8634  EXPORT_SYMBOL(dev_set_group);
8635  
8636  /**
8637   *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8638   *	@dev: device
8639   *	@addr: new address
8640   *	@extack: netlink extended ack
8641   */
8642  int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8643  			      struct netlink_ext_ack *extack)
8644  {
8645  	struct netdev_notifier_pre_changeaddr_info info = {
8646  		.info.dev = dev,
8647  		.info.extack = extack,
8648  		.dev_addr = addr,
8649  	};
8650  	int rc;
8651  
8652  	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8653  	return notifier_to_errno(rc);
8654  }
8655  EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8656  
8657  /**
8658   *	dev_set_mac_address - Change Media Access Control Address
8659   *	@dev: device
8660   *	@sa: new address
8661   *	@extack: netlink extended ack
8662   *
8663   *	Change the hardware (MAC) address of the device
8664   */
8665  int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8666  			struct netlink_ext_ack *extack)
8667  {
8668  	const struct net_device_ops *ops = dev->netdev_ops;
8669  	int err;
8670  
8671  	if (!ops->ndo_set_mac_address)
8672  		return -EOPNOTSUPP;
8673  	if (sa->sa_family != dev->type)
8674  		return -EINVAL;
8675  	if (!netif_device_present(dev))
8676  		return -ENODEV;
8677  	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8678  	if (err)
8679  		return err;
8680  	err = ops->ndo_set_mac_address(dev, sa);
8681  	if (err)
8682  		return err;
8683  	dev->addr_assign_type = NET_ADDR_SET;
8684  	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8685  	add_device_randomness(dev->dev_addr, dev->addr_len);
8686  	return 0;
8687  }
8688  EXPORT_SYMBOL(dev_set_mac_address);
8689  
8690  static DECLARE_RWSEM(dev_addr_sem);
8691  
8692  int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
8693  			     struct netlink_ext_ack *extack)
8694  {
8695  	int ret;
8696  
8697  	down_write(&dev_addr_sem);
8698  	ret = dev_set_mac_address(dev, sa, extack);
8699  	up_write(&dev_addr_sem);
8700  	return ret;
8701  }
8702  EXPORT_SYMBOL(dev_set_mac_address_user);
8703  
8704  int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
8705  {
8706  	size_t size = sizeof(sa->sa_data);
8707  	struct net_device *dev;
8708  	int ret = 0;
8709  
8710  	down_read(&dev_addr_sem);
8711  	rcu_read_lock();
8712  
8713  	dev = dev_get_by_name_rcu(net, dev_name);
8714  	if (!dev) {
8715  		ret = -ENODEV;
8716  		goto unlock;
8717  	}
8718  	if (!dev->addr_len)
8719  		memset(sa->sa_data, 0, size);
8720  	else
8721  		memcpy(sa->sa_data, dev->dev_addr,
8722  		       min_t(size_t, size, dev->addr_len));
8723  	sa->sa_family = dev->type;
8724  
8725  unlock:
8726  	rcu_read_unlock();
8727  	up_read(&dev_addr_sem);
8728  	return ret;
8729  }
8730  EXPORT_SYMBOL(dev_get_mac_address);
8731  
8732  /**
8733   *	dev_change_carrier - Change device carrier
8734   *	@dev: device
8735   *	@new_carrier: new value
8736   *
8737   *	Change device carrier
8738   */
8739  int dev_change_carrier(struct net_device *dev, bool new_carrier)
8740  {
8741  	const struct net_device_ops *ops = dev->netdev_ops;
8742  
8743  	if (!ops->ndo_change_carrier)
8744  		return -EOPNOTSUPP;
8745  	if (!netif_device_present(dev))
8746  		return -ENODEV;
8747  	return ops->ndo_change_carrier(dev, new_carrier);
8748  }
8749  EXPORT_SYMBOL(dev_change_carrier);
8750  
8751  /**
8752   *	dev_get_phys_port_id - Get device physical port ID
8753   *	@dev: device
8754   *	@ppid: port ID
8755   *
8756   *	Get device physical port ID
8757   */
8758  int dev_get_phys_port_id(struct net_device *dev,
8759  			 struct netdev_phys_item_id *ppid)
8760  {
8761  	const struct net_device_ops *ops = dev->netdev_ops;
8762  
8763  	if (!ops->ndo_get_phys_port_id)
8764  		return -EOPNOTSUPP;
8765  	return ops->ndo_get_phys_port_id(dev, ppid);
8766  }
8767  EXPORT_SYMBOL(dev_get_phys_port_id);
8768  
8769  /**
8770   *	dev_get_phys_port_name - Get device physical port name
8771   *	@dev: device
8772   *	@name: port name
8773   *	@len: limit of bytes to copy to name
8774   *
8775   *	Get device physical port name
8776   */
8777  int dev_get_phys_port_name(struct net_device *dev,
8778  			   char *name, size_t len)
8779  {
8780  	const struct net_device_ops *ops = dev->netdev_ops;
8781  	int err;
8782  
8783  	if (ops->ndo_get_phys_port_name) {
8784  		err = ops->ndo_get_phys_port_name(dev, name, len);
8785  		if (err != -EOPNOTSUPP)
8786  			return err;
8787  	}
8788  	return devlink_compat_phys_port_name_get(dev, name, len);
8789  }
8790  EXPORT_SYMBOL(dev_get_phys_port_name);
8791  
8792  /**
8793   *	dev_get_port_parent_id - Get the device's port parent identifier
8794   *	@dev: network device
8795   *	@ppid: pointer to a storage for the port's parent identifier
8796   *	@recurse: allow/disallow recursion to lower devices
8797   *
8798   *	Get the devices's port parent identifier
8799   */
8800  int dev_get_port_parent_id(struct net_device *dev,
8801  			   struct netdev_phys_item_id *ppid,
8802  			   bool recurse)
8803  {
8804  	const struct net_device_ops *ops = dev->netdev_ops;
8805  	struct netdev_phys_item_id first = { };
8806  	struct net_device *lower_dev;
8807  	struct list_head *iter;
8808  	int err;
8809  
8810  	if (ops->ndo_get_port_parent_id) {
8811  		err = ops->ndo_get_port_parent_id(dev, ppid);
8812  		if (err != -EOPNOTSUPP)
8813  			return err;
8814  	}
8815  
8816  	err = devlink_compat_switch_id_get(dev, ppid);
8817  	if (!recurse || err != -EOPNOTSUPP)
8818  		return err;
8819  
8820  	netdev_for_each_lower_dev(dev, lower_dev, iter) {
8821  		err = dev_get_port_parent_id(lower_dev, ppid, true);
8822  		if (err)
8823  			break;
8824  		if (!first.id_len)
8825  			first = *ppid;
8826  		else if (memcmp(&first, ppid, sizeof(*ppid)))
8827  			return -EOPNOTSUPP;
8828  	}
8829  
8830  	return err;
8831  }
8832  EXPORT_SYMBOL(dev_get_port_parent_id);
8833  
8834  /**
8835   *	netdev_port_same_parent_id - Indicate if two network devices have
8836   *	the same port parent identifier
8837   *	@a: first network device
8838   *	@b: second network device
8839   */
8840  bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
8841  {
8842  	struct netdev_phys_item_id a_id = { };
8843  	struct netdev_phys_item_id b_id = { };
8844  
8845  	if (dev_get_port_parent_id(a, &a_id, true) ||
8846  	    dev_get_port_parent_id(b, &b_id, true))
8847  		return false;
8848  
8849  	return netdev_phys_item_id_same(&a_id, &b_id);
8850  }
8851  EXPORT_SYMBOL(netdev_port_same_parent_id);
8852  
8853  /**
8854   *	dev_change_proto_down - set carrier according to proto_down.
8855   *
8856   *	@dev: device
8857   *	@proto_down: new value
8858   */
8859  int dev_change_proto_down(struct net_device *dev, bool proto_down)
8860  {
8861  	if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
8862  		return -EOPNOTSUPP;
8863  	if (!netif_device_present(dev))
8864  		return -ENODEV;
8865  	if (proto_down)
8866  		netif_carrier_off(dev);
8867  	else
8868  		netif_carrier_on(dev);
8869  	dev->proto_down = proto_down;
8870  	return 0;
8871  }
8872  EXPORT_SYMBOL(dev_change_proto_down);
8873  
8874  /**
8875   *	dev_change_proto_down_reason - proto down reason
8876   *
8877   *	@dev: device
8878   *	@mask: proto down mask
8879   *	@value: proto down value
8880   */
8881  void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
8882  				  u32 value)
8883  {
8884  	int b;
8885  
8886  	if (!mask) {
8887  		dev->proto_down_reason = value;
8888  	} else {
8889  		for_each_set_bit(b, &mask, 32) {
8890  			if (value & (1 << b))
8891  				dev->proto_down_reason |= BIT(b);
8892  			else
8893  				dev->proto_down_reason &= ~BIT(b);
8894  		}
8895  	}
8896  }
8897  EXPORT_SYMBOL(dev_change_proto_down_reason);
8898  
8899  struct bpf_xdp_link {
8900  	struct bpf_link link;
8901  	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
8902  	int flags;
8903  };
8904  
8905  static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
8906  {
8907  	if (flags & XDP_FLAGS_HW_MODE)
8908  		return XDP_MODE_HW;
8909  	if (flags & XDP_FLAGS_DRV_MODE)
8910  		return XDP_MODE_DRV;
8911  	if (flags & XDP_FLAGS_SKB_MODE)
8912  		return XDP_MODE_SKB;
8913  	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
8914  }
8915  
8916  static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
8917  {
8918  	switch (mode) {
8919  	case XDP_MODE_SKB:
8920  		return generic_xdp_install;
8921  	case XDP_MODE_DRV:
8922  	case XDP_MODE_HW:
8923  		return dev->netdev_ops->ndo_bpf;
8924  	default:
8925  		return NULL;
8926  	}
8927  }
8928  
8929  static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
8930  					 enum bpf_xdp_mode mode)
8931  {
8932  	return dev->xdp_state[mode].link;
8933  }
8934  
8935  static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
8936  				     enum bpf_xdp_mode mode)
8937  {
8938  	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
8939  
8940  	if (link)
8941  		return link->link.prog;
8942  	return dev->xdp_state[mode].prog;
8943  }
8944  
8945  u8 dev_xdp_prog_count(struct net_device *dev)
8946  {
8947  	u8 count = 0;
8948  	int i;
8949  
8950  	for (i = 0; i < __MAX_XDP_MODE; i++)
8951  		if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
8952  			count++;
8953  	return count;
8954  }
8955  EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
8956  
8957  u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
8958  {
8959  	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
8960  
8961  	return prog ? prog->aux->id : 0;
8962  }
8963  
8964  static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
8965  			     struct bpf_xdp_link *link)
8966  {
8967  	dev->xdp_state[mode].link = link;
8968  	dev->xdp_state[mode].prog = NULL;
8969  }
8970  
8971  static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
8972  			     struct bpf_prog *prog)
8973  {
8974  	dev->xdp_state[mode].link = NULL;
8975  	dev->xdp_state[mode].prog = prog;
8976  }
8977  
8978  static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
8979  			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
8980  			   u32 flags, struct bpf_prog *prog)
8981  {
8982  	struct netdev_bpf xdp;
8983  	int err;
8984  
8985  	memset(&xdp, 0, sizeof(xdp));
8986  	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
8987  	xdp.extack = extack;
8988  	xdp.flags = flags;
8989  	xdp.prog = prog;
8990  
8991  	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
8992  	 * "moved" into driver), so they don't increment it on their own, but
8993  	 * they do decrement refcnt when program is detached or replaced.
8994  	 * Given net_device also owns link/prog, we need to bump refcnt here
8995  	 * to prevent drivers from underflowing it.
8996  	 */
8997  	if (prog)
8998  		bpf_prog_inc(prog);
8999  	err = bpf_op(dev, &xdp);
9000  	if (err) {
9001  		if (prog)
9002  			bpf_prog_put(prog);
9003  		return err;
9004  	}
9005  
9006  	if (mode != XDP_MODE_HW)
9007  		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
9008  
9009  	return 0;
9010  }
9011  
9012  static void dev_xdp_uninstall(struct net_device *dev)
9013  {
9014  	struct bpf_xdp_link *link;
9015  	struct bpf_prog *prog;
9016  	enum bpf_xdp_mode mode;
9017  	bpf_op_t bpf_op;
9018  
9019  	ASSERT_RTNL();
9020  
9021  	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
9022  		prog = dev_xdp_prog(dev, mode);
9023  		if (!prog)
9024  			continue;
9025  
9026  		bpf_op = dev_xdp_bpf_op(dev, mode);
9027  		if (!bpf_op)
9028  			continue;
9029  
9030  		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9031  
9032  		/* auto-detach link from net device */
9033  		link = dev_xdp_link(dev, mode);
9034  		if (link)
9035  			link->dev = NULL;
9036  		else
9037  			bpf_prog_put(prog);
9038  
9039  		dev_xdp_set_link(dev, mode, NULL);
9040  	}
9041  }
9042  
9043  static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
9044  			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
9045  			  struct bpf_prog *old_prog, u32 flags)
9046  {
9047  	unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
9048  	struct bpf_prog *cur_prog;
9049  	struct net_device *upper;
9050  	struct list_head *iter;
9051  	enum bpf_xdp_mode mode;
9052  	bpf_op_t bpf_op;
9053  	int err;
9054  
9055  	ASSERT_RTNL();
9056  
9057  	/* either link or prog attachment, never both */
9058  	if (link && (new_prog || old_prog))
9059  		return -EINVAL;
9060  	/* link supports only XDP mode flags */
9061  	if (link && (flags & ~XDP_FLAGS_MODES)) {
9062  		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
9063  		return -EINVAL;
9064  	}
9065  	/* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9066  	if (num_modes > 1) {
9067  		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
9068  		return -EINVAL;
9069  	}
9070  	/* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9071  	if (!num_modes && dev_xdp_prog_count(dev) > 1) {
9072  		NL_SET_ERR_MSG(extack,
9073  			       "More than one program loaded, unset mode is ambiguous");
9074  		return -EINVAL;
9075  	}
9076  	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9077  	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
9078  		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
9079  		return -EINVAL;
9080  	}
9081  
9082  	mode = dev_xdp_mode(dev, flags);
9083  	/* can't replace attached link */
9084  	if (dev_xdp_link(dev, mode)) {
9085  		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
9086  		return -EBUSY;
9087  	}
9088  
9089  	/* don't allow if an upper device already has a program */
9090  	netdev_for_each_upper_dev_rcu(dev, upper, iter) {
9091  		if (dev_xdp_prog_count(upper) > 0) {
9092  			NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
9093  			return -EEXIST;
9094  		}
9095  	}
9096  
9097  	cur_prog = dev_xdp_prog(dev, mode);
9098  	/* can't replace attached prog with link */
9099  	if (link && cur_prog) {
9100  		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
9101  		return -EBUSY;
9102  	}
9103  	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
9104  		NL_SET_ERR_MSG(extack, "Active program does not match expected");
9105  		return -EEXIST;
9106  	}
9107  
9108  	/* put effective new program into new_prog */
9109  	if (link)
9110  		new_prog = link->link.prog;
9111  
9112  	if (new_prog) {
9113  		bool offload = mode == XDP_MODE_HW;
9114  		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
9115  					       ? XDP_MODE_DRV : XDP_MODE_SKB;
9116  
9117  		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
9118  			NL_SET_ERR_MSG(extack, "XDP program already attached");
9119  			return -EBUSY;
9120  		}
9121  		if (!offload && dev_xdp_prog(dev, other_mode)) {
9122  			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
9123  			return -EEXIST;
9124  		}
9125  		if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
9126  			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
9127  			return -EINVAL;
9128  		}
9129  		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
9130  			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
9131  			return -EINVAL;
9132  		}
9133  		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
9134  			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
9135  			return -EINVAL;
9136  		}
9137  	}
9138  
9139  	/* don't call drivers if the effective program didn't change */
9140  	if (new_prog != cur_prog) {
9141  		bpf_op = dev_xdp_bpf_op(dev, mode);
9142  		if (!bpf_op) {
9143  			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
9144  			return -EOPNOTSUPP;
9145  		}
9146  
9147  		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
9148  		if (err)
9149  			return err;
9150  	}
9151  
9152  	if (link)
9153  		dev_xdp_set_link(dev, mode, link);
9154  	else
9155  		dev_xdp_set_prog(dev, mode, new_prog);
9156  	if (cur_prog)
9157  		bpf_prog_put(cur_prog);
9158  
9159  	return 0;
9160  }
9161  
9162  static int dev_xdp_attach_link(struct net_device *dev,
9163  			       struct netlink_ext_ack *extack,
9164  			       struct bpf_xdp_link *link)
9165  {
9166  	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
9167  }
9168  
9169  static int dev_xdp_detach_link(struct net_device *dev,
9170  			       struct netlink_ext_ack *extack,
9171  			       struct bpf_xdp_link *link)
9172  {
9173  	enum bpf_xdp_mode mode;
9174  	bpf_op_t bpf_op;
9175  
9176  	ASSERT_RTNL();
9177  
9178  	mode = dev_xdp_mode(dev, link->flags);
9179  	if (dev_xdp_link(dev, mode) != link)
9180  		return -EINVAL;
9181  
9182  	bpf_op = dev_xdp_bpf_op(dev, mode);
9183  	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9184  	dev_xdp_set_link(dev, mode, NULL);
9185  	return 0;
9186  }
9187  
9188  static void bpf_xdp_link_release(struct bpf_link *link)
9189  {
9190  	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9191  
9192  	rtnl_lock();
9193  
9194  	/* if racing with net_device's tear down, xdp_link->dev might be
9195  	 * already NULL, in which case link was already auto-detached
9196  	 */
9197  	if (xdp_link->dev) {
9198  		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9199  		xdp_link->dev = NULL;
9200  	}
9201  
9202  	rtnl_unlock();
9203  }
9204  
9205  static int bpf_xdp_link_detach(struct bpf_link *link)
9206  {
9207  	bpf_xdp_link_release(link);
9208  	return 0;
9209  }
9210  
9211  static void bpf_xdp_link_dealloc(struct bpf_link *link)
9212  {
9213  	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9214  
9215  	kfree(xdp_link);
9216  }
9217  
9218  static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
9219  				     struct seq_file *seq)
9220  {
9221  	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9222  	u32 ifindex = 0;
9223  
9224  	rtnl_lock();
9225  	if (xdp_link->dev)
9226  		ifindex = xdp_link->dev->ifindex;
9227  	rtnl_unlock();
9228  
9229  	seq_printf(seq, "ifindex:\t%u\n", ifindex);
9230  }
9231  
9232  static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
9233  				       struct bpf_link_info *info)
9234  {
9235  	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9236  	u32 ifindex = 0;
9237  
9238  	rtnl_lock();
9239  	if (xdp_link->dev)
9240  		ifindex = xdp_link->dev->ifindex;
9241  	rtnl_unlock();
9242  
9243  	info->xdp.ifindex = ifindex;
9244  	return 0;
9245  }
9246  
9247  static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
9248  			       struct bpf_prog *old_prog)
9249  {
9250  	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9251  	enum bpf_xdp_mode mode;
9252  	bpf_op_t bpf_op;
9253  	int err = 0;
9254  
9255  	rtnl_lock();
9256  
9257  	/* link might have been auto-released already, so fail */
9258  	if (!xdp_link->dev) {
9259  		err = -ENOLINK;
9260  		goto out_unlock;
9261  	}
9262  
9263  	if (old_prog && link->prog != old_prog) {
9264  		err = -EPERM;
9265  		goto out_unlock;
9266  	}
9267  	old_prog = link->prog;
9268  	if (old_prog->type != new_prog->type ||
9269  	    old_prog->expected_attach_type != new_prog->expected_attach_type) {
9270  		err = -EINVAL;
9271  		goto out_unlock;
9272  	}
9273  
9274  	if (old_prog == new_prog) {
9275  		/* no-op, don't disturb drivers */
9276  		bpf_prog_put(new_prog);
9277  		goto out_unlock;
9278  	}
9279  
9280  	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
9281  	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
9282  	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
9283  			      xdp_link->flags, new_prog);
9284  	if (err)
9285  		goto out_unlock;
9286  
9287  	old_prog = xchg(&link->prog, new_prog);
9288  	bpf_prog_put(old_prog);
9289  
9290  out_unlock:
9291  	rtnl_unlock();
9292  	return err;
9293  }
9294  
9295  static const struct bpf_link_ops bpf_xdp_link_lops = {
9296  	.release = bpf_xdp_link_release,
9297  	.dealloc = bpf_xdp_link_dealloc,
9298  	.detach = bpf_xdp_link_detach,
9299  	.show_fdinfo = bpf_xdp_link_show_fdinfo,
9300  	.fill_link_info = bpf_xdp_link_fill_link_info,
9301  	.update_prog = bpf_xdp_link_update,
9302  };
9303  
9304  int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
9305  {
9306  	struct net *net = current->nsproxy->net_ns;
9307  	struct bpf_link_primer link_primer;
9308  	struct bpf_xdp_link *link;
9309  	struct net_device *dev;
9310  	int err, fd;
9311  
9312  	rtnl_lock();
9313  	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
9314  	if (!dev) {
9315  		rtnl_unlock();
9316  		return -EINVAL;
9317  	}
9318  
9319  	link = kzalloc(sizeof(*link), GFP_USER);
9320  	if (!link) {
9321  		err = -ENOMEM;
9322  		goto unlock;
9323  	}
9324  
9325  	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
9326  	link->dev = dev;
9327  	link->flags = attr->link_create.flags;
9328  
9329  	err = bpf_link_prime(&link->link, &link_primer);
9330  	if (err) {
9331  		kfree(link);
9332  		goto unlock;
9333  	}
9334  
9335  	err = dev_xdp_attach_link(dev, NULL, link);
9336  	rtnl_unlock();
9337  
9338  	if (err) {
9339  		link->dev = NULL;
9340  		bpf_link_cleanup(&link_primer);
9341  		goto out_put_dev;
9342  	}
9343  
9344  	fd = bpf_link_settle(&link_primer);
9345  	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
9346  	dev_put(dev);
9347  	return fd;
9348  
9349  unlock:
9350  	rtnl_unlock();
9351  
9352  out_put_dev:
9353  	dev_put(dev);
9354  	return err;
9355  }
9356  
9357  /**
9358   *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
9359   *	@dev: device
9360   *	@extack: netlink extended ack
9361   *	@fd: new program fd or negative value to clear
9362   *	@expected_fd: old program fd that userspace expects to replace or clear
9363   *	@flags: xdp-related flags
9364   *
9365   *	Set or clear a bpf program for a device
9366   */
9367  int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
9368  		      int fd, int expected_fd, u32 flags)
9369  {
9370  	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
9371  	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
9372  	int err;
9373  
9374  	ASSERT_RTNL();
9375  
9376  	if (fd >= 0) {
9377  		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
9378  						 mode != XDP_MODE_SKB);
9379  		if (IS_ERR(new_prog))
9380  			return PTR_ERR(new_prog);
9381  	}
9382  
9383  	if (expected_fd >= 0) {
9384  		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
9385  						 mode != XDP_MODE_SKB);
9386  		if (IS_ERR(old_prog)) {
9387  			err = PTR_ERR(old_prog);
9388  			old_prog = NULL;
9389  			goto err_out;
9390  		}
9391  	}
9392  
9393  	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
9394  
9395  err_out:
9396  	if (err && new_prog)
9397  		bpf_prog_put(new_prog);
9398  	if (old_prog)
9399  		bpf_prog_put(old_prog);
9400  	return err;
9401  }
9402  
9403  /**
9404   *	dev_new_index	-	allocate an ifindex
9405   *	@net: the applicable net namespace
9406   *
9407   *	Returns a suitable unique value for a new device interface
9408   *	number.  The caller must hold the rtnl semaphore or the
9409   *	dev_base_lock to be sure it remains unique.
9410   */
9411  static int dev_new_index(struct net *net)
9412  {
9413  	int ifindex = net->ifindex;
9414  
9415  	for (;;) {
9416  		if (++ifindex <= 0)
9417  			ifindex = 1;
9418  		if (!__dev_get_by_index(net, ifindex))
9419  			return net->ifindex = ifindex;
9420  	}
9421  }
9422  
9423  /* Delayed registration/unregisteration */
9424  static LIST_HEAD(net_todo_list);
9425  DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
9426  
9427  static void net_set_todo(struct net_device *dev)
9428  {
9429  	list_add_tail(&dev->todo_list, &net_todo_list);
9430  	atomic_inc(&dev_net(dev)->dev_unreg_count);
9431  }
9432  
9433  static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
9434  	struct net_device *upper, netdev_features_t features)
9435  {
9436  	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9437  	netdev_features_t feature;
9438  	int feature_bit;
9439  
9440  	for_each_netdev_feature(upper_disables, feature_bit) {
9441  		feature = __NETIF_F_BIT(feature_bit);
9442  		if (!(upper->wanted_features & feature)
9443  		    && (features & feature)) {
9444  			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
9445  				   &feature, upper->name);
9446  			features &= ~feature;
9447  		}
9448  	}
9449  
9450  	return features;
9451  }
9452  
9453  static void netdev_sync_lower_features(struct net_device *upper,
9454  	struct net_device *lower, netdev_features_t features)
9455  {
9456  	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9457  	netdev_features_t feature;
9458  	int feature_bit;
9459  
9460  	for_each_netdev_feature(upper_disables, feature_bit) {
9461  		feature = __NETIF_F_BIT(feature_bit);
9462  		if (!(features & feature) && (lower->features & feature)) {
9463  			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
9464  				   &feature, lower->name);
9465  			lower->wanted_features &= ~feature;
9466  			__netdev_update_features(lower);
9467  
9468  			if (unlikely(lower->features & feature))
9469  				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
9470  					    &feature, lower->name);
9471  			else
9472  				netdev_features_change(lower);
9473  		}
9474  	}
9475  }
9476  
9477  static netdev_features_t netdev_fix_features(struct net_device *dev,
9478  	netdev_features_t features)
9479  {
9480  	/* Fix illegal checksum combinations */
9481  	if ((features & NETIF_F_HW_CSUM) &&
9482  	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
9483  		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
9484  		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
9485  	}
9486  
9487  	/* TSO requires that SG is present as well. */
9488  	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
9489  		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
9490  		features &= ~NETIF_F_ALL_TSO;
9491  	}
9492  
9493  	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
9494  					!(features & NETIF_F_IP_CSUM)) {
9495  		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
9496  		features &= ~NETIF_F_TSO;
9497  		features &= ~NETIF_F_TSO_ECN;
9498  	}
9499  
9500  	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
9501  					 !(features & NETIF_F_IPV6_CSUM)) {
9502  		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
9503  		features &= ~NETIF_F_TSO6;
9504  	}
9505  
9506  	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
9507  	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
9508  		features &= ~NETIF_F_TSO_MANGLEID;
9509  
9510  	/* TSO ECN requires that TSO is present as well. */
9511  	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
9512  		features &= ~NETIF_F_TSO_ECN;
9513  
9514  	/* Software GSO depends on SG. */
9515  	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
9516  		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
9517  		features &= ~NETIF_F_GSO;
9518  	}
9519  
9520  	/* GSO partial features require GSO partial be set */
9521  	if ((features & dev->gso_partial_features) &&
9522  	    !(features & NETIF_F_GSO_PARTIAL)) {
9523  		netdev_dbg(dev,
9524  			   "Dropping partially supported GSO features since no GSO partial.\n");
9525  		features &= ~dev->gso_partial_features;
9526  	}
9527  
9528  	if (!(features & NETIF_F_RXCSUM)) {
9529  		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
9530  		 * successfully merged by hardware must also have the
9531  		 * checksum verified by hardware.  If the user does not
9532  		 * want to enable RXCSUM, logically, we should disable GRO_HW.
9533  		 */
9534  		if (features & NETIF_F_GRO_HW) {
9535  			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
9536  			features &= ~NETIF_F_GRO_HW;
9537  		}
9538  	}
9539  
9540  	/* LRO/HW-GRO features cannot be combined with RX-FCS */
9541  	if (features & NETIF_F_RXFCS) {
9542  		if (features & NETIF_F_LRO) {
9543  			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
9544  			features &= ~NETIF_F_LRO;
9545  		}
9546  
9547  		if (features & NETIF_F_GRO_HW) {
9548  			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
9549  			features &= ~NETIF_F_GRO_HW;
9550  		}
9551  	}
9552  
9553  	if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
9554  		netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
9555  		features &= ~NETIF_F_LRO;
9556  	}
9557  
9558  	if (features & NETIF_F_HW_TLS_TX) {
9559  		bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
9560  			(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
9561  		bool hw_csum = features & NETIF_F_HW_CSUM;
9562  
9563  		if (!ip_csum && !hw_csum) {
9564  			netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
9565  			features &= ~NETIF_F_HW_TLS_TX;
9566  		}
9567  	}
9568  
9569  	if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
9570  		netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
9571  		features &= ~NETIF_F_HW_TLS_RX;
9572  	}
9573  
9574  	return features;
9575  }
9576  
9577  int __netdev_update_features(struct net_device *dev)
9578  {
9579  	struct net_device *upper, *lower;
9580  	netdev_features_t features;
9581  	struct list_head *iter;
9582  	int err = -1;
9583  
9584  	ASSERT_RTNL();
9585  
9586  	features = netdev_get_wanted_features(dev);
9587  
9588  	if (dev->netdev_ops->ndo_fix_features)
9589  		features = dev->netdev_ops->ndo_fix_features(dev, features);
9590  
9591  	/* driver might be less strict about feature dependencies */
9592  	features = netdev_fix_features(dev, features);
9593  
9594  	/* some features can't be enabled if they're off on an upper device */
9595  	netdev_for_each_upper_dev_rcu(dev, upper, iter)
9596  		features = netdev_sync_upper_features(dev, upper, features);
9597  
9598  	if (dev->features == features)
9599  		goto sync_lower;
9600  
9601  	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
9602  		&dev->features, &features);
9603  
9604  	if (dev->netdev_ops->ndo_set_features)
9605  		err = dev->netdev_ops->ndo_set_features(dev, features);
9606  	else
9607  		err = 0;
9608  
9609  	if (unlikely(err < 0)) {
9610  		netdev_err(dev,
9611  			"set_features() failed (%d); wanted %pNF, left %pNF\n",
9612  			err, &features, &dev->features);
9613  		/* return non-0 since some features might have changed and
9614  		 * it's better to fire a spurious notification than miss it
9615  		 */
9616  		return -1;
9617  	}
9618  
9619  sync_lower:
9620  	/* some features must be disabled on lower devices when disabled
9621  	 * on an upper device (think: bonding master or bridge)
9622  	 */
9623  	netdev_for_each_lower_dev(dev, lower, iter)
9624  		netdev_sync_lower_features(dev, lower, features);
9625  
9626  	if (!err) {
9627  		netdev_features_t diff = features ^ dev->features;
9628  
9629  		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
9630  			/* udp_tunnel_{get,drop}_rx_info both need
9631  			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
9632  			 * device, or they won't do anything.
9633  			 * Thus we need to update dev->features
9634  			 * *before* calling udp_tunnel_get_rx_info,
9635  			 * but *after* calling udp_tunnel_drop_rx_info.
9636  			 */
9637  			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
9638  				dev->features = features;
9639  				udp_tunnel_get_rx_info(dev);
9640  			} else {
9641  				udp_tunnel_drop_rx_info(dev);
9642  			}
9643  		}
9644  
9645  		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
9646  			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
9647  				dev->features = features;
9648  				err |= vlan_get_rx_ctag_filter_info(dev);
9649  			} else {
9650  				vlan_drop_rx_ctag_filter_info(dev);
9651  			}
9652  		}
9653  
9654  		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
9655  			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
9656  				dev->features = features;
9657  				err |= vlan_get_rx_stag_filter_info(dev);
9658  			} else {
9659  				vlan_drop_rx_stag_filter_info(dev);
9660  			}
9661  		}
9662  
9663  		dev->features = features;
9664  	}
9665  
9666  	return err < 0 ? 0 : 1;
9667  }
9668  
9669  /**
9670   *	netdev_update_features - recalculate device features
9671   *	@dev: the device to check
9672   *
9673   *	Recalculate dev->features set and send notifications if it
9674   *	has changed. Should be called after driver or hardware dependent
9675   *	conditions might have changed that influence the features.
9676   */
9677  void netdev_update_features(struct net_device *dev)
9678  {
9679  	if (__netdev_update_features(dev))
9680  		netdev_features_change(dev);
9681  }
9682  EXPORT_SYMBOL(netdev_update_features);
9683  
9684  /**
9685   *	netdev_change_features - recalculate device features
9686   *	@dev: the device to check
9687   *
9688   *	Recalculate dev->features set and send notifications even
9689   *	if they have not changed. Should be called instead of
9690   *	netdev_update_features() if also dev->vlan_features might
9691   *	have changed to allow the changes to be propagated to stacked
9692   *	VLAN devices.
9693   */
9694  void netdev_change_features(struct net_device *dev)
9695  {
9696  	__netdev_update_features(dev);
9697  	netdev_features_change(dev);
9698  }
9699  EXPORT_SYMBOL(netdev_change_features);
9700  
9701  /**
9702   *	netif_stacked_transfer_operstate -	transfer operstate
9703   *	@rootdev: the root or lower level device to transfer state from
9704   *	@dev: the device to transfer operstate to
9705   *
9706   *	Transfer operational state from root to device. This is normally
9707   *	called when a stacking relationship exists between the root
9708   *	device and the device(a leaf device).
9709   */
9710  void netif_stacked_transfer_operstate(const struct net_device *rootdev,
9711  					struct net_device *dev)
9712  {
9713  	if (rootdev->operstate == IF_OPER_DORMANT)
9714  		netif_dormant_on(dev);
9715  	else
9716  		netif_dormant_off(dev);
9717  
9718  	if (rootdev->operstate == IF_OPER_TESTING)
9719  		netif_testing_on(dev);
9720  	else
9721  		netif_testing_off(dev);
9722  
9723  	if (netif_carrier_ok(rootdev))
9724  		netif_carrier_on(dev);
9725  	else
9726  		netif_carrier_off(dev);
9727  }
9728  EXPORT_SYMBOL(netif_stacked_transfer_operstate);
9729  
9730  static int netif_alloc_rx_queues(struct net_device *dev)
9731  {
9732  	unsigned int i, count = dev->num_rx_queues;
9733  	struct netdev_rx_queue *rx;
9734  	size_t sz = count * sizeof(*rx);
9735  	int err = 0;
9736  
9737  	BUG_ON(count < 1);
9738  
9739  	rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
9740  	if (!rx)
9741  		return -ENOMEM;
9742  
9743  	dev->_rx = rx;
9744  
9745  	for (i = 0; i < count; i++) {
9746  		rx[i].dev = dev;
9747  
9748  		/* XDP RX-queue setup */
9749  		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
9750  		if (err < 0)
9751  			goto err_rxq_info;
9752  	}
9753  	return 0;
9754  
9755  err_rxq_info:
9756  	/* Rollback successful reg's and free other resources */
9757  	while (i--)
9758  		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
9759  	kvfree(dev->_rx);
9760  	dev->_rx = NULL;
9761  	return err;
9762  }
9763  
9764  static void netif_free_rx_queues(struct net_device *dev)
9765  {
9766  	unsigned int i, count = dev->num_rx_queues;
9767  
9768  	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
9769  	if (!dev->_rx)
9770  		return;
9771  
9772  	for (i = 0; i < count; i++)
9773  		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
9774  
9775  	kvfree(dev->_rx);
9776  }
9777  
9778  static void netdev_init_one_queue(struct net_device *dev,
9779  				  struct netdev_queue *queue, void *_unused)
9780  {
9781  	/* Initialize queue lock */
9782  	spin_lock_init(&queue->_xmit_lock);
9783  	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
9784  	queue->xmit_lock_owner = -1;
9785  	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
9786  	queue->dev = dev;
9787  #ifdef CONFIG_BQL
9788  	dql_init(&queue->dql, HZ);
9789  #endif
9790  }
9791  
9792  static void netif_free_tx_queues(struct net_device *dev)
9793  {
9794  	kvfree(dev->_tx);
9795  }
9796  
9797  static int netif_alloc_netdev_queues(struct net_device *dev)
9798  {
9799  	unsigned int count = dev->num_tx_queues;
9800  	struct netdev_queue *tx;
9801  	size_t sz = count * sizeof(*tx);
9802  
9803  	if (count < 1 || count > 0xffff)
9804  		return -EINVAL;
9805  
9806  	tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
9807  	if (!tx)
9808  		return -ENOMEM;
9809  
9810  	dev->_tx = tx;
9811  
9812  	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
9813  	spin_lock_init(&dev->tx_global_lock);
9814  
9815  	return 0;
9816  }
9817  
9818  void netif_tx_stop_all_queues(struct net_device *dev)
9819  {
9820  	unsigned int i;
9821  
9822  	for (i = 0; i < dev->num_tx_queues; i++) {
9823  		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
9824  
9825  		netif_tx_stop_queue(txq);
9826  	}
9827  }
9828  EXPORT_SYMBOL(netif_tx_stop_all_queues);
9829  
9830  /**
9831   *	register_netdevice	- register a network device
9832   *	@dev: device to register
9833   *
9834   *	Take a completed network device structure and add it to the kernel
9835   *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
9836   *	chain. 0 is returned on success. A negative errno code is returned
9837   *	on a failure to set up the device, or if the name is a duplicate.
9838   *
9839   *	Callers must hold the rtnl semaphore. You may want
9840   *	register_netdev() instead of this.
9841   *
9842   *	BUGS:
9843   *	The locking appears insufficient to guarantee two parallel registers
9844   *	will not get the same name.
9845   */
9846  
9847  int register_netdevice(struct net_device *dev)
9848  {
9849  	int ret;
9850  	struct net *net = dev_net(dev);
9851  
9852  	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
9853  		     NETDEV_FEATURE_COUNT);
9854  	BUG_ON(dev_boot_phase);
9855  	ASSERT_RTNL();
9856  
9857  	might_sleep();
9858  
9859  	/* When net_device's are persistent, this will be fatal. */
9860  	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
9861  	BUG_ON(!net);
9862  
9863  	ret = ethtool_check_ops(dev->ethtool_ops);
9864  	if (ret)
9865  		return ret;
9866  
9867  	spin_lock_init(&dev->addr_list_lock);
9868  	netdev_set_addr_lockdep_class(dev);
9869  
9870  	ret = dev_get_valid_name(net, dev, dev->name);
9871  	if (ret < 0)
9872  		goto out;
9873  
9874  	ret = -ENOMEM;
9875  	dev->name_node = netdev_name_node_head_alloc(dev);
9876  	if (!dev->name_node)
9877  		goto out;
9878  
9879  	/* Init, if this function is available */
9880  	if (dev->netdev_ops->ndo_init) {
9881  		ret = dev->netdev_ops->ndo_init(dev);
9882  		if (ret) {
9883  			if (ret > 0)
9884  				ret = -EIO;
9885  			goto err_free_name;
9886  		}
9887  	}
9888  
9889  	if (((dev->hw_features | dev->features) &
9890  	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
9891  	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
9892  	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
9893  		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
9894  		ret = -EINVAL;
9895  		goto err_uninit;
9896  	}
9897  
9898  	ret = -EBUSY;
9899  	if (!dev->ifindex)
9900  		dev->ifindex = dev_new_index(net);
9901  	else if (__dev_get_by_index(net, dev->ifindex))
9902  		goto err_uninit;
9903  
9904  	/* Transfer changeable features to wanted_features and enable
9905  	 * software offloads (GSO and GRO).
9906  	 */
9907  	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
9908  	dev->features |= NETIF_F_SOFT_FEATURES;
9909  
9910  	if (dev->udp_tunnel_nic_info) {
9911  		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9912  		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
9913  	}
9914  
9915  	dev->wanted_features = dev->features & dev->hw_features;
9916  
9917  	if (!(dev->flags & IFF_LOOPBACK))
9918  		dev->hw_features |= NETIF_F_NOCACHE_COPY;
9919  
9920  	/* If IPv4 TCP segmentation offload is supported we should also
9921  	 * allow the device to enable segmenting the frame with the option
9922  	 * of ignoring a static IP ID value.  This doesn't enable the
9923  	 * feature itself but allows the user to enable it later.
9924  	 */
9925  	if (dev->hw_features & NETIF_F_TSO)
9926  		dev->hw_features |= NETIF_F_TSO_MANGLEID;
9927  	if (dev->vlan_features & NETIF_F_TSO)
9928  		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
9929  	if (dev->mpls_features & NETIF_F_TSO)
9930  		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
9931  	if (dev->hw_enc_features & NETIF_F_TSO)
9932  		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
9933  
9934  	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
9935  	 */
9936  	dev->vlan_features |= NETIF_F_HIGHDMA;
9937  
9938  	/* Make NETIF_F_SG inheritable to tunnel devices.
9939  	 */
9940  	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
9941  
9942  	/* Make NETIF_F_SG inheritable to MPLS.
9943  	 */
9944  	dev->mpls_features |= NETIF_F_SG;
9945  
9946  	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
9947  	ret = notifier_to_errno(ret);
9948  	if (ret)
9949  		goto err_uninit;
9950  
9951  	ret = netdev_register_kobject(dev);
9952  	if (ret) {
9953  		dev->reg_state = NETREG_UNREGISTERED;
9954  		goto err_uninit;
9955  	}
9956  	dev->reg_state = NETREG_REGISTERED;
9957  
9958  	__netdev_update_features(dev);
9959  
9960  	/*
9961  	 *	Default initial state at registry is that the
9962  	 *	device is present.
9963  	 */
9964  
9965  	set_bit(__LINK_STATE_PRESENT, &dev->state);
9966  
9967  	linkwatch_init_dev(dev);
9968  
9969  	dev_init_scheduler(dev);
9970  
9971  	dev_hold_track(dev, &dev->dev_registered_tracker, GFP_KERNEL);
9972  	list_netdevice(dev);
9973  
9974  	add_device_randomness(dev->dev_addr, dev->addr_len);
9975  
9976  	/* If the device has permanent device address, driver should
9977  	 * set dev_addr and also addr_assign_type should be set to
9978  	 * NET_ADDR_PERM (default value).
9979  	 */
9980  	if (dev->addr_assign_type == NET_ADDR_PERM)
9981  		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
9982  
9983  	/* Notify protocols, that a new device appeared. */
9984  	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
9985  	ret = notifier_to_errno(ret);
9986  	if (ret) {
9987  		/* Expect explicit free_netdev() on failure */
9988  		dev->needs_free_netdev = false;
9989  		unregister_netdevice_queue(dev, NULL);
9990  		goto out;
9991  	}
9992  	/*
9993  	 *	Prevent userspace races by waiting until the network
9994  	 *	device is fully setup before sending notifications.
9995  	 */
9996  	if (!dev->rtnl_link_ops ||
9997  	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
9998  		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
9999  
10000  out:
10001  	return ret;
10002  
10003  err_uninit:
10004  	if (dev->netdev_ops->ndo_uninit)
10005  		dev->netdev_ops->ndo_uninit(dev);
10006  	if (dev->priv_destructor)
10007  		dev->priv_destructor(dev);
10008  err_free_name:
10009  	netdev_name_node_free(dev->name_node);
10010  	goto out;
10011  }
10012  EXPORT_SYMBOL(register_netdevice);
10013  
10014  /**
10015   *	init_dummy_netdev	- init a dummy network device for NAPI
10016   *	@dev: device to init
10017   *
10018   *	This takes a network device structure and initialize the minimum
10019   *	amount of fields so it can be used to schedule NAPI polls without
10020   *	registering a full blown interface. This is to be used by drivers
10021   *	that need to tie several hardware interfaces to a single NAPI
10022   *	poll scheduler due to HW limitations.
10023   */
10024  int init_dummy_netdev(struct net_device *dev)
10025  {
10026  	/* Clear everything. Note we don't initialize spinlocks
10027  	 * are they aren't supposed to be taken by any of the
10028  	 * NAPI code and this dummy netdev is supposed to be
10029  	 * only ever used for NAPI polls
10030  	 */
10031  	memset(dev, 0, sizeof(struct net_device));
10032  
10033  	/* make sure we BUG if trying to hit standard
10034  	 * register/unregister code path
10035  	 */
10036  	dev->reg_state = NETREG_DUMMY;
10037  
10038  	/* NAPI wants this */
10039  	INIT_LIST_HEAD(&dev->napi_list);
10040  
10041  	/* a dummy interface is started by default */
10042  	set_bit(__LINK_STATE_PRESENT, &dev->state);
10043  	set_bit(__LINK_STATE_START, &dev->state);
10044  
10045  	/* napi_busy_loop stats accounting wants this */
10046  	dev_net_set(dev, &init_net);
10047  
10048  	/* Note : We dont allocate pcpu_refcnt for dummy devices,
10049  	 * because users of this 'device' dont need to change
10050  	 * its refcount.
10051  	 */
10052  
10053  	return 0;
10054  }
10055  EXPORT_SYMBOL_GPL(init_dummy_netdev);
10056  
10057  
10058  /**
10059   *	register_netdev	- register a network device
10060   *	@dev: device to register
10061   *
10062   *	Take a completed network device structure and add it to the kernel
10063   *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10064   *	chain. 0 is returned on success. A negative errno code is returned
10065   *	on a failure to set up the device, or if the name is a duplicate.
10066   *
10067   *	This is a wrapper around register_netdevice that takes the rtnl semaphore
10068   *	and expands the device name if you passed a format string to
10069   *	alloc_netdev.
10070   */
10071  int register_netdev(struct net_device *dev)
10072  {
10073  	int err;
10074  
10075  	if (rtnl_lock_killable())
10076  		return -EINTR;
10077  	err = register_netdevice(dev);
10078  	rtnl_unlock();
10079  	return err;
10080  }
10081  EXPORT_SYMBOL(register_netdev);
10082  
10083  int netdev_refcnt_read(const struct net_device *dev)
10084  {
10085  #ifdef CONFIG_PCPU_DEV_REFCNT
10086  	int i, refcnt = 0;
10087  
10088  	for_each_possible_cpu(i)
10089  		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10090  	return refcnt;
10091  #else
10092  	return refcount_read(&dev->dev_refcnt);
10093  #endif
10094  }
10095  EXPORT_SYMBOL(netdev_refcnt_read);
10096  
10097  int netdev_unregister_timeout_secs __read_mostly = 10;
10098  
10099  #define WAIT_REFS_MIN_MSECS 1
10100  #define WAIT_REFS_MAX_MSECS 250
10101  /**
10102   * netdev_wait_allrefs_any - wait until all references are gone.
10103   * @list: list of net_devices to wait on
10104   *
10105   * This is called when unregistering network devices.
10106   *
10107   * Any protocol or device that holds a reference should register
10108   * for netdevice notification, and cleanup and put back the
10109   * reference if they receive an UNREGISTER event.
10110   * We can get stuck here if buggy protocols don't correctly
10111   * call dev_put.
10112   */
10113  static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
10114  {
10115  	unsigned long rebroadcast_time, warning_time;
10116  	struct net_device *dev;
10117  	int wait = 0;
10118  
10119  	rebroadcast_time = warning_time = jiffies;
10120  
10121  	list_for_each_entry(dev, list, todo_list)
10122  		if (netdev_refcnt_read(dev) == 1)
10123  			return dev;
10124  
10125  	while (true) {
10126  		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10127  			rtnl_lock();
10128  
10129  			/* Rebroadcast unregister notification */
10130  			list_for_each_entry(dev, list, todo_list)
10131  				call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10132  
10133  			__rtnl_unlock();
10134  			rcu_barrier();
10135  			rtnl_lock();
10136  
10137  			list_for_each_entry(dev, list, todo_list)
10138  				if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10139  					     &dev->state)) {
10140  					/* We must not have linkwatch events
10141  					 * pending on unregister. If this
10142  					 * happens, we simply run the queue
10143  					 * unscheduled, resulting in a noop
10144  					 * for this device.
10145  					 */
10146  					linkwatch_run_queue();
10147  					break;
10148  				}
10149  
10150  			__rtnl_unlock();
10151  
10152  			rebroadcast_time = jiffies;
10153  		}
10154  
10155  		if (!wait) {
10156  			rcu_barrier();
10157  			wait = WAIT_REFS_MIN_MSECS;
10158  		} else {
10159  			msleep(wait);
10160  			wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10161  		}
10162  
10163  		list_for_each_entry(dev, list, todo_list)
10164  			if (netdev_refcnt_read(dev) == 1)
10165  				return dev;
10166  
10167  		if (time_after(jiffies, warning_time +
10168  			       netdev_unregister_timeout_secs * HZ)) {
10169  			list_for_each_entry(dev, list, todo_list) {
10170  				pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10171  					 dev->name, netdev_refcnt_read(dev));
10172  				ref_tracker_dir_print(&dev->refcnt_tracker, 10);
10173  			}
10174  
10175  			warning_time = jiffies;
10176  		}
10177  	}
10178  }
10179  
10180  /* The sequence is:
10181   *
10182   *	rtnl_lock();
10183   *	...
10184   *	register_netdevice(x1);
10185   *	register_netdevice(x2);
10186   *	...
10187   *	unregister_netdevice(y1);
10188   *	unregister_netdevice(y2);
10189   *      ...
10190   *	rtnl_unlock();
10191   *	free_netdev(y1);
10192   *	free_netdev(y2);
10193   *
10194   * We are invoked by rtnl_unlock().
10195   * This allows us to deal with problems:
10196   * 1) We can delete sysfs objects which invoke hotplug
10197   *    without deadlocking with linkwatch via keventd.
10198   * 2) Since we run with the RTNL semaphore not held, we can sleep
10199   *    safely in order to wait for the netdev refcnt to drop to zero.
10200   *
10201   * We must not return until all unregister events added during
10202   * the interval the lock was held have been completed.
10203   */
10204  void netdev_run_todo(void)
10205  {
10206  	struct net_device *dev, *tmp;
10207  	struct list_head list;
10208  #ifdef CONFIG_LOCKDEP
10209  	struct list_head unlink_list;
10210  
10211  	list_replace_init(&net_unlink_list, &unlink_list);
10212  
10213  	while (!list_empty(&unlink_list)) {
10214  		struct net_device *dev = list_first_entry(&unlink_list,
10215  							  struct net_device,
10216  							  unlink_list);
10217  		list_del_init(&dev->unlink_list);
10218  		dev->nested_level = dev->lower_level - 1;
10219  	}
10220  #endif
10221  
10222  	/* Snapshot list, allow later requests */
10223  	list_replace_init(&net_todo_list, &list);
10224  
10225  	__rtnl_unlock();
10226  
10227  	/* Wait for rcu callbacks to finish before next phase */
10228  	if (!list_empty(&list))
10229  		rcu_barrier();
10230  
10231  	list_for_each_entry_safe(dev, tmp, &list, todo_list) {
10232  		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10233  			netdev_WARN(dev, "run_todo but not unregistering\n");
10234  			list_del(&dev->todo_list);
10235  			continue;
10236  		}
10237  
10238  		dev->reg_state = NETREG_UNREGISTERED;
10239  		linkwatch_forget_dev(dev);
10240  	}
10241  
10242  	while (!list_empty(&list)) {
10243  		dev = netdev_wait_allrefs_any(&list);
10244  		list_del(&dev->todo_list);
10245  
10246  		/* paranoia */
10247  		BUG_ON(netdev_refcnt_read(dev) != 1);
10248  		BUG_ON(!list_empty(&dev->ptype_all));
10249  		BUG_ON(!list_empty(&dev->ptype_specific));
10250  		WARN_ON(rcu_access_pointer(dev->ip_ptr));
10251  		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10252  #if IS_ENABLED(CONFIG_DECNET)
10253  		WARN_ON(dev->dn_ptr);
10254  #endif
10255  		if (dev->priv_destructor)
10256  			dev->priv_destructor(dev);
10257  		if (dev->needs_free_netdev)
10258  			free_netdev(dev);
10259  
10260  		if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
10261  			wake_up(&netdev_unregistering_wq);
10262  
10263  		/* Free network device */
10264  		kobject_put(&dev->dev.kobj);
10265  	}
10266  }
10267  
10268  /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10269   * all the same fields in the same order as net_device_stats, with only
10270   * the type differing, but rtnl_link_stats64 may have additional fields
10271   * at the end for newer counters.
10272   */
10273  void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10274  			     const struct net_device_stats *netdev_stats)
10275  {
10276  #if BITS_PER_LONG == 64
10277  	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
10278  	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
10279  	/* zero out counters that only exist in rtnl_link_stats64 */
10280  	memset((char *)stats64 + sizeof(*netdev_stats), 0,
10281  	       sizeof(*stats64) - sizeof(*netdev_stats));
10282  #else
10283  	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
10284  	const unsigned long *src = (const unsigned long *)netdev_stats;
10285  	u64 *dst = (u64 *)stats64;
10286  
10287  	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10288  	for (i = 0; i < n; i++)
10289  		dst[i] = src[i];
10290  	/* zero out counters that only exist in rtnl_link_stats64 */
10291  	memset((char *)stats64 + n * sizeof(u64), 0,
10292  	       sizeof(*stats64) - n * sizeof(u64));
10293  #endif
10294  }
10295  EXPORT_SYMBOL(netdev_stats_to_stats64);
10296  
10297  struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev)
10298  {
10299  	struct net_device_core_stats __percpu *p;
10300  
10301  	p = alloc_percpu_gfp(struct net_device_core_stats,
10302  			     GFP_ATOMIC | __GFP_NOWARN);
10303  
10304  	if (p && cmpxchg(&dev->core_stats, NULL, p))
10305  		free_percpu(p);
10306  
10307  	/* This READ_ONCE() pairs with the cmpxchg() above */
10308  	p = READ_ONCE(dev->core_stats);
10309  	if (!p)
10310  		return NULL;
10311  
10312  	return this_cpu_ptr(p);
10313  }
10314  EXPORT_SYMBOL(netdev_core_stats_alloc);
10315  
10316  /**
10317   *	dev_get_stats	- get network device statistics
10318   *	@dev: device to get statistics from
10319   *	@storage: place to store stats
10320   *
10321   *	Get network statistics from device. Return @storage.
10322   *	The device driver may provide its own method by setting
10323   *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10324   *	otherwise the internal statistics structure is used.
10325   */
10326  struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10327  					struct rtnl_link_stats64 *storage)
10328  {
10329  	const struct net_device_ops *ops = dev->netdev_ops;
10330  	const struct net_device_core_stats __percpu *p;
10331  
10332  	if (ops->ndo_get_stats64) {
10333  		memset(storage, 0, sizeof(*storage));
10334  		ops->ndo_get_stats64(dev, storage);
10335  	} else if (ops->ndo_get_stats) {
10336  		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10337  	} else {
10338  		netdev_stats_to_stats64(storage, &dev->stats);
10339  	}
10340  
10341  	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10342  	p = READ_ONCE(dev->core_stats);
10343  	if (p) {
10344  		const struct net_device_core_stats *core_stats;
10345  		int i;
10346  
10347  		for_each_possible_cpu(i) {
10348  			core_stats = per_cpu_ptr(p, i);
10349  			storage->rx_dropped += local_read(&core_stats->rx_dropped);
10350  			storage->tx_dropped += local_read(&core_stats->tx_dropped);
10351  			storage->rx_nohandler += local_read(&core_stats->rx_nohandler);
10352  		}
10353  	}
10354  	return storage;
10355  }
10356  EXPORT_SYMBOL(dev_get_stats);
10357  
10358  /**
10359   *	dev_fetch_sw_netstats - get per-cpu network device statistics
10360   *	@s: place to store stats
10361   *	@netstats: per-cpu network stats to read from
10362   *
10363   *	Read per-cpu network statistics and populate the related fields in @s.
10364   */
10365  void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10366  			   const struct pcpu_sw_netstats __percpu *netstats)
10367  {
10368  	int cpu;
10369  
10370  	for_each_possible_cpu(cpu) {
10371  		const struct pcpu_sw_netstats *stats;
10372  		struct pcpu_sw_netstats tmp;
10373  		unsigned int start;
10374  
10375  		stats = per_cpu_ptr(netstats, cpu);
10376  		do {
10377  			start = u64_stats_fetch_begin_irq(&stats->syncp);
10378  			tmp.rx_packets = stats->rx_packets;
10379  			tmp.rx_bytes   = stats->rx_bytes;
10380  			tmp.tx_packets = stats->tx_packets;
10381  			tmp.tx_bytes   = stats->tx_bytes;
10382  		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
10383  
10384  		s->rx_packets += tmp.rx_packets;
10385  		s->rx_bytes   += tmp.rx_bytes;
10386  		s->tx_packets += tmp.tx_packets;
10387  		s->tx_bytes   += tmp.tx_bytes;
10388  	}
10389  }
10390  EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10391  
10392  /**
10393   *	dev_get_tstats64 - ndo_get_stats64 implementation
10394   *	@dev: device to get statistics from
10395   *	@s: place to store stats
10396   *
10397   *	Populate @s from dev->stats and dev->tstats. Can be used as
10398   *	ndo_get_stats64() callback.
10399   */
10400  void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
10401  {
10402  	netdev_stats_to_stats64(s, &dev->stats);
10403  	dev_fetch_sw_netstats(s, dev->tstats);
10404  }
10405  EXPORT_SYMBOL_GPL(dev_get_tstats64);
10406  
10407  struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10408  {
10409  	struct netdev_queue *queue = dev_ingress_queue(dev);
10410  
10411  #ifdef CONFIG_NET_CLS_ACT
10412  	if (queue)
10413  		return queue;
10414  	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10415  	if (!queue)
10416  		return NULL;
10417  	netdev_init_one_queue(dev, queue, NULL);
10418  	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10419  	queue->qdisc_sleeping = &noop_qdisc;
10420  	rcu_assign_pointer(dev->ingress_queue, queue);
10421  #endif
10422  	return queue;
10423  }
10424  
10425  static const struct ethtool_ops default_ethtool_ops;
10426  
10427  void netdev_set_default_ethtool_ops(struct net_device *dev,
10428  				    const struct ethtool_ops *ops)
10429  {
10430  	if (dev->ethtool_ops == &default_ethtool_ops)
10431  		dev->ethtool_ops = ops;
10432  }
10433  EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10434  
10435  void netdev_freemem(struct net_device *dev)
10436  {
10437  	char *addr = (char *)dev - dev->padded;
10438  
10439  	kvfree(addr);
10440  }
10441  
10442  /**
10443   * alloc_netdev_mqs - allocate network device
10444   * @sizeof_priv: size of private data to allocate space for
10445   * @name: device name format string
10446   * @name_assign_type: origin of device name
10447   * @setup: callback to initialize device
10448   * @txqs: the number of TX subqueues to allocate
10449   * @rxqs: the number of RX subqueues to allocate
10450   *
10451   * Allocates a struct net_device with private data area for driver use
10452   * and performs basic initialization.  Also allocates subqueue structs
10453   * for each queue on the device.
10454   */
10455  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10456  		unsigned char name_assign_type,
10457  		void (*setup)(struct net_device *),
10458  		unsigned int txqs, unsigned int rxqs)
10459  {
10460  	struct net_device *dev;
10461  	unsigned int alloc_size;
10462  	struct net_device *p;
10463  
10464  	BUG_ON(strlen(name) >= sizeof(dev->name));
10465  
10466  	if (txqs < 1) {
10467  		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10468  		return NULL;
10469  	}
10470  
10471  	if (rxqs < 1) {
10472  		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10473  		return NULL;
10474  	}
10475  
10476  	alloc_size = sizeof(struct net_device);
10477  	if (sizeof_priv) {
10478  		/* ensure 32-byte alignment of private area */
10479  		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10480  		alloc_size += sizeof_priv;
10481  	}
10482  	/* ensure 32-byte alignment of whole construct */
10483  	alloc_size += NETDEV_ALIGN - 1;
10484  
10485  	p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10486  	if (!p)
10487  		return NULL;
10488  
10489  	dev = PTR_ALIGN(p, NETDEV_ALIGN);
10490  	dev->padded = (char *)dev - (char *)p;
10491  
10492  	ref_tracker_dir_init(&dev->refcnt_tracker, 128);
10493  #ifdef CONFIG_PCPU_DEV_REFCNT
10494  	dev->pcpu_refcnt = alloc_percpu(int);
10495  	if (!dev->pcpu_refcnt)
10496  		goto free_dev;
10497  	__dev_hold(dev);
10498  #else
10499  	refcount_set(&dev->dev_refcnt, 1);
10500  #endif
10501  
10502  	if (dev_addr_init(dev))
10503  		goto free_pcpu;
10504  
10505  	dev_mc_init(dev);
10506  	dev_uc_init(dev);
10507  
10508  	dev_net_set(dev, &init_net);
10509  
10510  	dev->gso_max_size = GSO_MAX_SIZE;
10511  	dev->gso_max_segs = GSO_MAX_SEGS;
10512  	dev->gro_max_size = GRO_MAX_SIZE;
10513  	dev->upper_level = 1;
10514  	dev->lower_level = 1;
10515  #ifdef CONFIG_LOCKDEP
10516  	dev->nested_level = 0;
10517  	INIT_LIST_HEAD(&dev->unlink_list);
10518  #endif
10519  
10520  	INIT_LIST_HEAD(&dev->napi_list);
10521  	INIT_LIST_HEAD(&dev->unreg_list);
10522  	INIT_LIST_HEAD(&dev->close_list);
10523  	INIT_LIST_HEAD(&dev->link_watch_list);
10524  	INIT_LIST_HEAD(&dev->adj_list.upper);
10525  	INIT_LIST_HEAD(&dev->adj_list.lower);
10526  	INIT_LIST_HEAD(&dev->ptype_all);
10527  	INIT_LIST_HEAD(&dev->ptype_specific);
10528  	INIT_LIST_HEAD(&dev->net_notifier_list);
10529  #ifdef CONFIG_NET_SCHED
10530  	hash_init(dev->qdisc_hash);
10531  #endif
10532  	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10533  	setup(dev);
10534  
10535  	if (!dev->tx_queue_len) {
10536  		dev->priv_flags |= IFF_NO_QUEUE;
10537  		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10538  	}
10539  
10540  	dev->num_tx_queues = txqs;
10541  	dev->real_num_tx_queues = txqs;
10542  	if (netif_alloc_netdev_queues(dev))
10543  		goto free_all;
10544  
10545  	dev->num_rx_queues = rxqs;
10546  	dev->real_num_rx_queues = rxqs;
10547  	if (netif_alloc_rx_queues(dev))
10548  		goto free_all;
10549  
10550  	strcpy(dev->name, name);
10551  	dev->name_assign_type = name_assign_type;
10552  	dev->group = INIT_NETDEV_GROUP;
10553  	if (!dev->ethtool_ops)
10554  		dev->ethtool_ops = &default_ethtool_ops;
10555  
10556  	nf_hook_netdev_init(dev);
10557  
10558  	return dev;
10559  
10560  free_all:
10561  	free_netdev(dev);
10562  	return NULL;
10563  
10564  free_pcpu:
10565  #ifdef CONFIG_PCPU_DEV_REFCNT
10566  	free_percpu(dev->pcpu_refcnt);
10567  free_dev:
10568  #endif
10569  	netdev_freemem(dev);
10570  	return NULL;
10571  }
10572  EXPORT_SYMBOL(alloc_netdev_mqs);
10573  
10574  /**
10575   * free_netdev - free network device
10576   * @dev: device
10577   *
10578   * This function does the last stage of destroying an allocated device
10579   * interface. The reference to the device object is released. If this
10580   * is the last reference then it will be freed.Must be called in process
10581   * context.
10582   */
10583  void free_netdev(struct net_device *dev)
10584  {
10585  	struct napi_struct *p, *n;
10586  
10587  	might_sleep();
10588  
10589  	/* When called immediately after register_netdevice() failed the unwind
10590  	 * handling may still be dismantling the device. Handle that case by
10591  	 * deferring the free.
10592  	 */
10593  	if (dev->reg_state == NETREG_UNREGISTERING) {
10594  		ASSERT_RTNL();
10595  		dev->needs_free_netdev = true;
10596  		return;
10597  	}
10598  
10599  	netif_free_tx_queues(dev);
10600  	netif_free_rx_queues(dev);
10601  
10602  	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10603  
10604  	/* Flush device addresses */
10605  	dev_addr_flush(dev);
10606  
10607  	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10608  		netif_napi_del(p);
10609  
10610  	ref_tracker_dir_exit(&dev->refcnt_tracker);
10611  #ifdef CONFIG_PCPU_DEV_REFCNT
10612  	free_percpu(dev->pcpu_refcnt);
10613  	dev->pcpu_refcnt = NULL;
10614  #endif
10615  	free_percpu(dev->core_stats);
10616  	dev->core_stats = NULL;
10617  	free_percpu(dev->xdp_bulkq);
10618  	dev->xdp_bulkq = NULL;
10619  
10620  	/*  Compatibility with error handling in drivers */
10621  	if (dev->reg_state == NETREG_UNINITIALIZED) {
10622  		netdev_freemem(dev);
10623  		return;
10624  	}
10625  
10626  	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10627  	dev->reg_state = NETREG_RELEASED;
10628  
10629  	/* will free via device release */
10630  	put_device(&dev->dev);
10631  }
10632  EXPORT_SYMBOL(free_netdev);
10633  
10634  /**
10635   *	synchronize_net -  Synchronize with packet receive processing
10636   *
10637   *	Wait for packets currently being received to be done.
10638   *	Does not block later packets from starting.
10639   */
10640  void synchronize_net(void)
10641  {
10642  	might_sleep();
10643  	if (rtnl_is_locked())
10644  		synchronize_rcu_expedited();
10645  	else
10646  		synchronize_rcu();
10647  }
10648  EXPORT_SYMBOL(synchronize_net);
10649  
10650  /**
10651   *	unregister_netdevice_queue - remove device from the kernel
10652   *	@dev: device
10653   *	@head: list
10654   *
10655   *	This function shuts down a device interface and removes it
10656   *	from the kernel tables.
10657   *	If head not NULL, device is queued to be unregistered later.
10658   *
10659   *	Callers must hold the rtnl semaphore.  You may want
10660   *	unregister_netdev() instead of this.
10661   */
10662  
10663  void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10664  {
10665  	ASSERT_RTNL();
10666  
10667  	if (head) {
10668  		list_move_tail(&dev->unreg_list, head);
10669  	} else {
10670  		LIST_HEAD(single);
10671  
10672  		list_add(&dev->unreg_list, &single);
10673  		unregister_netdevice_many(&single);
10674  	}
10675  }
10676  EXPORT_SYMBOL(unregister_netdevice_queue);
10677  
10678  /**
10679   *	unregister_netdevice_many - unregister many devices
10680   *	@head: list of devices
10681   *
10682   *  Note: As most callers use a stack allocated list_head,
10683   *  we force a list_del() to make sure stack wont be corrupted later.
10684   */
10685  void unregister_netdevice_many(struct list_head *head)
10686  {
10687  	struct net_device *dev, *tmp;
10688  	LIST_HEAD(close_head);
10689  
10690  	BUG_ON(dev_boot_phase);
10691  	ASSERT_RTNL();
10692  
10693  	if (list_empty(head))
10694  		return;
10695  
10696  	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
10697  		/* Some devices call without registering
10698  		 * for initialization unwind. Remove those
10699  		 * devices and proceed with the remaining.
10700  		 */
10701  		if (dev->reg_state == NETREG_UNINITIALIZED) {
10702  			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
10703  				 dev->name, dev);
10704  
10705  			WARN_ON(1);
10706  			list_del(&dev->unreg_list);
10707  			continue;
10708  		}
10709  		dev->dismantle = true;
10710  		BUG_ON(dev->reg_state != NETREG_REGISTERED);
10711  	}
10712  
10713  	/* If device is running, close it first. */
10714  	list_for_each_entry(dev, head, unreg_list)
10715  		list_add_tail(&dev->close_list, &close_head);
10716  	dev_close_many(&close_head, true);
10717  
10718  	list_for_each_entry(dev, head, unreg_list) {
10719  		/* And unlink it from device chain. */
10720  		unlist_netdevice(dev);
10721  
10722  		dev->reg_state = NETREG_UNREGISTERING;
10723  	}
10724  	flush_all_backlogs();
10725  
10726  	synchronize_net();
10727  
10728  	list_for_each_entry(dev, head, unreg_list) {
10729  		struct sk_buff *skb = NULL;
10730  
10731  		/* Shutdown queueing discipline. */
10732  		dev_shutdown(dev);
10733  
10734  		dev_xdp_uninstall(dev);
10735  
10736  		netdev_offload_xstats_disable_all(dev);
10737  
10738  		/* Notify protocols, that we are about to destroy
10739  		 * this device. They should clean all the things.
10740  		 */
10741  		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10742  
10743  		if (!dev->rtnl_link_ops ||
10744  		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10745  			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
10746  						     GFP_KERNEL, NULL, 0);
10747  
10748  		/*
10749  		 *	Flush the unicast and multicast chains
10750  		 */
10751  		dev_uc_flush(dev);
10752  		dev_mc_flush(dev);
10753  
10754  		netdev_name_node_alt_flush(dev);
10755  		netdev_name_node_free(dev->name_node);
10756  
10757  		if (dev->netdev_ops->ndo_uninit)
10758  			dev->netdev_ops->ndo_uninit(dev);
10759  
10760  		if (skb)
10761  			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
10762  
10763  		/* Notifier chain MUST detach us all upper devices. */
10764  		WARN_ON(netdev_has_any_upper_dev(dev));
10765  		WARN_ON(netdev_has_any_lower_dev(dev));
10766  
10767  		/* Remove entries from kobject tree */
10768  		netdev_unregister_kobject(dev);
10769  #ifdef CONFIG_XPS
10770  		/* Remove XPS queueing entries */
10771  		netif_reset_xps_queues_gt(dev, 0);
10772  #endif
10773  	}
10774  
10775  	synchronize_net();
10776  
10777  	list_for_each_entry(dev, head, unreg_list) {
10778  		dev_put_track(dev, &dev->dev_registered_tracker);
10779  		net_set_todo(dev);
10780  	}
10781  
10782  	list_del(head);
10783  }
10784  EXPORT_SYMBOL(unregister_netdevice_many);
10785  
10786  /**
10787   *	unregister_netdev - remove device from the kernel
10788   *	@dev: device
10789   *
10790   *	This function shuts down a device interface and removes it
10791   *	from the kernel tables.
10792   *
10793   *	This is just a wrapper for unregister_netdevice that takes
10794   *	the rtnl semaphore.  In general you want to use this and not
10795   *	unregister_netdevice.
10796   */
10797  void unregister_netdev(struct net_device *dev)
10798  {
10799  	rtnl_lock();
10800  	unregister_netdevice(dev);
10801  	rtnl_unlock();
10802  }
10803  EXPORT_SYMBOL(unregister_netdev);
10804  
10805  /**
10806   *	__dev_change_net_namespace - move device to different nethost namespace
10807   *	@dev: device
10808   *	@net: network namespace
10809   *	@pat: If not NULL name pattern to try if the current device name
10810   *	      is already taken in the destination network namespace.
10811   *	@new_ifindex: If not zero, specifies device index in the target
10812   *	              namespace.
10813   *
10814   *	This function shuts down a device interface and moves it
10815   *	to a new network namespace. On success 0 is returned, on
10816   *	a failure a netagive errno code is returned.
10817   *
10818   *	Callers must hold the rtnl semaphore.
10819   */
10820  
10821  int __dev_change_net_namespace(struct net_device *dev, struct net *net,
10822  			       const char *pat, int new_ifindex)
10823  {
10824  	struct net *net_old = dev_net(dev);
10825  	int err, new_nsid;
10826  
10827  	ASSERT_RTNL();
10828  
10829  	/* Don't allow namespace local devices to be moved. */
10830  	err = -EINVAL;
10831  	if (dev->features & NETIF_F_NETNS_LOCAL)
10832  		goto out;
10833  
10834  	/* Ensure the device has been registrered */
10835  	if (dev->reg_state != NETREG_REGISTERED)
10836  		goto out;
10837  
10838  	/* Get out if there is nothing todo */
10839  	err = 0;
10840  	if (net_eq(net_old, net))
10841  		goto out;
10842  
10843  	/* Pick the destination device name, and ensure
10844  	 * we can use it in the destination network namespace.
10845  	 */
10846  	err = -EEXIST;
10847  	if (netdev_name_in_use(net, dev->name)) {
10848  		/* We get here if we can't use the current device name */
10849  		if (!pat)
10850  			goto out;
10851  		err = dev_get_valid_name(net, dev, pat);
10852  		if (err < 0)
10853  			goto out;
10854  	}
10855  
10856  	/* Check that new_ifindex isn't used yet. */
10857  	err = -EBUSY;
10858  	if (new_ifindex && __dev_get_by_index(net, new_ifindex))
10859  		goto out;
10860  
10861  	/*
10862  	 * And now a mini version of register_netdevice unregister_netdevice.
10863  	 */
10864  
10865  	/* If device is running close it first. */
10866  	dev_close(dev);
10867  
10868  	/* And unlink it from device chain */
10869  	unlist_netdevice(dev);
10870  
10871  	synchronize_net();
10872  
10873  	/* Shutdown queueing discipline. */
10874  	dev_shutdown(dev);
10875  
10876  	/* Notify protocols, that we are about to destroy
10877  	 * this device. They should clean all the things.
10878  	 *
10879  	 * Note that dev->reg_state stays at NETREG_REGISTERED.
10880  	 * This is wanted because this way 8021q and macvlan know
10881  	 * the device is just moving and can keep their slaves up.
10882  	 */
10883  	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10884  	rcu_barrier();
10885  
10886  	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
10887  	/* If there is an ifindex conflict assign a new one */
10888  	if (!new_ifindex) {
10889  		if (__dev_get_by_index(net, dev->ifindex))
10890  			new_ifindex = dev_new_index(net);
10891  		else
10892  			new_ifindex = dev->ifindex;
10893  	}
10894  
10895  	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
10896  			    new_ifindex);
10897  
10898  	/*
10899  	 *	Flush the unicast and multicast chains
10900  	 */
10901  	dev_uc_flush(dev);
10902  	dev_mc_flush(dev);
10903  
10904  	/* Send a netdev-removed uevent to the old namespace */
10905  	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
10906  	netdev_adjacent_del_links(dev);
10907  
10908  	/* Move per-net netdevice notifiers that are following the netdevice */
10909  	move_netdevice_notifiers_dev_net(dev, net);
10910  
10911  	/* Actually switch the network namespace */
10912  	dev_net_set(dev, net);
10913  	dev->ifindex = new_ifindex;
10914  
10915  	/* Send a netdev-add uevent to the new namespace */
10916  	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
10917  	netdev_adjacent_add_links(dev);
10918  
10919  	/* Fixup kobjects */
10920  	err = device_rename(&dev->dev, dev->name);
10921  	WARN_ON(err);
10922  
10923  	/* Adapt owner in case owning user namespace of target network
10924  	 * namespace is different from the original one.
10925  	 */
10926  	err = netdev_change_owner(dev, net_old, net);
10927  	WARN_ON(err);
10928  
10929  	/* Add the device back in the hashes */
10930  	list_netdevice(dev);
10931  
10932  	/* Notify protocols, that a new device appeared. */
10933  	call_netdevice_notifiers(NETDEV_REGISTER, dev);
10934  
10935  	/*
10936  	 *	Prevent userspace races by waiting until the network
10937  	 *	device is fully setup before sending notifications.
10938  	 */
10939  	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10940  
10941  	synchronize_net();
10942  	err = 0;
10943  out:
10944  	return err;
10945  }
10946  EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
10947  
10948  static int dev_cpu_dead(unsigned int oldcpu)
10949  {
10950  	struct sk_buff **list_skb;
10951  	struct sk_buff *skb;
10952  	unsigned int cpu;
10953  	struct softnet_data *sd, *oldsd, *remsd = NULL;
10954  
10955  	local_irq_disable();
10956  	cpu = smp_processor_id();
10957  	sd = &per_cpu(softnet_data, cpu);
10958  	oldsd = &per_cpu(softnet_data, oldcpu);
10959  
10960  	/* Find end of our completion_queue. */
10961  	list_skb = &sd->completion_queue;
10962  	while (*list_skb)
10963  		list_skb = &(*list_skb)->next;
10964  	/* Append completion queue from offline CPU. */
10965  	*list_skb = oldsd->completion_queue;
10966  	oldsd->completion_queue = NULL;
10967  
10968  	/* Append output queue from offline CPU. */
10969  	if (oldsd->output_queue) {
10970  		*sd->output_queue_tailp = oldsd->output_queue;
10971  		sd->output_queue_tailp = oldsd->output_queue_tailp;
10972  		oldsd->output_queue = NULL;
10973  		oldsd->output_queue_tailp = &oldsd->output_queue;
10974  	}
10975  	/* Append NAPI poll list from offline CPU, with one exception :
10976  	 * process_backlog() must be called by cpu owning percpu backlog.
10977  	 * We properly handle process_queue & input_pkt_queue later.
10978  	 */
10979  	while (!list_empty(&oldsd->poll_list)) {
10980  		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
10981  							    struct napi_struct,
10982  							    poll_list);
10983  
10984  		list_del_init(&napi->poll_list);
10985  		if (napi->poll == process_backlog)
10986  			napi->state = 0;
10987  		else
10988  			____napi_schedule(sd, napi);
10989  	}
10990  
10991  	raise_softirq_irqoff(NET_TX_SOFTIRQ);
10992  	local_irq_enable();
10993  
10994  #ifdef CONFIG_RPS
10995  	remsd = oldsd->rps_ipi_list;
10996  	oldsd->rps_ipi_list = NULL;
10997  #endif
10998  	/* send out pending IPI's on offline CPU */
10999  	net_rps_send_ipi(remsd);
11000  
11001  	/* Process offline CPU's input_pkt_queue */
11002  	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
11003  		netif_rx(skb);
11004  		input_queue_head_incr(oldsd);
11005  	}
11006  	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
11007  		netif_rx(skb);
11008  		input_queue_head_incr(oldsd);
11009  	}
11010  
11011  	return 0;
11012  }
11013  
11014  /**
11015   *	netdev_increment_features - increment feature set by one
11016   *	@all: current feature set
11017   *	@one: new feature set
11018   *	@mask: mask feature set
11019   *
11020   *	Computes a new feature set after adding a device with feature set
11021   *	@one to the master device with current feature set @all.  Will not
11022   *	enable anything that is off in @mask. Returns the new feature set.
11023   */
11024  netdev_features_t netdev_increment_features(netdev_features_t all,
11025  	netdev_features_t one, netdev_features_t mask)
11026  {
11027  	if (mask & NETIF_F_HW_CSUM)
11028  		mask |= NETIF_F_CSUM_MASK;
11029  	mask |= NETIF_F_VLAN_CHALLENGED;
11030  
11031  	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11032  	all &= one | ~NETIF_F_ALL_FOR_ALL;
11033  
11034  	/* If one device supports hw checksumming, set for all. */
11035  	if (all & NETIF_F_HW_CSUM)
11036  		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11037  
11038  	return all;
11039  }
11040  EXPORT_SYMBOL(netdev_increment_features);
11041  
11042  static struct hlist_head * __net_init netdev_create_hash(void)
11043  {
11044  	int i;
11045  	struct hlist_head *hash;
11046  
11047  	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11048  	if (hash != NULL)
11049  		for (i = 0; i < NETDEV_HASHENTRIES; i++)
11050  			INIT_HLIST_HEAD(&hash[i]);
11051  
11052  	return hash;
11053  }
11054  
11055  /* Initialize per network namespace state */
11056  static int __net_init netdev_init(struct net *net)
11057  {
11058  	BUILD_BUG_ON(GRO_HASH_BUCKETS >
11059  		     8 * sizeof_field(struct napi_struct, gro_bitmask));
11060  
11061  	INIT_LIST_HEAD(&net->dev_base_head);
11062  
11063  	net->dev_name_head = netdev_create_hash();
11064  	if (net->dev_name_head == NULL)
11065  		goto err_name;
11066  
11067  	net->dev_index_head = netdev_create_hash();
11068  	if (net->dev_index_head == NULL)
11069  		goto err_idx;
11070  
11071  	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11072  
11073  	return 0;
11074  
11075  err_idx:
11076  	kfree(net->dev_name_head);
11077  err_name:
11078  	return -ENOMEM;
11079  }
11080  
11081  /**
11082   *	netdev_drivername - network driver for the device
11083   *	@dev: network device
11084   *
11085   *	Determine network driver for device.
11086   */
11087  const char *netdev_drivername(const struct net_device *dev)
11088  {
11089  	const struct device_driver *driver;
11090  	const struct device *parent;
11091  	const char *empty = "";
11092  
11093  	parent = dev->dev.parent;
11094  	if (!parent)
11095  		return empty;
11096  
11097  	driver = parent->driver;
11098  	if (driver && driver->name)
11099  		return driver->name;
11100  	return empty;
11101  }
11102  
11103  static void __netdev_printk(const char *level, const struct net_device *dev,
11104  			    struct va_format *vaf)
11105  {
11106  	if (dev && dev->dev.parent) {
11107  		dev_printk_emit(level[1] - '0',
11108  				dev->dev.parent,
11109  				"%s %s %s%s: %pV",
11110  				dev_driver_string(dev->dev.parent),
11111  				dev_name(dev->dev.parent),
11112  				netdev_name(dev), netdev_reg_state(dev),
11113  				vaf);
11114  	} else if (dev) {
11115  		printk("%s%s%s: %pV",
11116  		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
11117  	} else {
11118  		printk("%s(NULL net_device): %pV", level, vaf);
11119  	}
11120  }
11121  
11122  void netdev_printk(const char *level, const struct net_device *dev,
11123  		   const char *format, ...)
11124  {
11125  	struct va_format vaf;
11126  	va_list args;
11127  
11128  	va_start(args, format);
11129  
11130  	vaf.fmt = format;
11131  	vaf.va = &args;
11132  
11133  	__netdev_printk(level, dev, &vaf);
11134  
11135  	va_end(args);
11136  }
11137  EXPORT_SYMBOL(netdev_printk);
11138  
11139  #define define_netdev_printk_level(func, level)			\
11140  void func(const struct net_device *dev, const char *fmt, ...)	\
11141  {								\
11142  	struct va_format vaf;					\
11143  	va_list args;						\
11144  								\
11145  	va_start(args, fmt);					\
11146  								\
11147  	vaf.fmt = fmt;						\
11148  	vaf.va = &args;						\
11149  								\
11150  	__netdev_printk(level, dev, &vaf);			\
11151  								\
11152  	va_end(args);						\
11153  }								\
11154  EXPORT_SYMBOL(func);
11155  
11156  define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11157  define_netdev_printk_level(netdev_alert, KERN_ALERT);
11158  define_netdev_printk_level(netdev_crit, KERN_CRIT);
11159  define_netdev_printk_level(netdev_err, KERN_ERR);
11160  define_netdev_printk_level(netdev_warn, KERN_WARNING);
11161  define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11162  define_netdev_printk_level(netdev_info, KERN_INFO);
11163  
11164  static void __net_exit netdev_exit(struct net *net)
11165  {
11166  	kfree(net->dev_name_head);
11167  	kfree(net->dev_index_head);
11168  	if (net != &init_net)
11169  		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11170  }
11171  
11172  static struct pernet_operations __net_initdata netdev_net_ops = {
11173  	.init = netdev_init,
11174  	.exit = netdev_exit,
11175  };
11176  
11177  static void __net_exit default_device_exit_net(struct net *net)
11178  {
11179  	struct net_device *dev, *aux;
11180  	/*
11181  	 * Push all migratable network devices back to the
11182  	 * initial network namespace
11183  	 */
11184  	ASSERT_RTNL();
11185  	for_each_netdev_safe(net, dev, aux) {
11186  		int err;
11187  		char fb_name[IFNAMSIZ];
11188  
11189  		/* Ignore unmoveable devices (i.e. loopback) */
11190  		if (dev->features & NETIF_F_NETNS_LOCAL)
11191  			continue;
11192  
11193  		/* Leave virtual devices for the generic cleanup */
11194  		if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
11195  			continue;
11196  
11197  		/* Push remaining network devices to init_net */
11198  		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11199  		if (netdev_name_in_use(&init_net, fb_name))
11200  			snprintf(fb_name, IFNAMSIZ, "dev%%d");
11201  		err = dev_change_net_namespace(dev, &init_net, fb_name);
11202  		if (err) {
11203  			pr_emerg("%s: failed to move %s to init_net: %d\n",
11204  				 __func__, dev->name, err);
11205  			BUG();
11206  		}
11207  	}
11208  }
11209  
11210  static void __net_exit default_device_exit_batch(struct list_head *net_list)
11211  {
11212  	/* At exit all network devices most be removed from a network
11213  	 * namespace.  Do this in the reverse order of registration.
11214  	 * Do this across as many network namespaces as possible to
11215  	 * improve batching efficiency.
11216  	 */
11217  	struct net_device *dev;
11218  	struct net *net;
11219  	LIST_HEAD(dev_kill_list);
11220  
11221  	rtnl_lock();
11222  	list_for_each_entry(net, net_list, exit_list) {
11223  		default_device_exit_net(net);
11224  		cond_resched();
11225  	}
11226  
11227  	list_for_each_entry(net, net_list, exit_list) {
11228  		for_each_netdev_reverse(net, dev) {
11229  			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11230  				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11231  			else
11232  				unregister_netdevice_queue(dev, &dev_kill_list);
11233  		}
11234  	}
11235  	unregister_netdevice_many(&dev_kill_list);
11236  	rtnl_unlock();
11237  }
11238  
11239  static struct pernet_operations __net_initdata default_device_ops = {
11240  	.exit_batch = default_device_exit_batch,
11241  };
11242  
11243  /*
11244   *	Initialize the DEV module. At boot time this walks the device list and
11245   *	unhooks any devices that fail to initialise (normally hardware not
11246   *	present) and leaves us with a valid list of present and active devices.
11247   *
11248   */
11249  
11250  /*
11251   *       This is called single threaded during boot, so no need
11252   *       to take the rtnl semaphore.
11253   */
11254  static int __init net_dev_init(void)
11255  {
11256  	int i, rc = -ENOMEM;
11257  
11258  	BUG_ON(!dev_boot_phase);
11259  
11260  	if (dev_proc_init())
11261  		goto out;
11262  
11263  	if (netdev_kobject_init())
11264  		goto out;
11265  
11266  	INIT_LIST_HEAD(&ptype_all);
11267  	for (i = 0; i < PTYPE_HASH_SIZE; i++)
11268  		INIT_LIST_HEAD(&ptype_base[i]);
11269  
11270  	if (register_pernet_subsys(&netdev_net_ops))
11271  		goto out;
11272  
11273  	/*
11274  	 *	Initialise the packet receive queues.
11275  	 */
11276  
11277  	for_each_possible_cpu(i) {
11278  		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11279  		struct softnet_data *sd = &per_cpu(softnet_data, i);
11280  
11281  		INIT_WORK(flush, flush_backlog);
11282  
11283  		skb_queue_head_init(&sd->input_pkt_queue);
11284  		skb_queue_head_init(&sd->process_queue);
11285  #ifdef CONFIG_XFRM_OFFLOAD
11286  		skb_queue_head_init(&sd->xfrm_backlog);
11287  #endif
11288  		INIT_LIST_HEAD(&sd->poll_list);
11289  		sd->output_queue_tailp = &sd->output_queue;
11290  #ifdef CONFIG_RPS
11291  		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
11292  		sd->cpu = i;
11293  #endif
11294  
11295  		init_gro_hash(&sd->backlog);
11296  		sd->backlog.poll = process_backlog;
11297  		sd->backlog.weight = weight_p;
11298  	}
11299  
11300  	dev_boot_phase = 0;
11301  
11302  	/* The loopback device is special if any other network devices
11303  	 * is present in a network namespace the loopback device must
11304  	 * be present. Since we now dynamically allocate and free the
11305  	 * loopback device ensure this invariant is maintained by
11306  	 * keeping the loopback device as the first device on the
11307  	 * list of network devices.  Ensuring the loopback devices
11308  	 * is the first device that appears and the last network device
11309  	 * that disappears.
11310  	 */
11311  	if (register_pernet_device(&loopback_net_ops))
11312  		goto out;
11313  
11314  	if (register_pernet_device(&default_device_ops))
11315  		goto out;
11316  
11317  	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11318  	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11319  
11320  	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11321  				       NULL, dev_cpu_dead);
11322  	WARN_ON(rc < 0);
11323  	rc = 0;
11324  out:
11325  	return rc;
11326  }
11327  
11328  subsys_initcall(net_dev_init);
11329