xref: /linux/net/core/netpoll.c (revision 6c537b845c99e32312a1bd84d4c95cdb26efb577)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Common framework for low-level network console, dump, and debugger code
4  *
5  * Sep 8 2003  Matt Mackall <mpm@selenic.com>
6  *
7  * based on the netconsole code from:
8  *
9  * Copyright (C) 2001  Ingo Molnar <mingo@redhat.com>
10  * Copyright (C) 2002  Red Hat, Inc.
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/moduleparam.h>
16 #include <linux/kernel.h>
17 #include <linux/netdevice.h>
18 #include <linux/etherdevice.h>
19 #include <linux/string.h>
20 #include <linux/if_arp.h>
21 #include <linux/inetdevice.h>
22 #include <linux/inet.h>
23 #include <linux/interrupt.h>
24 #include <linux/netpoll.h>
25 #include <linux/sched.h>
26 #include <linux/delay.h>
27 #include <linux/rcupdate.h>
28 #include <linux/workqueue.h>
29 #include <linux/slab.h>
30 #include <linux/export.h>
31 #include <linux/if_vlan.h>
32 #include <linux/udp.h>
33 #include <net/tcp.h>
34 #include <net/addrconf.h>
35 #include <net/ndisc.h>
36 #include <trace/events/napi.h>
37 #include <linux/kconfig.h>
38 
39 /*
40  * We maintain a small pool of fully-sized skbs, to make sure the
41  * message gets out even in extreme OOM situations.
42  */
43 
44 #define MAX_SKBS 32
45 #define USEC_PER_POLL	50
46 
47 static unsigned int carrier_timeout = 4;
48 module_param(carrier_timeout, uint, 0644);
49 
50 static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb,
51 				      struct net_device *dev,
52 				      struct netdev_queue *txq)
53 {
54 	netdev_tx_t status = NETDEV_TX_OK;
55 	netdev_features_t features;
56 
57 	features = netif_skb_features(skb);
58 
59 	if (skb_vlan_tag_present(skb) &&
60 	    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
61 		skb = __vlan_hwaccel_push_inside(skb);
62 		if (unlikely(!skb)) {
63 			/* This is actually a packet drop, but we
64 			 * don't want the code that calls this
65 			 * function to try and operate on a NULL skb.
66 			 */
67 			goto out;
68 		}
69 	}
70 
71 	status = netdev_start_xmit(skb, dev, txq, false);
72 
73 out:
74 	return status;
75 }
76 
77 static void queue_process(struct work_struct *work)
78 {
79 	struct netpoll_info *npinfo =
80 		container_of(work, struct netpoll_info, tx_work.work);
81 	struct sk_buff *skb;
82 	unsigned long flags;
83 
84 	while ((skb = skb_dequeue(&npinfo->txq))) {
85 		struct net_device *dev = skb->dev;
86 		struct netdev_queue *txq;
87 		unsigned int q_index;
88 
89 		if (!netif_device_present(dev) || !netif_running(dev)) {
90 			kfree_skb(skb);
91 			continue;
92 		}
93 
94 		local_irq_save(flags);
95 		/* check if skb->queue_mapping is still valid */
96 		q_index = skb_get_queue_mapping(skb);
97 		if (unlikely(q_index >= dev->real_num_tx_queues)) {
98 			q_index = q_index % dev->real_num_tx_queues;
99 			skb_set_queue_mapping(skb, q_index);
100 		}
101 		txq = netdev_get_tx_queue(dev, q_index);
102 		HARD_TX_LOCK(dev, txq, smp_processor_id());
103 		if (netif_xmit_frozen_or_stopped(txq) ||
104 		    !dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) {
105 			skb_queue_head(&npinfo->txq, skb);
106 			HARD_TX_UNLOCK(dev, txq);
107 			local_irq_restore(flags);
108 
109 			schedule_delayed_work(&npinfo->tx_work, HZ/10);
110 			return;
111 		}
112 		HARD_TX_UNLOCK(dev, txq);
113 		local_irq_restore(flags);
114 	}
115 }
116 
117 static int netif_local_xmit_active(struct net_device *dev)
118 {
119 	int i;
120 
121 	for (i = 0; i < dev->num_tx_queues; i++) {
122 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
123 
124 		if (netif_tx_owned(txq, smp_processor_id()))
125 			return 1;
126 	}
127 
128 	return 0;
129 }
130 
131 static void poll_one_napi(struct napi_struct *napi)
132 {
133 	int work;
134 
135 	/* If we set this bit but see that it has already been set,
136 	 * that indicates that napi has been disabled and we need
137 	 * to abort this operation
138 	 */
139 	if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state))
140 		return;
141 
142 	/* We explicitly pass the polling call a budget of 0 to
143 	 * indicate that we are clearing the Tx path only.
144 	 */
145 	work = napi->poll(napi, 0);
146 	WARN_ONCE(work, "%pS exceeded budget in poll\n", napi->poll);
147 	trace_napi_poll(napi, work, 0);
148 
149 	clear_bit(NAPI_STATE_NPSVC, &napi->state);
150 }
151 
152 static void poll_napi(struct net_device *dev)
153 {
154 	struct napi_struct *napi;
155 	int cpu = smp_processor_id();
156 
157 	list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
158 		if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) {
159 			poll_one_napi(napi);
160 			smp_store_release(&napi->poll_owner, -1);
161 		}
162 	}
163 }
164 
165 void netpoll_poll_dev(struct net_device *dev)
166 {
167 	struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
168 	const struct net_device_ops *ops;
169 
170 	/* Don't do any rx activity if the dev_lock mutex is held
171 	 * the dev_open/close paths use this to block netpoll activity
172 	 * while changing device state
173 	 */
174 	if (!ni || down_trylock(&ni->dev_lock))
175 		return;
176 
177 	/* Some drivers will take the same locks in poll and xmit,
178 	 * we can't poll if local CPU is already in xmit.
179 	 */
180 	if (!netif_running(dev) || netif_local_xmit_active(dev)) {
181 		up(&ni->dev_lock);
182 		return;
183 	}
184 
185 	ops = dev->netdev_ops;
186 	if (ops->ndo_poll_controller)
187 		ops->ndo_poll_controller(dev);
188 
189 	poll_napi(dev);
190 
191 	up(&ni->dev_lock);
192 
193 	netpoll_zap_completion_queue();
194 }
195 EXPORT_SYMBOL(netpoll_poll_dev);
196 
197 void netpoll_poll_disable(struct net_device *dev)
198 {
199 	struct netpoll_info *ni;
200 
201 	might_sleep();
202 	ni = rtnl_dereference(dev->npinfo);
203 	if (ni)
204 		down(&ni->dev_lock);
205 }
206 
207 void netpoll_poll_enable(struct net_device *dev)
208 {
209 	struct netpoll_info *ni;
210 
211 	ni = rtnl_dereference(dev->npinfo);
212 	if (ni)
213 		up(&ni->dev_lock);
214 }
215 
216 static void refill_skbs(struct netpoll *np)
217 {
218 	struct sk_buff_head *skb_pool;
219 	struct sk_buff *skb;
220 
221 	skb_pool = &np->skb_pool;
222 
223 	while (READ_ONCE(skb_pool->qlen) < MAX_SKBS) {
224 		skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
225 		if (!skb)
226 			break;
227 
228 		skb_queue_tail(skb_pool, skb);
229 	}
230 }
231 
232 void netpoll_zap_completion_queue(void)
233 {
234 	unsigned long flags;
235 	struct softnet_data *sd = &get_cpu_var(softnet_data);
236 
237 	if (sd->completion_queue) {
238 		struct sk_buff *clist;
239 
240 		local_irq_save(flags);
241 		clist = sd->completion_queue;
242 		sd->completion_queue = NULL;
243 		local_irq_restore(flags);
244 
245 		while (clist != NULL) {
246 			struct sk_buff *skb = clist;
247 			clist = clist->next;
248 			if (!skb_irq_freeable(skb)) {
249 				refcount_set(&skb->users, 1);
250 				dev_kfree_skb_any(skb); /* put this one back */
251 			} else {
252 				__kfree_skb(skb);
253 			}
254 		}
255 	}
256 
257 	put_cpu_var(softnet_data);
258 }
259 EXPORT_SYMBOL_NS_GPL(netpoll_zap_completion_queue, "NETDEV_INTERNAL");
260 
261 static int netpoll_owner_active(struct net_device *dev)
262 {
263 	struct napi_struct *napi;
264 
265 	list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
266 		if (READ_ONCE(napi->poll_owner) == smp_processor_id())
267 			return 1;
268 	}
269 	return 0;
270 }
271 
272 /* call with IRQ disabled */
273 static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
274 {
275 	netdev_tx_t status = NETDEV_TX_BUSY;
276 	netdev_tx_t ret = NET_XMIT_DROP;
277 	struct net_device *dev;
278 	unsigned long tries;
279 	/* It is up to the caller to keep npinfo alive. */
280 	struct netpoll_info *npinfo;
281 
282 	lockdep_assert_irqs_disabled();
283 
284 	dev = np->dev;
285 	/* npinfo->txq belongs to np->dev, so retries must stay bound to it. */
286 	skb->dev = dev;
287 	rcu_read_lock();
288 	npinfo = rcu_dereference_bh(dev->npinfo);
289 
290 	if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
291 		dev_kfree_skb_irq(skb);
292 		goto out;
293 	}
294 
295 	/* don't get messages out of order, and no recursion */
296 	if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
297 		struct netdev_queue *txq;
298 
299 		txq = netdev_core_pick_tx(dev, skb, NULL);
300 
301 		/* try until next clock tick */
302 		for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
303 		     tries > 0; --tries) {
304 			if (HARD_TX_TRYLOCK(dev, txq)) {
305 				if (!netif_xmit_stopped(txq))
306 					status = netpoll_start_xmit(skb, dev, txq);
307 
308 				HARD_TX_UNLOCK(dev, txq);
309 
310 				if (dev_xmit_complete(status))
311 					break;
312 
313 			}
314 
315 			/* tickle device maybe there is some cleanup */
316 			netpoll_poll_dev(np->dev);
317 
318 			udelay(USEC_PER_POLL);
319 		}
320 
321 		WARN_ONCE(!irqs_disabled(),
322 			"netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pS)\n",
323 			dev->name, dev->netdev_ops->ndo_start_xmit);
324 
325 	}
326 
327 	if (!dev_xmit_complete(status)) {
328 		skb_queue_tail(&npinfo->txq, skb);
329 		schedule_delayed_work(&npinfo->tx_work,0);
330 	}
331 	ret = NETDEV_TX_OK;
332 out:
333 	rcu_read_unlock();
334 	return ret;
335 }
336 
337 netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
338 {
339 	unsigned long flags;
340 	netdev_tx_t ret;
341 
342 	if (unlikely(!np)) {
343 		dev_kfree_skb_irq(skb);
344 		ret = NET_XMIT_DROP;
345 	} else {
346 		local_irq_save(flags);
347 		ret = __netpoll_send_skb(np, skb);
348 		local_irq_restore(flags);
349 	}
350 	return ret;
351 }
352 EXPORT_SYMBOL(netpoll_send_skb);
353 
354 static void skb_pool_flush(struct netpoll *np)
355 {
356 	struct sk_buff_head *skb_pool;
357 
358 	cancel_work_sync(&np->refill_wq);
359 	skb_pool = &np->skb_pool;
360 	skb_queue_purge_reason(skb_pool, SKB_CONSUMED);
361 }
362 
363 static void refill_skbs_work_handler(struct work_struct *work)
364 {
365 	struct netpoll *np =
366 		container_of(work, struct netpoll, refill_wq);
367 
368 	refill_skbs(np);
369 }
370 
371 int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
372 {
373 	struct netpoll_info *npinfo;
374 	const struct net_device_ops *ops;
375 	int err;
376 
377 	skb_queue_head_init(&np->skb_pool);
378 	INIT_WORK(&np->refill_wq, refill_skbs_work_handler);
379 
380 	if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
381 		np_err(np, "%s doesn't support polling, aborting\n",
382 		       ndev->name);
383 		err = -ENOTSUPP;
384 		goto out;
385 	}
386 
387 	npinfo = rtnl_dereference(ndev->npinfo);
388 	if (!npinfo) {
389 		npinfo = kmalloc_obj(*npinfo);
390 		if (!npinfo) {
391 			err = -ENOMEM;
392 			goto out;
393 		}
394 
395 		sema_init(&npinfo->dev_lock, 1);
396 		skb_queue_head_init(&npinfo->txq);
397 		INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
398 
399 		refcount_set(&npinfo->refcnt, 1);
400 
401 		ops = ndev->netdev_ops;
402 		if (ops->ndo_netpoll_setup) {
403 			err = ops->ndo_netpoll_setup(ndev);
404 			if (err)
405 				goto free_npinfo;
406 		}
407 	} else {
408 		refcount_inc(&npinfo->refcnt);
409 	}
410 
411 	np->dev = ndev;
412 	strscpy(np->dev_name, ndev->name, IFNAMSIZ);
413 
414 	/* fill up the skb queue */
415 	refill_skbs(np);
416 
417 	/* last thing to do is link it to the net device structure */
418 	rcu_assign_pointer(ndev->npinfo, npinfo);
419 
420 	return 0;
421 
422 free_npinfo:
423 	kfree(npinfo);
424 out:
425 	return err;
426 }
427 EXPORT_SYMBOL_GPL(__netpoll_setup);
428 
429 /*
430  * Returns a pointer to a string representation of the identifier used
431  * to select the egress interface for the given netpoll instance. buf
432  * is used to format np->dev_mac when np->dev_name is empty; bufsz must
433  * be at least MAC_ADDR_STR_LEN + 1 to fit the formatted MAC address
434  * and its NUL terminator.
435  */
436 static char *egress_dev(struct netpoll *np, char *buf, size_t bufsz)
437 {
438 	if (np->dev_name[0])
439 		return np->dev_name;
440 
441 	snprintf(buf, bufsz, "%pM", np->dev_mac);
442 	return buf;
443 }
444 
445 static void netpoll_wait_carrier(struct netpoll *np, struct net_device *ndev,
446 				 unsigned int timeout)
447 {
448 	unsigned long atmost;
449 
450 	atmost = jiffies + timeout * HZ;
451 	while (!netif_carrier_ok(ndev)) {
452 		if (time_after(jiffies, atmost)) {
453 			np_notice(np, "timeout waiting for carrier\n");
454 			break;
455 		}
456 		msleep(1);
457 	}
458 }
459 
460 /*
461  * Take the IPv6 from ndev and populate local_ip structure in netpoll
462  */
463 static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev)
464 {
465 	char buf[MAC_ADDR_STR_LEN + 1];
466 	int err = -EDESTADDRREQ;
467 	struct inet6_dev *idev;
468 
469 	if (!IS_ENABLED(CONFIG_IPV6)) {
470 		np_err(np, "IPv6 is not supported %s, aborting\n",
471 		       egress_dev(np, buf, sizeof(buf)));
472 		return -EINVAL;
473 	}
474 
475 	idev = __in6_dev_get(ndev);
476 	if (idev) {
477 		struct inet6_ifaddr *ifp;
478 
479 		read_lock_bh(&idev->lock);
480 		list_for_each_entry(ifp, &idev->addr_list, if_list) {
481 			if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) !=
482 				!!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL))
483 				continue;
484 			/* Got the IP, let's return */
485 			np->local_ip.in6 = ifp->addr;
486 			err = 0;
487 			break;
488 		}
489 		read_unlock_bh(&idev->lock);
490 	}
491 	if (err) {
492 		np_err(np, "no IPv6 address for %s, aborting\n",
493 		       egress_dev(np, buf, sizeof(buf)));
494 		return err;
495 	}
496 
497 	np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6);
498 	return 0;
499 }
500 
501 /*
502  * Take the IPv4 from ndev and populate local_ip structure in netpoll
503  */
504 static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev)
505 {
506 	char buf[MAC_ADDR_STR_LEN + 1];
507 	const struct in_ifaddr *ifa;
508 	struct in_device *in_dev;
509 
510 	in_dev = __in_dev_get_rtnl(ndev);
511 	if (!in_dev) {
512 		np_err(np, "no IP address for %s, aborting\n",
513 		       egress_dev(np, buf, sizeof(buf)));
514 		return -EDESTADDRREQ;
515 	}
516 
517 	ifa = rtnl_dereference(in_dev->ifa_list);
518 	if (!ifa) {
519 		np_err(np, "no IP address for %s, aborting\n",
520 		       egress_dev(np, buf, sizeof(buf)));
521 		return -EDESTADDRREQ;
522 	}
523 
524 	np->local_ip.ip = ifa->ifa_local;
525 	np_info(np, "local IP %pI4\n", &np->local_ip.ip);
526 
527 	return 0;
528 }
529 
530 /*
531  * Test whether the caller left np->local_ip unset, so that
532  * netpoll_setup() should auto-populate it from the egress device.
533  *
534  * np->local_ip is a union of __be32 (IPv4) and struct in6_addr (IPv6),
535  * so an IPv6 address whose first 4 bytes are zero (e.g. ::1, ::2,
536  * IPv4-mapped ::ffff:a.b.c.d) must not be tested via the IPv4 arm —
537  * doing so would misclassify a caller-supplied address as unset and
538  * silently overwrite it with whatever address the device exposes.
539  */
540 static bool netpoll_local_ip_unset(const struct netpoll *np)
541 {
542 	if (np->ipv6)
543 		return ipv6_addr_any(&np->local_ip.in6);
544 	return !np->local_ip.ip;
545 }
546 
547 int netpoll_setup(struct netpoll *np)
548 {
549 	struct net *net = current->nsproxy->net_ns;
550 	char buf[MAC_ADDR_STR_LEN + 1];
551 	struct net_device *ndev = NULL;
552 	bool ip_overwritten = false;
553 	int err;
554 
555 	rtnl_lock();
556 	if (np->dev_name[0])
557 		ndev = __dev_get_by_name(net, np->dev_name);
558 	else if (is_valid_ether_addr(np->dev_mac))
559 		ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac);
560 
561 	if (!ndev) {
562 		np_err(np, "%s doesn't exist, aborting\n",
563 		       egress_dev(np, buf, sizeof(buf)));
564 		err = -ENODEV;
565 		goto unlock;
566 	}
567 	netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL);
568 
569 	if (netdev_master_upper_dev_get(ndev)) {
570 		np_err(np, "%s is a slave device, aborting\n",
571 		       egress_dev(np, buf, sizeof(buf)));
572 		err = -EBUSY;
573 		goto put;
574 	}
575 
576 	if (!netif_running(ndev)) {
577 		np_info(np, "device %s not up yet, forcing it\n",
578 			egress_dev(np, buf, sizeof(buf)));
579 
580 		err = dev_open(ndev, NULL);
581 		if (err) {
582 			np_err(np, "failed to open %s\n", ndev->name);
583 			goto put;
584 		}
585 
586 		rtnl_unlock();
587 		netpoll_wait_carrier(np, ndev, carrier_timeout);
588 		rtnl_lock();
589 	}
590 
591 	if (netpoll_local_ip_unset(np)) {
592 		if (!np->ipv6) {
593 			err = netpoll_take_ipv4(np, ndev);
594 			if (err)
595 				goto put;
596 		} else {
597 			err = netpoll_take_ipv6(np, ndev);
598 			if (err)
599 				goto put;
600 		}
601 		ip_overwritten = true;
602 	}
603 
604 	err = __netpoll_setup(np, ndev);
605 	if (err)
606 		goto flush;
607 	rtnl_unlock();
608 
609 	/* Make sure all NAPI polls which started before dev->npinfo
610 	 * was visible have exited before we start calling NAPI poll.
611 	 * NAPI skips locking if dev->npinfo is NULL.
612 	 */
613 	synchronize_rcu();
614 
615 	return 0;
616 
617 flush:
618 	skb_pool_flush(np);
619 put:
620 	DEBUG_NET_WARN_ON_ONCE(np->dev);
621 	if (ip_overwritten)
622 		memset(&np->local_ip, 0, sizeof(np->local_ip));
623 	netdev_put(ndev, &np->dev_tracker);
624 unlock:
625 	rtnl_unlock();
626 	return err;
627 }
628 EXPORT_SYMBOL(netpoll_setup);
629 
630 static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
631 {
632 	struct netpoll_info *npinfo =
633 			container_of(rcu_head, struct netpoll_info, rcu);
634 
635 	skb_queue_purge(&npinfo->txq);
636 
637 	/* we can't call cancel_delayed_work_sync here, as we are in softirq */
638 	cancel_delayed_work(&npinfo->tx_work);
639 
640 	/* clean after last, unfinished work */
641 	__skb_queue_purge(&npinfo->txq);
642 	/* now cancel it again */
643 	cancel_delayed_work(&npinfo->tx_work);
644 	kfree(npinfo);
645 }
646 
647 static void __netpoll_cleanup(struct netpoll *np)
648 {
649 	struct netpoll_info *npinfo;
650 
651 	npinfo = rtnl_dereference(np->dev->npinfo);
652 	if (!npinfo)
653 		return;
654 
655 	/* At this point, there is a single npinfo instance per netdevice, and
656 	 * its refcnt tracks how many netpoll structures are linked to it. We
657 	 * only perform npinfo cleanup when the refcnt decrements to zero.
658 	 */
659 	if (refcount_dec_and_test(&npinfo->refcnt)) {
660 		const struct net_device_ops *ops;
661 
662 		ops = np->dev->netdev_ops;
663 		if (ops->ndo_netpoll_cleanup)
664 			ops->ndo_netpoll_cleanup(np->dev);
665 
666 		RCU_INIT_POINTER(np->dev->npinfo, NULL);
667 		call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info);
668 	}
669 
670 	skb_pool_flush(np);
671 }
672 
673 void __netpoll_free(struct netpoll *np)
674 {
675 	ASSERT_RTNL();
676 
677 	/* Wait for transmitting packets to finish before freeing. */
678 	synchronize_net();
679 	__netpoll_cleanup(np);
680 	kfree(np);
681 }
682 EXPORT_SYMBOL_GPL(__netpoll_free);
683 
684 void do_netpoll_cleanup(struct netpoll *np)
685 {
686 	__netpoll_cleanup(np);
687 	netdev_put(np->dev, &np->dev_tracker);
688 	np->dev = NULL;
689 }
690 EXPORT_SYMBOL(do_netpoll_cleanup);
691 
692 void netpoll_cleanup(struct netpoll *np)
693 {
694 	rtnl_lock();
695 	if (!np->dev)
696 		goto out;
697 	do_netpoll_cleanup(np);
698 out:
699 	rtnl_unlock();
700 }
701 EXPORT_SYMBOL(netpoll_cleanup);
702