1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Common framework for low-level network console, dump, and debugger code 4 * 5 * Sep 8 2003 Matt Mackall <mpm@selenic.com> 6 * 7 * based on the netconsole code from: 8 * 9 * Copyright (C) 2001 Ingo Molnar <mingo@redhat.com> 10 * Copyright (C) 2002 Red Hat, Inc. 11 */ 12 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 15 #include <linux/moduleparam.h> 16 #include <linux/kernel.h> 17 #include <linux/netdevice.h> 18 #include <linux/etherdevice.h> 19 #include <linux/string.h> 20 #include <linux/if_arp.h> 21 #include <linux/inetdevice.h> 22 #include <linux/inet.h> 23 #include <linux/interrupt.h> 24 #include <linux/netpoll.h> 25 #include <linux/sched.h> 26 #include <linux/delay.h> 27 #include <linux/rcupdate.h> 28 #include <linux/workqueue.h> 29 #include <linux/slab.h> 30 #include <linux/export.h> 31 #include <linux/if_vlan.h> 32 #include <linux/udp.h> 33 #include <net/tcp.h> 34 #include <net/addrconf.h> 35 #include <net/ndisc.h> 36 #include <trace/events/napi.h> 37 #include <linux/kconfig.h> 38 39 /* 40 * We maintain a small pool of fully-sized skbs, to make sure the 41 * message gets out even in extreme OOM situations. 42 */ 43 44 #define MAX_SKBS 32 45 #define USEC_PER_POLL 50 46 47 static unsigned int carrier_timeout = 4; 48 module_param(carrier_timeout, uint, 0644); 49 50 static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, 51 struct net_device *dev, 52 struct netdev_queue *txq) 53 { 54 netdev_tx_t status = NETDEV_TX_OK; 55 netdev_features_t features; 56 57 features = netif_skb_features(skb); 58 59 if (skb_vlan_tag_present(skb) && 60 !vlan_hw_offload_capable(features, skb->vlan_proto)) { 61 skb = __vlan_hwaccel_push_inside(skb); 62 if (unlikely(!skb)) { 63 /* This is actually a packet drop, but we 64 * don't want the code that calls this 65 * function to try and operate on a NULL skb. 66 */ 67 goto out; 68 } 69 } 70 71 status = netdev_start_xmit(skb, dev, txq, false); 72 73 out: 74 return status; 75 } 76 77 static void queue_process(struct work_struct *work) 78 { 79 struct netpoll_info *npinfo = 80 container_of(work, struct netpoll_info, tx_work.work); 81 struct sk_buff *skb; 82 unsigned long flags; 83 84 while ((skb = skb_dequeue(&npinfo->txq))) { 85 struct net_device *dev = skb->dev; 86 struct netdev_queue *txq; 87 unsigned int q_index; 88 89 if (!netif_device_present(dev) || !netif_running(dev)) { 90 kfree_skb(skb); 91 continue; 92 } 93 94 local_irq_save(flags); 95 /* check if skb->queue_mapping is still valid */ 96 q_index = skb_get_queue_mapping(skb); 97 if (unlikely(q_index >= dev->real_num_tx_queues)) { 98 q_index = q_index % dev->real_num_tx_queues; 99 skb_set_queue_mapping(skb, q_index); 100 } 101 txq = netdev_get_tx_queue(dev, q_index); 102 HARD_TX_LOCK(dev, txq, smp_processor_id()); 103 if (netif_xmit_frozen_or_stopped(txq) || 104 !dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) { 105 skb_queue_head(&npinfo->txq, skb); 106 HARD_TX_UNLOCK(dev, txq); 107 local_irq_restore(flags); 108 109 schedule_delayed_work(&npinfo->tx_work, HZ/10); 110 return; 111 } 112 HARD_TX_UNLOCK(dev, txq); 113 local_irq_restore(flags); 114 } 115 } 116 117 static int netif_local_xmit_active(struct net_device *dev) 118 { 119 int i; 120 121 for (i = 0; i < dev->num_tx_queues; i++) { 122 struct netdev_queue *txq = netdev_get_tx_queue(dev, i); 123 124 if (netif_tx_owned(txq, smp_processor_id())) 125 return 1; 126 } 127 128 return 0; 129 } 130 131 static void poll_one_napi(struct napi_struct *napi) 132 { 133 int work; 134 135 /* If we set this bit but see that it has already been set, 136 * that indicates that napi has been disabled and we need 137 * to abort this operation 138 */ 139 if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) 140 return; 141 142 /* We explicitly pass the polling call a budget of 0 to 143 * indicate that we are clearing the Tx path only. 144 */ 145 work = napi->poll(napi, 0); 146 WARN_ONCE(work, "%pS exceeded budget in poll\n", napi->poll); 147 trace_napi_poll(napi, work, 0); 148 149 clear_bit(NAPI_STATE_NPSVC, &napi->state); 150 } 151 152 static void poll_napi(struct net_device *dev) 153 { 154 struct napi_struct *napi; 155 int cpu = smp_processor_id(); 156 157 list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { 158 if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) { 159 poll_one_napi(napi); 160 smp_store_release(&napi->poll_owner, -1); 161 } 162 } 163 } 164 165 void netpoll_poll_dev(struct net_device *dev) 166 { 167 struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); 168 const struct net_device_ops *ops; 169 170 /* Don't do any rx activity if the dev_lock mutex is held 171 * the dev_open/close paths use this to block netpoll activity 172 * while changing device state 173 */ 174 if (!ni || down_trylock(&ni->dev_lock)) 175 return; 176 177 /* Some drivers will take the same locks in poll and xmit, 178 * we can't poll if local CPU is already in xmit. 179 */ 180 if (!netif_running(dev) || netif_local_xmit_active(dev)) { 181 up(&ni->dev_lock); 182 return; 183 } 184 185 ops = dev->netdev_ops; 186 if (ops->ndo_poll_controller) 187 ops->ndo_poll_controller(dev); 188 189 poll_napi(dev); 190 191 up(&ni->dev_lock); 192 193 netpoll_zap_completion_queue(); 194 } 195 EXPORT_SYMBOL(netpoll_poll_dev); 196 197 void netpoll_poll_disable(struct net_device *dev) 198 { 199 struct netpoll_info *ni; 200 201 might_sleep(); 202 ni = rtnl_dereference(dev->npinfo); 203 if (ni) 204 down(&ni->dev_lock); 205 } 206 207 void netpoll_poll_enable(struct net_device *dev) 208 { 209 struct netpoll_info *ni; 210 211 ni = rtnl_dereference(dev->npinfo); 212 if (ni) 213 up(&ni->dev_lock); 214 } 215 216 static void refill_skbs(struct netpoll *np) 217 { 218 struct sk_buff_head *skb_pool; 219 struct sk_buff *skb; 220 221 skb_pool = &np->skb_pool; 222 223 while (READ_ONCE(skb_pool->qlen) < MAX_SKBS) { 224 skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); 225 if (!skb) 226 break; 227 228 skb_queue_tail(skb_pool, skb); 229 } 230 } 231 232 void netpoll_zap_completion_queue(void) 233 { 234 unsigned long flags; 235 struct softnet_data *sd = &get_cpu_var(softnet_data); 236 237 if (sd->completion_queue) { 238 struct sk_buff *clist; 239 240 local_irq_save(flags); 241 clist = sd->completion_queue; 242 sd->completion_queue = NULL; 243 local_irq_restore(flags); 244 245 while (clist != NULL) { 246 struct sk_buff *skb = clist; 247 clist = clist->next; 248 if (!skb_irq_freeable(skb)) { 249 refcount_set(&skb->users, 1); 250 dev_kfree_skb_any(skb); /* put this one back */ 251 } else { 252 __kfree_skb(skb); 253 } 254 } 255 } 256 257 put_cpu_var(softnet_data); 258 } 259 EXPORT_SYMBOL_NS_GPL(netpoll_zap_completion_queue, "NETDEV_INTERNAL"); 260 261 static int netpoll_owner_active(struct net_device *dev) 262 { 263 struct napi_struct *napi; 264 265 list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) { 266 if (READ_ONCE(napi->poll_owner) == smp_processor_id()) 267 return 1; 268 } 269 return 0; 270 } 271 272 /* call with IRQ disabled */ 273 static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) 274 { 275 netdev_tx_t status = NETDEV_TX_BUSY; 276 netdev_tx_t ret = NET_XMIT_DROP; 277 struct net_device *dev; 278 unsigned long tries; 279 /* It is up to the caller to keep npinfo alive. */ 280 struct netpoll_info *npinfo; 281 282 lockdep_assert_irqs_disabled(); 283 284 dev = np->dev; 285 /* npinfo->txq belongs to np->dev, so retries must stay bound to it. */ 286 skb->dev = dev; 287 rcu_read_lock(); 288 npinfo = rcu_dereference_bh(dev->npinfo); 289 290 if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { 291 dev_kfree_skb_irq(skb); 292 goto out; 293 } 294 295 /* don't get messages out of order, and no recursion */ 296 if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { 297 struct netdev_queue *txq; 298 299 txq = netdev_core_pick_tx(dev, skb, NULL); 300 301 /* try until next clock tick */ 302 for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; 303 tries > 0; --tries) { 304 if (HARD_TX_TRYLOCK(dev, txq)) { 305 if (!netif_xmit_stopped(txq)) 306 status = netpoll_start_xmit(skb, dev, txq); 307 308 HARD_TX_UNLOCK(dev, txq); 309 310 if (dev_xmit_complete(status)) 311 break; 312 313 } 314 315 /* tickle device maybe there is some cleanup */ 316 netpoll_poll_dev(np->dev); 317 318 udelay(USEC_PER_POLL); 319 } 320 321 WARN_ONCE(!irqs_disabled(), 322 "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pS)\n", 323 dev->name, dev->netdev_ops->ndo_start_xmit); 324 325 } 326 327 if (!dev_xmit_complete(status)) { 328 skb_queue_tail(&npinfo->txq, skb); 329 schedule_delayed_work(&npinfo->tx_work,0); 330 } 331 ret = NETDEV_TX_OK; 332 out: 333 rcu_read_unlock(); 334 return ret; 335 } 336 337 netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) 338 { 339 unsigned long flags; 340 netdev_tx_t ret; 341 342 if (unlikely(!np)) { 343 dev_kfree_skb_irq(skb); 344 ret = NET_XMIT_DROP; 345 } else { 346 local_irq_save(flags); 347 ret = __netpoll_send_skb(np, skb); 348 local_irq_restore(flags); 349 } 350 return ret; 351 } 352 EXPORT_SYMBOL(netpoll_send_skb); 353 354 static void skb_pool_flush(struct netpoll *np) 355 { 356 struct sk_buff_head *skb_pool; 357 358 cancel_work_sync(&np->refill_wq); 359 skb_pool = &np->skb_pool; 360 skb_queue_purge_reason(skb_pool, SKB_CONSUMED); 361 } 362 363 static void refill_skbs_work_handler(struct work_struct *work) 364 { 365 struct netpoll *np = 366 container_of(work, struct netpoll, refill_wq); 367 368 refill_skbs(np); 369 } 370 371 int __netpoll_setup(struct netpoll *np, struct net_device *ndev) 372 { 373 struct netpoll_info *npinfo; 374 const struct net_device_ops *ops; 375 int err; 376 377 skb_queue_head_init(&np->skb_pool); 378 INIT_WORK(&np->refill_wq, refill_skbs_work_handler); 379 380 if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { 381 np_err(np, "%s doesn't support polling, aborting\n", 382 ndev->name); 383 err = -ENOTSUPP; 384 goto out; 385 } 386 387 npinfo = rtnl_dereference(ndev->npinfo); 388 if (!npinfo) { 389 npinfo = kmalloc_obj(*npinfo); 390 if (!npinfo) { 391 err = -ENOMEM; 392 goto out; 393 } 394 395 sema_init(&npinfo->dev_lock, 1); 396 skb_queue_head_init(&npinfo->txq); 397 INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); 398 399 refcount_set(&npinfo->refcnt, 1); 400 401 ops = ndev->netdev_ops; 402 if (ops->ndo_netpoll_setup) { 403 err = ops->ndo_netpoll_setup(ndev); 404 if (err) 405 goto free_npinfo; 406 } 407 } else { 408 refcount_inc(&npinfo->refcnt); 409 } 410 411 np->dev = ndev; 412 strscpy(np->dev_name, ndev->name, IFNAMSIZ); 413 414 /* fill up the skb queue */ 415 refill_skbs(np); 416 417 /* last thing to do is link it to the net device structure */ 418 rcu_assign_pointer(ndev->npinfo, npinfo); 419 420 return 0; 421 422 free_npinfo: 423 kfree(npinfo); 424 out: 425 return err; 426 } 427 EXPORT_SYMBOL_GPL(__netpoll_setup); 428 429 /* 430 * Returns a pointer to a string representation of the identifier used 431 * to select the egress interface for the given netpoll instance. buf 432 * is used to format np->dev_mac when np->dev_name is empty; bufsz must 433 * be at least MAC_ADDR_STR_LEN + 1 to fit the formatted MAC address 434 * and its NUL terminator. 435 */ 436 static char *egress_dev(struct netpoll *np, char *buf, size_t bufsz) 437 { 438 if (np->dev_name[0]) 439 return np->dev_name; 440 441 snprintf(buf, bufsz, "%pM", np->dev_mac); 442 return buf; 443 } 444 445 static void netpoll_wait_carrier(struct netpoll *np, struct net_device *ndev, 446 unsigned int timeout) 447 { 448 unsigned long atmost; 449 450 atmost = jiffies + timeout * HZ; 451 while (!netif_carrier_ok(ndev)) { 452 if (time_after(jiffies, atmost)) { 453 np_notice(np, "timeout waiting for carrier\n"); 454 break; 455 } 456 msleep(1); 457 } 458 } 459 460 /* 461 * Take the IPv6 from ndev and populate local_ip structure in netpoll 462 */ 463 static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev) 464 { 465 char buf[MAC_ADDR_STR_LEN + 1]; 466 int err = -EDESTADDRREQ; 467 struct inet6_dev *idev; 468 469 if (!IS_ENABLED(CONFIG_IPV6)) { 470 np_err(np, "IPv6 is not supported %s, aborting\n", 471 egress_dev(np, buf, sizeof(buf))); 472 return -EINVAL; 473 } 474 475 idev = __in6_dev_get(ndev); 476 if (idev) { 477 struct inet6_ifaddr *ifp; 478 479 read_lock_bh(&idev->lock); 480 list_for_each_entry(ifp, &idev->addr_list, if_list) { 481 if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) != 482 !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) 483 continue; 484 /* Got the IP, let's return */ 485 np->local_ip.in6 = ifp->addr; 486 err = 0; 487 break; 488 } 489 read_unlock_bh(&idev->lock); 490 } 491 if (err) { 492 np_err(np, "no IPv6 address for %s, aborting\n", 493 egress_dev(np, buf, sizeof(buf))); 494 return err; 495 } 496 497 np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); 498 return 0; 499 } 500 501 /* 502 * Take the IPv4 from ndev and populate local_ip structure in netpoll 503 */ 504 static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev) 505 { 506 char buf[MAC_ADDR_STR_LEN + 1]; 507 const struct in_ifaddr *ifa; 508 struct in_device *in_dev; 509 510 in_dev = __in_dev_get_rtnl(ndev); 511 if (!in_dev) { 512 np_err(np, "no IP address for %s, aborting\n", 513 egress_dev(np, buf, sizeof(buf))); 514 return -EDESTADDRREQ; 515 } 516 517 ifa = rtnl_dereference(in_dev->ifa_list); 518 if (!ifa) { 519 np_err(np, "no IP address for %s, aborting\n", 520 egress_dev(np, buf, sizeof(buf))); 521 return -EDESTADDRREQ; 522 } 523 524 np->local_ip.ip = ifa->ifa_local; 525 np_info(np, "local IP %pI4\n", &np->local_ip.ip); 526 527 return 0; 528 } 529 530 /* 531 * Test whether the caller left np->local_ip unset, so that 532 * netpoll_setup() should auto-populate it from the egress device. 533 * 534 * np->local_ip is a union of __be32 (IPv4) and struct in6_addr (IPv6), 535 * so an IPv6 address whose first 4 bytes are zero (e.g. ::1, ::2, 536 * IPv4-mapped ::ffff:a.b.c.d) must not be tested via the IPv4 arm — 537 * doing so would misclassify a caller-supplied address as unset and 538 * silently overwrite it with whatever address the device exposes. 539 */ 540 static bool netpoll_local_ip_unset(const struct netpoll *np) 541 { 542 if (np->ipv6) 543 return ipv6_addr_any(&np->local_ip.in6); 544 return !np->local_ip.ip; 545 } 546 547 int netpoll_setup(struct netpoll *np) 548 { 549 struct net *net = current->nsproxy->net_ns; 550 char buf[MAC_ADDR_STR_LEN + 1]; 551 struct net_device *ndev = NULL; 552 bool ip_overwritten = false; 553 int err; 554 555 rtnl_lock(); 556 if (np->dev_name[0]) 557 ndev = __dev_get_by_name(net, np->dev_name); 558 else if (is_valid_ether_addr(np->dev_mac)) 559 ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac); 560 561 if (!ndev) { 562 np_err(np, "%s doesn't exist, aborting\n", 563 egress_dev(np, buf, sizeof(buf))); 564 err = -ENODEV; 565 goto unlock; 566 } 567 netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL); 568 569 if (netdev_master_upper_dev_get(ndev)) { 570 np_err(np, "%s is a slave device, aborting\n", 571 egress_dev(np, buf, sizeof(buf))); 572 err = -EBUSY; 573 goto put; 574 } 575 576 if (!netif_running(ndev)) { 577 np_info(np, "device %s not up yet, forcing it\n", 578 egress_dev(np, buf, sizeof(buf))); 579 580 err = dev_open(ndev, NULL); 581 if (err) { 582 np_err(np, "failed to open %s\n", ndev->name); 583 goto put; 584 } 585 586 rtnl_unlock(); 587 netpoll_wait_carrier(np, ndev, carrier_timeout); 588 rtnl_lock(); 589 } 590 591 if (netpoll_local_ip_unset(np)) { 592 if (!np->ipv6) { 593 err = netpoll_take_ipv4(np, ndev); 594 if (err) 595 goto put; 596 } else { 597 err = netpoll_take_ipv6(np, ndev); 598 if (err) 599 goto put; 600 } 601 ip_overwritten = true; 602 } 603 604 err = __netpoll_setup(np, ndev); 605 if (err) 606 goto flush; 607 rtnl_unlock(); 608 609 /* Make sure all NAPI polls which started before dev->npinfo 610 * was visible have exited before we start calling NAPI poll. 611 * NAPI skips locking if dev->npinfo is NULL. 612 */ 613 synchronize_rcu(); 614 615 return 0; 616 617 flush: 618 skb_pool_flush(np); 619 put: 620 DEBUG_NET_WARN_ON_ONCE(np->dev); 621 if (ip_overwritten) 622 memset(&np->local_ip, 0, sizeof(np->local_ip)); 623 netdev_put(ndev, &np->dev_tracker); 624 unlock: 625 rtnl_unlock(); 626 return err; 627 } 628 EXPORT_SYMBOL(netpoll_setup); 629 630 static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) 631 { 632 struct netpoll_info *npinfo = 633 container_of(rcu_head, struct netpoll_info, rcu); 634 635 skb_queue_purge(&npinfo->txq); 636 637 /* we can't call cancel_delayed_work_sync here, as we are in softirq */ 638 cancel_delayed_work(&npinfo->tx_work); 639 640 /* clean after last, unfinished work */ 641 __skb_queue_purge(&npinfo->txq); 642 /* now cancel it again */ 643 cancel_delayed_work(&npinfo->tx_work); 644 kfree(npinfo); 645 } 646 647 static void __netpoll_cleanup(struct netpoll *np) 648 { 649 struct netpoll_info *npinfo; 650 651 npinfo = rtnl_dereference(np->dev->npinfo); 652 if (!npinfo) 653 return; 654 655 /* At this point, there is a single npinfo instance per netdevice, and 656 * its refcnt tracks how many netpoll structures are linked to it. We 657 * only perform npinfo cleanup when the refcnt decrements to zero. 658 */ 659 if (refcount_dec_and_test(&npinfo->refcnt)) { 660 const struct net_device_ops *ops; 661 662 ops = np->dev->netdev_ops; 663 if (ops->ndo_netpoll_cleanup) 664 ops->ndo_netpoll_cleanup(np->dev); 665 666 RCU_INIT_POINTER(np->dev->npinfo, NULL); 667 call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); 668 } 669 670 skb_pool_flush(np); 671 } 672 673 void __netpoll_free(struct netpoll *np) 674 { 675 ASSERT_RTNL(); 676 677 /* Wait for transmitting packets to finish before freeing. */ 678 synchronize_net(); 679 __netpoll_cleanup(np); 680 kfree(np); 681 } 682 EXPORT_SYMBOL_GPL(__netpoll_free); 683 684 void do_netpoll_cleanup(struct netpoll *np) 685 { 686 __netpoll_cleanup(np); 687 netdev_put(np->dev, &np->dev_tracker); 688 np->dev = NULL; 689 } 690 EXPORT_SYMBOL(do_netpoll_cleanup); 691 692 void netpoll_cleanup(struct netpoll *np) 693 { 694 rtnl_lock(); 695 if (!np->dev) 696 goto out; 697 do_netpoll_cleanup(np); 698 out: 699 rtnl_unlock(); 700 } 701 EXPORT_SYMBOL(netpoll_cleanup); 702