1 /* SPDX-License-Identifier: GPL-2.0 2 * 3 * page_pool.c 4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> 5 * Copyright (C) 2016 Red Hat, Inc. 6 */ 7 8 #include <linux/error-injection.h> 9 #include <linux/types.h> 10 #include <linux/kernel.h> 11 #include <linux/slab.h> 12 #include <linux/device.h> 13 14 #include <net/netdev_rx_queue.h> 15 #include <net/page_pool/helpers.h> 16 #include <net/page_pool/memory_provider.h> 17 #include <net/xdp.h> 18 19 #include <linux/dma-direction.h> 20 #include <linux/dma-mapping.h> 21 #include <linux/page-flags.h> 22 #include <linux/mm.h> /* for put_page() */ 23 #include <linux/poison.h> 24 #include <linux/ethtool.h> 25 #include <linux/netdevice.h> 26 27 #include <trace/events/page_pool.h> 28 29 #include "dev.h" 30 #include "mp_dmabuf_devmem.h" 31 #include "netmem_priv.h" 32 #include "page_pool_priv.h" 33 34 DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers); 35 36 #define DEFER_TIME (msecs_to_jiffies(1000)) 37 #define DEFER_WARN_INTERVAL (60 * HZ) 38 39 #define BIAS_MAX (LONG_MAX >> 1) 40 41 #ifdef CONFIG_PAGE_POOL_STATS 42 static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); 43 44 /* alloc_stat_inc is intended to be used in softirq context */ 45 #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) 46 /* recycle_stat_inc is safe to use when preemption is possible. */ 47 #define recycle_stat_inc(pool, __stat) \ 48 do { \ 49 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ 50 this_cpu_inc(s->__stat); \ 51 } while (0) 52 53 #define recycle_stat_add(pool, __stat, val) \ 54 do { \ 55 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ 56 this_cpu_add(s->__stat, val); \ 57 } while (0) 58 59 static const char pp_stats[][ETH_GSTRING_LEN] = { 60 "rx_pp_alloc_fast", 61 "rx_pp_alloc_slow", 62 "rx_pp_alloc_slow_ho", 63 "rx_pp_alloc_empty", 64 "rx_pp_alloc_refill", 65 "rx_pp_alloc_waive", 66 "rx_pp_recycle_cached", 67 "rx_pp_recycle_cache_full", 68 "rx_pp_recycle_ring", 69 "rx_pp_recycle_ring_full", 70 "rx_pp_recycle_released_ref", 71 }; 72 73 /** 74 * page_pool_get_stats() - fetch page pool stats 75 * @pool: pool from which page was allocated 76 * @stats: struct page_pool_stats to fill in 77 * 78 * Retrieve statistics about the page_pool. This API is only available 79 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. 80 * A pointer to a caller allocated struct page_pool_stats structure 81 * is passed to this API which is filled in. The caller can then report 82 * those stats to the user (perhaps via ethtool, debugfs, etc.). 83 */ 84 bool page_pool_get_stats(const struct page_pool *pool, 85 struct page_pool_stats *stats) 86 { 87 int cpu = 0; 88 89 if (!stats) 90 return false; 91 92 /* The caller is responsible to initialize stats. */ 93 stats->alloc_stats.fast += pool->alloc_stats.fast; 94 stats->alloc_stats.slow += pool->alloc_stats.slow; 95 stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; 96 stats->alloc_stats.empty += pool->alloc_stats.empty; 97 stats->alloc_stats.refill += pool->alloc_stats.refill; 98 stats->alloc_stats.waive += pool->alloc_stats.waive; 99 100 for_each_possible_cpu(cpu) { 101 const struct page_pool_recycle_stats *pcpu = 102 per_cpu_ptr(pool->recycle_stats, cpu); 103 104 stats->recycle_stats.cached += pcpu->cached; 105 stats->recycle_stats.cache_full += pcpu->cache_full; 106 stats->recycle_stats.ring += pcpu->ring; 107 stats->recycle_stats.ring_full += pcpu->ring_full; 108 stats->recycle_stats.released_refcnt += pcpu->released_refcnt; 109 } 110 111 return true; 112 } 113 EXPORT_SYMBOL(page_pool_get_stats); 114 115 u8 *page_pool_ethtool_stats_get_strings(u8 *data) 116 { 117 int i; 118 119 for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { 120 memcpy(data, pp_stats[i], ETH_GSTRING_LEN); 121 data += ETH_GSTRING_LEN; 122 } 123 124 return data; 125 } 126 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); 127 128 int page_pool_ethtool_stats_get_count(void) 129 { 130 return ARRAY_SIZE(pp_stats); 131 } 132 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); 133 134 u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) 135 { 136 const struct page_pool_stats *pool_stats = stats; 137 138 *data++ = pool_stats->alloc_stats.fast; 139 *data++ = pool_stats->alloc_stats.slow; 140 *data++ = pool_stats->alloc_stats.slow_high_order; 141 *data++ = pool_stats->alloc_stats.empty; 142 *data++ = pool_stats->alloc_stats.refill; 143 *data++ = pool_stats->alloc_stats.waive; 144 *data++ = pool_stats->recycle_stats.cached; 145 *data++ = pool_stats->recycle_stats.cache_full; 146 *data++ = pool_stats->recycle_stats.ring; 147 *data++ = pool_stats->recycle_stats.ring_full; 148 *data++ = pool_stats->recycle_stats.released_refcnt; 149 150 return data; 151 } 152 EXPORT_SYMBOL(page_pool_ethtool_stats_get); 153 154 #else 155 #define alloc_stat_inc(pool, __stat) 156 #define recycle_stat_inc(pool, __stat) 157 #define recycle_stat_add(pool, __stat, val) 158 #endif 159 160 static bool page_pool_producer_lock(struct page_pool *pool) 161 __acquires(&pool->ring.producer_lock) 162 { 163 bool in_softirq = in_softirq(); 164 165 if (in_softirq) 166 spin_lock(&pool->ring.producer_lock); 167 else 168 spin_lock_bh(&pool->ring.producer_lock); 169 170 return in_softirq; 171 } 172 173 static void page_pool_producer_unlock(struct page_pool *pool, 174 bool in_softirq) 175 __releases(&pool->ring.producer_lock) 176 { 177 if (in_softirq) 178 spin_unlock(&pool->ring.producer_lock); 179 else 180 spin_unlock_bh(&pool->ring.producer_lock); 181 } 182 183 static void page_pool_struct_check(void) 184 { 185 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); 186 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); 187 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); 188 CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, 189 PAGE_POOL_FRAG_GROUP_ALIGN); 190 } 191 192 static int page_pool_init(struct page_pool *pool, 193 const struct page_pool_params *params, 194 int cpuid) 195 { 196 unsigned int ring_qsize = 1024; /* Default */ 197 struct netdev_rx_queue *rxq; 198 int err; 199 200 page_pool_struct_check(); 201 202 memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); 203 memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); 204 205 pool->cpuid = cpuid; 206 pool->dma_sync_for_cpu = true; 207 208 /* Validate only known flags were used */ 209 if (pool->slow.flags & ~PP_FLAG_ALL) 210 return -EINVAL; 211 212 if (pool->p.pool_size) 213 ring_qsize = pool->p.pool_size; 214 215 /* Sanity limit mem that can be pinned down */ 216 if (ring_qsize > 32768) 217 return -E2BIG; 218 219 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. 220 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, 221 * which is the XDP_TX use-case. 222 */ 223 if (pool->slow.flags & PP_FLAG_DMA_MAP) { 224 if ((pool->p.dma_dir != DMA_FROM_DEVICE) && 225 (pool->p.dma_dir != DMA_BIDIRECTIONAL)) 226 return -EINVAL; 227 228 pool->dma_map = true; 229 } 230 231 if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { 232 /* In order to request DMA-sync-for-device the page 233 * needs to be mapped 234 */ 235 if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) 236 return -EINVAL; 237 238 if (!pool->p.max_len) 239 return -EINVAL; 240 241 pool->dma_sync = true; 242 243 /* pool->p.offset has to be set according to the address 244 * offset used by the DMA engine to start copying rx data 245 */ 246 } 247 248 pool->has_init_callback = !!pool->slow.init_callback; 249 250 #ifdef CONFIG_PAGE_POOL_STATS 251 if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) { 252 pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); 253 if (!pool->recycle_stats) 254 return -ENOMEM; 255 } else { 256 /* For system page pool instance we use a singular stats object 257 * instead of allocating a separate percpu variable for each 258 * (also percpu) page pool instance. 259 */ 260 pool->recycle_stats = &pp_system_recycle_stats; 261 pool->system = true; 262 } 263 #endif 264 265 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { 266 #ifdef CONFIG_PAGE_POOL_STATS 267 if (!pool->system) 268 free_percpu(pool->recycle_stats); 269 #endif 270 return -ENOMEM; 271 } 272 273 atomic_set(&pool->pages_state_release_cnt, 0); 274 275 /* Driver calling page_pool_create() also call page_pool_destroy() */ 276 refcount_set(&pool->user_cnt, 1); 277 278 if (pool->dma_map) 279 get_device(pool->p.dev); 280 281 if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { 282 /* We rely on rtnl_lock()ing to make sure netdev_rx_queue 283 * configuration doesn't change while we're initializing 284 * the page_pool. 285 */ 286 ASSERT_RTNL(); 287 rxq = __netif_get_rx_queue(pool->slow.netdev, 288 pool->slow.queue_idx); 289 pool->mp_priv = rxq->mp_params.mp_priv; 290 pool->mp_ops = rxq->mp_params.mp_ops; 291 } 292 293 if (pool->mp_ops) { 294 if (!pool->dma_map || !pool->dma_sync) 295 return -EOPNOTSUPP; 296 297 if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { 298 err = -EFAULT; 299 goto free_ptr_ring; 300 } 301 302 err = pool->mp_ops->init(pool); 303 if (err) { 304 pr_warn("%s() mem-provider init failed %d\n", __func__, 305 err); 306 goto free_ptr_ring; 307 } 308 309 static_branch_inc(&page_pool_mem_providers); 310 } 311 312 return 0; 313 314 free_ptr_ring: 315 ptr_ring_cleanup(&pool->ring, NULL); 316 #ifdef CONFIG_PAGE_POOL_STATS 317 if (!pool->system) 318 free_percpu(pool->recycle_stats); 319 #endif 320 return err; 321 } 322 323 static void page_pool_uninit(struct page_pool *pool) 324 { 325 ptr_ring_cleanup(&pool->ring, NULL); 326 327 if (pool->dma_map) 328 put_device(pool->p.dev); 329 330 #ifdef CONFIG_PAGE_POOL_STATS 331 if (!pool->system) 332 free_percpu(pool->recycle_stats); 333 #endif 334 } 335 336 /** 337 * page_pool_create_percpu() - create a page pool for a given cpu. 338 * @params: parameters, see struct page_pool_params 339 * @cpuid: cpu identifier 340 */ 341 struct page_pool * 342 page_pool_create_percpu(const struct page_pool_params *params, int cpuid) 343 { 344 struct page_pool *pool; 345 int err; 346 347 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); 348 if (!pool) 349 return ERR_PTR(-ENOMEM); 350 351 err = page_pool_init(pool, params, cpuid); 352 if (err < 0) 353 goto err_free; 354 355 err = page_pool_list(pool); 356 if (err) 357 goto err_uninit; 358 359 return pool; 360 361 err_uninit: 362 page_pool_uninit(pool); 363 err_free: 364 pr_warn("%s() gave up with errno %d\n", __func__, err); 365 kfree(pool); 366 return ERR_PTR(err); 367 } 368 EXPORT_SYMBOL(page_pool_create_percpu); 369 370 /** 371 * page_pool_create() - create a page pool 372 * @params: parameters, see struct page_pool_params 373 */ 374 struct page_pool *page_pool_create(const struct page_pool_params *params) 375 { 376 return page_pool_create_percpu(params, -1); 377 } 378 EXPORT_SYMBOL(page_pool_create); 379 380 static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem); 381 382 static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) 383 { 384 struct ptr_ring *r = &pool->ring; 385 netmem_ref netmem; 386 int pref_nid; /* preferred NUMA node */ 387 388 /* Quicker fallback, avoid locks when ring is empty */ 389 if (__ptr_ring_empty(r)) { 390 alloc_stat_inc(pool, empty); 391 return 0; 392 } 393 394 /* Softirq guarantee CPU and thus NUMA node is stable. This, 395 * assumes CPU refilling driver RX-ring will also run RX-NAPI. 396 */ 397 #ifdef CONFIG_NUMA 398 pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; 399 #else 400 /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ 401 pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ 402 #endif 403 404 /* Refill alloc array, but only if NUMA match */ 405 do { 406 netmem = (__force netmem_ref)__ptr_ring_consume(r); 407 if (unlikely(!netmem)) 408 break; 409 410 if (likely(netmem_is_pref_nid(netmem, pref_nid))) { 411 pool->alloc.cache[pool->alloc.count++] = netmem; 412 } else { 413 /* NUMA mismatch; 414 * (1) release 1 page to page-allocator and 415 * (2) break out to fallthrough to alloc_pages_node. 416 * This limit stress on page buddy alloactor. 417 */ 418 page_pool_return_page(pool, netmem); 419 alloc_stat_inc(pool, waive); 420 netmem = 0; 421 break; 422 } 423 } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); 424 425 /* Return last page */ 426 if (likely(pool->alloc.count > 0)) { 427 netmem = pool->alloc.cache[--pool->alloc.count]; 428 alloc_stat_inc(pool, refill); 429 } 430 431 return netmem; 432 } 433 434 /* fast path */ 435 static netmem_ref __page_pool_get_cached(struct page_pool *pool) 436 { 437 netmem_ref netmem; 438 439 /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ 440 if (likely(pool->alloc.count)) { 441 /* Fast-path */ 442 netmem = pool->alloc.cache[--pool->alloc.count]; 443 alloc_stat_inc(pool, fast); 444 } else { 445 netmem = page_pool_refill_alloc_cache(pool); 446 } 447 448 return netmem; 449 } 450 451 static void __page_pool_dma_sync_for_device(const struct page_pool *pool, 452 netmem_ref netmem, 453 u32 dma_sync_size) 454 { 455 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) 456 dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem); 457 458 dma_sync_size = min(dma_sync_size, pool->p.max_len); 459 __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, 460 dma_sync_size, pool->p.dma_dir); 461 #endif 462 } 463 464 static __always_inline void 465 page_pool_dma_sync_for_device(const struct page_pool *pool, 466 netmem_ref netmem, 467 u32 dma_sync_size) 468 { 469 if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) 470 __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); 471 } 472 473 static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) 474 { 475 dma_addr_t dma; 476 477 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr 478 * since dma_addr_t can be either 32 or 64 bits and does not always fit 479 * into page private data (i.e 32bit cpu with 64bit DMA caps) 480 * This mapping is kept for lifetime of page, until leaving pool. 481 */ 482 dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0, 483 (PAGE_SIZE << pool->p.order), pool->p.dma_dir, 484 DMA_ATTR_SKIP_CPU_SYNC | 485 DMA_ATTR_WEAK_ORDERING); 486 if (dma_mapping_error(pool->p.dev, dma)) 487 return false; 488 489 if (page_pool_set_dma_addr_netmem(netmem, dma)) 490 goto unmap_failed; 491 492 page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); 493 494 return true; 495 496 unmap_failed: 497 WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); 498 dma_unmap_page_attrs(pool->p.dev, dma, 499 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 500 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); 501 return false; 502 } 503 504 static struct page *__page_pool_alloc_page_order(struct page_pool *pool, 505 gfp_t gfp) 506 { 507 struct page *page; 508 509 gfp |= __GFP_COMP; 510 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); 511 if (unlikely(!page)) 512 return NULL; 513 514 if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) { 515 put_page(page); 516 return NULL; 517 } 518 519 alloc_stat_inc(pool, slow_high_order); 520 page_pool_set_pp_info(pool, page_to_netmem(page)); 521 522 /* Track how many pages are held 'in-flight' */ 523 pool->pages_state_hold_cnt++; 524 trace_page_pool_state_hold(pool, page_to_netmem(page), 525 pool->pages_state_hold_cnt); 526 return page; 527 } 528 529 /* slow path */ 530 static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, 531 gfp_t gfp) 532 { 533 const int bulk = PP_ALLOC_CACHE_REFILL; 534 unsigned int pp_order = pool->p.order; 535 bool dma_map = pool->dma_map; 536 netmem_ref netmem; 537 int i, nr_pages; 538 539 /* Don't support bulk alloc for high-order pages */ 540 if (unlikely(pp_order)) 541 return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); 542 543 /* Unnecessary as alloc cache is empty, but guarantees zero count */ 544 if (unlikely(pool->alloc.count > 0)) 545 return pool->alloc.cache[--pool->alloc.count]; 546 547 /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ 548 memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); 549 550 nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, 551 (struct page **)pool->alloc.cache); 552 if (unlikely(!nr_pages)) 553 return 0; 554 555 /* Pages have been filled into alloc.cache array, but count is zero and 556 * page element have not been (possibly) DMA mapped. 557 */ 558 for (i = 0; i < nr_pages; i++) { 559 netmem = pool->alloc.cache[i]; 560 if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) { 561 put_page(netmem_to_page(netmem)); 562 continue; 563 } 564 565 page_pool_set_pp_info(pool, netmem); 566 pool->alloc.cache[pool->alloc.count++] = netmem; 567 /* Track how many pages are held 'in-flight' */ 568 pool->pages_state_hold_cnt++; 569 trace_page_pool_state_hold(pool, netmem, 570 pool->pages_state_hold_cnt); 571 } 572 573 /* Return last page */ 574 if (likely(pool->alloc.count > 0)) { 575 netmem = pool->alloc.cache[--pool->alloc.count]; 576 alloc_stat_inc(pool, slow); 577 } else { 578 netmem = 0; 579 } 580 581 /* When page just alloc'ed is should/must have refcnt 1. */ 582 return netmem; 583 } 584 585 /* For using page_pool replace: alloc_pages() API calls, but provide 586 * synchronization guarantee for allocation side. 587 */ 588 netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) 589 { 590 netmem_ref netmem; 591 592 /* Fast-path: Get a page from cache */ 593 netmem = __page_pool_get_cached(pool); 594 if (netmem) 595 return netmem; 596 597 /* Slow-path: cache empty, do real allocation */ 598 if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) 599 netmem = pool->mp_ops->alloc_netmems(pool, gfp); 600 else 601 netmem = __page_pool_alloc_pages_slow(pool, gfp); 602 return netmem; 603 } 604 EXPORT_SYMBOL(page_pool_alloc_netmems); 605 ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL); 606 607 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) 608 { 609 return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); 610 } 611 EXPORT_SYMBOL(page_pool_alloc_pages); 612 613 /* Calculate distance between two u32 values, valid if distance is below 2^(31) 614 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution 615 */ 616 #define _distance(a, b) (s32)((a) - (b)) 617 618 s32 page_pool_inflight(const struct page_pool *pool, bool strict) 619 { 620 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); 621 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); 622 s32 inflight; 623 624 inflight = _distance(hold_cnt, release_cnt); 625 626 if (strict) { 627 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); 628 WARN(inflight < 0, "Negative(%d) inflight packet-pages", 629 inflight); 630 } else { 631 inflight = max(0, inflight); 632 } 633 634 return inflight; 635 } 636 637 void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) 638 { 639 netmem_set_pp(netmem, pool); 640 netmem_or_pp_magic(netmem, PP_SIGNATURE); 641 642 /* Ensuring all pages have been split into one fragment initially: 643 * page_pool_set_pp_info() is only called once for every page when it 644 * is allocated from the page allocator and page_pool_fragment_page() 645 * is dirtying the same cache line as the page->pp_magic above, so 646 * the overhead is negligible. 647 */ 648 page_pool_fragment_netmem(netmem, 1); 649 if (pool->has_init_callback) 650 pool->slow.init_callback(netmem, pool->slow.init_arg); 651 } 652 653 void page_pool_clear_pp_info(netmem_ref netmem) 654 { 655 netmem_clear_pp_magic(netmem); 656 netmem_set_pp(netmem, NULL); 657 } 658 659 static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, 660 netmem_ref netmem) 661 { 662 dma_addr_t dma; 663 664 if (!pool->dma_map) 665 /* Always account for inflight pages, even if we didn't 666 * map them 667 */ 668 return; 669 670 dma = page_pool_get_dma_addr_netmem(netmem); 671 672 /* When page is unmapped, it cannot be returned to our pool */ 673 dma_unmap_page_attrs(pool->p.dev, dma, 674 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 675 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); 676 page_pool_set_dma_addr_netmem(netmem, 0); 677 } 678 679 /* Disconnects a page (from a page_pool). API users can have a need 680 * to disconnect a page (from a page_pool), to allow it to be used as 681 * a regular page (that will eventually be returned to the normal 682 * page-allocator via put_page). 683 */ 684 void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) 685 { 686 int count; 687 bool put; 688 689 put = true; 690 if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) 691 put = pool->mp_ops->release_netmem(pool, netmem); 692 else 693 __page_pool_release_page_dma(pool, netmem); 694 695 /* This may be the last page returned, releasing the pool, so 696 * it is not safe to reference pool afterwards. 697 */ 698 count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); 699 trace_page_pool_state_release(pool, netmem, count); 700 701 if (put) { 702 page_pool_clear_pp_info(netmem); 703 put_page(netmem_to_page(netmem)); 704 } 705 /* An optimization would be to call __free_pages(page, pool->p.order) 706 * knowing page is not part of page-cache (thus avoiding a 707 * __page_cache_release() call). 708 */ 709 } 710 711 static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) 712 { 713 int ret; 714 /* BH protection not needed if current is softirq */ 715 if (in_softirq()) 716 ret = ptr_ring_produce(&pool->ring, (__force void *)netmem); 717 else 718 ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem); 719 720 if (!ret) { 721 recycle_stat_inc(pool, ring); 722 return true; 723 } 724 725 return false; 726 } 727 728 /* Only allow direct recycling in special circumstances, into the 729 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. 730 * 731 * Caller must provide appropriate safe context. 732 */ 733 static bool page_pool_recycle_in_cache(netmem_ref netmem, 734 struct page_pool *pool) 735 { 736 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { 737 recycle_stat_inc(pool, cache_full); 738 return false; 739 } 740 741 /* Caller MUST have verified/know (page_ref_count(page) == 1) */ 742 pool->alloc.cache[pool->alloc.count++] = netmem; 743 recycle_stat_inc(pool, cached); 744 return true; 745 } 746 747 static bool __page_pool_page_can_be_recycled(netmem_ref netmem) 748 { 749 return netmem_is_net_iov(netmem) || 750 (page_ref_count(netmem_to_page(netmem)) == 1 && 751 !page_is_pfmemalloc(netmem_to_page(netmem))); 752 } 753 754 /* If the page refcnt == 1, this will try to recycle the page. 755 * If pool->dma_sync is set, we'll try to sync the DMA area for 756 * the configured size min(dma_sync_size, pool->max_len). 757 * If the page refcnt != 1, then the page will be returned to memory 758 * subsystem. 759 */ 760 static __always_inline netmem_ref 761 __page_pool_put_page(struct page_pool *pool, netmem_ref netmem, 762 unsigned int dma_sync_size, bool allow_direct) 763 { 764 lockdep_assert_no_hardirq(); 765 766 /* This allocator is optimized for the XDP mode that uses 767 * one-frame-per-page, but have fallbacks that act like the 768 * regular page allocator APIs. 769 * 770 * refcnt == 1 means page_pool owns page, and can recycle it. 771 * 772 * page is NOT reusable when allocated when system is under 773 * some pressure. (page_is_pfmemalloc) 774 */ 775 if (likely(__page_pool_page_can_be_recycled(netmem))) { 776 /* Read barrier done in page_ref_count / READ_ONCE */ 777 778 page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); 779 780 if (allow_direct && page_pool_recycle_in_cache(netmem, pool)) 781 return 0; 782 783 /* Page found as candidate for recycling */ 784 return netmem; 785 } 786 787 /* Fallback/non-XDP mode: API user have elevated refcnt. 788 * 789 * Many drivers split up the page into fragments, and some 790 * want to keep doing this to save memory and do refcnt based 791 * recycling. Support this use case too, to ease drivers 792 * switching between XDP/non-XDP. 793 * 794 * In-case page_pool maintains the DMA mapping, API user must 795 * call page_pool_put_page once. In this elevated refcnt 796 * case, the DMA is unmapped/released, as driver is likely 797 * doing refcnt based recycle tricks, meaning another process 798 * will be invoking put_page. 799 */ 800 recycle_stat_inc(pool, released_refcnt); 801 page_pool_return_page(pool, netmem); 802 803 return 0; 804 } 805 806 static bool page_pool_napi_local(const struct page_pool *pool) 807 { 808 const struct napi_struct *napi; 809 u32 cpuid; 810 811 if (unlikely(!in_softirq())) 812 return false; 813 814 /* Allow direct recycle if we have reasons to believe that we are 815 * in the same context as the consumer would run, so there's 816 * no possible race. 817 * __page_pool_put_page() makes sure we're not in hardirq context 818 * and interrupts are enabled prior to accessing the cache. 819 */ 820 cpuid = smp_processor_id(); 821 if (READ_ONCE(pool->cpuid) == cpuid) 822 return true; 823 824 napi = READ_ONCE(pool->p.napi); 825 826 return napi && READ_ONCE(napi->list_owner) == cpuid; 827 } 828 829 void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, 830 unsigned int dma_sync_size, bool allow_direct) 831 { 832 if (!allow_direct) 833 allow_direct = page_pool_napi_local(pool); 834 835 netmem = 836 __page_pool_put_page(pool, netmem, dma_sync_size, allow_direct); 837 if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { 838 /* Cache full, fallback to free pages */ 839 recycle_stat_inc(pool, ring_full); 840 page_pool_return_page(pool, netmem); 841 } 842 } 843 EXPORT_SYMBOL(page_pool_put_unrefed_netmem); 844 845 void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, 846 unsigned int dma_sync_size, bool allow_direct) 847 { 848 page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size, 849 allow_direct); 850 } 851 EXPORT_SYMBOL(page_pool_put_unrefed_page); 852 853 static void page_pool_recycle_ring_bulk(struct page_pool *pool, 854 netmem_ref *bulk, 855 u32 bulk_len) 856 { 857 bool in_softirq; 858 u32 i; 859 860 /* Bulk produce into ptr_ring page_pool cache */ 861 in_softirq = page_pool_producer_lock(pool); 862 863 for (i = 0; i < bulk_len; i++) { 864 if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { 865 /* ring full */ 866 recycle_stat_inc(pool, ring_full); 867 break; 868 } 869 } 870 871 page_pool_producer_unlock(pool, in_softirq); 872 recycle_stat_add(pool, ring, i); 873 874 /* Hopefully all pages were returned into ptr_ring */ 875 if (likely(i == bulk_len)) 876 return; 877 878 /* 879 * ptr_ring cache is full, free remaining pages outside producer lock 880 * since put_page() with refcnt == 1 can be an expensive operation. 881 */ 882 for (; i < bulk_len; i++) 883 page_pool_return_page(pool, bulk[i]); 884 } 885 886 /** 887 * page_pool_put_netmem_bulk() - release references on multiple netmems 888 * @data: array holding netmem references 889 * @count: number of entries in @data 890 * 891 * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring 892 * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk() 893 * will release leftover netmems to the memory provider. 894 * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx 895 * completion loop for the XDP_REDIRECT use case. 896 * 897 * Please note the caller must not use data area after running 898 * page_pool_put_netmem_bulk(), as this function overwrites it. 899 */ 900 void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) 901 { 902 u32 bulk_len = 0; 903 904 for (u32 i = 0; i < count; i++) { 905 netmem_ref netmem = netmem_compound_head(data[i]); 906 907 if (page_pool_unref_and_test(netmem)) 908 data[bulk_len++] = netmem; 909 } 910 911 count = bulk_len; 912 while (count) { 913 netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; 914 struct page_pool *pool = NULL; 915 bool allow_direct; 916 u32 foreign = 0; 917 918 bulk_len = 0; 919 920 for (u32 i = 0; i < count; i++) { 921 struct page_pool *netmem_pp; 922 netmem_ref netmem = data[i]; 923 924 netmem_pp = netmem_get_pp(netmem); 925 if (unlikely(!pool)) { 926 pool = netmem_pp; 927 allow_direct = page_pool_napi_local(pool); 928 } else if (netmem_pp != pool) { 929 /* 930 * If the netmem belongs to a different 931 * page_pool, save it for another round. 932 */ 933 data[foreign++] = netmem; 934 continue; 935 } 936 937 netmem = __page_pool_put_page(pool, netmem, -1, 938 allow_direct); 939 /* Approved for bulk recycling in ptr_ring cache */ 940 if (netmem) 941 bulk[bulk_len++] = netmem; 942 } 943 944 if (bulk_len) 945 page_pool_recycle_ring_bulk(pool, bulk, bulk_len); 946 947 count = foreign; 948 } 949 } 950 EXPORT_SYMBOL(page_pool_put_netmem_bulk); 951 952 static netmem_ref page_pool_drain_frag(struct page_pool *pool, 953 netmem_ref netmem) 954 { 955 long drain_count = BIAS_MAX - pool->frag_users; 956 957 /* Some user is still using the page frag */ 958 if (likely(page_pool_unref_netmem(netmem, drain_count))) 959 return 0; 960 961 if (__page_pool_page_can_be_recycled(netmem)) { 962 page_pool_dma_sync_for_device(pool, netmem, -1); 963 return netmem; 964 } 965 966 page_pool_return_page(pool, netmem); 967 return 0; 968 } 969 970 static void page_pool_free_frag(struct page_pool *pool) 971 { 972 long drain_count = BIAS_MAX - pool->frag_users; 973 netmem_ref netmem = pool->frag_page; 974 975 pool->frag_page = 0; 976 977 if (!netmem || page_pool_unref_netmem(netmem, drain_count)) 978 return; 979 980 page_pool_return_page(pool, netmem); 981 } 982 983 netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, 984 unsigned int *offset, unsigned int size, 985 gfp_t gfp) 986 { 987 unsigned int max_size = PAGE_SIZE << pool->p.order; 988 netmem_ref netmem = pool->frag_page; 989 990 if (WARN_ON(size > max_size)) 991 return 0; 992 993 size = ALIGN(size, dma_get_cache_alignment()); 994 *offset = pool->frag_offset; 995 996 if (netmem && *offset + size > max_size) { 997 netmem = page_pool_drain_frag(pool, netmem); 998 if (netmem) { 999 recycle_stat_inc(pool, cached); 1000 alloc_stat_inc(pool, fast); 1001 goto frag_reset; 1002 } 1003 } 1004 1005 if (!netmem) { 1006 netmem = page_pool_alloc_netmems(pool, gfp); 1007 if (unlikely(!netmem)) { 1008 pool->frag_page = 0; 1009 return 0; 1010 } 1011 1012 pool->frag_page = netmem; 1013 1014 frag_reset: 1015 pool->frag_users = 1; 1016 *offset = 0; 1017 pool->frag_offset = size; 1018 page_pool_fragment_netmem(netmem, BIAS_MAX); 1019 return netmem; 1020 } 1021 1022 pool->frag_users++; 1023 pool->frag_offset = *offset + size; 1024 return netmem; 1025 } 1026 EXPORT_SYMBOL(page_pool_alloc_frag_netmem); 1027 1028 struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, 1029 unsigned int size, gfp_t gfp) 1030 { 1031 return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size, 1032 gfp)); 1033 } 1034 EXPORT_SYMBOL(page_pool_alloc_frag); 1035 1036 static void page_pool_empty_ring(struct page_pool *pool) 1037 { 1038 netmem_ref netmem; 1039 1040 /* Empty recycle ring */ 1041 while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { 1042 /* Verify the refcnt invariant of cached pages */ 1043 if (!(netmem_ref_count(netmem) == 1)) 1044 pr_crit("%s() page_pool refcnt %d violation\n", 1045 __func__, netmem_ref_count(netmem)); 1046 1047 page_pool_return_page(pool, netmem); 1048 } 1049 } 1050 1051 static void __page_pool_destroy(struct page_pool *pool) 1052 { 1053 if (pool->disconnect) 1054 pool->disconnect(pool); 1055 1056 page_pool_unlist(pool); 1057 page_pool_uninit(pool); 1058 1059 if (pool->mp_ops) { 1060 pool->mp_ops->destroy(pool); 1061 static_branch_dec(&page_pool_mem_providers); 1062 } 1063 1064 kfree(pool); 1065 } 1066 1067 static void page_pool_empty_alloc_cache_once(struct page_pool *pool) 1068 { 1069 netmem_ref netmem; 1070 1071 if (pool->destroy_cnt) 1072 return; 1073 1074 /* Empty alloc cache, assume caller made sure this is 1075 * no-longer in use, and page_pool_alloc_pages() cannot be 1076 * call concurrently. 1077 */ 1078 while (pool->alloc.count) { 1079 netmem = pool->alloc.cache[--pool->alloc.count]; 1080 page_pool_return_page(pool, netmem); 1081 } 1082 } 1083 1084 static void page_pool_scrub(struct page_pool *pool) 1085 { 1086 page_pool_empty_alloc_cache_once(pool); 1087 pool->destroy_cnt++; 1088 1089 /* No more consumers should exist, but producers could still 1090 * be in-flight. 1091 */ 1092 page_pool_empty_ring(pool); 1093 } 1094 1095 static int page_pool_release(struct page_pool *pool) 1096 { 1097 int inflight; 1098 1099 page_pool_scrub(pool); 1100 inflight = page_pool_inflight(pool, true); 1101 if (!inflight) 1102 __page_pool_destroy(pool); 1103 1104 return inflight; 1105 } 1106 1107 static void page_pool_release_retry(struct work_struct *wq) 1108 { 1109 struct delayed_work *dwq = to_delayed_work(wq); 1110 struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); 1111 void *netdev; 1112 int inflight; 1113 1114 inflight = page_pool_release(pool); 1115 /* In rare cases, a driver bug may cause inflight to go negative. 1116 * Don't reschedule release if inflight is 0 or negative. 1117 * - If 0, the page_pool has been destroyed 1118 * - if negative, we will never recover 1119 * in both cases no reschedule is necessary. 1120 */ 1121 if (inflight <= 0) 1122 return; 1123 1124 /* Periodic warning for page pools the user can't see */ 1125 netdev = READ_ONCE(pool->slow.netdev); 1126 if (time_after_eq(jiffies, pool->defer_warn) && 1127 (!netdev || netdev == NET_PTR_POISON)) { 1128 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; 1129 1130 pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", 1131 __func__, pool->user.id, inflight, sec); 1132 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 1133 } 1134 1135 /* Still not ready to be disconnected, retry later */ 1136 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 1137 } 1138 1139 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), 1140 const struct xdp_mem_info *mem) 1141 { 1142 refcount_inc(&pool->user_cnt); 1143 pool->disconnect = disconnect; 1144 pool->xdp_mem_id = mem->id; 1145 } 1146 1147 void page_pool_disable_direct_recycling(struct page_pool *pool) 1148 { 1149 /* Disable direct recycling based on pool->cpuid. 1150 * Paired with READ_ONCE() in page_pool_napi_local(). 1151 */ 1152 WRITE_ONCE(pool->cpuid, -1); 1153 1154 if (!pool->p.napi) 1155 return; 1156 1157 napi_assert_will_not_race(pool->p.napi); 1158 1159 mutex_lock(&page_pools_lock); 1160 WRITE_ONCE(pool->p.napi, NULL); 1161 mutex_unlock(&page_pools_lock); 1162 } 1163 EXPORT_SYMBOL(page_pool_disable_direct_recycling); 1164 1165 void page_pool_destroy(struct page_pool *pool) 1166 { 1167 if (!pool) 1168 return; 1169 1170 if (!page_pool_put(pool)) 1171 return; 1172 1173 page_pool_disable_direct_recycling(pool); 1174 page_pool_free_frag(pool); 1175 1176 if (!page_pool_release(pool)) 1177 return; 1178 1179 page_pool_detached(pool); 1180 pool->defer_start = jiffies; 1181 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 1182 1183 INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); 1184 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 1185 } 1186 EXPORT_SYMBOL(page_pool_destroy); 1187 1188 /* Caller must provide appropriate safe context, e.g. NAPI. */ 1189 void page_pool_update_nid(struct page_pool *pool, int new_nid) 1190 { 1191 netmem_ref netmem; 1192 1193 trace_page_pool_update_nid(pool, new_nid); 1194 pool->p.nid = new_nid; 1195 1196 /* Flush pool alloc cache, as refill will check NUMA node */ 1197 while (pool->alloc.count) { 1198 netmem = pool->alloc.cache[--pool->alloc.count]; 1199 page_pool_return_page(pool, netmem); 1200 } 1201 } 1202 EXPORT_SYMBOL(page_pool_update_nid); 1203 1204 bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) 1205 { 1206 return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); 1207 } 1208 1209 /* Associate a niov with a page pool. Should follow with a matching 1210 * net_mp_niov_clear_page_pool() 1211 */ 1212 void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) 1213 { 1214 netmem_ref netmem = net_iov_to_netmem(niov); 1215 1216 page_pool_set_pp_info(pool, netmem); 1217 1218 pool->pages_state_hold_cnt++; 1219 trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); 1220 } 1221 1222 /* Disassociate a niov from a page pool. Should only be used in the 1223 * ->release_netmem() path. 1224 */ 1225 void net_mp_niov_clear_page_pool(struct net_iov *niov) 1226 { 1227 netmem_ref netmem = net_iov_to_netmem(niov); 1228 1229 page_pool_clear_pp_info(netmem); 1230 } 1231