1 /* SPDX-License-Identifier: GPL-2.0 2 * 3 * page_pool.c 4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> 5 * Copyright (C) 2016 Red Hat, Inc. 6 */ 7 8 #include <linux/error-injection.h> 9 #include <linux/types.h> 10 #include <linux/kernel.h> 11 #include <linux/slab.h> 12 #include <linux/device.h> 13 14 #include <net/page_pool/helpers.h> 15 #include <net/xdp.h> 16 17 #include <linux/dma-direction.h> 18 #include <linux/dma-mapping.h> 19 #include <linux/page-flags.h> 20 #include <linux/mm.h> /* for put_page() */ 21 #include <linux/poison.h> 22 #include <linux/ethtool.h> 23 #include <linux/netdevice.h> 24 25 #include <trace/events/page_pool.h> 26 27 #include "page_pool_priv.h" 28 29 #define DEFER_TIME (msecs_to_jiffies(1000)) 30 #define DEFER_WARN_INTERVAL (60 * HZ) 31 32 #define BIAS_MAX (LONG_MAX >> 1) 33 34 #ifdef CONFIG_PAGE_POOL_STATS 35 static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); 36 37 /* alloc_stat_inc is intended to be used in softirq context */ 38 #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) 39 /* recycle_stat_inc is safe to use when preemption is possible. */ 40 #define recycle_stat_inc(pool, __stat) \ 41 do { \ 42 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ 43 this_cpu_inc(s->__stat); \ 44 } while (0) 45 46 #define recycle_stat_add(pool, __stat, val) \ 47 do { \ 48 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ 49 this_cpu_add(s->__stat, val); \ 50 } while (0) 51 52 static const char pp_stats[][ETH_GSTRING_LEN] = { 53 "rx_pp_alloc_fast", 54 "rx_pp_alloc_slow", 55 "rx_pp_alloc_slow_ho", 56 "rx_pp_alloc_empty", 57 "rx_pp_alloc_refill", 58 "rx_pp_alloc_waive", 59 "rx_pp_recycle_cached", 60 "rx_pp_recycle_cache_full", 61 "rx_pp_recycle_ring", 62 "rx_pp_recycle_ring_full", 63 "rx_pp_recycle_released_ref", 64 }; 65 66 /** 67 * page_pool_get_stats() - fetch page pool stats 68 * @pool: pool from which page was allocated 69 * @stats: struct page_pool_stats to fill in 70 * 71 * Retrieve statistics about the page_pool. This API is only available 72 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. 73 * A pointer to a caller allocated struct page_pool_stats structure 74 * is passed to this API which is filled in. The caller can then report 75 * those stats to the user (perhaps via ethtool, debugfs, etc.). 76 */ 77 bool page_pool_get_stats(const struct page_pool *pool, 78 struct page_pool_stats *stats) 79 { 80 int cpu = 0; 81 82 if (!stats) 83 return false; 84 85 /* The caller is responsible to initialize stats. */ 86 stats->alloc_stats.fast += pool->alloc_stats.fast; 87 stats->alloc_stats.slow += pool->alloc_stats.slow; 88 stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; 89 stats->alloc_stats.empty += pool->alloc_stats.empty; 90 stats->alloc_stats.refill += pool->alloc_stats.refill; 91 stats->alloc_stats.waive += pool->alloc_stats.waive; 92 93 for_each_possible_cpu(cpu) { 94 const struct page_pool_recycle_stats *pcpu = 95 per_cpu_ptr(pool->recycle_stats, cpu); 96 97 stats->recycle_stats.cached += pcpu->cached; 98 stats->recycle_stats.cache_full += pcpu->cache_full; 99 stats->recycle_stats.ring += pcpu->ring; 100 stats->recycle_stats.ring_full += pcpu->ring_full; 101 stats->recycle_stats.released_refcnt += pcpu->released_refcnt; 102 } 103 104 return true; 105 } 106 EXPORT_SYMBOL(page_pool_get_stats); 107 108 u8 *page_pool_ethtool_stats_get_strings(u8 *data) 109 { 110 int i; 111 112 for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { 113 memcpy(data, pp_stats[i], ETH_GSTRING_LEN); 114 data += ETH_GSTRING_LEN; 115 } 116 117 return data; 118 } 119 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); 120 121 int page_pool_ethtool_stats_get_count(void) 122 { 123 return ARRAY_SIZE(pp_stats); 124 } 125 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); 126 127 u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) 128 { 129 const struct page_pool_stats *pool_stats = stats; 130 131 *data++ = pool_stats->alloc_stats.fast; 132 *data++ = pool_stats->alloc_stats.slow; 133 *data++ = pool_stats->alloc_stats.slow_high_order; 134 *data++ = pool_stats->alloc_stats.empty; 135 *data++ = pool_stats->alloc_stats.refill; 136 *data++ = pool_stats->alloc_stats.waive; 137 *data++ = pool_stats->recycle_stats.cached; 138 *data++ = pool_stats->recycle_stats.cache_full; 139 *data++ = pool_stats->recycle_stats.ring; 140 *data++ = pool_stats->recycle_stats.ring_full; 141 *data++ = pool_stats->recycle_stats.released_refcnt; 142 143 return data; 144 } 145 EXPORT_SYMBOL(page_pool_ethtool_stats_get); 146 147 #else 148 #define alloc_stat_inc(pool, __stat) 149 #define recycle_stat_inc(pool, __stat) 150 #define recycle_stat_add(pool, __stat, val) 151 #endif 152 153 static bool page_pool_producer_lock(struct page_pool *pool) 154 __acquires(&pool->ring.producer_lock) 155 { 156 bool in_softirq = in_softirq(); 157 158 if (in_softirq) 159 spin_lock(&pool->ring.producer_lock); 160 else 161 spin_lock_bh(&pool->ring.producer_lock); 162 163 return in_softirq; 164 } 165 166 static void page_pool_producer_unlock(struct page_pool *pool, 167 bool in_softirq) 168 __releases(&pool->ring.producer_lock) 169 { 170 if (in_softirq) 171 spin_unlock(&pool->ring.producer_lock); 172 else 173 spin_unlock_bh(&pool->ring.producer_lock); 174 } 175 176 static void page_pool_struct_check(void) 177 { 178 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); 179 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); 180 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); 181 CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, 182 PAGE_POOL_FRAG_GROUP_ALIGN); 183 } 184 185 static int page_pool_init(struct page_pool *pool, 186 const struct page_pool_params *params, 187 int cpuid) 188 { 189 unsigned int ring_qsize = 1024; /* Default */ 190 191 page_pool_struct_check(); 192 193 memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); 194 memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); 195 196 pool->cpuid = cpuid; 197 198 /* Validate only known flags were used */ 199 if (pool->slow.flags & ~PP_FLAG_ALL) 200 return -EINVAL; 201 202 if (pool->p.pool_size) 203 ring_qsize = pool->p.pool_size; 204 205 /* Sanity limit mem that can be pinned down */ 206 if (ring_qsize > 32768) 207 return -E2BIG; 208 209 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. 210 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, 211 * which is the XDP_TX use-case. 212 */ 213 if (pool->slow.flags & PP_FLAG_DMA_MAP) { 214 if ((pool->p.dma_dir != DMA_FROM_DEVICE) && 215 (pool->p.dma_dir != DMA_BIDIRECTIONAL)) 216 return -EINVAL; 217 218 pool->dma_map = true; 219 } 220 221 if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { 222 /* In order to request DMA-sync-for-device the page 223 * needs to be mapped 224 */ 225 if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) 226 return -EINVAL; 227 228 if (!pool->p.max_len) 229 return -EINVAL; 230 231 pool->dma_sync = true; 232 233 /* pool->p.offset has to be set according to the address 234 * offset used by the DMA engine to start copying rx data 235 */ 236 } 237 238 pool->has_init_callback = !!pool->slow.init_callback; 239 240 #ifdef CONFIG_PAGE_POOL_STATS 241 if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) { 242 pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); 243 if (!pool->recycle_stats) 244 return -ENOMEM; 245 } else { 246 /* For system page pool instance we use a singular stats object 247 * instead of allocating a separate percpu variable for each 248 * (also percpu) page pool instance. 249 */ 250 pool->recycle_stats = &pp_system_recycle_stats; 251 pool->system = true; 252 } 253 #endif 254 255 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { 256 #ifdef CONFIG_PAGE_POOL_STATS 257 if (!pool->system) 258 free_percpu(pool->recycle_stats); 259 #endif 260 return -ENOMEM; 261 } 262 263 atomic_set(&pool->pages_state_release_cnt, 0); 264 265 /* Driver calling page_pool_create() also call page_pool_destroy() */ 266 refcount_set(&pool->user_cnt, 1); 267 268 if (pool->dma_map) 269 get_device(pool->p.dev); 270 271 return 0; 272 } 273 274 static void page_pool_uninit(struct page_pool *pool) 275 { 276 ptr_ring_cleanup(&pool->ring, NULL); 277 278 if (pool->dma_map) 279 put_device(pool->p.dev); 280 281 #ifdef CONFIG_PAGE_POOL_STATS 282 if (!pool->system) 283 free_percpu(pool->recycle_stats); 284 #endif 285 } 286 287 /** 288 * page_pool_create_percpu() - create a page pool for a given cpu. 289 * @params: parameters, see struct page_pool_params 290 * @cpuid: cpu identifier 291 */ 292 struct page_pool * 293 page_pool_create_percpu(const struct page_pool_params *params, int cpuid) 294 { 295 struct page_pool *pool; 296 int err; 297 298 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); 299 if (!pool) 300 return ERR_PTR(-ENOMEM); 301 302 err = page_pool_init(pool, params, cpuid); 303 if (err < 0) 304 goto err_free; 305 306 err = page_pool_list(pool); 307 if (err) 308 goto err_uninit; 309 310 return pool; 311 312 err_uninit: 313 page_pool_uninit(pool); 314 err_free: 315 pr_warn("%s() gave up with errno %d\n", __func__, err); 316 kfree(pool); 317 return ERR_PTR(err); 318 } 319 EXPORT_SYMBOL(page_pool_create_percpu); 320 321 /** 322 * page_pool_create() - create a page pool 323 * @params: parameters, see struct page_pool_params 324 */ 325 struct page_pool *page_pool_create(const struct page_pool_params *params) 326 { 327 return page_pool_create_percpu(params, -1); 328 } 329 EXPORT_SYMBOL(page_pool_create); 330 331 static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem); 332 333 static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) 334 { 335 struct ptr_ring *r = &pool->ring; 336 netmem_ref netmem; 337 int pref_nid; /* preferred NUMA node */ 338 339 /* Quicker fallback, avoid locks when ring is empty */ 340 if (__ptr_ring_empty(r)) { 341 alloc_stat_inc(pool, empty); 342 return 0; 343 } 344 345 /* Softirq guarantee CPU and thus NUMA node is stable. This, 346 * assumes CPU refilling driver RX-ring will also run RX-NAPI. 347 */ 348 #ifdef CONFIG_NUMA 349 pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; 350 #else 351 /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ 352 pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ 353 #endif 354 355 /* Refill alloc array, but only if NUMA match */ 356 do { 357 netmem = (__force netmem_ref)__ptr_ring_consume(r); 358 if (unlikely(!netmem)) 359 break; 360 361 if (likely(page_to_nid(netmem_to_page(netmem)) == pref_nid)) { 362 pool->alloc.cache[pool->alloc.count++] = netmem; 363 } else { 364 /* NUMA mismatch; 365 * (1) release 1 page to page-allocator and 366 * (2) break out to fallthrough to alloc_pages_node. 367 * This limit stress on page buddy alloactor. 368 */ 369 page_pool_return_page(pool, netmem); 370 alloc_stat_inc(pool, waive); 371 netmem = 0; 372 break; 373 } 374 } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); 375 376 /* Return last page */ 377 if (likely(pool->alloc.count > 0)) { 378 netmem = pool->alloc.cache[--pool->alloc.count]; 379 alloc_stat_inc(pool, refill); 380 } 381 382 return netmem; 383 } 384 385 /* fast path */ 386 static netmem_ref __page_pool_get_cached(struct page_pool *pool) 387 { 388 netmem_ref netmem; 389 390 /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ 391 if (likely(pool->alloc.count)) { 392 /* Fast-path */ 393 netmem = pool->alloc.cache[--pool->alloc.count]; 394 alloc_stat_inc(pool, fast); 395 } else { 396 netmem = page_pool_refill_alloc_cache(pool); 397 } 398 399 return netmem; 400 } 401 402 static void __page_pool_dma_sync_for_device(const struct page_pool *pool, 403 netmem_ref netmem, 404 u32 dma_sync_size) 405 { 406 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) 407 dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem); 408 409 dma_sync_size = min(dma_sync_size, pool->p.max_len); 410 __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, 411 dma_sync_size, pool->p.dma_dir); 412 #endif 413 } 414 415 static __always_inline void 416 page_pool_dma_sync_for_device(const struct page_pool *pool, 417 netmem_ref netmem, 418 u32 dma_sync_size) 419 { 420 if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) 421 __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); 422 } 423 424 static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) 425 { 426 dma_addr_t dma; 427 428 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr 429 * since dma_addr_t can be either 32 or 64 bits and does not always fit 430 * into page private data (i.e 32bit cpu with 64bit DMA caps) 431 * This mapping is kept for lifetime of page, until leaving pool. 432 */ 433 dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0, 434 (PAGE_SIZE << pool->p.order), pool->p.dma_dir, 435 DMA_ATTR_SKIP_CPU_SYNC | 436 DMA_ATTR_WEAK_ORDERING); 437 if (dma_mapping_error(pool->p.dev, dma)) 438 return false; 439 440 if (page_pool_set_dma_addr_netmem(netmem, dma)) 441 goto unmap_failed; 442 443 page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); 444 445 return true; 446 447 unmap_failed: 448 WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); 449 dma_unmap_page_attrs(pool->p.dev, dma, 450 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 451 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); 452 return false; 453 } 454 455 static void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) 456 { 457 struct page *page = netmem_to_page(netmem); 458 459 page->pp = pool; 460 page->pp_magic |= PP_SIGNATURE; 461 462 /* Ensuring all pages have been split into one fragment initially: 463 * page_pool_set_pp_info() is only called once for every page when it 464 * is allocated from the page allocator and page_pool_fragment_page() 465 * is dirtying the same cache line as the page->pp_magic above, so 466 * the overhead is negligible. 467 */ 468 page_pool_fragment_netmem(netmem, 1); 469 if (pool->has_init_callback) 470 pool->slow.init_callback(netmem, pool->slow.init_arg); 471 } 472 473 static void page_pool_clear_pp_info(netmem_ref netmem) 474 { 475 struct page *page = netmem_to_page(netmem); 476 477 page->pp_magic = 0; 478 page->pp = NULL; 479 } 480 481 static struct page *__page_pool_alloc_page_order(struct page_pool *pool, 482 gfp_t gfp) 483 { 484 struct page *page; 485 486 gfp |= __GFP_COMP; 487 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); 488 if (unlikely(!page)) 489 return NULL; 490 491 if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) { 492 put_page(page); 493 return NULL; 494 } 495 496 alloc_stat_inc(pool, slow_high_order); 497 page_pool_set_pp_info(pool, page_to_netmem(page)); 498 499 /* Track how many pages are held 'in-flight' */ 500 pool->pages_state_hold_cnt++; 501 trace_page_pool_state_hold(pool, page_to_netmem(page), 502 pool->pages_state_hold_cnt); 503 return page; 504 } 505 506 /* slow path */ 507 static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, 508 gfp_t gfp) 509 { 510 const int bulk = PP_ALLOC_CACHE_REFILL; 511 unsigned int pp_order = pool->p.order; 512 bool dma_map = pool->dma_map; 513 netmem_ref netmem; 514 int i, nr_pages; 515 516 /* Don't support bulk alloc for high-order pages */ 517 if (unlikely(pp_order)) 518 return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); 519 520 /* Unnecessary as alloc cache is empty, but guarantees zero count */ 521 if (unlikely(pool->alloc.count > 0)) 522 return pool->alloc.cache[--pool->alloc.count]; 523 524 /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ 525 memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); 526 527 nr_pages = alloc_pages_bulk_array_node(gfp, 528 pool->p.nid, bulk, 529 (struct page **)pool->alloc.cache); 530 if (unlikely(!nr_pages)) 531 return 0; 532 533 /* Pages have been filled into alloc.cache array, but count is zero and 534 * page element have not been (possibly) DMA mapped. 535 */ 536 for (i = 0; i < nr_pages; i++) { 537 netmem = pool->alloc.cache[i]; 538 if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) { 539 put_page(netmem_to_page(netmem)); 540 continue; 541 } 542 543 page_pool_set_pp_info(pool, netmem); 544 pool->alloc.cache[pool->alloc.count++] = netmem; 545 /* Track how many pages are held 'in-flight' */ 546 pool->pages_state_hold_cnt++; 547 trace_page_pool_state_hold(pool, netmem, 548 pool->pages_state_hold_cnt); 549 } 550 551 /* Return last page */ 552 if (likely(pool->alloc.count > 0)) { 553 netmem = pool->alloc.cache[--pool->alloc.count]; 554 alloc_stat_inc(pool, slow); 555 } else { 556 netmem = 0; 557 } 558 559 /* When page just alloc'ed is should/must have refcnt 1. */ 560 return netmem; 561 } 562 563 /* For using page_pool replace: alloc_pages() API calls, but provide 564 * synchronization guarantee for allocation side. 565 */ 566 netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp) 567 { 568 netmem_ref netmem; 569 570 /* Fast-path: Get a page from cache */ 571 netmem = __page_pool_get_cached(pool); 572 if (netmem) 573 return netmem; 574 575 /* Slow-path: cache empty, do real allocation */ 576 netmem = __page_pool_alloc_pages_slow(pool, gfp); 577 return netmem; 578 } 579 EXPORT_SYMBOL(page_pool_alloc_netmem); 580 581 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) 582 { 583 return netmem_to_page(page_pool_alloc_netmem(pool, gfp)); 584 } 585 EXPORT_SYMBOL(page_pool_alloc_pages); 586 ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL); 587 588 /* Calculate distance between two u32 values, valid if distance is below 2^(31) 589 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution 590 */ 591 #define _distance(a, b) (s32)((a) - (b)) 592 593 s32 page_pool_inflight(const struct page_pool *pool, bool strict) 594 { 595 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); 596 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); 597 s32 inflight; 598 599 inflight = _distance(hold_cnt, release_cnt); 600 601 if (strict) { 602 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); 603 WARN(inflight < 0, "Negative(%d) inflight packet-pages", 604 inflight); 605 } else { 606 inflight = max(0, inflight); 607 } 608 609 return inflight; 610 } 611 612 static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, 613 netmem_ref netmem) 614 { 615 dma_addr_t dma; 616 617 if (!pool->dma_map) 618 /* Always account for inflight pages, even if we didn't 619 * map them 620 */ 621 return; 622 623 dma = page_pool_get_dma_addr_netmem(netmem); 624 625 /* When page is unmapped, it cannot be returned to our pool */ 626 dma_unmap_page_attrs(pool->p.dev, dma, 627 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 628 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); 629 page_pool_set_dma_addr_netmem(netmem, 0); 630 } 631 632 /* Disconnects a page (from a page_pool). API users can have a need 633 * to disconnect a page (from a page_pool), to allow it to be used as 634 * a regular page (that will eventually be returned to the normal 635 * page-allocator via put_page). 636 */ 637 void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) 638 { 639 int count; 640 641 __page_pool_release_page_dma(pool, netmem); 642 643 /* This may be the last page returned, releasing the pool, so 644 * it is not safe to reference pool afterwards. 645 */ 646 count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); 647 trace_page_pool_state_release(pool, netmem, count); 648 649 page_pool_clear_pp_info(netmem); 650 put_page(netmem_to_page(netmem)); 651 /* An optimization would be to call __free_pages(page, pool->p.order) 652 * knowing page is not part of page-cache (thus avoiding a 653 * __page_cache_release() call). 654 */ 655 } 656 657 static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) 658 { 659 int ret; 660 /* BH protection not needed if current is softirq */ 661 if (in_softirq()) 662 ret = ptr_ring_produce(&pool->ring, (__force void *)netmem); 663 else 664 ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem); 665 666 if (!ret) { 667 recycle_stat_inc(pool, ring); 668 return true; 669 } 670 671 return false; 672 } 673 674 /* Only allow direct recycling in special circumstances, into the 675 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. 676 * 677 * Caller must provide appropriate safe context. 678 */ 679 static bool page_pool_recycle_in_cache(netmem_ref netmem, 680 struct page_pool *pool) 681 { 682 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { 683 recycle_stat_inc(pool, cache_full); 684 return false; 685 } 686 687 /* Caller MUST have verified/know (page_ref_count(page) == 1) */ 688 pool->alloc.cache[pool->alloc.count++] = netmem; 689 recycle_stat_inc(pool, cached); 690 return true; 691 } 692 693 static bool __page_pool_page_can_be_recycled(netmem_ref netmem) 694 { 695 return page_ref_count(netmem_to_page(netmem)) == 1 && 696 !page_is_pfmemalloc(netmem_to_page(netmem)); 697 } 698 699 /* If the page refcnt == 1, this will try to recycle the page. 700 * If pool->dma_sync is set, we'll try to sync the DMA area for 701 * the configured size min(dma_sync_size, pool->max_len). 702 * If the page refcnt != 1, then the page will be returned to memory 703 * subsystem. 704 */ 705 static __always_inline netmem_ref 706 __page_pool_put_page(struct page_pool *pool, netmem_ref netmem, 707 unsigned int dma_sync_size, bool allow_direct) 708 { 709 lockdep_assert_no_hardirq(); 710 711 /* This allocator is optimized for the XDP mode that uses 712 * one-frame-per-page, but have fallbacks that act like the 713 * regular page allocator APIs. 714 * 715 * refcnt == 1 means page_pool owns page, and can recycle it. 716 * 717 * page is NOT reusable when allocated when system is under 718 * some pressure. (page_is_pfmemalloc) 719 */ 720 if (likely(__page_pool_page_can_be_recycled(netmem))) { 721 /* Read barrier done in page_ref_count / READ_ONCE */ 722 723 page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); 724 725 if (allow_direct && page_pool_recycle_in_cache(netmem, pool)) 726 return 0; 727 728 /* Page found as candidate for recycling */ 729 return netmem; 730 } 731 /* Fallback/non-XDP mode: API user have elevated refcnt. 732 * 733 * Many drivers split up the page into fragments, and some 734 * want to keep doing this to save memory and do refcnt based 735 * recycling. Support this use case too, to ease drivers 736 * switching between XDP/non-XDP. 737 * 738 * In-case page_pool maintains the DMA mapping, API user must 739 * call page_pool_put_page once. In this elevated refcnt 740 * case, the DMA is unmapped/released, as driver is likely 741 * doing refcnt based recycle tricks, meaning another process 742 * will be invoking put_page. 743 */ 744 recycle_stat_inc(pool, released_refcnt); 745 page_pool_return_page(pool, netmem); 746 747 return 0; 748 } 749 750 static bool page_pool_napi_local(const struct page_pool *pool) 751 { 752 const struct napi_struct *napi; 753 u32 cpuid; 754 755 if (unlikely(!in_softirq())) 756 return false; 757 758 /* Allow direct recycle if we have reasons to believe that we are 759 * in the same context as the consumer would run, so there's 760 * no possible race. 761 * __page_pool_put_page() makes sure we're not in hardirq context 762 * and interrupts are enabled prior to accessing the cache. 763 */ 764 cpuid = smp_processor_id(); 765 if (READ_ONCE(pool->cpuid) == cpuid) 766 return true; 767 768 napi = READ_ONCE(pool->p.napi); 769 770 return napi && READ_ONCE(napi->list_owner) == cpuid; 771 } 772 773 void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, 774 unsigned int dma_sync_size, bool allow_direct) 775 { 776 if (!allow_direct) 777 allow_direct = page_pool_napi_local(pool); 778 779 netmem = 780 __page_pool_put_page(pool, netmem, dma_sync_size, allow_direct); 781 if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { 782 /* Cache full, fallback to free pages */ 783 recycle_stat_inc(pool, ring_full); 784 page_pool_return_page(pool, netmem); 785 } 786 } 787 EXPORT_SYMBOL(page_pool_put_unrefed_netmem); 788 789 void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, 790 unsigned int dma_sync_size, bool allow_direct) 791 { 792 page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size, 793 allow_direct); 794 } 795 EXPORT_SYMBOL(page_pool_put_unrefed_page); 796 797 /** 798 * page_pool_put_page_bulk() - release references on multiple pages 799 * @pool: pool from which pages were allocated 800 * @data: array holding page pointers 801 * @count: number of pages in @data 802 * 803 * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring 804 * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() 805 * will release leftover pages to the page allocator. 806 * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx 807 * completion loop for the XDP_REDIRECT use case. 808 * 809 * Please note the caller must not use data area after running 810 * page_pool_put_page_bulk(), as this function overwrites it. 811 */ 812 void page_pool_put_page_bulk(struct page_pool *pool, void **data, 813 int count) 814 { 815 int i, bulk_len = 0; 816 bool allow_direct; 817 bool in_softirq; 818 819 allow_direct = page_pool_napi_local(pool); 820 821 for (i = 0; i < count; i++) { 822 netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i])); 823 824 /* It is not the last user for the page frag case */ 825 if (!page_pool_is_last_ref(netmem)) 826 continue; 827 828 netmem = __page_pool_put_page(pool, netmem, -1, allow_direct); 829 /* Approved for bulk recycling in ptr_ring cache */ 830 if (netmem) 831 data[bulk_len++] = (__force void *)netmem; 832 } 833 834 if (!bulk_len) 835 return; 836 837 /* Bulk producer into ptr_ring page_pool cache */ 838 in_softirq = page_pool_producer_lock(pool); 839 for (i = 0; i < bulk_len; i++) { 840 if (__ptr_ring_produce(&pool->ring, data[i])) { 841 /* ring full */ 842 recycle_stat_inc(pool, ring_full); 843 break; 844 } 845 } 846 recycle_stat_add(pool, ring, i); 847 page_pool_producer_unlock(pool, in_softirq); 848 849 /* Hopefully all pages was return into ptr_ring */ 850 if (likely(i == bulk_len)) 851 return; 852 853 /* ptr_ring cache full, free remaining pages outside producer lock 854 * since put_page() with refcnt == 1 can be an expensive operation 855 */ 856 for (; i < bulk_len; i++) 857 page_pool_return_page(pool, (__force netmem_ref)data[i]); 858 } 859 EXPORT_SYMBOL(page_pool_put_page_bulk); 860 861 static netmem_ref page_pool_drain_frag(struct page_pool *pool, 862 netmem_ref netmem) 863 { 864 long drain_count = BIAS_MAX - pool->frag_users; 865 866 /* Some user is still using the page frag */ 867 if (likely(page_pool_unref_netmem(netmem, drain_count))) 868 return 0; 869 870 if (__page_pool_page_can_be_recycled(netmem)) { 871 page_pool_dma_sync_for_device(pool, netmem, -1); 872 return netmem; 873 } 874 875 page_pool_return_page(pool, netmem); 876 return 0; 877 } 878 879 static void page_pool_free_frag(struct page_pool *pool) 880 { 881 long drain_count = BIAS_MAX - pool->frag_users; 882 netmem_ref netmem = pool->frag_page; 883 884 pool->frag_page = 0; 885 886 if (!netmem || page_pool_unref_netmem(netmem, drain_count)) 887 return; 888 889 page_pool_return_page(pool, netmem); 890 } 891 892 netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, 893 unsigned int *offset, unsigned int size, 894 gfp_t gfp) 895 { 896 unsigned int max_size = PAGE_SIZE << pool->p.order; 897 netmem_ref netmem = pool->frag_page; 898 899 if (WARN_ON(size > max_size)) 900 return 0; 901 902 size = ALIGN(size, dma_get_cache_alignment()); 903 *offset = pool->frag_offset; 904 905 if (netmem && *offset + size > max_size) { 906 netmem = page_pool_drain_frag(pool, netmem); 907 if (netmem) { 908 alloc_stat_inc(pool, fast); 909 goto frag_reset; 910 } 911 } 912 913 if (!netmem) { 914 netmem = page_pool_alloc_netmem(pool, gfp); 915 if (unlikely(!netmem)) { 916 pool->frag_page = 0; 917 return 0; 918 } 919 920 pool->frag_page = netmem; 921 922 frag_reset: 923 pool->frag_users = 1; 924 *offset = 0; 925 pool->frag_offset = size; 926 page_pool_fragment_netmem(netmem, BIAS_MAX); 927 return netmem; 928 } 929 930 pool->frag_users++; 931 pool->frag_offset = *offset + size; 932 alloc_stat_inc(pool, fast); 933 return netmem; 934 } 935 EXPORT_SYMBOL(page_pool_alloc_frag_netmem); 936 937 struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, 938 unsigned int size, gfp_t gfp) 939 { 940 return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size, 941 gfp)); 942 } 943 EXPORT_SYMBOL(page_pool_alloc_frag); 944 945 static void page_pool_empty_ring(struct page_pool *pool) 946 { 947 netmem_ref netmem; 948 949 /* Empty recycle ring */ 950 while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { 951 /* Verify the refcnt invariant of cached pages */ 952 if (!(page_ref_count(netmem_to_page(netmem)) == 1)) 953 pr_crit("%s() page_pool refcnt %d violation\n", 954 __func__, netmem_ref_count(netmem)); 955 956 page_pool_return_page(pool, netmem); 957 } 958 } 959 960 static void __page_pool_destroy(struct page_pool *pool) 961 { 962 if (pool->disconnect) 963 pool->disconnect(pool); 964 965 page_pool_unlist(pool); 966 page_pool_uninit(pool); 967 kfree(pool); 968 } 969 970 static void page_pool_empty_alloc_cache_once(struct page_pool *pool) 971 { 972 netmem_ref netmem; 973 974 if (pool->destroy_cnt) 975 return; 976 977 /* Empty alloc cache, assume caller made sure this is 978 * no-longer in use, and page_pool_alloc_pages() cannot be 979 * call concurrently. 980 */ 981 while (pool->alloc.count) { 982 netmem = pool->alloc.cache[--pool->alloc.count]; 983 page_pool_return_page(pool, netmem); 984 } 985 } 986 987 static void page_pool_scrub(struct page_pool *pool) 988 { 989 page_pool_empty_alloc_cache_once(pool); 990 pool->destroy_cnt++; 991 992 /* No more consumers should exist, but producers could still 993 * be in-flight. 994 */ 995 page_pool_empty_ring(pool); 996 } 997 998 static int page_pool_release(struct page_pool *pool) 999 { 1000 int inflight; 1001 1002 page_pool_scrub(pool); 1003 inflight = page_pool_inflight(pool, true); 1004 if (!inflight) 1005 __page_pool_destroy(pool); 1006 1007 return inflight; 1008 } 1009 1010 static void page_pool_release_retry(struct work_struct *wq) 1011 { 1012 struct delayed_work *dwq = to_delayed_work(wq); 1013 struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); 1014 void *netdev; 1015 int inflight; 1016 1017 inflight = page_pool_release(pool); 1018 if (!inflight) 1019 return; 1020 1021 /* Periodic warning for page pools the user can't see */ 1022 netdev = READ_ONCE(pool->slow.netdev); 1023 if (time_after_eq(jiffies, pool->defer_warn) && 1024 (!netdev || netdev == NET_PTR_POISON)) { 1025 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; 1026 1027 pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", 1028 __func__, pool->user.id, inflight, sec); 1029 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 1030 } 1031 1032 /* Still not ready to be disconnected, retry later */ 1033 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 1034 } 1035 1036 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), 1037 const struct xdp_mem_info *mem) 1038 { 1039 refcount_inc(&pool->user_cnt); 1040 pool->disconnect = disconnect; 1041 pool->xdp_mem_id = mem->id; 1042 } 1043 1044 void page_pool_disable_direct_recycling(struct page_pool *pool) 1045 { 1046 /* Disable direct recycling based on pool->cpuid. 1047 * Paired with READ_ONCE() in page_pool_napi_local(). 1048 */ 1049 WRITE_ONCE(pool->cpuid, -1); 1050 1051 if (!pool->p.napi) 1052 return; 1053 1054 /* To avoid races with recycling and additional barriers make sure 1055 * pool and NAPI are unlinked when NAPI is disabled. 1056 */ 1057 WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state)); 1058 WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1); 1059 1060 WRITE_ONCE(pool->p.napi, NULL); 1061 } 1062 EXPORT_SYMBOL(page_pool_disable_direct_recycling); 1063 1064 void page_pool_destroy(struct page_pool *pool) 1065 { 1066 if (!pool) 1067 return; 1068 1069 if (!page_pool_put(pool)) 1070 return; 1071 1072 page_pool_disable_direct_recycling(pool); 1073 page_pool_free_frag(pool); 1074 1075 if (!page_pool_release(pool)) 1076 return; 1077 1078 page_pool_detached(pool); 1079 pool->defer_start = jiffies; 1080 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 1081 1082 INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); 1083 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 1084 } 1085 EXPORT_SYMBOL(page_pool_destroy); 1086 1087 /* Caller must provide appropriate safe context, e.g. NAPI. */ 1088 void page_pool_update_nid(struct page_pool *pool, int new_nid) 1089 { 1090 netmem_ref netmem; 1091 1092 trace_page_pool_update_nid(pool, new_nid); 1093 pool->p.nid = new_nid; 1094 1095 /* Flush pool alloc cache, as refill will check NUMA node */ 1096 while (pool->alloc.count) { 1097 netmem = pool->alloc.cache[--pool->alloc.count]; 1098 page_pool_return_page(pool, netmem); 1099 } 1100 } 1101 EXPORT_SYMBOL(page_pool_update_nid); 1102