1 /* SPDX-License-Identifier: GPL-2.0 2 * 3 * page_pool.c 4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> 5 * Copyright (C) 2016 Red Hat, Inc. 6 */ 7 8 #include <linux/types.h> 9 #include <linux/kernel.h> 10 #include <linux/slab.h> 11 #include <linux/device.h> 12 13 #include <net/page_pool/helpers.h> 14 #include <net/xdp.h> 15 16 #include <linux/dma-direction.h> 17 #include <linux/dma-mapping.h> 18 #include <linux/page-flags.h> 19 #include <linux/mm.h> /* for put_page() */ 20 #include <linux/poison.h> 21 #include <linux/ethtool.h> 22 #include <linux/netdevice.h> 23 24 #include <trace/events/page_pool.h> 25 26 #include "page_pool_priv.h" 27 28 #define DEFER_TIME (msecs_to_jiffies(1000)) 29 #define DEFER_WARN_INTERVAL (60 * HZ) 30 31 #define BIAS_MAX (LONG_MAX >> 1) 32 33 #ifdef CONFIG_PAGE_POOL_STATS 34 static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); 35 36 /* alloc_stat_inc is intended to be used in softirq context */ 37 #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) 38 /* recycle_stat_inc is safe to use when preemption is possible. */ 39 #define recycle_stat_inc(pool, __stat) \ 40 do { \ 41 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ 42 this_cpu_inc(s->__stat); \ 43 } while (0) 44 45 #define recycle_stat_add(pool, __stat, val) \ 46 do { \ 47 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ 48 this_cpu_add(s->__stat, val); \ 49 } while (0) 50 51 static const char pp_stats[][ETH_GSTRING_LEN] = { 52 "rx_pp_alloc_fast", 53 "rx_pp_alloc_slow", 54 "rx_pp_alloc_slow_ho", 55 "rx_pp_alloc_empty", 56 "rx_pp_alloc_refill", 57 "rx_pp_alloc_waive", 58 "rx_pp_recycle_cached", 59 "rx_pp_recycle_cache_full", 60 "rx_pp_recycle_ring", 61 "rx_pp_recycle_ring_full", 62 "rx_pp_recycle_released_ref", 63 }; 64 65 /** 66 * page_pool_get_stats() - fetch page pool stats 67 * @pool: pool from which page was allocated 68 * @stats: struct page_pool_stats to fill in 69 * 70 * Retrieve statistics about the page_pool. This API is only available 71 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. 72 * A pointer to a caller allocated struct page_pool_stats structure 73 * is passed to this API which is filled in. The caller can then report 74 * those stats to the user (perhaps via ethtool, debugfs, etc.). 75 */ 76 bool page_pool_get_stats(const struct page_pool *pool, 77 struct page_pool_stats *stats) 78 { 79 int cpu = 0; 80 81 if (!stats) 82 return false; 83 84 /* The caller is responsible to initialize stats. */ 85 stats->alloc_stats.fast += pool->alloc_stats.fast; 86 stats->alloc_stats.slow += pool->alloc_stats.slow; 87 stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; 88 stats->alloc_stats.empty += pool->alloc_stats.empty; 89 stats->alloc_stats.refill += pool->alloc_stats.refill; 90 stats->alloc_stats.waive += pool->alloc_stats.waive; 91 92 for_each_possible_cpu(cpu) { 93 const struct page_pool_recycle_stats *pcpu = 94 per_cpu_ptr(pool->recycle_stats, cpu); 95 96 stats->recycle_stats.cached += pcpu->cached; 97 stats->recycle_stats.cache_full += pcpu->cache_full; 98 stats->recycle_stats.ring += pcpu->ring; 99 stats->recycle_stats.ring_full += pcpu->ring_full; 100 stats->recycle_stats.released_refcnt += pcpu->released_refcnt; 101 } 102 103 return true; 104 } 105 EXPORT_SYMBOL(page_pool_get_stats); 106 107 u8 *page_pool_ethtool_stats_get_strings(u8 *data) 108 { 109 int i; 110 111 for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { 112 memcpy(data, pp_stats[i], ETH_GSTRING_LEN); 113 data += ETH_GSTRING_LEN; 114 } 115 116 return data; 117 } 118 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); 119 120 int page_pool_ethtool_stats_get_count(void) 121 { 122 return ARRAY_SIZE(pp_stats); 123 } 124 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); 125 126 u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) 127 { 128 struct page_pool_stats *pool_stats = stats; 129 130 *data++ = pool_stats->alloc_stats.fast; 131 *data++ = pool_stats->alloc_stats.slow; 132 *data++ = pool_stats->alloc_stats.slow_high_order; 133 *data++ = pool_stats->alloc_stats.empty; 134 *data++ = pool_stats->alloc_stats.refill; 135 *data++ = pool_stats->alloc_stats.waive; 136 *data++ = pool_stats->recycle_stats.cached; 137 *data++ = pool_stats->recycle_stats.cache_full; 138 *data++ = pool_stats->recycle_stats.ring; 139 *data++ = pool_stats->recycle_stats.ring_full; 140 *data++ = pool_stats->recycle_stats.released_refcnt; 141 142 return data; 143 } 144 EXPORT_SYMBOL(page_pool_ethtool_stats_get); 145 146 #else 147 #define alloc_stat_inc(pool, __stat) 148 #define recycle_stat_inc(pool, __stat) 149 #define recycle_stat_add(pool, __stat, val) 150 #endif 151 152 static bool page_pool_producer_lock(struct page_pool *pool) 153 __acquires(&pool->ring.producer_lock) 154 { 155 bool in_softirq = in_softirq(); 156 157 if (in_softirq) 158 spin_lock(&pool->ring.producer_lock); 159 else 160 spin_lock_bh(&pool->ring.producer_lock); 161 162 return in_softirq; 163 } 164 165 static void page_pool_producer_unlock(struct page_pool *pool, 166 bool in_softirq) 167 __releases(&pool->ring.producer_lock) 168 { 169 if (in_softirq) 170 spin_unlock(&pool->ring.producer_lock); 171 else 172 spin_unlock_bh(&pool->ring.producer_lock); 173 } 174 175 static void page_pool_struct_check(void) 176 { 177 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); 178 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); 179 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); 180 CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, 4 * sizeof(long)); 181 } 182 183 static int page_pool_init(struct page_pool *pool, 184 const struct page_pool_params *params, 185 int cpuid) 186 { 187 unsigned int ring_qsize = 1024; /* Default */ 188 189 page_pool_struct_check(); 190 191 memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); 192 memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); 193 194 pool->cpuid = cpuid; 195 196 /* Validate only known flags were used */ 197 if (pool->p.flags & ~(PP_FLAG_ALL)) 198 return -EINVAL; 199 200 if (pool->p.pool_size) 201 ring_qsize = pool->p.pool_size; 202 203 /* Sanity limit mem that can be pinned down */ 204 if (ring_qsize > 32768) 205 return -E2BIG; 206 207 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. 208 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, 209 * which is the XDP_TX use-case. 210 */ 211 if (pool->p.flags & PP_FLAG_DMA_MAP) { 212 if ((pool->p.dma_dir != DMA_FROM_DEVICE) && 213 (pool->p.dma_dir != DMA_BIDIRECTIONAL)) 214 return -EINVAL; 215 } 216 217 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { 218 /* In order to request DMA-sync-for-device the page 219 * needs to be mapped 220 */ 221 if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 222 return -EINVAL; 223 224 if (!pool->p.max_len) 225 return -EINVAL; 226 227 /* pool->p.offset has to be set according to the address 228 * offset used by the DMA engine to start copying rx data 229 */ 230 } 231 232 pool->has_init_callback = !!pool->slow.init_callback; 233 234 #ifdef CONFIG_PAGE_POOL_STATS 235 if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) { 236 pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); 237 if (!pool->recycle_stats) 238 return -ENOMEM; 239 } else { 240 /* For system page pool instance we use a singular stats object 241 * instead of allocating a separate percpu variable for each 242 * (also percpu) page pool instance. 243 */ 244 pool->recycle_stats = &pp_system_recycle_stats; 245 } 246 #endif 247 248 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { 249 #ifdef CONFIG_PAGE_POOL_STATS 250 if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) 251 free_percpu(pool->recycle_stats); 252 #endif 253 return -ENOMEM; 254 } 255 256 atomic_set(&pool->pages_state_release_cnt, 0); 257 258 /* Driver calling page_pool_create() also call page_pool_destroy() */ 259 refcount_set(&pool->user_cnt, 1); 260 261 if (pool->p.flags & PP_FLAG_DMA_MAP) 262 get_device(pool->p.dev); 263 264 return 0; 265 } 266 267 static void page_pool_uninit(struct page_pool *pool) 268 { 269 ptr_ring_cleanup(&pool->ring, NULL); 270 271 if (pool->p.flags & PP_FLAG_DMA_MAP) 272 put_device(pool->p.dev); 273 274 #ifdef CONFIG_PAGE_POOL_STATS 275 if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) 276 free_percpu(pool->recycle_stats); 277 #endif 278 } 279 280 /** 281 * page_pool_create_percpu() - create a page pool for a given cpu. 282 * @params: parameters, see struct page_pool_params 283 * @cpuid: cpu identifier 284 */ 285 struct page_pool * 286 page_pool_create_percpu(const struct page_pool_params *params, int cpuid) 287 { 288 struct page_pool *pool; 289 int err; 290 291 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); 292 if (!pool) 293 return ERR_PTR(-ENOMEM); 294 295 err = page_pool_init(pool, params, cpuid); 296 if (err < 0) 297 goto err_free; 298 299 err = page_pool_list(pool); 300 if (err) 301 goto err_uninit; 302 303 return pool; 304 305 err_uninit: 306 page_pool_uninit(pool); 307 err_free: 308 pr_warn("%s() gave up with errno %d\n", __func__, err); 309 kfree(pool); 310 return ERR_PTR(err); 311 } 312 EXPORT_SYMBOL(page_pool_create_percpu); 313 314 /** 315 * page_pool_create() - create a page pool 316 * @params: parameters, see struct page_pool_params 317 */ 318 struct page_pool *page_pool_create(const struct page_pool_params *params) 319 { 320 return page_pool_create_percpu(params, -1); 321 } 322 EXPORT_SYMBOL(page_pool_create); 323 324 static void page_pool_return_page(struct page_pool *pool, struct page *page); 325 326 noinline 327 static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) 328 { 329 struct ptr_ring *r = &pool->ring; 330 struct page *page; 331 int pref_nid; /* preferred NUMA node */ 332 333 /* Quicker fallback, avoid locks when ring is empty */ 334 if (__ptr_ring_empty(r)) { 335 alloc_stat_inc(pool, empty); 336 return NULL; 337 } 338 339 /* Softirq guarantee CPU and thus NUMA node is stable. This, 340 * assumes CPU refilling driver RX-ring will also run RX-NAPI. 341 */ 342 #ifdef CONFIG_NUMA 343 pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; 344 #else 345 /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ 346 pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ 347 #endif 348 349 /* Refill alloc array, but only if NUMA match */ 350 do { 351 page = __ptr_ring_consume(r); 352 if (unlikely(!page)) 353 break; 354 355 if (likely(page_to_nid(page) == pref_nid)) { 356 pool->alloc.cache[pool->alloc.count++] = page; 357 } else { 358 /* NUMA mismatch; 359 * (1) release 1 page to page-allocator and 360 * (2) break out to fallthrough to alloc_pages_node. 361 * This limit stress on page buddy alloactor. 362 */ 363 page_pool_return_page(pool, page); 364 alloc_stat_inc(pool, waive); 365 page = NULL; 366 break; 367 } 368 } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); 369 370 /* Return last page */ 371 if (likely(pool->alloc.count > 0)) { 372 page = pool->alloc.cache[--pool->alloc.count]; 373 alloc_stat_inc(pool, refill); 374 } 375 376 return page; 377 } 378 379 /* fast path */ 380 static struct page *__page_pool_get_cached(struct page_pool *pool) 381 { 382 struct page *page; 383 384 /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ 385 if (likely(pool->alloc.count)) { 386 /* Fast-path */ 387 page = pool->alloc.cache[--pool->alloc.count]; 388 alloc_stat_inc(pool, fast); 389 } else { 390 page = page_pool_refill_alloc_cache(pool); 391 } 392 393 return page; 394 } 395 396 static void page_pool_dma_sync_for_device(struct page_pool *pool, 397 struct page *page, 398 unsigned int dma_sync_size) 399 { 400 dma_addr_t dma_addr = page_pool_get_dma_addr(page); 401 402 dma_sync_size = min(dma_sync_size, pool->p.max_len); 403 dma_sync_single_range_for_device(pool->p.dev, dma_addr, 404 pool->p.offset, dma_sync_size, 405 pool->p.dma_dir); 406 } 407 408 static bool page_pool_dma_map(struct page_pool *pool, struct page *page) 409 { 410 dma_addr_t dma; 411 412 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr 413 * since dma_addr_t can be either 32 or 64 bits and does not always fit 414 * into page private data (i.e 32bit cpu with 64bit DMA caps) 415 * This mapping is kept for lifetime of page, until leaving pool. 416 */ 417 dma = dma_map_page_attrs(pool->p.dev, page, 0, 418 (PAGE_SIZE << pool->p.order), 419 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | 420 DMA_ATTR_WEAK_ORDERING); 421 if (dma_mapping_error(pool->p.dev, dma)) 422 return false; 423 424 if (page_pool_set_dma_addr(page, dma)) 425 goto unmap_failed; 426 427 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 428 page_pool_dma_sync_for_device(pool, page, pool->p.max_len); 429 430 return true; 431 432 unmap_failed: 433 WARN_ON_ONCE("unexpected DMA address, please report to netdev@"); 434 dma_unmap_page_attrs(pool->p.dev, dma, 435 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 436 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); 437 return false; 438 } 439 440 static void page_pool_set_pp_info(struct page_pool *pool, 441 struct page *page) 442 { 443 page->pp = pool; 444 page->pp_magic |= PP_SIGNATURE; 445 446 /* Ensuring all pages have been split into one fragment initially: 447 * page_pool_set_pp_info() is only called once for every page when it 448 * is allocated from the page allocator and page_pool_fragment_page() 449 * is dirtying the same cache line as the page->pp_magic above, so 450 * the overhead is negligible. 451 */ 452 page_pool_fragment_page(page, 1); 453 if (pool->has_init_callback) 454 pool->slow.init_callback(page, pool->slow.init_arg); 455 } 456 457 static void page_pool_clear_pp_info(struct page *page) 458 { 459 page->pp_magic = 0; 460 page->pp = NULL; 461 } 462 463 static struct page *__page_pool_alloc_page_order(struct page_pool *pool, 464 gfp_t gfp) 465 { 466 struct page *page; 467 468 gfp |= __GFP_COMP; 469 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); 470 if (unlikely(!page)) 471 return NULL; 472 473 if ((pool->p.flags & PP_FLAG_DMA_MAP) && 474 unlikely(!page_pool_dma_map(pool, page))) { 475 put_page(page); 476 return NULL; 477 } 478 479 alloc_stat_inc(pool, slow_high_order); 480 page_pool_set_pp_info(pool, page); 481 482 /* Track how many pages are held 'in-flight' */ 483 pool->pages_state_hold_cnt++; 484 trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); 485 return page; 486 } 487 488 /* slow path */ 489 noinline 490 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, 491 gfp_t gfp) 492 { 493 const int bulk = PP_ALLOC_CACHE_REFILL; 494 unsigned int pp_flags = pool->p.flags; 495 unsigned int pp_order = pool->p.order; 496 struct page *page; 497 int i, nr_pages; 498 499 /* Don't support bulk alloc for high-order pages */ 500 if (unlikely(pp_order)) 501 return __page_pool_alloc_page_order(pool, gfp); 502 503 /* Unnecessary as alloc cache is empty, but guarantees zero count */ 504 if (unlikely(pool->alloc.count > 0)) 505 return pool->alloc.cache[--pool->alloc.count]; 506 507 /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ 508 memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); 509 510 nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk, 511 pool->alloc.cache); 512 if (unlikely(!nr_pages)) 513 return NULL; 514 515 /* Pages have been filled into alloc.cache array, but count is zero and 516 * page element have not been (possibly) DMA mapped. 517 */ 518 for (i = 0; i < nr_pages; i++) { 519 page = pool->alloc.cache[i]; 520 if ((pp_flags & PP_FLAG_DMA_MAP) && 521 unlikely(!page_pool_dma_map(pool, page))) { 522 put_page(page); 523 continue; 524 } 525 526 page_pool_set_pp_info(pool, page); 527 pool->alloc.cache[pool->alloc.count++] = page; 528 /* Track how many pages are held 'in-flight' */ 529 pool->pages_state_hold_cnt++; 530 trace_page_pool_state_hold(pool, page, 531 pool->pages_state_hold_cnt); 532 } 533 534 /* Return last page */ 535 if (likely(pool->alloc.count > 0)) { 536 page = pool->alloc.cache[--pool->alloc.count]; 537 alloc_stat_inc(pool, slow); 538 } else { 539 page = NULL; 540 } 541 542 /* When page just alloc'ed is should/must have refcnt 1. */ 543 return page; 544 } 545 546 /* For using page_pool replace: alloc_pages() API calls, but provide 547 * synchronization guarantee for allocation side. 548 */ 549 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) 550 { 551 struct page *page; 552 553 /* Fast-path: Get a page from cache */ 554 page = __page_pool_get_cached(pool); 555 if (page) 556 return page; 557 558 /* Slow-path: cache empty, do real allocation */ 559 page = __page_pool_alloc_pages_slow(pool, gfp); 560 return page; 561 } 562 EXPORT_SYMBOL(page_pool_alloc_pages); 563 564 /* Calculate distance between two u32 values, valid if distance is below 2^(31) 565 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution 566 */ 567 #define _distance(a, b) (s32)((a) - (b)) 568 569 s32 page_pool_inflight(const struct page_pool *pool, bool strict) 570 { 571 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); 572 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); 573 s32 inflight; 574 575 inflight = _distance(hold_cnt, release_cnt); 576 577 if (strict) { 578 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); 579 WARN(inflight < 0, "Negative(%d) inflight packet-pages", 580 inflight); 581 } else { 582 inflight = max(0, inflight); 583 } 584 585 return inflight; 586 } 587 588 static __always_inline 589 void __page_pool_release_page_dma(struct page_pool *pool, struct page *page) 590 { 591 dma_addr_t dma; 592 593 if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 594 /* Always account for inflight pages, even if we didn't 595 * map them 596 */ 597 return; 598 599 dma = page_pool_get_dma_addr(page); 600 601 /* When page is unmapped, it cannot be returned to our pool */ 602 dma_unmap_page_attrs(pool->p.dev, dma, 603 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 604 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); 605 page_pool_set_dma_addr(page, 0); 606 } 607 608 /* Disconnects a page (from a page_pool). API users can have a need 609 * to disconnect a page (from a page_pool), to allow it to be used as 610 * a regular page (that will eventually be returned to the normal 611 * page-allocator via put_page). 612 */ 613 void page_pool_return_page(struct page_pool *pool, struct page *page) 614 { 615 int count; 616 617 __page_pool_release_page_dma(pool, page); 618 619 page_pool_clear_pp_info(page); 620 621 /* This may be the last page returned, releasing the pool, so 622 * it is not safe to reference pool afterwards. 623 */ 624 count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); 625 trace_page_pool_state_release(pool, page, count); 626 627 put_page(page); 628 /* An optimization would be to call __free_pages(page, pool->p.order) 629 * knowing page is not part of page-cache (thus avoiding a 630 * __page_cache_release() call). 631 */ 632 } 633 634 static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) 635 { 636 int ret; 637 /* BH protection not needed if current is softirq */ 638 if (in_softirq()) 639 ret = ptr_ring_produce(&pool->ring, page); 640 else 641 ret = ptr_ring_produce_bh(&pool->ring, page); 642 643 if (!ret) { 644 recycle_stat_inc(pool, ring); 645 return true; 646 } 647 648 return false; 649 } 650 651 /* Only allow direct recycling in special circumstances, into the 652 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. 653 * 654 * Caller must provide appropriate safe context. 655 */ 656 static bool page_pool_recycle_in_cache(struct page *page, 657 struct page_pool *pool) 658 { 659 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { 660 recycle_stat_inc(pool, cache_full); 661 return false; 662 } 663 664 /* Caller MUST have verified/know (page_ref_count(page) == 1) */ 665 pool->alloc.cache[pool->alloc.count++] = page; 666 recycle_stat_inc(pool, cached); 667 return true; 668 } 669 670 static bool __page_pool_page_can_be_recycled(const struct page *page) 671 { 672 return page_ref_count(page) == 1 && !page_is_pfmemalloc(page); 673 } 674 675 /* If the page refcnt == 1, this will try to recycle the page. 676 * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for 677 * the configured size min(dma_sync_size, pool->max_len). 678 * If the page refcnt != 1, then the page will be returned to memory 679 * subsystem. 680 */ 681 static __always_inline struct page * 682 __page_pool_put_page(struct page_pool *pool, struct page *page, 683 unsigned int dma_sync_size, bool allow_direct) 684 { 685 lockdep_assert_no_hardirq(); 686 687 /* This allocator is optimized for the XDP mode that uses 688 * one-frame-per-page, but have fallbacks that act like the 689 * regular page allocator APIs. 690 * 691 * refcnt == 1 means page_pool owns page, and can recycle it. 692 * 693 * page is NOT reusable when allocated when system is under 694 * some pressure. (page_is_pfmemalloc) 695 */ 696 if (likely(__page_pool_page_can_be_recycled(page))) { 697 /* Read barrier done in page_ref_count / READ_ONCE */ 698 699 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 700 page_pool_dma_sync_for_device(pool, page, 701 dma_sync_size); 702 703 if (allow_direct && in_softirq() && 704 page_pool_recycle_in_cache(page, pool)) 705 return NULL; 706 707 /* Page found as candidate for recycling */ 708 return page; 709 } 710 /* Fallback/non-XDP mode: API user have elevated refcnt. 711 * 712 * Many drivers split up the page into fragments, and some 713 * want to keep doing this to save memory and do refcnt based 714 * recycling. Support this use case too, to ease drivers 715 * switching between XDP/non-XDP. 716 * 717 * In-case page_pool maintains the DMA mapping, API user must 718 * call page_pool_put_page once. In this elevated refcnt 719 * case, the DMA is unmapped/released, as driver is likely 720 * doing refcnt based recycle tricks, meaning another process 721 * will be invoking put_page. 722 */ 723 recycle_stat_inc(pool, released_refcnt); 724 page_pool_return_page(pool, page); 725 726 return NULL; 727 } 728 729 void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, 730 unsigned int dma_sync_size, bool allow_direct) 731 { 732 page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); 733 if (page && !page_pool_recycle_in_ring(pool, page)) { 734 /* Cache full, fallback to free pages */ 735 recycle_stat_inc(pool, ring_full); 736 page_pool_return_page(pool, page); 737 } 738 } 739 EXPORT_SYMBOL(page_pool_put_unrefed_page); 740 741 /** 742 * page_pool_put_page_bulk() - release references on multiple pages 743 * @pool: pool from which pages were allocated 744 * @data: array holding page pointers 745 * @count: number of pages in @data 746 * 747 * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring 748 * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() 749 * will release leftover pages to the page allocator. 750 * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx 751 * completion loop for the XDP_REDIRECT use case. 752 * 753 * Please note the caller must not use data area after running 754 * page_pool_put_page_bulk(), as this function overwrites it. 755 */ 756 void page_pool_put_page_bulk(struct page_pool *pool, void **data, 757 int count) 758 { 759 int i, bulk_len = 0; 760 bool in_softirq; 761 762 for (i = 0; i < count; i++) { 763 struct page *page = virt_to_head_page(data[i]); 764 765 /* It is not the last user for the page frag case */ 766 if (!page_pool_is_last_ref(page)) 767 continue; 768 769 page = __page_pool_put_page(pool, page, -1, false); 770 /* Approved for bulk recycling in ptr_ring cache */ 771 if (page) 772 data[bulk_len++] = page; 773 } 774 775 if (unlikely(!bulk_len)) 776 return; 777 778 /* Bulk producer into ptr_ring page_pool cache */ 779 in_softirq = page_pool_producer_lock(pool); 780 for (i = 0; i < bulk_len; i++) { 781 if (__ptr_ring_produce(&pool->ring, data[i])) { 782 /* ring full */ 783 recycle_stat_inc(pool, ring_full); 784 break; 785 } 786 } 787 recycle_stat_add(pool, ring, i); 788 page_pool_producer_unlock(pool, in_softirq); 789 790 /* Hopefully all pages was return into ptr_ring */ 791 if (likely(i == bulk_len)) 792 return; 793 794 /* ptr_ring cache full, free remaining pages outside producer lock 795 * since put_page() with refcnt == 1 can be an expensive operation 796 */ 797 for (; i < bulk_len; i++) 798 page_pool_return_page(pool, data[i]); 799 } 800 EXPORT_SYMBOL(page_pool_put_page_bulk); 801 802 static struct page *page_pool_drain_frag(struct page_pool *pool, 803 struct page *page) 804 { 805 long drain_count = BIAS_MAX - pool->frag_users; 806 807 /* Some user is still using the page frag */ 808 if (likely(page_pool_unref_page(page, drain_count))) 809 return NULL; 810 811 if (__page_pool_page_can_be_recycled(page)) { 812 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 813 page_pool_dma_sync_for_device(pool, page, -1); 814 815 return page; 816 } 817 818 page_pool_return_page(pool, page); 819 return NULL; 820 } 821 822 static void page_pool_free_frag(struct page_pool *pool) 823 { 824 long drain_count = BIAS_MAX - pool->frag_users; 825 struct page *page = pool->frag_page; 826 827 pool->frag_page = NULL; 828 829 if (!page || page_pool_unref_page(page, drain_count)) 830 return; 831 832 page_pool_return_page(pool, page); 833 } 834 835 struct page *page_pool_alloc_frag(struct page_pool *pool, 836 unsigned int *offset, 837 unsigned int size, gfp_t gfp) 838 { 839 unsigned int max_size = PAGE_SIZE << pool->p.order; 840 struct page *page = pool->frag_page; 841 842 if (WARN_ON(size > max_size)) 843 return NULL; 844 845 size = ALIGN(size, dma_get_cache_alignment()); 846 *offset = pool->frag_offset; 847 848 if (page && *offset + size > max_size) { 849 page = page_pool_drain_frag(pool, page); 850 if (page) { 851 alloc_stat_inc(pool, fast); 852 goto frag_reset; 853 } 854 } 855 856 if (!page) { 857 page = page_pool_alloc_pages(pool, gfp); 858 if (unlikely(!page)) { 859 pool->frag_page = NULL; 860 return NULL; 861 } 862 863 pool->frag_page = page; 864 865 frag_reset: 866 pool->frag_users = 1; 867 *offset = 0; 868 pool->frag_offset = size; 869 page_pool_fragment_page(page, BIAS_MAX); 870 return page; 871 } 872 873 pool->frag_users++; 874 pool->frag_offset = *offset + size; 875 alloc_stat_inc(pool, fast); 876 return page; 877 } 878 EXPORT_SYMBOL(page_pool_alloc_frag); 879 880 static void page_pool_empty_ring(struct page_pool *pool) 881 { 882 struct page *page; 883 884 /* Empty recycle ring */ 885 while ((page = ptr_ring_consume_bh(&pool->ring))) { 886 /* Verify the refcnt invariant of cached pages */ 887 if (!(page_ref_count(page) == 1)) 888 pr_crit("%s() page_pool refcnt %d violation\n", 889 __func__, page_ref_count(page)); 890 891 page_pool_return_page(pool, page); 892 } 893 } 894 895 static void __page_pool_destroy(struct page_pool *pool) 896 { 897 if (pool->disconnect) 898 pool->disconnect(pool); 899 900 page_pool_unlist(pool); 901 page_pool_uninit(pool); 902 kfree(pool); 903 } 904 905 static void page_pool_empty_alloc_cache_once(struct page_pool *pool) 906 { 907 struct page *page; 908 909 if (pool->destroy_cnt) 910 return; 911 912 /* Empty alloc cache, assume caller made sure this is 913 * no-longer in use, and page_pool_alloc_pages() cannot be 914 * call concurrently. 915 */ 916 while (pool->alloc.count) { 917 page = pool->alloc.cache[--pool->alloc.count]; 918 page_pool_return_page(pool, page); 919 } 920 } 921 922 static void page_pool_scrub(struct page_pool *pool) 923 { 924 page_pool_empty_alloc_cache_once(pool); 925 pool->destroy_cnt++; 926 927 /* No more consumers should exist, but producers could still 928 * be in-flight. 929 */ 930 page_pool_empty_ring(pool); 931 } 932 933 static int page_pool_release(struct page_pool *pool) 934 { 935 int inflight; 936 937 page_pool_scrub(pool); 938 inflight = page_pool_inflight(pool, true); 939 if (!inflight) 940 __page_pool_destroy(pool); 941 942 return inflight; 943 } 944 945 static void page_pool_release_retry(struct work_struct *wq) 946 { 947 struct delayed_work *dwq = to_delayed_work(wq); 948 struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); 949 void *netdev; 950 int inflight; 951 952 inflight = page_pool_release(pool); 953 if (!inflight) 954 return; 955 956 /* Periodic warning for page pools the user can't see */ 957 netdev = READ_ONCE(pool->slow.netdev); 958 if (time_after_eq(jiffies, pool->defer_warn) && 959 (!netdev || netdev == NET_PTR_POISON)) { 960 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; 961 962 pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", 963 __func__, pool->user.id, inflight, sec); 964 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 965 } 966 967 /* Still not ready to be disconnected, retry later */ 968 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 969 } 970 971 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), 972 struct xdp_mem_info *mem) 973 { 974 refcount_inc(&pool->user_cnt); 975 pool->disconnect = disconnect; 976 pool->xdp_mem_id = mem->id; 977 } 978 979 static void page_pool_disable_direct_recycling(struct page_pool *pool) 980 { 981 /* Disable direct recycling based on pool->cpuid. 982 * Paired with READ_ONCE() in napi_pp_put_page(). 983 */ 984 WRITE_ONCE(pool->cpuid, -1); 985 986 if (!pool->p.napi) 987 return; 988 989 /* To avoid races with recycling and additional barriers make sure 990 * pool and NAPI are unlinked when NAPI is disabled. 991 */ 992 WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) || 993 READ_ONCE(pool->p.napi->list_owner) != -1); 994 995 WRITE_ONCE(pool->p.napi, NULL); 996 } 997 998 void page_pool_destroy(struct page_pool *pool) 999 { 1000 if (!pool) 1001 return; 1002 1003 if (!page_pool_put(pool)) 1004 return; 1005 1006 page_pool_disable_direct_recycling(pool); 1007 page_pool_free_frag(pool); 1008 1009 if (!page_pool_release(pool)) 1010 return; 1011 1012 page_pool_detached(pool); 1013 pool->defer_start = jiffies; 1014 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 1015 1016 INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); 1017 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 1018 } 1019 EXPORT_SYMBOL(page_pool_destroy); 1020 1021 /* Caller must provide appropriate safe context, e.g. NAPI. */ 1022 void page_pool_update_nid(struct page_pool *pool, int new_nid) 1023 { 1024 struct page *page; 1025 1026 trace_page_pool_update_nid(pool, new_nid); 1027 pool->p.nid = new_nid; 1028 1029 /* Flush pool alloc cache, as refill will check NUMA node */ 1030 while (pool->alloc.count) { 1031 page = pool->alloc.cache[--pool->alloc.count]; 1032 page_pool_return_page(pool, page); 1033 } 1034 } 1035 EXPORT_SYMBOL(page_pool_update_nid); 1036