1 /* SPDX-License-Identifier: GPL-2.0 2 * 3 * page_pool.c 4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> 5 * Copyright (C) 2016 Red Hat, Inc. 6 */ 7 8 #include <linux/error-injection.h> 9 #include <linux/types.h> 10 #include <linux/kernel.h> 11 #include <linux/slab.h> 12 #include <linux/device.h> 13 14 #include <net/page_pool/helpers.h> 15 #include <net/xdp.h> 16 17 #include <linux/dma-direction.h> 18 #include <linux/dma-mapping.h> 19 #include <linux/page-flags.h> 20 #include <linux/mm.h> /* for put_page() */ 21 #include <linux/poison.h> 22 #include <linux/ethtool.h> 23 #include <linux/netdevice.h> 24 25 #include <trace/events/page_pool.h> 26 27 #include "page_pool_priv.h" 28 29 #define DEFER_TIME (msecs_to_jiffies(1000)) 30 #define DEFER_WARN_INTERVAL (60 * HZ) 31 32 #define BIAS_MAX (LONG_MAX >> 1) 33 34 #ifdef CONFIG_PAGE_POOL_STATS 35 static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); 36 37 /* alloc_stat_inc is intended to be used in softirq context */ 38 #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) 39 /* recycle_stat_inc is safe to use when preemption is possible. */ 40 #define recycle_stat_inc(pool, __stat) \ 41 do { \ 42 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ 43 this_cpu_inc(s->__stat); \ 44 } while (0) 45 46 #define recycle_stat_add(pool, __stat, val) \ 47 do { \ 48 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ 49 this_cpu_add(s->__stat, val); \ 50 } while (0) 51 52 static const char pp_stats[][ETH_GSTRING_LEN] = { 53 "rx_pp_alloc_fast", 54 "rx_pp_alloc_slow", 55 "rx_pp_alloc_slow_ho", 56 "rx_pp_alloc_empty", 57 "rx_pp_alloc_refill", 58 "rx_pp_alloc_waive", 59 "rx_pp_recycle_cached", 60 "rx_pp_recycle_cache_full", 61 "rx_pp_recycle_ring", 62 "rx_pp_recycle_ring_full", 63 "rx_pp_recycle_released_ref", 64 }; 65 66 /** 67 * page_pool_get_stats() - fetch page pool stats 68 * @pool: pool from which page was allocated 69 * @stats: struct page_pool_stats to fill in 70 * 71 * Retrieve statistics about the page_pool. This API is only available 72 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. 73 * A pointer to a caller allocated struct page_pool_stats structure 74 * is passed to this API which is filled in. The caller can then report 75 * those stats to the user (perhaps via ethtool, debugfs, etc.). 76 */ 77 bool page_pool_get_stats(const struct page_pool *pool, 78 struct page_pool_stats *stats) 79 { 80 int cpu = 0; 81 82 if (!stats) 83 return false; 84 85 /* The caller is responsible to initialize stats. */ 86 stats->alloc_stats.fast += pool->alloc_stats.fast; 87 stats->alloc_stats.slow += pool->alloc_stats.slow; 88 stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; 89 stats->alloc_stats.empty += pool->alloc_stats.empty; 90 stats->alloc_stats.refill += pool->alloc_stats.refill; 91 stats->alloc_stats.waive += pool->alloc_stats.waive; 92 93 for_each_possible_cpu(cpu) { 94 const struct page_pool_recycle_stats *pcpu = 95 per_cpu_ptr(pool->recycle_stats, cpu); 96 97 stats->recycle_stats.cached += pcpu->cached; 98 stats->recycle_stats.cache_full += pcpu->cache_full; 99 stats->recycle_stats.ring += pcpu->ring; 100 stats->recycle_stats.ring_full += pcpu->ring_full; 101 stats->recycle_stats.released_refcnt += pcpu->released_refcnt; 102 } 103 104 return true; 105 } 106 EXPORT_SYMBOL(page_pool_get_stats); 107 108 u8 *page_pool_ethtool_stats_get_strings(u8 *data) 109 { 110 int i; 111 112 for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { 113 memcpy(data, pp_stats[i], ETH_GSTRING_LEN); 114 data += ETH_GSTRING_LEN; 115 } 116 117 return data; 118 } 119 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); 120 121 int page_pool_ethtool_stats_get_count(void) 122 { 123 return ARRAY_SIZE(pp_stats); 124 } 125 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); 126 127 u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) 128 { 129 const struct page_pool_stats *pool_stats = stats; 130 131 *data++ = pool_stats->alloc_stats.fast; 132 *data++ = pool_stats->alloc_stats.slow; 133 *data++ = pool_stats->alloc_stats.slow_high_order; 134 *data++ = pool_stats->alloc_stats.empty; 135 *data++ = pool_stats->alloc_stats.refill; 136 *data++ = pool_stats->alloc_stats.waive; 137 *data++ = pool_stats->recycle_stats.cached; 138 *data++ = pool_stats->recycle_stats.cache_full; 139 *data++ = pool_stats->recycle_stats.ring; 140 *data++ = pool_stats->recycle_stats.ring_full; 141 *data++ = pool_stats->recycle_stats.released_refcnt; 142 143 return data; 144 } 145 EXPORT_SYMBOL(page_pool_ethtool_stats_get); 146 147 #else 148 #define alloc_stat_inc(pool, __stat) 149 #define recycle_stat_inc(pool, __stat) 150 #define recycle_stat_add(pool, __stat, val) 151 #endif 152 153 static bool page_pool_producer_lock(struct page_pool *pool) 154 __acquires(&pool->ring.producer_lock) 155 { 156 bool in_softirq = in_softirq(); 157 158 if (in_softirq) 159 spin_lock(&pool->ring.producer_lock); 160 else 161 spin_lock_bh(&pool->ring.producer_lock); 162 163 return in_softirq; 164 } 165 166 static void page_pool_producer_unlock(struct page_pool *pool, 167 bool in_softirq) 168 __releases(&pool->ring.producer_lock) 169 { 170 if (in_softirq) 171 spin_unlock(&pool->ring.producer_lock); 172 else 173 spin_unlock_bh(&pool->ring.producer_lock); 174 } 175 176 static void page_pool_struct_check(void) 177 { 178 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); 179 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); 180 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); 181 CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, 4 * sizeof(long)); 182 } 183 184 static int page_pool_init(struct page_pool *pool, 185 const struct page_pool_params *params, 186 int cpuid) 187 { 188 unsigned int ring_qsize = 1024; /* Default */ 189 190 page_pool_struct_check(); 191 192 memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); 193 memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); 194 195 pool->cpuid = cpuid; 196 197 /* Validate only known flags were used */ 198 if (pool->slow.flags & ~PP_FLAG_ALL) 199 return -EINVAL; 200 201 if (pool->p.pool_size) 202 ring_qsize = pool->p.pool_size; 203 204 /* Sanity limit mem that can be pinned down */ 205 if (ring_qsize > 32768) 206 return -E2BIG; 207 208 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. 209 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, 210 * which is the XDP_TX use-case. 211 */ 212 if (pool->slow.flags & PP_FLAG_DMA_MAP) { 213 if ((pool->p.dma_dir != DMA_FROM_DEVICE) && 214 (pool->p.dma_dir != DMA_BIDIRECTIONAL)) 215 return -EINVAL; 216 217 pool->dma_map = true; 218 } 219 220 if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { 221 /* In order to request DMA-sync-for-device the page 222 * needs to be mapped 223 */ 224 if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) 225 return -EINVAL; 226 227 if (!pool->p.max_len) 228 return -EINVAL; 229 230 pool->dma_sync = true; 231 232 /* pool->p.offset has to be set according to the address 233 * offset used by the DMA engine to start copying rx data 234 */ 235 } 236 237 pool->has_init_callback = !!pool->slow.init_callback; 238 239 #ifdef CONFIG_PAGE_POOL_STATS 240 if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) { 241 pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); 242 if (!pool->recycle_stats) 243 return -ENOMEM; 244 } else { 245 /* For system page pool instance we use a singular stats object 246 * instead of allocating a separate percpu variable for each 247 * (also percpu) page pool instance. 248 */ 249 pool->recycle_stats = &pp_system_recycle_stats; 250 pool->system = true; 251 } 252 #endif 253 254 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { 255 #ifdef CONFIG_PAGE_POOL_STATS 256 if (!pool->system) 257 free_percpu(pool->recycle_stats); 258 #endif 259 return -ENOMEM; 260 } 261 262 atomic_set(&pool->pages_state_release_cnt, 0); 263 264 /* Driver calling page_pool_create() also call page_pool_destroy() */ 265 refcount_set(&pool->user_cnt, 1); 266 267 if (pool->dma_map) 268 get_device(pool->p.dev); 269 270 return 0; 271 } 272 273 static void page_pool_uninit(struct page_pool *pool) 274 { 275 ptr_ring_cleanup(&pool->ring, NULL); 276 277 if (pool->dma_map) 278 put_device(pool->p.dev); 279 280 #ifdef CONFIG_PAGE_POOL_STATS 281 if (!pool->system) 282 free_percpu(pool->recycle_stats); 283 #endif 284 } 285 286 /** 287 * page_pool_create_percpu() - create a page pool for a given cpu. 288 * @params: parameters, see struct page_pool_params 289 * @cpuid: cpu identifier 290 */ 291 struct page_pool * 292 page_pool_create_percpu(const struct page_pool_params *params, int cpuid) 293 { 294 struct page_pool *pool; 295 int err; 296 297 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); 298 if (!pool) 299 return ERR_PTR(-ENOMEM); 300 301 err = page_pool_init(pool, params, cpuid); 302 if (err < 0) 303 goto err_free; 304 305 err = page_pool_list(pool); 306 if (err) 307 goto err_uninit; 308 309 return pool; 310 311 err_uninit: 312 page_pool_uninit(pool); 313 err_free: 314 pr_warn("%s() gave up with errno %d\n", __func__, err); 315 kfree(pool); 316 return ERR_PTR(err); 317 } 318 EXPORT_SYMBOL(page_pool_create_percpu); 319 320 /** 321 * page_pool_create() - create a page pool 322 * @params: parameters, see struct page_pool_params 323 */ 324 struct page_pool *page_pool_create(const struct page_pool_params *params) 325 { 326 return page_pool_create_percpu(params, -1); 327 } 328 EXPORT_SYMBOL(page_pool_create); 329 330 static void page_pool_return_page(struct page_pool *pool, struct page *page); 331 332 noinline 333 static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) 334 { 335 struct ptr_ring *r = &pool->ring; 336 struct page *page; 337 int pref_nid; /* preferred NUMA node */ 338 339 /* Quicker fallback, avoid locks when ring is empty */ 340 if (__ptr_ring_empty(r)) { 341 alloc_stat_inc(pool, empty); 342 return NULL; 343 } 344 345 /* Softirq guarantee CPU and thus NUMA node is stable. This, 346 * assumes CPU refilling driver RX-ring will also run RX-NAPI. 347 */ 348 #ifdef CONFIG_NUMA 349 pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; 350 #else 351 /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ 352 pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ 353 #endif 354 355 /* Refill alloc array, but only if NUMA match */ 356 do { 357 page = __ptr_ring_consume(r); 358 if (unlikely(!page)) 359 break; 360 361 if (likely(page_to_nid(page) == pref_nid)) { 362 pool->alloc.cache[pool->alloc.count++] = page; 363 } else { 364 /* NUMA mismatch; 365 * (1) release 1 page to page-allocator and 366 * (2) break out to fallthrough to alloc_pages_node. 367 * This limit stress on page buddy alloactor. 368 */ 369 page_pool_return_page(pool, page); 370 alloc_stat_inc(pool, waive); 371 page = NULL; 372 break; 373 } 374 } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); 375 376 /* Return last page */ 377 if (likely(pool->alloc.count > 0)) { 378 page = pool->alloc.cache[--pool->alloc.count]; 379 alloc_stat_inc(pool, refill); 380 } 381 382 return page; 383 } 384 385 /* fast path */ 386 static struct page *__page_pool_get_cached(struct page_pool *pool) 387 { 388 struct page *page; 389 390 /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ 391 if (likely(pool->alloc.count)) { 392 /* Fast-path */ 393 page = pool->alloc.cache[--pool->alloc.count]; 394 alloc_stat_inc(pool, fast); 395 } else { 396 page = page_pool_refill_alloc_cache(pool); 397 } 398 399 return page; 400 } 401 402 static void __page_pool_dma_sync_for_device(const struct page_pool *pool, 403 const struct page *page, 404 u32 dma_sync_size) 405 { 406 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) 407 dma_addr_t dma_addr = page_pool_get_dma_addr(page); 408 409 dma_sync_size = min(dma_sync_size, pool->p.max_len); 410 __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, 411 dma_sync_size, pool->p.dma_dir); 412 #endif 413 } 414 415 static __always_inline void 416 page_pool_dma_sync_for_device(const struct page_pool *pool, 417 const struct page *page, 418 u32 dma_sync_size) 419 { 420 if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) 421 __page_pool_dma_sync_for_device(pool, page, dma_sync_size); 422 } 423 424 static bool page_pool_dma_map(struct page_pool *pool, struct page *page) 425 { 426 dma_addr_t dma; 427 428 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr 429 * since dma_addr_t can be either 32 or 64 bits and does not always fit 430 * into page private data (i.e 32bit cpu with 64bit DMA caps) 431 * This mapping is kept for lifetime of page, until leaving pool. 432 */ 433 dma = dma_map_page_attrs(pool->p.dev, page, 0, 434 (PAGE_SIZE << pool->p.order), 435 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | 436 DMA_ATTR_WEAK_ORDERING); 437 if (dma_mapping_error(pool->p.dev, dma)) 438 return false; 439 440 if (page_pool_set_dma_addr(page, dma)) 441 goto unmap_failed; 442 443 page_pool_dma_sync_for_device(pool, page, pool->p.max_len); 444 445 return true; 446 447 unmap_failed: 448 WARN_ON_ONCE("unexpected DMA address, please report to netdev@"); 449 dma_unmap_page_attrs(pool->p.dev, dma, 450 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 451 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); 452 return false; 453 } 454 455 static void page_pool_set_pp_info(struct page_pool *pool, 456 struct page *page) 457 { 458 page->pp = pool; 459 page->pp_magic |= PP_SIGNATURE; 460 461 /* Ensuring all pages have been split into one fragment initially: 462 * page_pool_set_pp_info() is only called once for every page when it 463 * is allocated from the page allocator and page_pool_fragment_page() 464 * is dirtying the same cache line as the page->pp_magic above, so 465 * the overhead is negligible. 466 */ 467 page_pool_fragment_page(page, 1); 468 if (pool->has_init_callback) 469 pool->slow.init_callback(page, pool->slow.init_arg); 470 } 471 472 static void page_pool_clear_pp_info(struct page *page) 473 { 474 page->pp_magic = 0; 475 page->pp = NULL; 476 } 477 478 static struct page *__page_pool_alloc_page_order(struct page_pool *pool, 479 gfp_t gfp) 480 { 481 struct page *page; 482 483 gfp |= __GFP_COMP; 484 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); 485 if (unlikely(!page)) 486 return NULL; 487 488 if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page))) { 489 put_page(page); 490 return NULL; 491 } 492 493 alloc_stat_inc(pool, slow_high_order); 494 page_pool_set_pp_info(pool, page); 495 496 /* Track how many pages are held 'in-flight' */ 497 pool->pages_state_hold_cnt++; 498 trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); 499 return page; 500 } 501 502 /* slow path */ 503 noinline 504 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, 505 gfp_t gfp) 506 { 507 const int bulk = PP_ALLOC_CACHE_REFILL; 508 unsigned int pp_order = pool->p.order; 509 bool dma_map = pool->dma_map; 510 struct page *page; 511 int i, nr_pages; 512 513 /* Don't support bulk alloc for high-order pages */ 514 if (unlikely(pp_order)) 515 return __page_pool_alloc_page_order(pool, gfp); 516 517 /* Unnecessary as alloc cache is empty, but guarantees zero count */ 518 if (unlikely(pool->alloc.count > 0)) 519 return pool->alloc.cache[--pool->alloc.count]; 520 521 /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ 522 memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); 523 524 nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk, 525 pool->alloc.cache); 526 if (unlikely(!nr_pages)) 527 return NULL; 528 529 /* Pages have been filled into alloc.cache array, but count is zero and 530 * page element have not been (possibly) DMA mapped. 531 */ 532 for (i = 0; i < nr_pages; i++) { 533 page = pool->alloc.cache[i]; 534 if (dma_map && unlikely(!page_pool_dma_map(pool, page))) { 535 put_page(page); 536 continue; 537 } 538 539 page_pool_set_pp_info(pool, page); 540 pool->alloc.cache[pool->alloc.count++] = page; 541 /* Track how many pages are held 'in-flight' */ 542 pool->pages_state_hold_cnt++; 543 trace_page_pool_state_hold(pool, page, 544 pool->pages_state_hold_cnt); 545 } 546 547 /* Return last page */ 548 if (likely(pool->alloc.count > 0)) { 549 page = pool->alloc.cache[--pool->alloc.count]; 550 alloc_stat_inc(pool, slow); 551 } else { 552 page = NULL; 553 } 554 555 /* When page just alloc'ed is should/must have refcnt 1. */ 556 return page; 557 } 558 559 /* For using page_pool replace: alloc_pages() API calls, but provide 560 * synchronization guarantee for allocation side. 561 */ 562 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) 563 { 564 struct page *page; 565 566 /* Fast-path: Get a page from cache */ 567 page = __page_pool_get_cached(pool); 568 if (page) 569 return page; 570 571 /* Slow-path: cache empty, do real allocation */ 572 page = __page_pool_alloc_pages_slow(pool, gfp); 573 return page; 574 } 575 EXPORT_SYMBOL(page_pool_alloc_pages); 576 ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL); 577 578 /* Calculate distance between two u32 values, valid if distance is below 2^(31) 579 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution 580 */ 581 #define _distance(a, b) (s32)((a) - (b)) 582 583 s32 page_pool_inflight(const struct page_pool *pool, bool strict) 584 { 585 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); 586 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); 587 s32 inflight; 588 589 inflight = _distance(hold_cnt, release_cnt); 590 591 if (strict) { 592 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); 593 WARN(inflight < 0, "Negative(%d) inflight packet-pages", 594 inflight); 595 } else { 596 inflight = max(0, inflight); 597 } 598 599 return inflight; 600 } 601 602 static __always_inline 603 void __page_pool_release_page_dma(struct page_pool *pool, struct page *page) 604 { 605 dma_addr_t dma; 606 607 if (!pool->dma_map) 608 /* Always account for inflight pages, even if we didn't 609 * map them 610 */ 611 return; 612 613 dma = page_pool_get_dma_addr(page); 614 615 /* When page is unmapped, it cannot be returned to our pool */ 616 dma_unmap_page_attrs(pool->p.dev, dma, 617 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 618 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); 619 page_pool_set_dma_addr(page, 0); 620 } 621 622 /* Disconnects a page (from a page_pool). API users can have a need 623 * to disconnect a page (from a page_pool), to allow it to be used as 624 * a regular page (that will eventually be returned to the normal 625 * page-allocator via put_page). 626 */ 627 void page_pool_return_page(struct page_pool *pool, struct page *page) 628 { 629 int count; 630 631 __page_pool_release_page_dma(pool, page); 632 633 page_pool_clear_pp_info(page); 634 635 /* This may be the last page returned, releasing the pool, so 636 * it is not safe to reference pool afterwards. 637 */ 638 count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); 639 trace_page_pool_state_release(pool, page, count); 640 641 put_page(page); 642 /* An optimization would be to call __free_pages(page, pool->p.order) 643 * knowing page is not part of page-cache (thus avoiding a 644 * __page_cache_release() call). 645 */ 646 } 647 648 static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) 649 { 650 int ret; 651 /* BH protection not needed if current is softirq */ 652 if (in_softirq()) 653 ret = ptr_ring_produce(&pool->ring, page); 654 else 655 ret = ptr_ring_produce_bh(&pool->ring, page); 656 657 if (!ret) { 658 recycle_stat_inc(pool, ring); 659 return true; 660 } 661 662 return false; 663 } 664 665 /* Only allow direct recycling in special circumstances, into the 666 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. 667 * 668 * Caller must provide appropriate safe context. 669 */ 670 static bool page_pool_recycle_in_cache(struct page *page, 671 struct page_pool *pool) 672 { 673 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { 674 recycle_stat_inc(pool, cache_full); 675 return false; 676 } 677 678 /* Caller MUST have verified/know (page_ref_count(page) == 1) */ 679 pool->alloc.cache[pool->alloc.count++] = page; 680 recycle_stat_inc(pool, cached); 681 return true; 682 } 683 684 static bool __page_pool_page_can_be_recycled(const struct page *page) 685 { 686 return page_ref_count(page) == 1 && !page_is_pfmemalloc(page); 687 } 688 689 /* If the page refcnt == 1, this will try to recycle the page. 690 * If pool->dma_sync is set, we'll try to sync the DMA area for 691 * the configured size min(dma_sync_size, pool->max_len). 692 * If the page refcnt != 1, then the page will be returned to memory 693 * subsystem. 694 */ 695 static __always_inline struct page * 696 __page_pool_put_page(struct page_pool *pool, struct page *page, 697 unsigned int dma_sync_size, bool allow_direct) 698 { 699 lockdep_assert_no_hardirq(); 700 701 /* This allocator is optimized for the XDP mode that uses 702 * one-frame-per-page, but have fallbacks that act like the 703 * regular page allocator APIs. 704 * 705 * refcnt == 1 means page_pool owns page, and can recycle it. 706 * 707 * page is NOT reusable when allocated when system is under 708 * some pressure. (page_is_pfmemalloc) 709 */ 710 if (likely(__page_pool_page_can_be_recycled(page))) { 711 /* Read barrier done in page_ref_count / READ_ONCE */ 712 713 page_pool_dma_sync_for_device(pool, page, dma_sync_size); 714 715 if (allow_direct && page_pool_recycle_in_cache(page, pool)) 716 return NULL; 717 718 /* Page found as candidate for recycling */ 719 return page; 720 } 721 /* Fallback/non-XDP mode: API user have elevated refcnt. 722 * 723 * Many drivers split up the page into fragments, and some 724 * want to keep doing this to save memory and do refcnt based 725 * recycling. Support this use case too, to ease drivers 726 * switching between XDP/non-XDP. 727 * 728 * In-case page_pool maintains the DMA mapping, API user must 729 * call page_pool_put_page once. In this elevated refcnt 730 * case, the DMA is unmapped/released, as driver is likely 731 * doing refcnt based recycle tricks, meaning another process 732 * will be invoking put_page. 733 */ 734 recycle_stat_inc(pool, released_refcnt); 735 page_pool_return_page(pool, page); 736 737 return NULL; 738 } 739 740 static bool page_pool_napi_local(const struct page_pool *pool) 741 { 742 const struct napi_struct *napi; 743 u32 cpuid; 744 745 if (unlikely(!in_softirq())) 746 return false; 747 748 /* Allow direct recycle if we have reasons to believe that we are 749 * in the same context as the consumer would run, so there's 750 * no possible race. 751 * __page_pool_put_page() makes sure we're not in hardirq context 752 * and interrupts are enabled prior to accessing the cache. 753 */ 754 cpuid = smp_processor_id(); 755 if (READ_ONCE(pool->cpuid) == cpuid) 756 return true; 757 758 napi = READ_ONCE(pool->p.napi); 759 760 return napi && READ_ONCE(napi->list_owner) == cpuid; 761 } 762 763 void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, 764 unsigned int dma_sync_size, bool allow_direct) 765 { 766 if (!allow_direct) 767 allow_direct = page_pool_napi_local(pool); 768 769 page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); 770 if (page && !page_pool_recycle_in_ring(pool, page)) { 771 /* Cache full, fallback to free pages */ 772 recycle_stat_inc(pool, ring_full); 773 page_pool_return_page(pool, page); 774 } 775 } 776 EXPORT_SYMBOL(page_pool_put_unrefed_page); 777 778 /** 779 * page_pool_put_page_bulk() - release references on multiple pages 780 * @pool: pool from which pages were allocated 781 * @data: array holding page pointers 782 * @count: number of pages in @data 783 * 784 * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring 785 * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() 786 * will release leftover pages to the page allocator. 787 * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx 788 * completion loop for the XDP_REDIRECT use case. 789 * 790 * Please note the caller must not use data area after running 791 * page_pool_put_page_bulk(), as this function overwrites it. 792 */ 793 void page_pool_put_page_bulk(struct page_pool *pool, void **data, 794 int count) 795 { 796 int i, bulk_len = 0; 797 bool allow_direct; 798 bool in_softirq; 799 800 allow_direct = page_pool_napi_local(pool); 801 802 for (i = 0; i < count; i++) { 803 struct page *page = virt_to_head_page(data[i]); 804 805 /* It is not the last user for the page frag case */ 806 if (!page_pool_is_last_ref(page)) 807 continue; 808 809 page = __page_pool_put_page(pool, page, -1, allow_direct); 810 /* Approved for bulk recycling in ptr_ring cache */ 811 if (page) 812 data[bulk_len++] = page; 813 } 814 815 if (!bulk_len) 816 return; 817 818 /* Bulk producer into ptr_ring page_pool cache */ 819 in_softirq = page_pool_producer_lock(pool); 820 for (i = 0; i < bulk_len; i++) { 821 if (__ptr_ring_produce(&pool->ring, data[i])) { 822 /* ring full */ 823 recycle_stat_inc(pool, ring_full); 824 break; 825 } 826 } 827 recycle_stat_add(pool, ring, i); 828 page_pool_producer_unlock(pool, in_softirq); 829 830 /* Hopefully all pages was return into ptr_ring */ 831 if (likely(i == bulk_len)) 832 return; 833 834 /* ptr_ring cache full, free remaining pages outside producer lock 835 * since put_page() with refcnt == 1 can be an expensive operation 836 */ 837 for (; i < bulk_len; i++) 838 page_pool_return_page(pool, data[i]); 839 } 840 EXPORT_SYMBOL(page_pool_put_page_bulk); 841 842 static struct page *page_pool_drain_frag(struct page_pool *pool, 843 struct page *page) 844 { 845 long drain_count = BIAS_MAX - pool->frag_users; 846 847 /* Some user is still using the page frag */ 848 if (likely(page_pool_unref_page(page, drain_count))) 849 return NULL; 850 851 if (__page_pool_page_can_be_recycled(page)) { 852 page_pool_dma_sync_for_device(pool, page, -1); 853 return page; 854 } 855 856 page_pool_return_page(pool, page); 857 return NULL; 858 } 859 860 static void page_pool_free_frag(struct page_pool *pool) 861 { 862 long drain_count = BIAS_MAX - pool->frag_users; 863 struct page *page = pool->frag_page; 864 865 pool->frag_page = NULL; 866 867 if (!page || page_pool_unref_page(page, drain_count)) 868 return; 869 870 page_pool_return_page(pool, page); 871 } 872 873 struct page *page_pool_alloc_frag(struct page_pool *pool, 874 unsigned int *offset, 875 unsigned int size, gfp_t gfp) 876 { 877 unsigned int max_size = PAGE_SIZE << pool->p.order; 878 struct page *page = pool->frag_page; 879 880 if (WARN_ON(size > max_size)) 881 return NULL; 882 883 size = ALIGN(size, dma_get_cache_alignment()); 884 *offset = pool->frag_offset; 885 886 if (page && *offset + size > max_size) { 887 page = page_pool_drain_frag(pool, page); 888 if (page) { 889 alloc_stat_inc(pool, fast); 890 goto frag_reset; 891 } 892 } 893 894 if (!page) { 895 page = page_pool_alloc_pages(pool, gfp); 896 if (unlikely(!page)) { 897 pool->frag_page = NULL; 898 return NULL; 899 } 900 901 pool->frag_page = page; 902 903 frag_reset: 904 pool->frag_users = 1; 905 *offset = 0; 906 pool->frag_offset = size; 907 page_pool_fragment_page(page, BIAS_MAX); 908 return page; 909 } 910 911 pool->frag_users++; 912 pool->frag_offset = *offset + size; 913 alloc_stat_inc(pool, fast); 914 return page; 915 } 916 EXPORT_SYMBOL(page_pool_alloc_frag); 917 918 static void page_pool_empty_ring(struct page_pool *pool) 919 { 920 struct page *page; 921 922 /* Empty recycle ring */ 923 while ((page = ptr_ring_consume_bh(&pool->ring))) { 924 /* Verify the refcnt invariant of cached pages */ 925 if (!(page_ref_count(page) == 1)) 926 pr_crit("%s() page_pool refcnt %d violation\n", 927 __func__, page_ref_count(page)); 928 929 page_pool_return_page(pool, page); 930 } 931 } 932 933 static void __page_pool_destroy(struct page_pool *pool) 934 { 935 if (pool->disconnect) 936 pool->disconnect(pool); 937 938 page_pool_unlist(pool); 939 page_pool_uninit(pool); 940 kfree(pool); 941 } 942 943 static void page_pool_empty_alloc_cache_once(struct page_pool *pool) 944 { 945 struct page *page; 946 947 if (pool->destroy_cnt) 948 return; 949 950 /* Empty alloc cache, assume caller made sure this is 951 * no-longer in use, and page_pool_alloc_pages() cannot be 952 * call concurrently. 953 */ 954 while (pool->alloc.count) { 955 page = pool->alloc.cache[--pool->alloc.count]; 956 page_pool_return_page(pool, page); 957 } 958 } 959 960 static void page_pool_scrub(struct page_pool *pool) 961 { 962 page_pool_empty_alloc_cache_once(pool); 963 pool->destroy_cnt++; 964 965 /* No more consumers should exist, but producers could still 966 * be in-flight. 967 */ 968 page_pool_empty_ring(pool); 969 } 970 971 static int page_pool_release(struct page_pool *pool) 972 { 973 int inflight; 974 975 page_pool_scrub(pool); 976 inflight = page_pool_inflight(pool, true); 977 if (!inflight) 978 __page_pool_destroy(pool); 979 980 return inflight; 981 } 982 983 static void page_pool_release_retry(struct work_struct *wq) 984 { 985 struct delayed_work *dwq = to_delayed_work(wq); 986 struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); 987 void *netdev; 988 int inflight; 989 990 inflight = page_pool_release(pool); 991 if (!inflight) 992 return; 993 994 /* Periodic warning for page pools the user can't see */ 995 netdev = READ_ONCE(pool->slow.netdev); 996 if (time_after_eq(jiffies, pool->defer_warn) && 997 (!netdev || netdev == NET_PTR_POISON)) { 998 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; 999 1000 pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", 1001 __func__, pool->user.id, inflight, sec); 1002 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 1003 } 1004 1005 /* Still not ready to be disconnected, retry later */ 1006 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 1007 } 1008 1009 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), 1010 const struct xdp_mem_info *mem) 1011 { 1012 refcount_inc(&pool->user_cnt); 1013 pool->disconnect = disconnect; 1014 pool->xdp_mem_id = mem->id; 1015 } 1016 1017 static void page_pool_disable_direct_recycling(struct page_pool *pool) 1018 { 1019 /* Disable direct recycling based on pool->cpuid. 1020 * Paired with READ_ONCE() in page_pool_napi_local(). 1021 */ 1022 WRITE_ONCE(pool->cpuid, -1); 1023 1024 if (!pool->p.napi) 1025 return; 1026 1027 /* To avoid races with recycling and additional barriers make sure 1028 * pool and NAPI are unlinked when NAPI is disabled. 1029 */ 1030 WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) || 1031 READ_ONCE(pool->p.napi->list_owner) != -1); 1032 1033 WRITE_ONCE(pool->p.napi, NULL); 1034 } 1035 1036 void page_pool_destroy(struct page_pool *pool) 1037 { 1038 if (!pool) 1039 return; 1040 1041 if (!page_pool_put(pool)) 1042 return; 1043 1044 page_pool_disable_direct_recycling(pool); 1045 page_pool_free_frag(pool); 1046 1047 if (!page_pool_release(pool)) 1048 return; 1049 1050 page_pool_detached(pool); 1051 pool->defer_start = jiffies; 1052 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 1053 1054 INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); 1055 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 1056 } 1057 EXPORT_SYMBOL(page_pool_destroy); 1058 1059 /* Caller must provide appropriate safe context, e.g. NAPI. */ 1060 void page_pool_update_nid(struct page_pool *pool, int new_nid) 1061 { 1062 struct page *page; 1063 1064 trace_page_pool_update_nid(pool, new_nid); 1065 pool->p.nid = new_nid; 1066 1067 /* Flush pool alloc cache, as refill will check NUMA node */ 1068 while (pool->alloc.count) { 1069 page = pool->alloc.cache[--pool->alloc.count]; 1070 page_pool_return_page(pool, page); 1071 } 1072 } 1073 EXPORT_SYMBOL(page_pool_update_nid); 1074