1 /* SPDX-License-Identifier: GPL-2.0 2 * 3 * page_pool.c 4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> 5 * Copyright (C) 2016 Red Hat, Inc. 6 */ 7 8 #include <linux/types.h> 9 #include <linux/kernel.h> 10 #include <linux/slab.h> 11 #include <linux/device.h> 12 13 #include <net/page_pool/helpers.h> 14 #include <net/xdp.h> 15 16 #include <linux/dma-direction.h> 17 #include <linux/dma-mapping.h> 18 #include <linux/page-flags.h> 19 #include <linux/mm.h> /* for put_page() */ 20 #include <linux/poison.h> 21 #include <linux/ethtool.h> 22 #include <linux/netdevice.h> 23 24 #include <trace/events/page_pool.h> 25 26 #define DEFER_TIME (msecs_to_jiffies(1000)) 27 #define DEFER_WARN_INTERVAL (60 * HZ) 28 29 #define BIAS_MAX LONG_MAX 30 31 #ifdef CONFIG_PAGE_POOL_STATS 32 /* alloc_stat_inc is intended to be used in softirq context */ 33 #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) 34 /* recycle_stat_inc is safe to use when preemption is possible. */ 35 #define recycle_stat_inc(pool, __stat) \ 36 do { \ 37 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ 38 this_cpu_inc(s->__stat); \ 39 } while (0) 40 41 #define recycle_stat_add(pool, __stat, val) \ 42 do { \ 43 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ 44 this_cpu_add(s->__stat, val); \ 45 } while (0) 46 47 static const char pp_stats[][ETH_GSTRING_LEN] = { 48 "rx_pp_alloc_fast", 49 "rx_pp_alloc_slow", 50 "rx_pp_alloc_slow_ho", 51 "rx_pp_alloc_empty", 52 "rx_pp_alloc_refill", 53 "rx_pp_alloc_waive", 54 "rx_pp_recycle_cached", 55 "rx_pp_recycle_cache_full", 56 "rx_pp_recycle_ring", 57 "rx_pp_recycle_ring_full", 58 "rx_pp_recycle_released_ref", 59 }; 60 61 /** 62 * page_pool_get_stats() - fetch page pool stats 63 * @pool: pool from which page was allocated 64 * @stats: struct page_pool_stats to fill in 65 * 66 * Retrieve statistics about the page_pool. This API is only available 67 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. 68 * A pointer to a caller allocated struct page_pool_stats structure 69 * is passed to this API which is filled in. The caller can then report 70 * those stats to the user (perhaps via ethtool, debugfs, etc.). 71 */ 72 bool page_pool_get_stats(struct page_pool *pool, 73 struct page_pool_stats *stats) 74 { 75 int cpu = 0; 76 77 if (!stats) 78 return false; 79 80 /* The caller is responsible to initialize stats. */ 81 stats->alloc_stats.fast += pool->alloc_stats.fast; 82 stats->alloc_stats.slow += pool->alloc_stats.slow; 83 stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; 84 stats->alloc_stats.empty += pool->alloc_stats.empty; 85 stats->alloc_stats.refill += pool->alloc_stats.refill; 86 stats->alloc_stats.waive += pool->alloc_stats.waive; 87 88 for_each_possible_cpu(cpu) { 89 const struct page_pool_recycle_stats *pcpu = 90 per_cpu_ptr(pool->recycle_stats, cpu); 91 92 stats->recycle_stats.cached += pcpu->cached; 93 stats->recycle_stats.cache_full += pcpu->cache_full; 94 stats->recycle_stats.ring += pcpu->ring; 95 stats->recycle_stats.ring_full += pcpu->ring_full; 96 stats->recycle_stats.released_refcnt += pcpu->released_refcnt; 97 } 98 99 return true; 100 } 101 EXPORT_SYMBOL(page_pool_get_stats); 102 103 u8 *page_pool_ethtool_stats_get_strings(u8 *data) 104 { 105 int i; 106 107 for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { 108 memcpy(data, pp_stats[i], ETH_GSTRING_LEN); 109 data += ETH_GSTRING_LEN; 110 } 111 112 return data; 113 } 114 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); 115 116 int page_pool_ethtool_stats_get_count(void) 117 { 118 return ARRAY_SIZE(pp_stats); 119 } 120 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); 121 122 u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) 123 { 124 struct page_pool_stats *pool_stats = stats; 125 126 *data++ = pool_stats->alloc_stats.fast; 127 *data++ = pool_stats->alloc_stats.slow; 128 *data++ = pool_stats->alloc_stats.slow_high_order; 129 *data++ = pool_stats->alloc_stats.empty; 130 *data++ = pool_stats->alloc_stats.refill; 131 *data++ = pool_stats->alloc_stats.waive; 132 *data++ = pool_stats->recycle_stats.cached; 133 *data++ = pool_stats->recycle_stats.cache_full; 134 *data++ = pool_stats->recycle_stats.ring; 135 *data++ = pool_stats->recycle_stats.ring_full; 136 *data++ = pool_stats->recycle_stats.released_refcnt; 137 138 return data; 139 } 140 EXPORT_SYMBOL(page_pool_ethtool_stats_get); 141 142 #else 143 #define alloc_stat_inc(pool, __stat) 144 #define recycle_stat_inc(pool, __stat) 145 #define recycle_stat_add(pool, __stat, val) 146 #endif 147 148 static bool page_pool_producer_lock(struct page_pool *pool) 149 __acquires(&pool->ring.producer_lock) 150 { 151 bool in_softirq = in_softirq(); 152 153 if (in_softirq) 154 spin_lock(&pool->ring.producer_lock); 155 else 156 spin_lock_bh(&pool->ring.producer_lock); 157 158 return in_softirq; 159 } 160 161 static void page_pool_producer_unlock(struct page_pool *pool, 162 bool in_softirq) 163 __releases(&pool->ring.producer_lock) 164 { 165 if (in_softirq) 166 spin_unlock(&pool->ring.producer_lock); 167 else 168 spin_unlock_bh(&pool->ring.producer_lock); 169 } 170 171 static int page_pool_init(struct page_pool *pool, 172 const struct page_pool_params *params) 173 { 174 unsigned int ring_qsize = 1024; /* Default */ 175 176 memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); 177 memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); 178 179 /* Validate only known flags were used */ 180 if (pool->p.flags & ~(PP_FLAG_ALL)) 181 return -EINVAL; 182 183 if (pool->p.pool_size) 184 ring_qsize = pool->p.pool_size; 185 186 /* Sanity limit mem that can be pinned down */ 187 if (ring_qsize > 32768) 188 return -E2BIG; 189 190 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. 191 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, 192 * which is the XDP_TX use-case. 193 */ 194 if (pool->p.flags & PP_FLAG_DMA_MAP) { 195 if ((pool->p.dma_dir != DMA_FROM_DEVICE) && 196 (pool->p.dma_dir != DMA_BIDIRECTIONAL)) 197 return -EINVAL; 198 } 199 200 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { 201 /* In order to request DMA-sync-for-device the page 202 * needs to be mapped 203 */ 204 if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 205 return -EINVAL; 206 207 if (!pool->p.max_len) 208 return -EINVAL; 209 210 /* pool->p.offset has to be set according to the address 211 * offset used by the DMA engine to start copying rx data 212 */ 213 } 214 215 pool->has_init_callback = !!pool->slow.init_callback; 216 217 #ifdef CONFIG_PAGE_POOL_STATS 218 pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); 219 if (!pool->recycle_stats) 220 return -ENOMEM; 221 #endif 222 223 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { 224 #ifdef CONFIG_PAGE_POOL_STATS 225 free_percpu(pool->recycle_stats); 226 #endif 227 return -ENOMEM; 228 } 229 230 atomic_set(&pool->pages_state_release_cnt, 0); 231 232 /* Driver calling page_pool_create() also call page_pool_destroy() */ 233 refcount_set(&pool->user_cnt, 1); 234 235 if (pool->p.flags & PP_FLAG_DMA_MAP) 236 get_device(pool->p.dev); 237 238 return 0; 239 } 240 241 /** 242 * page_pool_create() - create a page pool. 243 * @params: parameters, see struct page_pool_params 244 */ 245 struct page_pool *page_pool_create(const struct page_pool_params *params) 246 { 247 struct page_pool *pool; 248 int err; 249 250 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); 251 if (!pool) 252 return ERR_PTR(-ENOMEM); 253 254 err = page_pool_init(pool, params); 255 if (err < 0) { 256 pr_warn("%s() gave up with errno %d\n", __func__, err); 257 kfree(pool); 258 return ERR_PTR(err); 259 } 260 261 return pool; 262 } 263 EXPORT_SYMBOL(page_pool_create); 264 265 static void page_pool_return_page(struct page_pool *pool, struct page *page); 266 267 noinline 268 static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) 269 { 270 struct ptr_ring *r = &pool->ring; 271 struct page *page; 272 int pref_nid; /* preferred NUMA node */ 273 274 /* Quicker fallback, avoid locks when ring is empty */ 275 if (__ptr_ring_empty(r)) { 276 alloc_stat_inc(pool, empty); 277 return NULL; 278 } 279 280 /* Softirq guarantee CPU and thus NUMA node is stable. This, 281 * assumes CPU refilling driver RX-ring will also run RX-NAPI. 282 */ 283 #ifdef CONFIG_NUMA 284 pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; 285 #else 286 /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ 287 pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ 288 #endif 289 290 /* Refill alloc array, but only if NUMA match */ 291 do { 292 page = __ptr_ring_consume(r); 293 if (unlikely(!page)) 294 break; 295 296 if (likely(page_to_nid(page) == pref_nid)) { 297 pool->alloc.cache[pool->alloc.count++] = page; 298 } else { 299 /* NUMA mismatch; 300 * (1) release 1 page to page-allocator and 301 * (2) break out to fallthrough to alloc_pages_node. 302 * This limit stress on page buddy alloactor. 303 */ 304 page_pool_return_page(pool, page); 305 alloc_stat_inc(pool, waive); 306 page = NULL; 307 break; 308 } 309 } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); 310 311 /* Return last page */ 312 if (likely(pool->alloc.count > 0)) { 313 page = pool->alloc.cache[--pool->alloc.count]; 314 alloc_stat_inc(pool, refill); 315 } 316 317 return page; 318 } 319 320 /* fast path */ 321 static struct page *__page_pool_get_cached(struct page_pool *pool) 322 { 323 struct page *page; 324 325 /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ 326 if (likely(pool->alloc.count)) { 327 /* Fast-path */ 328 page = pool->alloc.cache[--pool->alloc.count]; 329 alloc_stat_inc(pool, fast); 330 } else { 331 page = page_pool_refill_alloc_cache(pool); 332 } 333 334 return page; 335 } 336 337 static void page_pool_dma_sync_for_device(struct page_pool *pool, 338 struct page *page, 339 unsigned int dma_sync_size) 340 { 341 dma_addr_t dma_addr = page_pool_get_dma_addr(page); 342 343 dma_sync_size = min(dma_sync_size, pool->p.max_len); 344 dma_sync_single_range_for_device(pool->p.dev, dma_addr, 345 pool->p.offset, dma_sync_size, 346 pool->p.dma_dir); 347 } 348 349 static bool page_pool_dma_map(struct page_pool *pool, struct page *page) 350 { 351 dma_addr_t dma; 352 353 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr 354 * since dma_addr_t can be either 32 or 64 bits and does not always fit 355 * into page private data (i.e 32bit cpu with 64bit DMA caps) 356 * This mapping is kept for lifetime of page, until leaving pool. 357 */ 358 dma = dma_map_page_attrs(pool->p.dev, page, 0, 359 (PAGE_SIZE << pool->p.order), 360 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | 361 DMA_ATTR_WEAK_ORDERING); 362 if (dma_mapping_error(pool->p.dev, dma)) 363 return false; 364 365 if (page_pool_set_dma_addr(page, dma)) 366 goto unmap_failed; 367 368 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 369 page_pool_dma_sync_for_device(pool, page, pool->p.max_len); 370 371 return true; 372 373 unmap_failed: 374 WARN_ON_ONCE("unexpected DMA address, please report to netdev@"); 375 dma_unmap_page_attrs(pool->p.dev, dma, 376 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 377 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); 378 return false; 379 } 380 381 static void page_pool_set_pp_info(struct page_pool *pool, 382 struct page *page) 383 { 384 page->pp = pool; 385 page->pp_magic |= PP_SIGNATURE; 386 387 /* Ensuring all pages have been split into one fragment initially: 388 * page_pool_set_pp_info() is only called once for every page when it 389 * is allocated from the page allocator and page_pool_fragment_page() 390 * is dirtying the same cache line as the page->pp_magic above, so 391 * the overhead is negligible. 392 */ 393 page_pool_fragment_page(page, 1); 394 if (pool->has_init_callback) 395 pool->slow.init_callback(page, pool->slow.init_arg); 396 } 397 398 static void page_pool_clear_pp_info(struct page *page) 399 { 400 page->pp_magic = 0; 401 page->pp = NULL; 402 } 403 404 static struct page *__page_pool_alloc_page_order(struct page_pool *pool, 405 gfp_t gfp) 406 { 407 struct page *page; 408 409 gfp |= __GFP_COMP; 410 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); 411 if (unlikely(!page)) 412 return NULL; 413 414 if ((pool->p.flags & PP_FLAG_DMA_MAP) && 415 unlikely(!page_pool_dma_map(pool, page))) { 416 put_page(page); 417 return NULL; 418 } 419 420 alloc_stat_inc(pool, slow_high_order); 421 page_pool_set_pp_info(pool, page); 422 423 /* Track how many pages are held 'in-flight' */ 424 pool->pages_state_hold_cnt++; 425 trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); 426 return page; 427 } 428 429 /* slow path */ 430 noinline 431 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, 432 gfp_t gfp) 433 { 434 const int bulk = PP_ALLOC_CACHE_REFILL; 435 unsigned int pp_flags = pool->p.flags; 436 unsigned int pp_order = pool->p.order; 437 struct page *page; 438 int i, nr_pages; 439 440 /* Don't support bulk alloc for high-order pages */ 441 if (unlikely(pp_order)) 442 return __page_pool_alloc_page_order(pool, gfp); 443 444 /* Unnecessary as alloc cache is empty, but guarantees zero count */ 445 if (unlikely(pool->alloc.count > 0)) 446 return pool->alloc.cache[--pool->alloc.count]; 447 448 /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ 449 memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); 450 451 nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk, 452 pool->alloc.cache); 453 if (unlikely(!nr_pages)) 454 return NULL; 455 456 /* Pages have been filled into alloc.cache array, but count is zero and 457 * page element have not been (possibly) DMA mapped. 458 */ 459 for (i = 0; i < nr_pages; i++) { 460 page = pool->alloc.cache[i]; 461 if ((pp_flags & PP_FLAG_DMA_MAP) && 462 unlikely(!page_pool_dma_map(pool, page))) { 463 put_page(page); 464 continue; 465 } 466 467 page_pool_set_pp_info(pool, page); 468 pool->alloc.cache[pool->alloc.count++] = page; 469 /* Track how many pages are held 'in-flight' */ 470 pool->pages_state_hold_cnt++; 471 trace_page_pool_state_hold(pool, page, 472 pool->pages_state_hold_cnt); 473 } 474 475 /* Return last page */ 476 if (likely(pool->alloc.count > 0)) { 477 page = pool->alloc.cache[--pool->alloc.count]; 478 alloc_stat_inc(pool, slow); 479 } else { 480 page = NULL; 481 } 482 483 /* When page just alloc'ed is should/must have refcnt 1. */ 484 return page; 485 } 486 487 /* For using page_pool replace: alloc_pages() API calls, but provide 488 * synchronization guarantee for allocation side. 489 */ 490 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) 491 { 492 struct page *page; 493 494 /* Fast-path: Get a page from cache */ 495 page = __page_pool_get_cached(pool); 496 if (page) 497 return page; 498 499 /* Slow-path: cache empty, do real allocation */ 500 page = __page_pool_alloc_pages_slow(pool, gfp); 501 return page; 502 } 503 EXPORT_SYMBOL(page_pool_alloc_pages); 504 505 /* Calculate distance between two u32 values, valid if distance is below 2^(31) 506 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution 507 */ 508 #define _distance(a, b) (s32)((a) - (b)) 509 510 static s32 page_pool_inflight(struct page_pool *pool) 511 { 512 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); 513 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); 514 s32 inflight; 515 516 inflight = _distance(hold_cnt, release_cnt); 517 518 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); 519 WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); 520 521 return inflight; 522 } 523 524 /* Disconnects a page (from a page_pool). API users can have a need 525 * to disconnect a page (from a page_pool), to allow it to be used as 526 * a regular page (that will eventually be returned to the normal 527 * page-allocator via put_page). 528 */ 529 static void page_pool_return_page(struct page_pool *pool, struct page *page) 530 { 531 dma_addr_t dma; 532 int count; 533 534 if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 535 /* Always account for inflight pages, even if we didn't 536 * map them 537 */ 538 goto skip_dma_unmap; 539 540 dma = page_pool_get_dma_addr(page); 541 542 /* When page is unmapped, it cannot be returned to our pool */ 543 dma_unmap_page_attrs(pool->p.dev, dma, 544 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 545 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); 546 page_pool_set_dma_addr(page, 0); 547 skip_dma_unmap: 548 page_pool_clear_pp_info(page); 549 550 /* This may be the last page returned, releasing the pool, so 551 * it is not safe to reference pool afterwards. 552 */ 553 count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); 554 trace_page_pool_state_release(pool, page, count); 555 556 put_page(page); 557 /* An optimization would be to call __free_pages(page, pool->p.order) 558 * knowing page is not part of page-cache (thus avoiding a 559 * __page_cache_release() call). 560 */ 561 } 562 563 static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) 564 { 565 int ret; 566 /* BH protection not needed if current is softirq */ 567 if (in_softirq()) 568 ret = ptr_ring_produce(&pool->ring, page); 569 else 570 ret = ptr_ring_produce_bh(&pool->ring, page); 571 572 if (!ret) { 573 recycle_stat_inc(pool, ring); 574 return true; 575 } 576 577 return false; 578 } 579 580 /* Only allow direct recycling in special circumstances, into the 581 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. 582 * 583 * Caller must provide appropriate safe context. 584 */ 585 static bool page_pool_recycle_in_cache(struct page *page, 586 struct page_pool *pool) 587 { 588 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { 589 recycle_stat_inc(pool, cache_full); 590 return false; 591 } 592 593 /* Caller MUST have verified/know (page_ref_count(page) == 1) */ 594 pool->alloc.cache[pool->alloc.count++] = page; 595 recycle_stat_inc(pool, cached); 596 return true; 597 } 598 599 /* If the page refcnt == 1, this will try to recycle the page. 600 * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for 601 * the configured size min(dma_sync_size, pool->max_len). 602 * If the page refcnt != 1, then the page will be returned to memory 603 * subsystem. 604 */ 605 static __always_inline struct page * 606 __page_pool_put_page(struct page_pool *pool, struct page *page, 607 unsigned int dma_sync_size, bool allow_direct) 608 { 609 lockdep_assert_no_hardirq(); 610 611 /* This allocator is optimized for the XDP mode that uses 612 * one-frame-per-page, but have fallbacks that act like the 613 * regular page allocator APIs. 614 * 615 * refcnt == 1 means page_pool owns page, and can recycle it. 616 * 617 * page is NOT reusable when allocated when system is under 618 * some pressure. (page_is_pfmemalloc) 619 */ 620 if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { 621 /* Read barrier done in page_ref_count / READ_ONCE */ 622 623 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 624 page_pool_dma_sync_for_device(pool, page, 625 dma_sync_size); 626 627 if (allow_direct && in_softirq() && 628 page_pool_recycle_in_cache(page, pool)) 629 return NULL; 630 631 /* Page found as candidate for recycling */ 632 return page; 633 } 634 /* Fallback/non-XDP mode: API user have elevated refcnt. 635 * 636 * Many drivers split up the page into fragments, and some 637 * want to keep doing this to save memory and do refcnt based 638 * recycling. Support this use case too, to ease drivers 639 * switching between XDP/non-XDP. 640 * 641 * In-case page_pool maintains the DMA mapping, API user must 642 * call page_pool_put_page once. In this elevated refcnt 643 * case, the DMA is unmapped/released, as driver is likely 644 * doing refcnt based recycle tricks, meaning another process 645 * will be invoking put_page. 646 */ 647 recycle_stat_inc(pool, released_refcnt); 648 page_pool_return_page(pool, page); 649 650 return NULL; 651 } 652 653 void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, 654 unsigned int dma_sync_size, bool allow_direct) 655 { 656 page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); 657 if (page && !page_pool_recycle_in_ring(pool, page)) { 658 /* Cache full, fallback to free pages */ 659 recycle_stat_inc(pool, ring_full); 660 page_pool_return_page(pool, page); 661 } 662 } 663 EXPORT_SYMBOL(page_pool_put_defragged_page); 664 665 /** 666 * page_pool_put_page_bulk() - release references on multiple pages 667 * @pool: pool from which pages were allocated 668 * @data: array holding page pointers 669 * @count: number of pages in @data 670 * 671 * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring 672 * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() 673 * will release leftover pages to the page allocator. 674 * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx 675 * completion loop for the XDP_REDIRECT use case. 676 * 677 * Please note the caller must not use data area after running 678 * page_pool_put_page_bulk(), as this function overwrites it. 679 */ 680 void page_pool_put_page_bulk(struct page_pool *pool, void **data, 681 int count) 682 { 683 int i, bulk_len = 0; 684 bool in_softirq; 685 686 for (i = 0; i < count; i++) { 687 struct page *page = virt_to_head_page(data[i]); 688 689 /* It is not the last user for the page frag case */ 690 if (!page_pool_is_last_frag(page)) 691 continue; 692 693 page = __page_pool_put_page(pool, page, -1, false); 694 /* Approved for bulk recycling in ptr_ring cache */ 695 if (page) 696 data[bulk_len++] = page; 697 } 698 699 if (unlikely(!bulk_len)) 700 return; 701 702 /* Bulk producer into ptr_ring page_pool cache */ 703 in_softirq = page_pool_producer_lock(pool); 704 for (i = 0; i < bulk_len; i++) { 705 if (__ptr_ring_produce(&pool->ring, data[i])) { 706 /* ring full */ 707 recycle_stat_inc(pool, ring_full); 708 break; 709 } 710 } 711 recycle_stat_add(pool, ring, i); 712 page_pool_producer_unlock(pool, in_softirq); 713 714 /* Hopefully all pages was return into ptr_ring */ 715 if (likely(i == bulk_len)) 716 return; 717 718 /* ptr_ring cache full, free remaining pages outside producer lock 719 * since put_page() with refcnt == 1 can be an expensive operation 720 */ 721 for (; i < bulk_len; i++) 722 page_pool_return_page(pool, data[i]); 723 } 724 EXPORT_SYMBOL(page_pool_put_page_bulk); 725 726 static struct page *page_pool_drain_frag(struct page_pool *pool, 727 struct page *page) 728 { 729 long drain_count = BIAS_MAX - pool->frag_users; 730 731 /* Some user is still using the page frag */ 732 if (likely(page_pool_defrag_page(page, drain_count))) 733 return NULL; 734 735 if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { 736 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 737 page_pool_dma_sync_for_device(pool, page, -1); 738 739 return page; 740 } 741 742 page_pool_return_page(pool, page); 743 return NULL; 744 } 745 746 static void page_pool_free_frag(struct page_pool *pool) 747 { 748 long drain_count = BIAS_MAX - pool->frag_users; 749 struct page *page = pool->frag_page; 750 751 pool->frag_page = NULL; 752 753 if (!page || page_pool_defrag_page(page, drain_count)) 754 return; 755 756 page_pool_return_page(pool, page); 757 } 758 759 struct page *page_pool_alloc_frag(struct page_pool *pool, 760 unsigned int *offset, 761 unsigned int size, gfp_t gfp) 762 { 763 unsigned int max_size = PAGE_SIZE << pool->p.order; 764 struct page *page = pool->frag_page; 765 766 if (WARN_ON(size > max_size)) 767 return NULL; 768 769 size = ALIGN(size, dma_get_cache_alignment()); 770 *offset = pool->frag_offset; 771 772 if (page && *offset + size > max_size) { 773 page = page_pool_drain_frag(pool, page); 774 if (page) { 775 alloc_stat_inc(pool, fast); 776 goto frag_reset; 777 } 778 } 779 780 if (!page) { 781 page = page_pool_alloc_pages(pool, gfp); 782 if (unlikely(!page)) { 783 pool->frag_page = NULL; 784 return NULL; 785 } 786 787 pool->frag_page = page; 788 789 frag_reset: 790 pool->frag_users = 1; 791 *offset = 0; 792 pool->frag_offset = size; 793 page_pool_fragment_page(page, BIAS_MAX); 794 return page; 795 } 796 797 pool->frag_users++; 798 pool->frag_offset = *offset + size; 799 alloc_stat_inc(pool, fast); 800 return page; 801 } 802 EXPORT_SYMBOL(page_pool_alloc_frag); 803 804 static void page_pool_empty_ring(struct page_pool *pool) 805 { 806 struct page *page; 807 808 /* Empty recycle ring */ 809 while ((page = ptr_ring_consume_bh(&pool->ring))) { 810 /* Verify the refcnt invariant of cached pages */ 811 if (!(page_ref_count(page) == 1)) 812 pr_crit("%s() page_pool refcnt %d violation\n", 813 __func__, page_ref_count(page)); 814 815 page_pool_return_page(pool, page); 816 } 817 } 818 819 static void __page_pool_destroy(struct page_pool *pool) 820 { 821 if (pool->disconnect) 822 pool->disconnect(pool); 823 824 ptr_ring_cleanup(&pool->ring, NULL); 825 826 if (pool->p.flags & PP_FLAG_DMA_MAP) 827 put_device(pool->p.dev); 828 829 #ifdef CONFIG_PAGE_POOL_STATS 830 free_percpu(pool->recycle_stats); 831 #endif 832 kfree(pool); 833 } 834 835 static void page_pool_empty_alloc_cache_once(struct page_pool *pool) 836 { 837 struct page *page; 838 839 if (pool->destroy_cnt) 840 return; 841 842 /* Empty alloc cache, assume caller made sure this is 843 * no-longer in use, and page_pool_alloc_pages() cannot be 844 * call concurrently. 845 */ 846 while (pool->alloc.count) { 847 page = pool->alloc.cache[--pool->alloc.count]; 848 page_pool_return_page(pool, page); 849 } 850 } 851 852 static void page_pool_scrub(struct page_pool *pool) 853 { 854 page_pool_empty_alloc_cache_once(pool); 855 pool->destroy_cnt++; 856 857 /* No more consumers should exist, but producers could still 858 * be in-flight. 859 */ 860 page_pool_empty_ring(pool); 861 } 862 863 static int page_pool_release(struct page_pool *pool) 864 { 865 int inflight; 866 867 page_pool_scrub(pool); 868 inflight = page_pool_inflight(pool); 869 if (!inflight) 870 __page_pool_destroy(pool); 871 872 return inflight; 873 } 874 875 static void page_pool_release_retry(struct work_struct *wq) 876 { 877 struct delayed_work *dwq = to_delayed_work(wq); 878 struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); 879 int inflight; 880 881 inflight = page_pool_release(pool); 882 if (!inflight) 883 return; 884 885 /* Periodic warning */ 886 if (time_after_eq(jiffies, pool->defer_warn)) { 887 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; 888 889 pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", 890 __func__, inflight, sec); 891 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 892 } 893 894 /* Still not ready to be disconnected, retry later */ 895 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 896 } 897 898 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), 899 struct xdp_mem_info *mem) 900 { 901 refcount_inc(&pool->user_cnt); 902 pool->disconnect = disconnect; 903 pool->xdp_mem_id = mem->id; 904 } 905 906 void page_pool_unlink_napi(struct page_pool *pool) 907 { 908 if (!pool->p.napi) 909 return; 910 911 /* To avoid races with recycling and additional barriers make sure 912 * pool and NAPI are unlinked when NAPI is disabled. 913 */ 914 WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) || 915 READ_ONCE(pool->p.napi->list_owner) != -1); 916 917 WRITE_ONCE(pool->p.napi, NULL); 918 } 919 EXPORT_SYMBOL(page_pool_unlink_napi); 920 921 void page_pool_destroy(struct page_pool *pool) 922 { 923 if (!pool) 924 return; 925 926 if (!page_pool_put(pool)) 927 return; 928 929 page_pool_unlink_napi(pool); 930 page_pool_free_frag(pool); 931 932 if (!page_pool_release(pool)) 933 return; 934 935 pool->defer_start = jiffies; 936 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 937 938 INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); 939 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 940 } 941 EXPORT_SYMBOL(page_pool_destroy); 942 943 /* Caller must provide appropriate safe context, e.g. NAPI. */ 944 void page_pool_update_nid(struct page_pool *pool, int new_nid) 945 { 946 struct page *page; 947 948 trace_page_pool_update_nid(pool, new_nid); 949 pool->p.nid = new_nid; 950 951 /* Flush pool alloc cache, as refill will check NUMA node */ 952 while (pool->alloc.count) { 953 page = pool->alloc.cache[--pool->alloc.count]; 954 page_pool_return_page(pool, page); 955 } 956 } 957 EXPORT_SYMBOL(page_pool_update_nid); 958