1 /* SPDX-License-Identifier: GPL-2.0 2 * 3 * page_pool.c 4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> 5 * Copyright (C) 2016 Red Hat, Inc. 6 */ 7 8 #include <linux/types.h> 9 #include <linux/kernel.h> 10 #include <linux/slab.h> 11 #include <linux/device.h> 12 13 #include <net/page_pool.h> 14 #include <net/xdp.h> 15 16 #include <linux/dma-direction.h> 17 #include <linux/dma-mapping.h> 18 #include <linux/page-flags.h> 19 #include <linux/mm.h> /* for put_page() */ 20 #include <linux/poison.h> 21 #include <linux/ethtool.h> 22 #include <linux/netdevice.h> 23 24 #include <trace/events/page_pool.h> 25 26 #define DEFER_TIME (msecs_to_jiffies(1000)) 27 #define DEFER_WARN_INTERVAL (60 * HZ) 28 29 #define BIAS_MAX LONG_MAX 30 31 #ifdef CONFIG_PAGE_POOL_STATS 32 /* alloc_stat_inc is intended to be used in softirq context */ 33 #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) 34 /* recycle_stat_inc is safe to use when preemption is possible. */ 35 #define recycle_stat_inc(pool, __stat) \ 36 do { \ 37 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ 38 this_cpu_inc(s->__stat); \ 39 } while (0) 40 41 #define recycle_stat_add(pool, __stat, val) \ 42 do { \ 43 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ 44 this_cpu_add(s->__stat, val); \ 45 } while (0) 46 47 static const char pp_stats[][ETH_GSTRING_LEN] = { 48 "rx_pp_alloc_fast", 49 "rx_pp_alloc_slow", 50 "rx_pp_alloc_slow_ho", 51 "rx_pp_alloc_empty", 52 "rx_pp_alloc_refill", 53 "rx_pp_alloc_waive", 54 "rx_pp_recycle_cached", 55 "rx_pp_recycle_cache_full", 56 "rx_pp_recycle_ring", 57 "rx_pp_recycle_ring_full", 58 "rx_pp_recycle_released_ref", 59 }; 60 61 bool page_pool_get_stats(struct page_pool *pool, 62 struct page_pool_stats *stats) 63 { 64 int cpu = 0; 65 66 if (!stats) 67 return false; 68 69 /* The caller is responsible to initialize stats. */ 70 stats->alloc_stats.fast += pool->alloc_stats.fast; 71 stats->alloc_stats.slow += pool->alloc_stats.slow; 72 stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; 73 stats->alloc_stats.empty += pool->alloc_stats.empty; 74 stats->alloc_stats.refill += pool->alloc_stats.refill; 75 stats->alloc_stats.waive += pool->alloc_stats.waive; 76 77 for_each_possible_cpu(cpu) { 78 const struct page_pool_recycle_stats *pcpu = 79 per_cpu_ptr(pool->recycle_stats, cpu); 80 81 stats->recycle_stats.cached += pcpu->cached; 82 stats->recycle_stats.cache_full += pcpu->cache_full; 83 stats->recycle_stats.ring += pcpu->ring; 84 stats->recycle_stats.ring_full += pcpu->ring_full; 85 stats->recycle_stats.released_refcnt += pcpu->released_refcnt; 86 } 87 88 return true; 89 } 90 EXPORT_SYMBOL(page_pool_get_stats); 91 92 u8 *page_pool_ethtool_stats_get_strings(u8 *data) 93 { 94 int i; 95 96 for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { 97 memcpy(data, pp_stats[i], ETH_GSTRING_LEN); 98 data += ETH_GSTRING_LEN; 99 } 100 101 return data; 102 } 103 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); 104 105 int page_pool_ethtool_stats_get_count(void) 106 { 107 return ARRAY_SIZE(pp_stats); 108 } 109 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); 110 111 u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) 112 { 113 struct page_pool_stats *pool_stats = stats; 114 115 *data++ = pool_stats->alloc_stats.fast; 116 *data++ = pool_stats->alloc_stats.slow; 117 *data++ = pool_stats->alloc_stats.slow_high_order; 118 *data++ = pool_stats->alloc_stats.empty; 119 *data++ = pool_stats->alloc_stats.refill; 120 *data++ = pool_stats->alloc_stats.waive; 121 *data++ = pool_stats->recycle_stats.cached; 122 *data++ = pool_stats->recycle_stats.cache_full; 123 *data++ = pool_stats->recycle_stats.ring; 124 *data++ = pool_stats->recycle_stats.ring_full; 125 *data++ = pool_stats->recycle_stats.released_refcnt; 126 127 return data; 128 } 129 EXPORT_SYMBOL(page_pool_ethtool_stats_get); 130 131 #else 132 #define alloc_stat_inc(pool, __stat) 133 #define recycle_stat_inc(pool, __stat) 134 #define recycle_stat_add(pool, __stat, val) 135 #endif 136 137 static int page_pool_init(struct page_pool *pool, 138 const struct page_pool_params *params) 139 { 140 unsigned int ring_qsize = 1024; /* Default */ 141 142 memcpy(&pool->p, params, sizeof(pool->p)); 143 144 /* Validate only known flags were used */ 145 if (pool->p.flags & ~(PP_FLAG_ALL)) 146 return -EINVAL; 147 148 if (pool->p.pool_size) 149 ring_qsize = pool->p.pool_size; 150 151 /* Sanity limit mem that can be pinned down */ 152 if (ring_qsize > 32768) 153 return -E2BIG; 154 155 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. 156 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, 157 * which is the XDP_TX use-case. 158 */ 159 if (pool->p.flags & PP_FLAG_DMA_MAP) { 160 if ((pool->p.dma_dir != DMA_FROM_DEVICE) && 161 (pool->p.dma_dir != DMA_BIDIRECTIONAL)) 162 return -EINVAL; 163 } 164 165 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { 166 /* In order to request DMA-sync-for-device the page 167 * needs to be mapped 168 */ 169 if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 170 return -EINVAL; 171 172 if (!pool->p.max_len) 173 return -EINVAL; 174 175 /* pool->p.offset has to be set according to the address 176 * offset used by the DMA engine to start copying rx data 177 */ 178 } 179 180 if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && 181 pool->p.flags & PP_FLAG_PAGE_FRAG) 182 return -EINVAL; 183 184 #ifdef CONFIG_PAGE_POOL_STATS 185 pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); 186 if (!pool->recycle_stats) 187 return -ENOMEM; 188 #endif 189 190 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) 191 return -ENOMEM; 192 193 atomic_set(&pool->pages_state_release_cnt, 0); 194 195 /* Driver calling page_pool_create() also call page_pool_destroy() */ 196 refcount_set(&pool->user_cnt, 1); 197 198 if (pool->p.flags & PP_FLAG_DMA_MAP) 199 get_device(pool->p.dev); 200 201 return 0; 202 } 203 204 struct page_pool *page_pool_create(const struct page_pool_params *params) 205 { 206 struct page_pool *pool; 207 int err; 208 209 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); 210 if (!pool) 211 return ERR_PTR(-ENOMEM); 212 213 err = page_pool_init(pool, params); 214 if (err < 0) { 215 pr_warn("%s() gave up with errno %d\n", __func__, err); 216 kfree(pool); 217 return ERR_PTR(err); 218 } 219 220 return pool; 221 } 222 EXPORT_SYMBOL(page_pool_create); 223 224 static void page_pool_return_page(struct page_pool *pool, struct page *page); 225 226 noinline 227 static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) 228 { 229 struct ptr_ring *r = &pool->ring; 230 struct page *page; 231 int pref_nid; /* preferred NUMA node */ 232 233 /* Quicker fallback, avoid locks when ring is empty */ 234 if (__ptr_ring_empty(r)) { 235 alloc_stat_inc(pool, empty); 236 return NULL; 237 } 238 239 /* Softirq guarantee CPU and thus NUMA node is stable. This, 240 * assumes CPU refilling driver RX-ring will also run RX-NAPI. 241 */ 242 #ifdef CONFIG_NUMA 243 pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; 244 #else 245 /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ 246 pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ 247 #endif 248 249 /* Refill alloc array, but only if NUMA match */ 250 do { 251 page = __ptr_ring_consume(r); 252 if (unlikely(!page)) 253 break; 254 255 if (likely(page_to_nid(page) == pref_nid)) { 256 pool->alloc.cache[pool->alloc.count++] = page; 257 } else { 258 /* NUMA mismatch; 259 * (1) release 1 page to page-allocator and 260 * (2) break out to fallthrough to alloc_pages_node. 261 * This limit stress on page buddy alloactor. 262 */ 263 page_pool_return_page(pool, page); 264 alloc_stat_inc(pool, waive); 265 page = NULL; 266 break; 267 } 268 } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); 269 270 /* Return last page */ 271 if (likely(pool->alloc.count > 0)) { 272 page = pool->alloc.cache[--pool->alloc.count]; 273 alloc_stat_inc(pool, refill); 274 } 275 276 return page; 277 } 278 279 /* fast path */ 280 static struct page *__page_pool_get_cached(struct page_pool *pool) 281 { 282 struct page *page; 283 284 /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ 285 if (likely(pool->alloc.count)) { 286 /* Fast-path */ 287 page = pool->alloc.cache[--pool->alloc.count]; 288 alloc_stat_inc(pool, fast); 289 } else { 290 page = page_pool_refill_alloc_cache(pool); 291 } 292 293 return page; 294 } 295 296 static void page_pool_dma_sync_for_device(struct page_pool *pool, 297 struct page *page, 298 unsigned int dma_sync_size) 299 { 300 dma_addr_t dma_addr = page_pool_get_dma_addr(page); 301 302 dma_sync_size = min(dma_sync_size, pool->p.max_len); 303 dma_sync_single_range_for_device(pool->p.dev, dma_addr, 304 pool->p.offset, dma_sync_size, 305 pool->p.dma_dir); 306 } 307 308 static bool page_pool_dma_map(struct page_pool *pool, struct page *page) 309 { 310 dma_addr_t dma; 311 312 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr 313 * since dma_addr_t can be either 32 or 64 bits and does not always fit 314 * into page private data (i.e 32bit cpu with 64bit DMA caps) 315 * This mapping is kept for lifetime of page, until leaving pool. 316 */ 317 dma = dma_map_page_attrs(pool->p.dev, page, 0, 318 (PAGE_SIZE << pool->p.order), 319 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); 320 if (dma_mapping_error(pool->p.dev, dma)) 321 return false; 322 323 page_pool_set_dma_addr(page, dma); 324 325 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 326 page_pool_dma_sync_for_device(pool, page, pool->p.max_len); 327 328 return true; 329 } 330 331 static void page_pool_set_pp_info(struct page_pool *pool, 332 struct page *page) 333 { 334 page->pp = pool; 335 page->pp_magic |= PP_SIGNATURE; 336 if (pool->p.init_callback) 337 pool->p.init_callback(page, pool->p.init_arg); 338 } 339 340 static void page_pool_clear_pp_info(struct page *page) 341 { 342 page->pp_magic = 0; 343 page->pp = NULL; 344 } 345 346 static struct page *__page_pool_alloc_page_order(struct page_pool *pool, 347 gfp_t gfp) 348 { 349 struct page *page; 350 351 gfp |= __GFP_COMP; 352 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); 353 if (unlikely(!page)) 354 return NULL; 355 356 if ((pool->p.flags & PP_FLAG_DMA_MAP) && 357 unlikely(!page_pool_dma_map(pool, page))) { 358 put_page(page); 359 return NULL; 360 } 361 362 alloc_stat_inc(pool, slow_high_order); 363 page_pool_set_pp_info(pool, page); 364 365 /* Track how many pages are held 'in-flight' */ 366 pool->pages_state_hold_cnt++; 367 trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); 368 return page; 369 } 370 371 /* slow path */ 372 noinline 373 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, 374 gfp_t gfp) 375 { 376 const int bulk = PP_ALLOC_CACHE_REFILL; 377 unsigned int pp_flags = pool->p.flags; 378 unsigned int pp_order = pool->p.order; 379 struct page *page; 380 int i, nr_pages; 381 382 /* Don't support bulk alloc for high-order pages */ 383 if (unlikely(pp_order)) 384 return __page_pool_alloc_page_order(pool, gfp); 385 386 /* Unnecessary as alloc cache is empty, but guarantees zero count */ 387 if (unlikely(pool->alloc.count > 0)) 388 return pool->alloc.cache[--pool->alloc.count]; 389 390 /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ 391 memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); 392 393 nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk, 394 pool->alloc.cache); 395 if (unlikely(!nr_pages)) 396 return NULL; 397 398 /* Pages have been filled into alloc.cache array, but count is zero and 399 * page element have not been (possibly) DMA mapped. 400 */ 401 for (i = 0; i < nr_pages; i++) { 402 page = pool->alloc.cache[i]; 403 if ((pp_flags & PP_FLAG_DMA_MAP) && 404 unlikely(!page_pool_dma_map(pool, page))) { 405 put_page(page); 406 continue; 407 } 408 409 page_pool_set_pp_info(pool, page); 410 pool->alloc.cache[pool->alloc.count++] = page; 411 /* Track how many pages are held 'in-flight' */ 412 pool->pages_state_hold_cnt++; 413 trace_page_pool_state_hold(pool, page, 414 pool->pages_state_hold_cnt); 415 } 416 417 /* Return last page */ 418 if (likely(pool->alloc.count > 0)) { 419 page = pool->alloc.cache[--pool->alloc.count]; 420 alloc_stat_inc(pool, slow); 421 } else { 422 page = NULL; 423 } 424 425 /* When page just alloc'ed is should/must have refcnt 1. */ 426 return page; 427 } 428 429 /* For using page_pool replace: alloc_pages() API calls, but provide 430 * synchronization guarantee for allocation side. 431 */ 432 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) 433 { 434 struct page *page; 435 436 /* Fast-path: Get a page from cache */ 437 page = __page_pool_get_cached(pool); 438 if (page) 439 return page; 440 441 /* Slow-path: cache empty, do real allocation */ 442 page = __page_pool_alloc_pages_slow(pool, gfp); 443 return page; 444 } 445 EXPORT_SYMBOL(page_pool_alloc_pages); 446 447 /* Calculate distance between two u32 values, valid if distance is below 2^(31) 448 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution 449 */ 450 #define _distance(a, b) (s32)((a) - (b)) 451 452 static s32 page_pool_inflight(struct page_pool *pool) 453 { 454 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); 455 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); 456 s32 inflight; 457 458 inflight = _distance(hold_cnt, release_cnt); 459 460 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); 461 WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); 462 463 return inflight; 464 } 465 466 /* Disconnects a page (from a page_pool). API users can have a need 467 * to disconnect a page (from a page_pool), to allow it to be used as 468 * a regular page (that will eventually be returned to the normal 469 * page-allocator via put_page). 470 */ 471 void page_pool_release_page(struct page_pool *pool, struct page *page) 472 { 473 dma_addr_t dma; 474 int count; 475 476 if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 477 /* Always account for inflight pages, even if we didn't 478 * map them 479 */ 480 goto skip_dma_unmap; 481 482 dma = page_pool_get_dma_addr(page); 483 484 /* When page is unmapped, it cannot be returned to our pool */ 485 dma_unmap_page_attrs(pool->p.dev, dma, 486 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 487 DMA_ATTR_SKIP_CPU_SYNC); 488 page_pool_set_dma_addr(page, 0); 489 skip_dma_unmap: 490 page_pool_clear_pp_info(page); 491 492 /* This may be the last page returned, releasing the pool, so 493 * it is not safe to reference pool afterwards. 494 */ 495 count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); 496 trace_page_pool_state_release(pool, page, count); 497 } 498 EXPORT_SYMBOL(page_pool_release_page); 499 500 /* Return a page to the page allocator, cleaning up our state */ 501 static void page_pool_return_page(struct page_pool *pool, struct page *page) 502 { 503 page_pool_release_page(pool, page); 504 505 put_page(page); 506 /* An optimization would be to call __free_pages(page, pool->p.order) 507 * knowing page is not part of page-cache (thus avoiding a 508 * __page_cache_release() call). 509 */ 510 } 511 512 static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) 513 { 514 int ret; 515 /* BH protection not needed if current is softirq */ 516 if (in_softirq()) 517 ret = ptr_ring_produce(&pool->ring, page); 518 else 519 ret = ptr_ring_produce_bh(&pool->ring, page); 520 521 if (!ret) { 522 recycle_stat_inc(pool, ring); 523 return true; 524 } 525 526 return false; 527 } 528 529 /* Only allow direct recycling in special circumstances, into the 530 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. 531 * 532 * Caller must provide appropriate safe context. 533 */ 534 static bool page_pool_recycle_in_cache(struct page *page, 535 struct page_pool *pool) 536 { 537 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { 538 recycle_stat_inc(pool, cache_full); 539 return false; 540 } 541 542 /* Caller MUST have verified/know (page_ref_count(page) == 1) */ 543 pool->alloc.cache[pool->alloc.count++] = page; 544 recycle_stat_inc(pool, cached); 545 return true; 546 } 547 548 /* If the page refcnt == 1, this will try to recycle the page. 549 * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for 550 * the configured size min(dma_sync_size, pool->max_len). 551 * If the page refcnt != 1, then the page will be returned to memory 552 * subsystem. 553 */ 554 static __always_inline struct page * 555 __page_pool_put_page(struct page_pool *pool, struct page *page, 556 unsigned int dma_sync_size, bool allow_direct) 557 { 558 /* This allocator is optimized for the XDP mode that uses 559 * one-frame-per-page, but have fallbacks that act like the 560 * regular page allocator APIs. 561 * 562 * refcnt == 1 means page_pool owns page, and can recycle it. 563 * 564 * page is NOT reusable when allocated when system is under 565 * some pressure. (page_is_pfmemalloc) 566 */ 567 if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { 568 /* Read barrier done in page_ref_count / READ_ONCE */ 569 570 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 571 page_pool_dma_sync_for_device(pool, page, 572 dma_sync_size); 573 574 if (allow_direct && in_softirq() && 575 page_pool_recycle_in_cache(page, pool)) 576 return NULL; 577 578 /* Page found as candidate for recycling */ 579 return page; 580 } 581 /* Fallback/non-XDP mode: API user have elevated refcnt. 582 * 583 * Many drivers split up the page into fragments, and some 584 * want to keep doing this to save memory and do refcnt based 585 * recycling. Support this use case too, to ease drivers 586 * switching between XDP/non-XDP. 587 * 588 * In-case page_pool maintains the DMA mapping, API user must 589 * call page_pool_put_page once. In this elevated refcnt 590 * case, the DMA is unmapped/released, as driver is likely 591 * doing refcnt based recycle tricks, meaning another process 592 * will be invoking put_page. 593 */ 594 recycle_stat_inc(pool, released_refcnt); 595 /* Do not replace this with page_pool_return_page() */ 596 page_pool_release_page(pool, page); 597 put_page(page); 598 599 return NULL; 600 } 601 602 void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, 603 unsigned int dma_sync_size, bool allow_direct) 604 { 605 page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); 606 if (page && !page_pool_recycle_in_ring(pool, page)) { 607 /* Cache full, fallback to free pages */ 608 recycle_stat_inc(pool, ring_full); 609 page_pool_return_page(pool, page); 610 } 611 } 612 EXPORT_SYMBOL(page_pool_put_defragged_page); 613 614 /* Caller must not use data area after call, as this function overwrites it */ 615 void page_pool_put_page_bulk(struct page_pool *pool, void **data, 616 int count) 617 { 618 int i, bulk_len = 0; 619 620 for (i = 0; i < count; i++) { 621 struct page *page = virt_to_head_page(data[i]); 622 623 /* It is not the last user for the page frag case */ 624 if (!page_pool_is_last_frag(pool, page)) 625 continue; 626 627 page = __page_pool_put_page(pool, page, -1, false); 628 /* Approved for bulk recycling in ptr_ring cache */ 629 if (page) 630 data[bulk_len++] = page; 631 } 632 633 if (unlikely(!bulk_len)) 634 return; 635 636 /* Bulk producer into ptr_ring page_pool cache */ 637 page_pool_ring_lock(pool); 638 for (i = 0; i < bulk_len; i++) { 639 if (__ptr_ring_produce(&pool->ring, data[i])) { 640 /* ring full */ 641 recycle_stat_inc(pool, ring_full); 642 break; 643 } 644 } 645 recycle_stat_add(pool, ring, i); 646 page_pool_ring_unlock(pool); 647 648 /* Hopefully all pages was return into ptr_ring */ 649 if (likely(i == bulk_len)) 650 return; 651 652 /* ptr_ring cache full, free remaining pages outside producer lock 653 * since put_page() with refcnt == 1 can be an expensive operation 654 */ 655 for (; i < bulk_len; i++) 656 page_pool_return_page(pool, data[i]); 657 } 658 EXPORT_SYMBOL(page_pool_put_page_bulk); 659 660 static struct page *page_pool_drain_frag(struct page_pool *pool, 661 struct page *page) 662 { 663 long drain_count = BIAS_MAX - pool->frag_users; 664 665 /* Some user is still using the page frag */ 666 if (likely(page_pool_defrag_page(page, drain_count))) 667 return NULL; 668 669 if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { 670 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) 671 page_pool_dma_sync_for_device(pool, page, -1); 672 673 return page; 674 } 675 676 page_pool_return_page(pool, page); 677 return NULL; 678 } 679 680 static void page_pool_free_frag(struct page_pool *pool) 681 { 682 long drain_count = BIAS_MAX - pool->frag_users; 683 struct page *page = pool->frag_page; 684 685 pool->frag_page = NULL; 686 687 if (!page || page_pool_defrag_page(page, drain_count)) 688 return; 689 690 page_pool_return_page(pool, page); 691 } 692 693 struct page *page_pool_alloc_frag(struct page_pool *pool, 694 unsigned int *offset, 695 unsigned int size, gfp_t gfp) 696 { 697 unsigned int max_size = PAGE_SIZE << pool->p.order; 698 struct page *page = pool->frag_page; 699 700 if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || 701 size > max_size)) 702 return NULL; 703 704 size = ALIGN(size, dma_get_cache_alignment()); 705 *offset = pool->frag_offset; 706 707 if (page && *offset + size > max_size) { 708 page = page_pool_drain_frag(pool, page); 709 if (page) { 710 alloc_stat_inc(pool, fast); 711 goto frag_reset; 712 } 713 } 714 715 if (!page) { 716 page = page_pool_alloc_pages(pool, gfp); 717 if (unlikely(!page)) { 718 pool->frag_page = NULL; 719 return NULL; 720 } 721 722 pool->frag_page = page; 723 724 frag_reset: 725 pool->frag_users = 1; 726 *offset = 0; 727 pool->frag_offset = size; 728 page_pool_fragment_page(page, BIAS_MAX); 729 return page; 730 } 731 732 pool->frag_users++; 733 pool->frag_offset = *offset + size; 734 alloc_stat_inc(pool, fast); 735 return page; 736 } 737 EXPORT_SYMBOL(page_pool_alloc_frag); 738 739 static void page_pool_empty_ring(struct page_pool *pool) 740 { 741 struct page *page; 742 743 /* Empty recycle ring */ 744 while ((page = ptr_ring_consume_bh(&pool->ring))) { 745 /* Verify the refcnt invariant of cached pages */ 746 if (!(page_ref_count(page) == 1)) 747 pr_crit("%s() page_pool refcnt %d violation\n", 748 __func__, page_ref_count(page)); 749 750 page_pool_return_page(pool, page); 751 } 752 } 753 754 static void page_pool_free(struct page_pool *pool) 755 { 756 if (pool->disconnect) 757 pool->disconnect(pool); 758 759 ptr_ring_cleanup(&pool->ring, NULL); 760 761 if (pool->p.flags & PP_FLAG_DMA_MAP) 762 put_device(pool->p.dev); 763 764 #ifdef CONFIG_PAGE_POOL_STATS 765 free_percpu(pool->recycle_stats); 766 #endif 767 kfree(pool); 768 } 769 770 static void page_pool_empty_alloc_cache_once(struct page_pool *pool) 771 { 772 struct page *page; 773 774 if (pool->destroy_cnt) 775 return; 776 777 /* Empty alloc cache, assume caller made sure this is 778 * no-longer in use, and page_pool_alloc_pages() cannot be 779 * call concurrently. 780 */ 781 while (pool->alloc.count) { 782 page = pool->alloc.cache[--pool->alloc.count]; 783 page_pool_return_page(pool, page); 784 } 785 } 786 787 static void page_pool_scrub(struct page_pool *pool) 788 { 789 page_pool_empty_alloc_cache_once(pool); 790 pool->destroy_cnt++; 791 792 /* No more consumers should exist, but producers could still 793 * be in-flight. 794 */ 795 page_pool_empty_ring(pool); 796 } 797 798 static int page_pool_release(struct page_pool *pool) 799 { 800 int inflight; 801 802 page_pool_scrub(pool); 803 inflight = page_pool_inflight(pool); 804 if (!inflight) 805 page_pool_free(pool); 806 807 return inflight; 808 } 809 810 static void page_pool_release_retry(struct work_struct *wq) 811 { 812 struct delayed_work *dwq = to_delayed_work(wq); 813 struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); 814 int inflight; 815 816 inflight = page_pool_release(pool); 817 if (!inflight) 818 return; 819 820 /* Periodic warning */ 821 if (time_after_eq(jiffies, pool->defer_warn)) { 822 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; 823 824 pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", 825 __func__, inflight, sec); 826 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 827 } 828 829 /* Still not ready to be disconnected, retry later */ 830 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 831 } 832 833 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), 834 struct xdp_mem_info *mem) 835 { 836 refcount_inc(&pool->user_cnt); 837 pool->disconnect = disconnect; 838 pool->xdp_mem_id = mem->id; 839 } 840 841 void page_pool_destroy(struct page_pool *pool) 842 { 843 if (!pool) 844 return; 845 846 if (!page_pool_put(pool)) 847 return; 848 849 page_pool_free_frag(pool); 850 851 if (!page_pool_release(pool)) 852 return; 853 854 pool->defer_start = jiffies; 855 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; 856 857 INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); 858 schedule_delayed_work(&pool->release_dw, DEFER_TIME); 859 } 860 EXPORT_SYMBOL(page_pool_destroy); 861 862 /* Caller must provide appropriate safe context, e.g. NAPI. */ 863 void page_pool_update_nid(struct page_pool *pool, int new_nid) 864 { 865 struct page *page; 866 867 trace_page_pool_update_nid(pool, new_nid); 868 pool->p.nid = new_nid; 869 870 /* Flush pool alloc cache, as refill will check NUMA node */ 871 while (pool->alloc.count) { 872 page = pool->alloc.cache[--pool->alloc.count]; 873 page_pool_return_page(pool, page); 874 } 875 } 876 EXPORT_SYMBOL(page_pool_update_nid); 877 878 bool page_pool_return_skb_page(struct page *page, bool napi_safe) 879 { 880 struct napi_struct *napi; 881 struct page_pool *pp; 882 bool allow_direct; 883 884 page = compound_head(page); 885 886 /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation 887 * in order to preserve any existing bits, such as bit 0 for the 888 * head page of compound page and bit 1 for pfmemalloc page, so 889 * mask those bits for freeing side when doing below checking, 890 * and page_is_pfmemalloc() is checked in __page_pool_put_page() 891 * to avoid recycling the pfmemalloc page. 892 */ 893 if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) 894 return false; 895 896 pp = page->pp; 897 898 /* Allow direct recycle if we have reasons to believe that we are 899 * in the same context as the consumer would run, so there's 900 * no possible race. 901 */ 902 napi = pp->p.napi; 903 allow_direct = napi_safe && napi && 904 READ_ONCE(napi->list_owner) == smp_processor_id(); 905 906 /* Driver set this to memory recycling info. Reset it on recycle. 907 * This will *not* work for NIC using a split-page memory model. 908 * The page will be returned to the pool here regardless of the 909 * 'flipped' fragment being in use or not. 910 */ 911 page_pool_put_full_page(pp, page, allow_direct); 912 913 return true; 914 } 915 EXPORT_SYMBOL(page_pool_return_skb_page); 916