1 /* SPDX-License-Identifier: GPL-2.0
2 *
3 * page_pool.c
4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
5 * Copyright (C) 2016 Red Hat, Inc.
6 */
7
8 #include <linux/error-injection.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/slab.h>
12 #include <linux/device.h>
13
14 #include <net/netdev_lock.h>
15 #include <net/netdev_rx_queue.h>
16 #include <net/page_pool/helpers.h>
17 #include <net/page_pool/memory_provider.h>
18 #include <net/xdp.h>
19
20 #include <linux/dma-direction.h>
21 #include <linux/dma-mapping.h>
22 #include <linux/page-flags.h>
23 #include <linux/mm.h> /* for put_page() */
24 #include <linux/poison.h>
25 #include <linux/ethtool.h>
26 #include <linux/netdevice.h>
27
28 #include <trace/events/page_pool.h>
29
30 #include "dev.h"
31 #include "mp_dmabuf_devmem.h"
32 #include "netmem_priv.h"
33 #include "page_pool_priv.h"
34
35 DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
36
37 #define DEFER_TIME (msecs_to_jiffies(1000))
38 #define DEFER_WARN_INTERVAL (60 * HZ)
39
40 #define BIAS_MAX (LONG_MAX >> 1)
41
42 #ifdef CONFIG_PAGE_POOL_STATS
43 static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
44
45 /* alloc_stat_inc is intended to be used in softirq context */
46 #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
47 /* recycle_stat_inc is safe to use when preemption is possible. */
48 #define recycle_stat_inc(pool, __stat) \
49 do { \
50 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
51 this_cpu_inc(s->__stat); \
52 } while (0)
53
54 #define recycle_stat_add(pool, __stat, val) \
55 do { \
56 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
57 this_cpu_add(s->__stat, val); \
58 } while (0)
59
60 static const char pp_stats[][ETH_GSTRING_LEN] = {
61 "rx_pp_alloc_fast",
62 "rx_pp_alloc_slow",
63 "rx_pp_alloc_slow_ho",
64 "rx_pp_alloc_empty",
65 "rx_pp_alloc_refill",
66 "rx_pp_alloc_waive",
67 "rx_pp_recycle_cached",
68 "rx_pp_recycle_cache_full",
69 "rx_pp_recycle_ring",
70 "rx_pp_recycle_ring_full",
71 "rx_pp_recycle_released_ref",
72 };
73
74 /**
75 * page_pool_get_stats() - fetch page pool stats
76 * @pool: pool from which page was allocated
77 * @stats: struct page_pool_stats to fill in
78 *
79 * Retrieve statistics about the page_pool. This API is only available
80 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
81 * A pointer to a caller allocated struct page_pool_stats structure
82 * is passed to this API which is filled in. The caller can then report
83 * those stats to the user (perhaps via ethtool, debugfs, etc.).
84 */
page_pool_get_stats(const struct page_pool * pool,struct page_pool_stats * stats)85 bool page_pool_get_stats(const struct page_pool *pool,
86 struct page_pool_stats *stats)
87 {
88 int cpu = 0;
89
90 if (!stats)
91 return false;
92
93 /* The caller is responsible to initialize stats. */
94 stats->alloc_stats.fast += pool->alloc_stats.fast;
95 stats->alloc_stats.slow += pool->alloc_stats.slow;
96 stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
97 stats->alloc_stats.empty += pool->alloc_stats.empty;
98 stats->alloc_stats.refill += pool->alloc_stats.refill;
99 stats->alloc_stats.waive += pool->alloc_stats.waive;
100
101 for_each_possible_cpu(cpu) {
102 const struct page_pool_recycle_stats *pcpu =
103 per_cpu_ptr(pool->recycle_stats, cpu);
104
105 stats->recycle_stats.cached += pcpu->cached;
106 stats->recycle_stats.cache_full += pcpu->cache_full;
107 stats->recycle_stats.ring += pcpu->ring;
108 stats->recycle_stats.ring_full += pcpu->ring_full;
109 stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
110 }
111
112 return true;
113 }
114 EXPORT_SYMBOL(page_pool_get_stats);
115
page_pool_ethtool_stats_get_strings(u8 * data)116 u8 *page_pool_ethtool_stats_get_strings(u8 *data)
117 {
118 int i;
119
120 for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
121 memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
122 data += ETH_GSTRING_LEN;
123 }
124
125 return data;
126 }
127 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
128
page_pool_ethtool_stats_get_count(void)129 int page_pool_ethtool_stats_get_count(void)
130 {
131 return ARRAY_SIZE(pp_stats);
132 }
133 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
134
page_pool_ethtool_stats_get(u64 * data,const void * stats)135 u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
136 {
137 const struct page_pool_stats *pool_stats = stats;
138
139 *data++ = pool_stats->alloc_stats.fast;
140 *data++ = pool_stats->alloc_stats.slow;
141 *data++ = pool_stats->alloc_stats.slow_high_order;
142 *data++ = pool_stats->alloc_stats.empty;
143 *data++ = pool_stats->alloc_stats.refill;
144 *data++ = pool_stats->alloc_stats.waive;
145 *data++ = pool_stats->recycle_stats.cached;
146 *data++ = pool_stats->recycle_stats.cache_full;
147 *data++ = pool_stats->recycle_stats.ring;
148 *data++ = pool_stats->recycle_stats.ring_full;
149 *data++ = pool_stats->recycle_stats.released_refcnt;
150
151 return data;
152 }
153 EXPORT_SYMBOL(page_pool_ethtool_stats_get);
154
155 #else
156 #define alloc_stat_inc(pool, __stat)
157 #define recycle_stat_inc(pool, __stat)
158 #define recycle_stat_add(pool, __stat, val)
159 #endif
160
page_pool_producer_lock(struct page_pool * pool)161 static bool page_pool_producer_lock(struct page_pool *pool)
162 __acquires(&pool->ring.producer_lock)
163 {
164 bool in_softirq = in_softirq();
165
166 if (in_softirq)
167 spin_lock(&pool->ring.producer_lock);
168 else
169 spin_lock_bh(&pool->ring.producer_lock);
170
171 return in_softirq;
172 }
173
page_pool_producer_unlock(struct page_pool * pool,bool in_softirq)174 static void page_pool_producer_unlock(struct page_pool *pool,
175 bool in_softirq)
176 __releases(&pool->ring.producer_lock)
177 {
178 if (in_softirq)
179 spin_unlock(&pool->ring.producer_lock);
180 else
181 spin_unlock_bh(&pool->ring.producer_lock);
182 }
183
page_pool_struct_check(void)184 static void page_pool_struct_check(void)
185 {
186 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
187 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
188 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
189 CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
190 PAGE_POOL_FRAG_GROUP_ALIGN);
191 }
192
page_pool_init(struct page_pool * pool,const struct page_pool_params * params,int cpuid)193 static int page_pool_init(struct page_pool *pool,
194 const struct page_pool_params *params,
195 int cpuid)
196 {
197 unsigned int ring_qsize = 1024; /* Default */
198 struct netdev_rx_queue *rxq;
199 int err;
200
201 page_pool_struct_check();
202
203 memcpy(&pool->p, ¶ms->fast, sizeof(pool->p));
204 memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow));
205
206 pool->cpuid = cpuid;
207 pool->dma_sync_for_cpu = true;
208
209 /* Validate only known flags were used */
210 if (pool->slow.flags & ~PP_FLAG_ALL)
211 return -EINVAL;
212
213 if (pool->p.pool_size)
214 ring_qsize = pool->p.pool_size;
215
216 /* Sanity limit mem that can be pinned down */
217 if (ring_qsize > 32768)
218 return -E2BIG;
219
220 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
221 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
222 * which is the XDP_TX use-case.
223 */
224 if (pool->slow.flags & PP_FLAG_DMA_MAP) {
225 if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
226 (pool->p.dma_dir != DMA_BIDIRECTIONAL))
227 return -EINVAL;
228
229 pool->dma_map = true;
230 }
231
232 if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
233 /* In order to request DMA-sync-for-device the page
234 * needs to be mapped
235 */
236 if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
237 return -EINVAL;
238
239 if (!pool->p.max_len)
240 return -EINVAL;
241
242 pool->dma_sync = true;
243
244 /* pool->p.offset has to be set according to the address
245 * offset used by the DMA engine to start copying rx data
246 */
247 }
248
249 pool->has_init_callback = !!pool->slow.init_callback;
250
251 #ifdef CONFIG_PAGE_POOL_STATS
252 if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
253 pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
254 if (!pool->recycle_stats)
255 return -ENOMEM;
256 } else {
257 /* For system page pool instance we use a singular stats object
258 * instead of allocating a separate percpu variable for each
259 * (also percpu) page pool instance.
260 */
261 pool->recycle_stats = &pp_system_recycle_stats;
262 pool->system = true;
263 }
264 #endif
265
266 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
267 #ifdef CONFIG_PAGE_POOL_STATS
268 if (!pool->system)
269 free_percpu(pool->recycle_stats);
270 #endif
271 return -ENOMEM;
272 }
273
274 atomic_set(&pool->pages_state_release_cnt, 0);
275
276 /* Driver calling page_pool_create() also call page_pool_destroy() */
277 refcount_set(&pool->user_cnt, 1);
278
279 if (pool->dma_map)
280 get_device(pool->p.dev);
281
282 if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
283 netdev_assert_locked(pool->slow.netdev);
284 rxq = __netif_get_rx_queue(pool->slow.netdev,
285 pool->slow.queue_idx);
286 pool->mp_priv = rxq->mp_params.mp_priv;
287 pool->mp_ops = rxq->mp_params.mp_ops;
288 }
289
290 if (pool->mp_ops) {
291 if (!pool->dma_map || !pool->dma_sync)
292 return -EOPNOTSUPP;
293
294 if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) {
295 err = -EFAULT;
296 goto free_ptr_ring;
297 }
298
299 err = pool->mp_ops->init(pool);
300 if (err) {
301 pr_warn("%s() mem-provider init failed %d\n", __func__,
302 err);
303 goto free_ptr_ring;
304 }
305
306 static_branch_inc(&page_pool_mem_providers);
307 }
308
309 return 0;
310
311 free_ptr_ring:
312 ptr_ring_cleanup(&pool->ring, NULL);
313 #ifdef CONFIG_PAGE_POOL_STATS
314 if (!pool->system)
315 free_percpu(pool->recycle_stats);
316 #endif
317 return err;
318 }
319
page_pool_uninit(struct page_pool * pool)320 static void page_pool_uninit(struct page_pool *pool)
321 {
322 ptr_ring_cleanup(&pool->ring, NULL);
323
324 if (pool->dma_map)
325 put_device(pool->p.dev);
326
327 #ifdef CONFIG_PAGE_POOL_STATS
328 if (!pool->system)
329 free_percpu(pool->recycle_stats);
330 #endif
331 }
332
333 /**
334 * page_pool_create_percpu() - create a page pool for a given cpu.
335 * @params: parameters, see struct page_pool_params
336 * @cpuid: cpu identifier
337 */
338 struct page_pool *
page_pool_create_percpu(const struct page_pool_params * params,int cpuid)339 page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
340 {
341 struct page_pool *pool;
342 int err;
343
344 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
345 if (!pool)
346 return ERR_PTR(-ENOMEM);
347
348 err = page_pool_init(pool, params, cpuid);
349 if (err < 0)
350 goto err_free;
351
352 err = page_pool_list(pool);
353 if (err)
354 goto err_uninit;
355
356 return pool;
357
358 err_uninit:
359 page_pool_uninit(pool);
360 err_free:
361 pr_warn("%s() gave up with errno %d\n", __func__, err);
362 kfree(pool);
363 return ERR_PTR(err);
364 }
365 EXPORT_SYMBOL(page_pool_create_percpu);
366
367 /**
368 * page_pool_create() - create a page pool
369 * @params: parameters, see struct page_pool_params
370 */
page_pool_create(const struct page_pool_params * params)371 struct page_pool *page_pool_create(const struct page_pool_params *params)
372 {
373 return page_pool_create_percpu(params, -1);
374 }
375 EXPORT_SYMBOL(page_pool_create);
376
377 static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
378
page_pool_refill_alloc_cache(struct page_pool * pool)379 static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
380 {
381 struct ptr_ring *r = &pool->ring;
382 netmem_ref netmem;
383 int pref_nid; /* preferred NUMA node */
384
385 /* Quicker fallback, avoid locks when ring is empty */
386 if (__ptr_ring_empty(r)) {
387 alloc_stat_inc(pool, empty);
388 return 0;
389 }
390
391 /* Softirq guarantee CPU and thus NUMA node is stable. This,
392 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
393 */
394 #ifdef CONFIG_NUMA
395 pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
396 #else
397 /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
398 pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
399 #endif
400
401 /* Refill alloc array, but only if NUMA match */
402 do {
403 netmem = (__force netmem_ref)__ptr_ring_consume(r);
404 if (unlikely(!netmem))
405 break;
406
407 if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
408 pool->alloc.cache[pool->alloc.count++] = netmem;
409 } else {
410 /* NUMA mismatch;
411 * (1) release 1 page to page-allocator and
412 * (2) break out to fallthrough to alloc_pages_node.
413 * This limit stress on page buddy alloactor.
414 */
415 page_pool_return_page(pool, netmem);
416 alloc_stat_inc(pool, waive);
417 netmem = 0;
418 break;
419 }
420 } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
421
422 /* Return last page */
423 if (likely(pool->alloc.count > 0)) {
424 netmem = pool->alloc.cache[--pool->alloc.count];
425 alloc_stat_inc(pool, refill);
426 }
427
428 return netmem;
429 }
430
431 /* fast path */
__page_pool_get_cached(struct page_pool * pool)432 static netmem_ref __page_pool_get_cached(struct page_pool *pool)
433 {
434 netmem_ref netmem;
435
436 /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
437 if (likely(pool->alloc.count)) {
438 /* Fast-path */
439 netmem = pool->alloc.cache[--pool->alloc.count];
440 alloc_stat_inc(pool, fast);
441 } else {
442 netmem = page_pool_refill_alloc_cache(pool);
443 }
444
445 return netmem;
446 }
447
__page_pool_dma_sync_for_device(const struct page_pool * pool,netmem_ref netmem,u32 dma_sync_size)448 static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
449 netmem_ref netmem,
450 u32 dma_sync_size)
451 {
452 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
453 dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
454
455 dma_sync_size = min(dma_sync_size, pool->p.max_len);
456 __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
457 dma_sync_size, pool->p.dma_dir);
458 #endif
459 }
460
461 static __always_inline void
page_pool_dma_sync_for_device(const struct page_pool * pool,netmem_ref netmem,u32 dma_sync_size)462 page_pool_dma_sync_for_device(const struct page_pool *pool,
463 netmem_ref netmem,
464 u32 dma_sync_size)
465 {
466 if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
467 __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
468 }
469
page_pool_dma_map(struct page_pool * pool,netmem_ref netmem)470 static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
471 {
472 dma_addr_t dma;
473
474 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
475 * since dma_addr_t can be either 32 or 64 bits and does not always fit
476 * into page private data (i.e 32bit cpu with 64bit DMA caps)
477 * This mapping is kept for lifetime of page, until leaving pool.
478 */
479 dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
480 (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
481 DMA_ATTR_SKIP_CPU_SYNC |
482 DMA_ATTR_WEAK_ORDERING);
483 if (dma_mapping_error(pool->p.dev, dma))
484 return false;
485
486 if (page_pool_set_dma_addr_netmem(netmem, dma))
487 goto unmap_failed;
488
489 page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
490
491 return true;
492
493 unmap_failed:
494 WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
495 dma_unmap_page_attrs(pool->p.dev, dma,
496 PAGE_SIZE << pool->p.order, pool->p.dma_dir,
497 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
498 return false;
499 }
500
__page_pool_alloc_page_order(struct page_pool * pool,gfp_t gfp)501 static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
502 gfp_t gfp)
503 {
504 struct page *page;
505
506 gfp |= __GFP_COMP;
507 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
508 if (unlikely(!page))
509 return NULL;
510
511 if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
512 put_page(page);
513 return NULL;
514 }
515
516 alloc_stat_inc(pool, slow_high_order);
517 page_pool_set_pp_info(pool, page_to_netmem(page));
518
519 /* Track how many pages are held 'in-flight' */
520 pool->pages_state_hold_cnt++;
521 trace_page_pool_state_hold(pool, page_to_netmem(page),
522 pool->pages_state_hold_cnt);
523 return page;
524 }
525
526 /* slow path */
__page_pool_alloc_pages_slow(struct page_pool * pool,gfp_t gfp)527 static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
528 gfp_t gfp)
529 {
530 const int bulk = PP_ALLOC_CACHE_REFILL;
531 unsigned int pp_order = pool->p.order;
532 bool dma_map = pool->dma_map;
533 netmem_ref netmem;
534 int i, nr_pages;
535
536 /* Don't support bulk alloc for high-order pages */
537 if (unlikely(pp_order))
538 return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
539
540 /* Unnecessary as alloc cache is empty, but guarantees zero count */
541 if (unlikely(pool->alloc.count > 0))
542 return pool->alloc.cache[--pool->alloc.count];
543
544 /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */
545 memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
546
547 nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk,
548 (struct page **)pool->alloc.cache);
549 if (unlikely(!nr_pages))
550 return 0;
551
552 /* Pages have been filled into alloc.cache array, but count is zero and
553 * page element have not been (possibly) DMA mapped.
554 */
555 for (i = 0; i < nr_pages; i++) {
556 netmem = pool->alloc.cache[i];
557 if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
558 put_page(netmem_to_page(netmem));
559 continue;
560 }
561
562 page_pool_set_pp_info(pool, netmem);
563 pool->alloc.cache[pool->alloc.count++] = netmem;
564 /* Track how many pages are held 'in-flight' */
565 pool->pages_state_hold_cnt++;
566 trace_page_pool_state_hold(pool, netmem,
567 pool->pages_state_hold_cnt);
568 }
569
570 /* Return last page */
571 if (likely(pool->alloc.count > 0)) {
572 netmem = pool->alloc.cache[--pool->alloc.count];
573 alloc_stat_inc(pool, slow);
574 } else {
575 netmem = 0;
576 }
577
578 /* When page just alloc'ed is should/must have refcnt 1. */
579 return netmem;
580 }
581
582 /* For using page_pool replace: alloc_pages() API calls, but provide
583 * synchronization guarantee for allocation side.
584 */
page_pool_alloc_netmems(struct page_pool * pool,gfp_t gfp)585 netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp)
586 {
587 netmem_ref netmem;
588
589 /* Fast-path: Get a page from cache */
590 netmem = __page_pool_get_cached(pool);
591 if (netmem)
592 return netmem;
593
594 /* Slow-path: cache empty, do real allocation */
595 if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops)
596 netmem = pool->mp_ops->alloc_netmems(pool, gfp);
597 else
598 netmem = __page_pool_alloc_pages_slow(pool, gfp);
599 return netmem;
600 }
601 EXPORT_SYMBOL(page_pool_alloc_netmems);
602 ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL);
603
page_pool_alloc_pages(struct page_pool * pool,gfp_t gfp)604 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
605 {
606 return netmem_to_page(page_pool_alloc_netmems(pool, gfp));
607 }
608 EXPORT_SYMBOL(page_pool_alloc_pages);
609
610 /* Calculate distance between two u32 values, valid if distance is below 2^(31)
611 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
612 */
613 #define _distance(a, b) (s32)((a) - (b))
614
page_pool_inflight(const struct page_pool * pool,bool strict)615 s32 page_pool_inflight(const struct page_pool *pool, bool strict)
616 {
617 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
618 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
619 s32 inflight;
620
621 inflight = _distance(hold_cnt, release_cnt);
622
623 if (strict) {
624 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
625 WARN(inflight < 0, "Negative(%d) inflight packet-pages",
626 inflight);
627 } else {
628 inflight = max(0, inflight);
629 }
630
631 return inflight;
632 }
633
page_pool_set_pp_info(struct page_pool * pool,netmem_ref netmem)634 void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
635 {
636 netmem_set_pp(netmem, pool);
637 netmem_or_pp_magic(netmem, PP_SIGNATURE);
638
639 /* Ensuring all pages have been split into one fragment initially:
640 * page_pool_set_pp_info() is only called once for every page when it
641 * is allocated from the page allocator and page_pool_fragment_page()
642 * is dirtying the same cache line as the page->pp_magic above, so
643 * the overhead is negligible.
644 */
645 page_pool_fragment_netmem(netmem, 1);
646 if (pool->has_init_callback)
647 pool->slow.init_callback(netmem, pool->slow.init_arg);
648 }
649
page_pool_clear_pp_info(netmem_ref netmem)650 void page_pool_clear_pp_info(netmem_ref netmem)
651 {
652 netmem_clear_pp_magic(netmem);
653 netmem_set_pp(netmem, NULL);
654 }
655
__page_pool_release_page_dma(struct page_pool * pool,netmem_ref netmem)656 static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
657 netmem_ref netmem)
658 {
659 dma_addr_t dma;
660
661 if (!pool->dma_map)
662 /* Always account for inflight pages, even if we didn't
663 * map them
664 */
665 return;
666
667 dma = page_pool_get_dma_addr_netmem(netmem);
668
669 /* When page is unmapped, it cannot be returned to our pool */
670 dma_unmap_page_attrs(pool->p.dev, dma,
671 PAGE_SIZE << pool->p.order, pool->p.dma_dir,
672 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
673 page_pool_set_dma_addr_netmem(netmem, 0);
674 }
675
676 /* Disconnects a page (from a page_pool). API users can have a need
677 * to disconnect a page (from a page_pool), to allow it to be used as
678 * a regular page (that will eventually be returned to the normal
679 * page-allocator via put_page).
680 */
page_pool_return_page(struct page_pool * pool,netmem_ref netmem)681 void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
682 {
683 int count;
684 bool put;
685
686 put = true;
687 if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops)
688 put = pool->mp_ops->release_netmem(pool, netmem);
689 else
690 __page_pool_release_page_dma(pool, netmem);
691
692 /* This may be the last page returned, releasing the pool, so
693 * it is not safe to reference pool afterwards.
694 */
695 count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
696 trace_page_pool_state_release(pool, netmem, count);
697
698 if (put) {
699 page_pool_clear_pp_info(netmem);
700 put_page(netmem_to_page(netmem));
701 }
702 /* An optimization would be to call __free_pages(page, pool->p.order)
703 * knowing page is not part of page-cache (thus avoiding a
704 * __page_cache_release() call).
705 */
706 }
707
page_pool_recycle_in_ring(struct page_pool * pool,netmem_ref netmem)708 static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
709 {
710 int ret;
711 /* BH protection not needed if current is softirq */
712 if (in_softirq())
713 ret = ptr_ring_produce(&pool->ring, (__force void *)netmem);
714 else
715 ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem);
716
717 if (!ret) {
718 recycle_stat_inc(pool, ring);
719 return true;
720 }
721
722 return false;
723 }
724
725 /* Only allow direct recycling in special circumstances, into the
726 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case.
727 *
728 * Caller must provide appropriate safe context.
729 */
page_pool_recycle_in_cache(netmem_ref netmem,struct page_pool * pool)730 static bool page_pool_recycle_in_cache(netmem_ref netmem,
731 struct page_pool *pool)
732 {
733 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
734 recycle_stat_inc(pool, cache_full);
735 return false;
736 }
737
738 /* Caller MUST have verified/know (page_ref_count(page) == 1) */
739 pool->alloc.cache[pool->alloc.count++] = netmem;
740 recycle_stat_inc(pool, cached);
741 return true;
742 }
743
__page_pool_page_can_be_recycled(netmem_ref netmem)744 static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
745 {
746 return netmem_is_net_iov(netmem) ||
747 (page_ref_count(netmem_to_page(netmem)) == 1 &&
748 !page_is_pfmemalloc(netmem_to_page(netmem)));
749 }
750
751 /* If the page refcnt == 1, this will try to recycle the page.
752 * If pool->dma_sync is set, we'll try to sync the DMA area for
753 * the configured size min(dma_sync_size, pool->max_len).
754 * If the page refcnt != 1, then the page will be returned to memory
755 * subsystem.
756 */
757 static __always_inline netmem_ref
__page_pool_put_page(struct page_pool * pool,netmem_ref netmem,unsigned int dma_sync_size,bool allow_direct)758 __page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
759 unsigned int dma_sync_size, bool allow_direct)
760 {
761 lockdep_assert_no_hardirq();
762
763 /* This allocator is optimized for the XDP mode that uses
764 * one-frame-per-page, but have fallbacks that act like the
765 * regular page allocator APIs.
766 *
767 * refcnt == 1 means page_pool owns page, and can recycle it.
768 *
769 * page is NOT reusable when allocated when system is under
770 * some pressure. (page_is_pfmemalloc)
771 */
772 if (likely(__page_pool_page_can_be_recycled(netmem))) {
773 /* Read barrier done in page_ref_count / READ_ONCE */
774
775 page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
776
777 if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
778 return 0;
779
780 /* Page found as candidate for recycling */
781 return netmem;
782 }
783
784 /* Fallback/non-XDP mode: API user have elevated refcnt.
785 *
786 * Many drivers split up the page into fragments, and some
787 * want to keep doing this to save memory and do refcnt based
788 * recycling. Support this use case too, to ease drivers
789 * switching between XDP/non-XDP.
790 *
791 * In-case page_pool maintains the DMA mapping, API user must
792 * call page_pool_put_page once. In this elevated refcnt
793 * case, the DMA is unmapped/released, as driver is likely
794 * doing refcnt based recycle tricks, meaning another process
795 * will be invoking put_page.
796 */
797 recycle_stat_inc(pool, released_refcnt);
798 page_pool_return_page(pool, netmem);
799
800 return 0;
801 }
802
page_pool_napi_local(const struct page_pool * pool)803 static bool page_pool_napi_local(const struct page_pool *pool)
804 {
805 const struct napi_struct *napi;
806 u32 cpuid;
807
808 if (unlikely(!in_softirq()))
809 return false;
810
811 /* Allow direct recycle if we have reasons to believe that we are
812 * in the same context as the consumer would run, so there's
813 * no possible race.
814 * __page_pool_put_page() makes sure we're not in hardirq context
815 * and interrupts are enabled prior to accessing the cache.
816 */
817 cpuid = smp_processor_id();
818 if (READ_ONCE(pool->cpuid) == cpuid)
819 return true;
820
821 napi = READ_ONCE(pool->p.napi);
822
823 return napi && READ_ONCE(napi->list_owner) == cpuid;
824 }
825
page_pool_put_unrefed_netmem(struct page_pool * pool,netmem_ref netmem,unsigned int dma_sync_size,bool allow_direct)826 void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
827 unsigned int dma_sync_size, bool allow_direct)
828 {
829 if (!allow_direct)
830 allow_direct = page_pool_napi_local(pool);
831
832 netmem =
833 __page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
834 if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
835 /* Cache full, fallback to free pages */
836 recycle_stat_inc(pool, ring_full);
837 page_pool_return_page(pool, netmem);
838 }
839 }
840 EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
841
page_pool_put_unrefed_page(struct page_pool * pool,struct page * page,unsigned int dma_sync_size,bool allow_direct)842 void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
843 unsigned int dma_sync_size, bool allow_direct)
844 {
845 page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
846 allow_direct);
847 }
848 EXPORT_SYMBOL(page_pool_put_unrefed_page);
849
page_pool_recycle_ring_bulk(struct page_pool * pool,netmem_ref * bulk,u32 bulk_len)850 static void page_pool_recycle_ring_bulk(struct page_pool *pool,
851 netmem_ref *bulk,
852 u32 bulk_len)
853 {
854 bool in_softirq;
855 u32 i;
856
857 /* Bulk produce into ptr_ring page_pool cache */
858 in_softirq = page_pool_producer_lock(pool);
859
860 for (i = 0; i < bulk_len; i++) {
861 if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) {
862 /* ring full */
863 recycle_stat_inc(pool, ring_full);
864 break;
865 }
866 }
867
868 page_pool_producer_unlock(pool, in_softirq);
869 recycle_stat_add(pool, ring, i);
870
871 /* Hopefully all pages were returned into ptr_ring */
872 if (likely(i == bulk_len))
873 return;
874
875 /*
876 * ptr_ring cache is full, free remaining pages outside producer lock
877 * since put_page() with refcnt == 1 can be an expensive operation.
878 */
879 for (; i < bulk_len; i++)
880 page_pool_return_page(pool, bulk[i]);
881 }
882
883 /**
884 * page_pool_put_netmem_bulk() - release references on multiple netmems
885 * @data: array holding netmem references
886 * @count: number of entries in @data
887 *
888 * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring
889 * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk()
890 * will release leftover netmems to the memory provider.
891 * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx
892 * completion loop for the XDP_REDIRECT use case.
893 *
894 * Please note the caller must not use data area after running
895 * page_pool_put_netmem_bulk(), as this function overwrites it.
896 */
page_pool_put_netmem_bulk(netmem_ref * data,u32 count)897 void page_pool_put_netmem_bulk(netmem_ref *data, u32 count)
898 {
899 u32 bulk_len = 0;
900
901 for (u32 i = 0; i < count; i++) {
902 netmem_ref netmem = netmem_compound_head(data[i]);
903
904 if (page_pool_unref_and_test(netmem))
905 data[bulk_len++] = netmem;
906 }
907
908 count = bulk_len;
909 while (count) {
910 netmem_ref bulk[XDP_BULK_QUEUE_SIZE];
911 struct page_pool *pool = NULL;
912 bool allow_direct;
913 u32 foreign = 0;
914
915 bulk_len = 0;
916
917 for (u32 i = 0; i < count; i++) {
918 struct page_pool *netmem_pp;
919 netmem_ref netmem = data[i];
920
921 netmem_pp = netmem_get_pp(netmem);
922 if (unlikely(!pool)) {
923 pool = netmem_pp;
924 allow_direct = page_pool_napi_local(pool);
925 } else if (netmem_pp != pool) {
926 /*
927 * If the netmem belongs to a different
928 * page_pool, save it for another round.
929 */
930 data[foreign++] = netmem;
931 continue;
932 }
933
934 netmem = __page_pool_put_page(pool, netmem, -1,
935 allow_direct);
936 /* Approved for bulk recycling in ptr_ring cache */
937 if (netmem)
938 bulk[bulk_len++] = netmem;
939 }
940
941 if (bulk_len)
942 page_pool_recycle_ring_bulk(pool, bulk, bulk_len);
943
944 count = foreign;
945 }
946 }
947 EXPORT_SYMBOL(page_pool_put_netmem_bulk);
948
page_pool_drain_frag(struct page_pool * pool,netmem_ref netmem)949 static netmem_ref page_pool_drain_frag(struct page_pool *pool,
950 netmem_ref netmem)
951 {
952 long drain_count = BIAS_MAX - pool->frag_users;
953
954 /* Some user is still using the page frag */
955 if (likely(page_pool_unref_netmem(netmem, drain_count)))
956 return 0;
957
958 if (__page_pool_page_can_be_recycled(netmem)) {
959 page_pool_dma_sync_for_device(pool, netmem, -1);
960 return netmem;
961 }
962
963 page_pool_return_page(pool, netmem);
964 return 0;
965 }
966
page_pool_free_frag(struct page_pool * pool)967 static void page_pool_free_frag(struct page_pool *pool)
968 {
969 long drain_count = BIAS_MAX - pool->frag_users;
970 netmem_ref netmem = pool->frag_page;
971
972 pool->frag_page = 0;
973
974 if (!netmem || page_pool_unref_netmem(netmem, drain_count))
975 return;
976
977 page_pool_return_page(pool, netmem);
978 }
979
page_pool_alloc_frag_netmem(struct page_pool * pool,unsigned int * offset,unsigned int size,gfp_t gfp)980 netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
981 unsigned int *offset, unsigned int size,
982 gfp_t gfp)
983 {
984 unsigned int max_size = PAGE_SIZE << pool->p.order;
985 netmem_ref netmem = pool->frag_page;
986
987 if (WARN_ON(size > max_size))
988 return 0;
989
990 size = ALIGN(size, dma_get_cache_alignment());
991 *offset = pool->frag_offset;
992
993 if (netmem && *offset + size > max_size) {
994 netmem = page_pool_drain_frag(pool, netmem);
995 if (netmem) {
996 recycle_stat_inc(pool, cached);
997 alloc_stat_inc(pool, fast);
998 goto frag_reset;
999 }
1000 }
1001
1002 if (!netmem) {
1003 netmem = page_pool_alloc_netmems(pool, gfp);
1004 if (unlikely(!netmem)) {
1005 pool->frag_page = 0;
1006 return 0;
1007 }
1008
1009 pool->frag_page = netmem;
1010
1011 frag_reset:
1012 pool->frag_users = 1;
1013 *offset = 0;
1014 pool->frag_offset = size;
1015 page_pool_fragment_netmem(netmem, BIAS_MAX);
1016 return netmem;
1017 }
1018
1019 pool->frag_users++;
1020 pool->frag_offset = *offset + size;
1021 return netmem;
1022 }
1023 EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
1024
page_pool_alloc_frag(struct page_pool * pool,unsigned int * offset,unsigned int size,gfp_t gfp)1025 struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
1026 unsigned int size, gfp_t gfp)
1027 {
1028 return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
1029 gfp));
1030 }
1031 EXPORT_SYMBOL(page_pool_alloc_frag);
1032
page_pool_empty_ring(struct page_pool * pool)1033 static void page_pool_empty_ring(struct page_pool *pool)
1034 {
1035 netmem_ref netmem;
1036
1037 /* Empty recycle ring */
1038 while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
1039 /* Verify the refcnt invariant of cached pages */
1040 if (!(netmem_ref_count(netmem) == 1))
1041 pr_crit("%s() page_pool refcnt %d violation\n",
1042 __func__, netmem_ref_count(netmem));
1043
1044 page_pool_return_page(pool, netmem);
1045 }
1046 }
1047
__page_pool_destroy(struct page_pool * pool)1048 static void __page_pool_destroy(struct page_pool *pool)
1049 {
1050 if (pool->disconnect)
1051 pool->disconnect(pool);
1052
1053 page_pool_unlist(pool);
1054 page_pool_uninit(pool);
1055
1056 if (pool->mp_ops) {
1057 pool->mp_ops->destroy(pool);
1058 static_branch_dec(&page_pool_mem_providers);
1059 }
1060
1061 kfree(pool);
1062 }
1063
page_pool_empty_alloc_cache_once(struct page_pool * pool)1064 static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
1065 {
1066 netmem_ref netmem;
1067
1068 if (pool->destroy_cnt)
1069 return;
1070
1071 /* Empty alloc cache, assume caller made sure this is
1072 * no-longer in use, and page_pool_alloc_pages() cannot be
1073 * call concurrently.
1074 */
1075 while (pool->alloc.count) {
1076 netmem = pool->alloc.cache[--pool->alloc.count];
1077 page_pool_return_page(pool, netmem);
1078 }
1079 }
1080
page_pool_scrub(struct page_pool * pool)1081 static void page_pool_scrub(struct page_pool *pool)
1082 {
1083 page_pool_empty_alloc_cache_once(pool);
1084 pool->destroy_cnt++;
1085
1086 /* No more consumers should exist, but producers could still
1087 * be in-flight.
1088 */
1089 page_pool_empty_ring(pool);
1090 }
1091
page_pool_release(struct page_pool * pool)1092 static int page_pool_release(struct page_pool *pool)
1093 {
1094 int inflight;
1095
1096 page_pool_scrub(pool);
1097 inflight = page_pool_inflight(pool, true);
1098 if (!inflight)
1099 __page_pool_destroy(pool);
1100
1101 return inflight;
1102 }
1103
page_pool_release_retry(struct work_struct * wq)1104 static void page_pool_release_retry(struct work_struct *wq)
1105 {
1106 struct delayed_work *dwq = to_delayed_work(wq);
1107 struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
1108 void *netdev;
1109 int inflight;
1110
1111 inflight = page_pool_release(pool);
1112 /* In rare cases, a driver bug may cause inflight to go negative.
1113 * Don't reschedule release if inflight is 0 or negative.
1114 * - If 0, the page_pool has been destroyed
1115 * - if negative, we will never recover
1116 * in both cases no reschedule is necessary.
1117 */
1118 if (inflight <= 0)
1119 return;
1120
1121 /* Periodic warning for page pools the user can't see */
1122 netdev = READ_ONCE(pool->slow.netdev);
1123 if (time_after_eq(jiffies, pool->defer_warn) &&
1124 (!netdev || netdev == NET_PTR_POISON)) {
1125 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
1126
1127 pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
1128 __func__, pool->user.id, inflight, sec);
1129 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
1130 }
1131
1132 /* Still not ready to be disconnected, retry later */
1133 schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1134 }
1135
page_pool_use_xdp_mem(struct page_pool * pool,void (* disconnect)(void *),const struct xdp_mem_info * mem)1136 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
1137 const struct xdp_mem_info *mem)
1138 {
1139 refcount_inc(&pool->user_cnt);
1140 pool->disconnect = disconnect;
1141 pool->xdp_mem_id = mem->id;
1142 }
1143
page_pool_disable_direct_recycling(struct page_pool * pool)1144 void page_pool_disable_direct_recycling(struct page_pool *pool)
1145 {
1146 /* Disable direct recycling based on pool->cpuid.
1147 * Paired with READ_ONCE() in page_pool_napi_local().
1148 */
1149 WRITE_ONCE(pool->cpuid, -1);
1150
1151 if (!pool->p.napi)
1152 return;
1153
1154 napi_assert_will_not_race(pool->p.napi);
1155
1156 mutex_lock(&page_pools_lock);
1157 WRITE_ONCE(pool->p.napi, NULL);
1158 mutex_unlock(&page_pools_lock);
1159 }
1160 EXPORT_SYMBOL(page_pool_disable_direct_recycling);
1161
page_pool_destroy(struct page_pool * pool)1162 void page_pool_destroy(struct page_pool *pool)
1163 {
1164 if (!pool)
1165 return;
1166
1167 if (!page_pool_put(pool))
1168 return;
1169
1170 page_pool_disable_direct_recycling(pool);
1171 page_pool_free_frag(pool);
1172
1173 if (!page_pool_release(pool))
1174 return;
1175
1176 page_pool_detached(pool);
1177 pool->defer_start = jiffies;
1178 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
1179
1180 INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
1181 schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1182 }
1183 EXPORT_SYMBOL(page_pool_destroy);
1184
1185 /* Caller must provide appropriate safe context, e.g. NAPI. */
page_pool_update_nid(struct page_pool * pool,int new_nid)1186 void page_pool_update_nid(struct page_pool *pool, int new_nid)
1187 {
1188 netmem_ref netmem;
1189
1190 trace_page_pool_update_nid(pool, new_nid);
1191 pool->p.nid = new_nid;
1192
1193 /* Flush pool alloc cache, as refill will check NUMA node */
1194 while (pool->alloc.count) {
1195 netmem = pool->alloc.cache[--pool->alloc.count];
1196 page_pool_return_page(pool, netmem);
1197 }
1198 }
1199 EXPORT_SYMBOL(page_pool_update_nid);
1200
net_mp_niov_set_dma_addr(struct net_iov * niov,dma_addr_t addr)1201 bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr)
1202 {
1203 return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr);
1204 }
1205
1206 /* Associate a niov with a page pool. Should follow with a matching
1207 * net_mp_niov_clear_page_pool()
1208 */
net_mp_niov_set_page_pool(struct page_pool * pool,struct net_iov * niov)1209 void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov)
1210 {
1211 netmem_ref netmem = net_iov_to_netmem(niov);
1212
1213 page_pool_set_pp_info(pool, netmem);
1214
1215 pool->pages_state_hold_cnt++;
1216 trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt);
1217 }
1218
1219 /* Disassociate a niov from a page pool. Should only be used in the
1220 * ->release_netmem() path.
1221 */
net_mp_niov_clear_page_pool(struct net_iov * niov)1222 void net_mp_niov_clear_page_pool(struct net_iov *niov)
1223 {
1224 netmem_ref netmem = net_iov_to_netmem(niov);
1225
1226 page_pool_clear_pp_info(netmem);
1227 }
1228