xref: /linux/net/core/page_pool.c (revision 26db4dbb747813b5946aff31485873f071a10332)
1 /* SPDX-License-Identifier: GPL-2.0
2  *
3  * page_pool.c
4  *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
5  *	Copyright (C) 2016 Red Hat, Inc.
6  */
7 
8 #include <linux/error-injection.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/slab.h>
12 #include <linux/device.h>
13 
14 #include <net/netdev_rx_queue.h>
15 #include <net/page_pool/helpers.h>
16 #include <net/page_pool/memory_provider.h>
17 #include <net/xdp.h>
18 
19 #include <linux/dma-direction.h>
20 #include <linux/dma-mapping.h>
21 #include <linux/page-flags.h>
22 #include <linux/mm.h> /* for put_page() */
23 #include <linux/poison.h>
24 #include <linux/ethtool.h>
25 #include <linux/netdevice.h>
26 
27 #include <trace/events/page_pool.h>
28 
29 #include "mp_dmabuf_devmem.h"
30 #include "netmem_priv.h"
31 #include "page_pool_priv.h"
32 
33 DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
34 
35 #define DEFER_TIME (msecs_to_jiffies(1000))
36 #define DEFER_WARN_INTERVAL (60 * HZ)
37 
38 #define BIAS_MAX	(LONG_MAX >> 1)
39 
40 #ifdef CONFIG_PAGE_POOL_STATS
41 static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
42 
43 /* alloc_stat_inc is intended to be used in softirq context */
44 #define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
45 /* recycle_stat_inc is safe to use when preemption is possible. */
46 #define recycle_stat_inc(pool, __stat)							\
47 	do {										\
48 		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
49 		this_cpu_inc(s->__stat);						\
50 	} while (0)
51 
52 #define recycle_stat_add(pool, __stat, val)						\
53 	do {										\
54 		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
55 		this_cpu_add(s->__stat, val);						\
56 	} while (0)
57 
58 static const char pp_stats[][ETH_GSTRING_LEN] = {
59 	"rx_pp_alloc_fast",
60 	"rx_pp_alloc_slow",
61 	"rx_pp_alloc_slow_ho",
62 	"rx_pp_alloc_empty",
63 	"rx_pp_alloc_refill",
64 	"rx_pp_alloc_waive",
65 	"rx_pp_recycle_cached",
66 	"rx_pp_recycle_cache_full",
67 	"rx_pp_recycle_ring",
68 	"rx_pp_recycle_ring_full",
69 	"rx_pp_recycle_released_ref",
70 };
71 
72 /**
73  * page_pool_get_stats() - fetch page pool stats
74  * @pool:	pool from which page was allocated
75  * @stats:	struct page_pool_stats to fill in
76  *
77  * Retrieve statistics about the page_pool. This API is only available
78  * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
79  * A pointer to a caller allocated struct page_pool_stats structure
80  * is passed to this API which is filled in. The caller can then report
81  * those stats to the user (perhaps via ethtool, debugfs, etc.).
82  */
83 bool page_pool_get_stats(const struct page_pool *pool,
84 			 struct page_pool_stats *stats)
85 {
86 	int cpu = 0;
87 
88 	if (!stats)
89 		return false;
90 
91 	/* The caller is responsible to initialize stats. */
92 	stats->alloc_stats.fast += pool->alloc_stats.fast;
93 	stats->alloc_stats.slow += pool->alloc_stats.slow;
94 	stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
95 	stats->alloc_stats.empty += pool->alloc_stats.empty;
96 	stats->alloc_stats.refill += pool->alloc_stats.refill;
97 	stats->alloc_stats.waive += pool->alloc_stats.waive;
98 
99 	for_each_possible_cpu(cpu) {
100 		const struct page_pool_recycle_stats *pcpu =
101 			per_cpu_ptr(pool->recycle_stats, cpu);
102 
103 		stats->recycle_stats.cached += pcpu->cached;
104 		stats->recycle_stats.cache_full += pcpu->cache_full;
105 		stats->recycle_stats.ring += pcpu->ring;
106 		stats->recycle_stats.ring_full += pcpu->ring_full;
107 		stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
108 	}
109 
110 	return true;
111 }
112 EXPORT_SYMBOL(page_pool_get_stats);
113 
114 u8 *page_pool_ethtool_stats_get_strings(u8 *data)
115 {
116 	int i;
117 
118 	for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
119 		memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
120 		data += ETH_GSTRING_LEN;
121 	}
122 
123 	return data;
124 }
125 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
126 
127 int page_pool_ethtool_stats_get_count(void)
128 {
129 	return ARRAY_SIZE(pp_stats);
130 }
131 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
132 
133 u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
134 {
135 	const struct page_pool_stats *pool_stats = stats;
136 
137 	*data++ = pool_stats->alloc_stats.fast;
138 	*data++ = pool_stats->alloc_stats.slow;
139 	*data++ = pool_stats->alloc_stats.slow_high_order;
140 	*data++ = pool_stats->alloc_stats.empty;
141 	*data++ = pool_stats->alloc_stats.refill;
142 	*data++ = pool_stats->alloc_stats.waive;
143 	*data++ = pool_stats->recycle_stats.cached;
144 	*data++ = pool_stats->recycle_stats.cache_full;
145 	*data++ = pool_stats->recycle_stats.ring;
146 	*data++ = pool_stats->recycle_stats.ring_full;
147 	*data++ = pool_stats->recycle_stats.released_refcnt;
148 
149 	return data;
150 }
151 EXPORT_SYMBOL(page_pool_ethtool_stats_get);
152 
153 #else
154 #define alloc_stat_inc(pool, __stat)
155 #define recycle_stat_inc(pool, __stat)
156 #define recycle_stat_add(pool, __stat, val)
157 #endif
158 
159 static bool page_pool_producer_lock(struct page_pool *pool)
160 	__acquires(&pool->ring.producer_lock)
161 {
162 	bool in_softirq = in_softirq();
163 
164 	if (in_softirq)
165 		spin_lock(&pool->ring.producer_lock);
166 	else
167 		spin_lock_bh(&pool->ring.producer_lock);
168 
169 	return in_softirq;
170 }
171 
172 static void page_pool_producer_unlock(struct page_pool *pool,
173 				      bool in_softirq)
174 	__releases(&pool->ring.producer_lock)
175 {
176 	if (in_softirq)
177 		spin_unlock(&pool->ring.producer_lock);
178 	else
179 		spin_unlock_bh(&pool->ring.producer_lock);
180 }
181 
182 static void page_pool_struct_check(void)
183 {
184 	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
185 	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
186 	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
187 	CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
188 				    PAGE_POOL_FRAG_GROUP_ALIGN);
189 }
190 
191 static int page_pool_init(struct page_pool *pool,
192 			  const struct page_pool_params *params,
193 			  int cpuid)
194 {
195 	unsigned int ring_qsize = 1024; /* Default */
196 	struct netdev_rx_queue *rxq;
197 	int err;
198 
199 	page_pool_struct_check();
200 
201 	memcpy(&pool->p, &params->fast, sizeof(pool->p));
202 	memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
203 
204 	pool->cpuid = cpuid;
205 	pool->dma_sync_for_cpu = true;
206 
207 	/* Validate only known flags were used */
208 	if (pool->slow.flags & ~PP_FLAG_ALL)
209 		return -EINVAL;
210 
211 	if (pool->p.pool_size)
212 		ring_qsize = pool->p.pool_size;
213 
214 	/* Sanity limit mem that can be pinned down */
215 	if (ring_qsize > 32768)
216 		return -E2BIG;
217 
218 	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
219 	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
220 	 * which is the XDP_TX use-case.
221 	 */
222 	if (pool->slow.flags & PP_FLAG_DMA_MAP) {
223 		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
224 		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
225 			return -EINVAL;
226 
227 		pool->dma_map = true;
228 	}
229 
230 	if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
231 		/* In order to request DMA-sync-for-device the page
232 		 * needs to be mapped
233 		 */
234 		if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
235 			return -EINVAL;
236 
237 		if (!pool->p.max_len)
238 			return -EINVAL;
239 
240 		pool->dma_sync = true;
241 
242 		/* pool->p.offset has to be set according to the address
243 		 * offset used by the DMA engine to start copying rx data
244 		 */
245 	}
246 
247 	pool->has_init_callback = !!pool->slow.init_callback;
248 
249 #ifdef CONFIG_PAGE_POOL_STATS
250 	if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
251 		pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
252 		if (!pool->recycle_stats)
253 			return -ENOMEM;
254 	} else {
255 		/* For system page pool instance we use a singular stats object
256 		 * instead of allocating a separate percpu variable for each
257 		 * (also percpu) page pool instance.
258 		 */
259 		pool->recycle_stats = &pp_system_recycle_stats;
260 		pool->system = true;
261 	}
262 #endif
263 
264 	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
265 #ifdef CONFIG_PAGE_POOL_STATS
266 		if (!pool->system)
267 			free_percpu(pool->recycle_stats);
268 #endif
269 		return -ENOMEM;
270 	}
271 
272 	atomic_set(&pool->pages_state_release_cnt, 0);
273 
274 	/* Driver calling page_pool_create() also call page_pool_destroy() */
275 	refcount_set(&pool->user_cnt, 1);
276 
277 	if (pool->dma_map)
278 		get_device(pool->p.dev);
279 
280 	if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
281 		/* We rely on rtnl_lock()ing to make sure netdev_rx_queue
282 		 * configuration doesn't change while we're initializing
283 		 * the page_pool.
284 		 */
285 		ASSERT_RTNL();
286 		rxq = __netif_get_rx_queue(pool->slow.netdev,
287 					   pool->slow.queue_idx);
288 		pool->mp_priv = rxq->mp_params.mp_priv;
289 		pool->mp_ops = rxq->mp_params.mp_ops;
290 	}
291 
292 	if (pool->mp_ops) {
293 		if (!pool->dma_map || !pool->dma_sync)
294 			return -EOPNOTSUPP;
295 
296 		if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) {
297 			err = -EFAULT;
298 			goto free_ptr_ring;
299 		}
300 
301 		err = pool->mp_ops->init(pool);
302 		if (err) {
303 			pr_warn("%s() mem-provider init failed %d\n", __func__,
304 				err);
305 			goto free_ptr_ring;
306 		}
307 
308 		static_branch_inc(&page_pool_mem_providers);
309 	}
310 
311 	return 0;
312 
313 free_ptr_ring:
314 	ptr_ring_cleanup(&pool->ring, NULL);
315 #ifdef CONFIG_PAGE_POOL_STATS
316 	if (!pool->system)
317 		free_percpu(pool->recycle_stats);
318 #endif
319 	return err;
320 }
321 
322 static void page_pool_uninit(struct page_pool *pool)
323 {
324 	ptr_ring_cleanup(&pool->ring, NULL);
325 
326 	if (pool->dma_map)
327 		put_device(pool->p.dev);
328 
329 #ifdef CONFIG_PAGE_POOL_STATS
330 	if (!pool->system)
331 		free_percpu(pool->recycle_stats);
332 #endif
333 }
334 
335 /**
336  * page_pool_create_percpu() - create a page pool for a given cpu.
337  * @params: parameters, see struct page_pool_params
338  * @cpuid: cpu identifier
339  */
340 struct page_pool *
341 page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
342 {
343 	struct page_pool *pool;
344 	int err;
345 
346 	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
347 	if (!pool)
348 		return ERR_PTR(-ENOMEM);
349 
350 	err = page_pool_init(pool, params, cpuid);
351 	if (err < 0)
352 		goto err_free;
353 
354 	err = page_pool_list(pool);
355 	if (err)
356 		goto err_uninit;
357 
358 	return pool;
359 
360 err_uninit:
361 	page_pool_uninit(pool);
362 err_free:
363 	pr_warn("%s() gave up with errno %d\n", __func__, err);
364 	kfree(pool);
365 	return ERR_PTR(err);
366 }
367 EXPORT_SYMBOL(page_pool_create_percpu);
368 
369 /**
370  * page_pool_create() - create a page pool
371  * @params: parameters, see struct page_pool_params
372  */
373 struct page_pool *page_pool_create(const struct page_pool_params *params)
374 {
375 	return page_pool_create_percpu(params, -1);
376 }
377 EXPORT_SYMBOL(page_pool_create);
378 
379 static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
380 
381 static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
382 {
383 	struct ptr_ring *r = &pool->ring;
384 	netmem_ref netmem;
385 	int pref_nid; /* preferred NUMA node */
386 
387 	/* Quicker fallback, avoid locks when ring is empty */
388 	if (__ptr_ring_empty(r)) {
389 		alloc_stat_inc(pool, empty);
390 		return 0;
391 	}
392 
393 	/* Softirq guarantee CPU and thus NUMA node is stable. This,
394 	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
395 	 */
396 #ifdef CONFIG_NUMA
397 	pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
398 #else
399 	/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
400 	pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
401 #endif
402 
403 	/* Refill alloc array, but only if NUMA match */
404 	do {
405 		netmem = (__force netmem_ref)__ptr_ring_consume(r);
406 		if (unlikely(!netmem))
407 			break;
408 
409 		if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
410 			pool->alloc.cache[pool->alloc.count++] = netmem;
411 		} else {
412 			/* NUMA mismatch;
413 			 * (1) release 1 page to page-allocator and
414 			 * (2) break out to fallthrough to alloc_pages_node.
415 			 * This limit stress on page buddy alloactor.
416 			 */
417 			page_pool_return_page(pool, netmem);
418 			alloc_stat_inc(pool, waive);
419 			netmem = 0;
420 			break;
421 		}
422 	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
423 
424 	/* Return last page */
425 	if (likely(pool->alloc.count > 0)) {
426 		netmem = pool->alloc.cache[--pool->alloc.count];
427 		alloc_stat_inc(pool, refill);
428 	}
429 
430 	return netmem;
431 }
432 
433 /* fast path */
434 static netmem_ref __page_pool_get_cached(struct page_pool *pool)
435 {
436 	netmem_ref netmem;
437 
438 	/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
439 	if (likely(pool->alloc.count)) {
440 		/* Fast-path */
441 		netmem = pool->alloc.cache[--pool->alloc.count];
442 		alloc_stat_inc(pool, fast);
443 	} else {
444 		netmem = page_pool_refill_alloc_cache(pool);
445 	}
446 
447 	return netmem;
448 }
449 
450 static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
451 					    netmem_ref netmem,
452 					    u32 dma_sync_size)
453 {
454 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
455 	dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
456 
457 	dma_sync_size = min(dma_sync_size, pool->p.max_len);
458 	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
459 				     dma_sync_size, pool->p.dma_dir);
460 #endif
461 }
462 
463 static __always_inline void
464 page_pool_dma_sync_for_device(const struct page_pool *pool,
465 			      netmem_ref netmem,
466 			      u32 dma_sync_size)
467 {
468 	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
469 		__page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
470 }
471 
472 static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
473 {
474 	dma_addr_t dma;
475 
476 	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
477 	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
478 	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
479 	 * This mapping is kept for lifetime of page, until leaving pool.
480 	 */
481 	dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
482 				 (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
483 				 DMA_ATTR_SKIP_CPU_SYNC |
484 					 DMA_ATTR_WEAK_ORDERING);
485 	if (dma_mapping_error(pool->p.dev, dma))
486 		return false;
487 
488 	if (page_pool_set_dma_addr_netmem(netmem, dma))
489 		goto unmap_failed;
490 
491 	page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
492 
493 	return true;
494 
495 unmap_failed:
496 	WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
497 	dma_unmap_page_attrs(pool->p.dev, dma,
498 			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
499 			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
500 	return false;
501 }
502 
503 static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
504 						 gfp_t gfp)
505 {
506 	struct page *page;
507 
508 	gfp |= __GFP_COMP;
509 	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
510 	if (unlikely(!page))
511 		return NULL;
512 
513 	if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
514 		put_page(page);
515 		return NULL;
516 	}
517 
518 	alloc_stat_inc(pool, slow_high_order);
519 	page_pool_set_pp_info(pool, page_to_netmem(page));
520 
521 	/* Track how many pages are held 'in-flight' */
522 	pool->pages_state_hold_cnt++;
523 	trace_page_pool_state_hold(pool, page_to_netmem(page),
524 				   pool->pages_state_hold_cnt);
525 	return page;
526 }
527 
528 /* slow path */
529 static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
530 							gfp_t gfp)
531 {
532 	const int bulk = PP_ALLOC_CACHE_REFILL;
533 	unsigned int pp_order = pool->p.order;
534 	bool dma_map = pool->dma_map;
535 	netmem_ref netmem;
536 	int i, nr_pages;
537 
538 	/* Don't support bulk alloc for high-order pages */
539 	if (unlikely(pp_order))
540 		return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
541 
542 	/* Unnecessary as alloc cache is empty, but guarantees zero count */
543 	if (unlikely(pool->alloc.count > 0))
544 		return pool->alloc.cache[--pool->alloc.count];
545 
546 	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */
547 	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
548 
549 	nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk,
550 					 (struct page **)pool->alloc.cache);
551 	if (unlikely(!nr_pages))
552 		return 0;
553 
554 	/* Pages have been filled into alloc.cache array, but count is zero and
555 	 * page element have not been (possibly) DMA mapped.
556 	 */
557 	for (i = 0; i < nr_pages; i++) {
558 		netmem = pool->alloc.cache[i];
559 		if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
560 			put_page(netmem_to_page(netmem));
561 			continue;
562 		}
563 
564 		page_pool_set_pp_info(pool, netmem);
565 		pool->alloc.cache[pool->alloc.count++] = netmem;
566 		/* Track how many pages are held 'in-flight' */
567 		pool->pages_state_hold_cnt++;
568 		trace_page_pool_state_hold(pool, netmem,
569 					   pool->pages_state_hold_cnt);
570 	}
571 
572 	/* Return last page */
573 	if (likely(pool->alloc.count > 0)) {
574 		netmem = pool->alloc.cache[--pool->alloc.count];
575 		alloc_stat_inc(pool, slow);
576 	} else {
577 		netmem = 0;
578 	}
579 
580 	/* When page just alloc'ed is should/must have refcnt 1. */
581 	return netmem;
582 }
583 
584 /* For using page_pool replace: alloc_pages() API calls, but provide
585  * synchronization guarantee for allocation side.
586  */
587 netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp)
588 {
589 	netmem_ref netmem;
590 
591 	/* Fast-path: Get a page from cache */
592 	netmem = __page_pool_get_cached(pool);
593 	if (netmem)
594 		return netmem;
595 
596 	/* Slow-path: cache empty, do real allocation */
597 	if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops)
598 		netmem = pool->mp_ops->alloc_netmems(pool, gfp);
599 	else
600 		netmem = __page_pool_alloc_pages_slow(pool, gfp);
601 	return netmem;
602 }
603 EXPORT_SYMBOL(page_pool_alloc_netmems);
604 ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL);
605 
606 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
607 {
608 	return netmem_to_page(page_pool_alloc_netmems(pool, gfp));
609 }
610 EXPORT_SYMBOL(page_pool_alloc_pages);
611 
612 /* Calculate distance between two u32 values, valid if distance is below 2^(31)
613  *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
614  */
615 #define _distance(a, b)	(s32)((a) - (b))
616 
617 s32 page_pool_inflight(const struct page_pool *pool, bool strict)
618 {
619 	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
620 	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
621 	s32 inflight;
622 
623 	inflight = _distance(hold_cnt, release_cnt);
624 
625 	if (strict) {
626 		trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
627 		WARN(inflight < 0, "Negative(%d) inflight packet-pages",
628 		     inflight);
629 	} else {
630 		inflight = max(0, inflight);
631 	}
632 
633 	return inflight;
634 }
635 
636 void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
637 {
638 	netmem_set_pp(netmem, pool);
639 	netmem_or_pp_magic(netmem, PP_SIGNATURE);
640 
641 	/* Ensuring all pages have been split into one fragment initially:
642 	 * page_pool_set_pp_info() is only called once for every page when it
643 	 * is allocated from the page allocator and page_pool_fragment_page()
644 	 * is dirtying the same cache line as the page->pp_magic above, so
645 	 * the overhead is negligible.
646 	 */
647 	page_pool_fragment_netmem(netmem, 1);
648 	if (pool->has_init_callback)
649 		pool->slow.init_callback(netmem, pool->slow.init_arg);
650 }
651 
652 void page_pool_clear_pp_info(netmem_ref netmem)
653 {
654 	netmem_clear_pp_magic(netmem);
655 	netmem_set_pp(netmem, NULL);
656 }
657 
658 static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
659 							 netmem_ref netmem)
660 {
661 	dma_addr_t dma;
662 
663 	if (!pool->dma_map)
664 		/* Always account for inflight pages, even if we didn't
665 		 * map them
666 		 */
667 		return;
668 
669 	dma = page_pool_get_dma_addr_netmem(netmem);
670 
671 	/* When page is unmapped, it cannot be returned to our pool */
672 	dma_unmap_page_attrs(pool->p.dev, dma,
673 			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
674 			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
675 	page_pool_set_dma_addr_netmem(netmem, 0);
676 }
677 
678 /* Disconnects a page (from a page_pool).  API users can have a need
679  * to disconnect a page (from a page_pool), to allow it to be used as
680  * a regular page (that will eventually be returned to the normal
681  * page-allocator via put_page).
682  */
683 void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
684 {
685 	int count;
686 	bool put;
687 
688 	put = true;
689 	if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops)
690 		put = pool->mp_ops->release_netmem(pool, netmem);
691 	else
692 		__page_pool_release_page_dma(pool, netmem);
693 
694 	/* This may be the last page returned, releasing the pool, so
695 	 * it is not safe to reference pool afterwards.
696 	 */
697 	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
698 	trace_page_pool_state_release(pool, netmem, count);
699 
700 	if (put) {
701 		page_pool_clear_pp_info(netmem);
702 		put_page(netmem_to_page(netmem));
703 	}
704 	/* An optimization would be to call __free_pages(page, pool->p.order)
705 	 * knowing page is not part of page-cache (thus avoiding a
706 	 * __page_cache_release() call).
707 	 */
708 }
709 
710 static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
711 {
712 	int ret;
713 	/* BH protection not needed if current is softirq */
714 	if (in_softirq())
715 		ret = ptr_ring_produce(&pool->ring, (__force void *)netmem);
716 	else
717 		ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem);
718 
719 	if (!ret) {
720 		recycle_stat_inc(pool, ring);
721 		return true;
722 	}
723 
724 	return false;
725 }
726 
727 /* Only allow direct recycling in special circumstances, into the
728  * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
729  *
730  * Caller must provide appropriate safe context.
731  */
732 static bool page_pool_recycle_in_cache(netmem_ref netmem,
733 				       struct page_pool *pool)
734 {
735 	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
736 		recycle_stat_inc(pool, cache_full);
737 		return false;
738 	}
739 
740 	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
741 	pool->alloc.cache[pool->alloc.count++] = netmem;
742 	recycle_stat_inc(pool, cached);
743 	return true;
744 }
745 
746 static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
747 {
748 	return netmem_is_net_iov(netmem) ||
749 	       (page_ref_count(netmem_to_page(netmem)) == 1 &&
750 		!page_is_pfmemalloc(netmem_to_page(netmem)));
751 }
752 
753 /* If the page refcnt == 1, this will try to recycle the page.
754  * If pool->dma_sync is set, we'll try to sync the DMA area for
755  * the configured size min(dma_sync_size, pool->max_len).
756  * If the page refcnt != 1, then the page will be returned to memory
757  * subsystem.
758  */
759 static __always_inline netmem_ref
760 __page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
761 		     unsigned int dma_sync_size, bool allow_direct)
762 {
763 	lockdep_assert_no_hardirq();
764 
765 	/* This allocator is optimized for the XDP mode that uses
766 	 * one-frame-per-page, but have fallbacks that act like the
767 	 * regular page allocator APIs.
768 	 *
769 	 * refcnt == 1 means page_pool owns page, and can recycle it.
770 	 *
771 	 * page is NOT reusable when allocated when system is under
772 	 * some pressure. (page_is_pfmemalloc)
773 	 */
774 	if (likely(__page_pool_page_can_be_recycled(netmem))) {
775 		/* Read barrier done in page_ref_count / READ_ONCE */
776 
777 		page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
778 
779 		if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
780 			return 0;
781 
782 		/* Page found as candidate for recycling */
783 		return netmem;
784 	}
785 
786 	/* Fallback/non-XDP mode: API user have elevated refcnt.
787 	 *
788 	 * Many drivers split up the page into fragments, and some
789 	 * want to keep doing this to save memory and do refcnt based
790 	 * recycling. Support this use case too, to ease drivers
791 	 * switching between XDP/non-XDP.
792 	 *
793 	 * In-case page_pool maintains the DMA mapping, API user must
794 	 * call page_pool_put_page once.  In this elevated refcnt
795 	 * case, the DMA is unmapped/released, as driver is likely
796 	 * doing refcnt based recycle tricks, meaning another process
797 	 * will be invoking put_page.
798 	 */
799 	recycle_stat_inc(pool, released_refcnt);
800 	page_pool_return_page(pool, netmem);
801 
802 	return 0;
803 }
804 
805 static bool page_pool_napi_local(const struct page_pool *pool)
806 {
807 	const struct napi_struct *napi;
808 	u32 cpuid;
809 
810 	if (unlikely(!in_softirq()))
811 		return false;
812 
813 	/* Allow direct recycle if we have reasons to believe that we are
814 	 * in the same context as the consumer would run, so there's
815 	 * no possible race.
816 	 * __page_pool_put_page() makes sure we're not in hardirq context
817 	 * and interrupts are enabled prior to accessing the cache.
818 	 */
819 	cpuid = smp_processor_id();
820 	if (READ_ONCE(pool->cpuid) == cpuid)
821 		return true;
822 
823 	napi = READ_ONCE(pool->p.napi);
824 
825 	return napi && READ_ONCE(napi->list_owner) == cpuid;
826 }
827 
828 void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
829 				  unsigned int dma_sync_size, bool allow_direct)
830 {
831 	if (!allow_direct)
832 		allow_direct = page_pool_napi_local(pool);
833 
834 	netmem =
835 		__page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
836 	if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
837 		/* Cache full, fallback to free pages */
838 		recycle_stat_inc(pool, ring_full);
839 		page_pool_return_page(pool, netmem);
840 	}
841 }
842 EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
843 
844 void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
845 				unsigned int dma_sync_size, bool allow_direct)
846 {
847 	page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
848 				     allow_direct);
849 }
850 EXPORT_SYMBOL(page_pool_put_unrefed_page);
851 
852 static void page_pool_recycle_ring_bulk(struct page_pool *pool,
853 					netmem_ref *bulk,
854 					u32 bulk_len)
855 {
856 	bool in_softirq;
857 	u32 i;
858 
859 	/* Bulk produce into ptr_ring page_pool cache */
860 	in_softirq = page_pool_producer_lock(pool);
861 
862 	for (i = 0; i < bulk_len; i++) {
863 		if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) {
864 			/* ring full */
865 			recycle_stat_inc(pool, ring_full);
866 			break;
867 		}
868 	}
869 
870 	page_pool_producer_unlock(pool, in_softirq);
871 	recycle_stat_add(pool, ring, i);
872 
873 	/* Hopefully all pages were returned into ptr_ring */
874 	if (likely(i == bulk_len))
875 		return;
876 
877 	/*
878 	 * ptr_ring cache is full, free remaining pages outside producer lock
879 	 * since put_page() with refcnt == 1 can be an expensive operation.
880 	 */
881 	for (; i < bulk_len; i++)
882 		page_pool_return_page(pool, bulk[i]);
883 }
884 
885 /**
886  * page_pool_put_netmem_bulk() - release references on multiple netmems
887  * @data:	array holding netmem references
888  * @count:	number of entries in @data
889  *
890  * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring
891  * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk()
892  * will release leftover netmems to the memory provider.
893  * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx
894  * completion loop for the XDP_REDIRECT use case.
895  *
896  * Please note the caller must not use data area after running
897  * page_pool_put_netmem_bulk(), as this function overwrites it.
898  */
899 void page_pool_put_netmem_bulk(netmem_ref *data, u32 count)
900 {
901 	u32 bulk_len = 0;
902 
903 	for (u32 i = 0; i < count; i++) {
904 		netmem_ref netmem = netmem_compound_head(data[i]);
905 
906 		if (page_pool_unref_and_test(netmem))
907 			data[bulk_len++] = netmem;
908 	}
909 
910 	count = bulk_len;
911 	while (count) {
912 		netmem_ref bulk[XDP_BULK_QUEUE_SIZE];
913 		struct page_pool *pool = NULL;
914 		bool allow_direct;
915 		u32 foreign = 0;
916 
917 		bulk_len = 0;
918 
919 		for (u32 i = 0; i < count; i++) {
920 			struct page_pool *netmem_pp;
921 			netmem_ref netmem = data[i];
922 
923 			netmem_pp = netmem_get_pp(netmem);
924 			if (unlikely(!pool)) {
925 				pool = netmem_pp;
926 				allow_direct = page_pool_napi_local(pool);
927 			} else if (netmem_pp != pool) {
928 				/*
929 				 * If the netmem belongs to a different
930 				 * page_pool, save it for another round.
931 				 */
932 				data[foreign++] = netmem;
933 				continue;
934 			}
935 
936 			netmem = __page_pool_put_page(pool, netmem, -1,
937 						      allow_direct);
938 			/* Approved for bulk recycling in ptr_ring cache */
939 			if (netmem)
940 				bulk[bulk_len++] = netmem;
941 		}
942 
943 		if (bulk_len)
944 			page_pool_recycle_ring_bulk(pool, bulk, bulk_len);
945 
946 		count = foreign;
947 	}
948 }
949 EXPORT_SYMBOL(page_pool_put_netmem_bulk);
950 
951 static netmem_ref page_pool_drain_frag(struct page_pool *pool,
952 				       netmem_ref netmem)
953 {
954 	long drain_count = BIAS_MAX - pool->frag_users;
955 
956 	/* Some user is still using the page frag */
957 	if (likely(page_pool_unref_netmem(netmem, drain_count)))
958 		return 0;
959 
960 	if (__page_pool_page_can_be_recycled(netmem)) {
961 		page_pool_dma_sync_for_device(pool, netmem, -1);
962 		return netmem;
963 	}
964 
965 	page_pool_return_page(pool, netmem);
966 	return 0;
967 }
968 
969 static void page_pool_free_frag(struct page_pool *pool)
970 {
971 	long drain_count = BIAS_MAX - pool->frag_users;
972 	netmem_ref netmem = pool->frag_page;
973 
974 	pool->frag_page = 0;
975 
976 	if (!netmem || page_pool_unref_netmem(netmem, drain_count))
977 		return;
978 
979 	page_pool_return_page(pool, netmem);
980 }
981 
982 netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
983 				       unsigned int *offset, unsigned int size,
984 				       gfp_t gfp)
985 {
986 	unsigned int max_size = PAGE_SIZE << pool->p.order;
987 	netmem_ref netmem = pool->frag_page;
988 
989 	if (WARN_ON(size > max_size))
990 		return 0;
991 
992 	size = ALIGN(size, dma_get_cache_alignment());
993 	*offset = pool->frag_offset;
994 
995 	if (netmem && *offset + size > max_size) {
996 		netmem = page_pool_drain_frag(pool, netmem);
997 		if (netmem) {
998 			recycle_stat_inc(pool, cached);
999 			alloc_stat_inc(pool, fast);
1000 			goto frag_reset;
1001 		}
1002 	}
1003 
1004 	if (!netmem) {
1005 		netmem = page_pool_alloc_netmems(pool, gfp);
1006 		if (unlikely(!netmem)) {
1007 			pool->frag_page = 0;
1008 			return 0;
1009 		}
1010 
1011 		pool->frag_page = netmem;
1012 
1013 frag_reset:
1014 		pool->frag_users = 1;
1015 		*offset = 0;
1016 		pool->frag_offset = size;
1017 		page_pool_fragment_netmem(netmem, BIAS_MAX);
1018 		return netmem;
1019 	}
1020 
1021 	pool->frag_users++;
1022 	pool->frag_offset = *offset + size;
1023 	return netmem;
1024 }
1025 EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
1026 
1027 struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
1028 				  unsigned int size, gfp_t gfp)
1029 {
1030 	return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
1031 							  gfp));
1032 }
1033 EXPORT_SYMBOL(page_pool_alloc_frag);
1034 
1035 static void page_pool_empty_ring(struct page_pool *pool)
1036 {
1037 	netmem_ref netmem;
1038 
1039 	/* Empty recycle ring */
1040 	while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
1041 		/* Verify the refcnt invariant of cached pages */
1042 		if (!(netmem_ref_count(netmem) == 1))
1043 			pr_crit("%s() page_pool refcnt %d violation\n",
1044 				__func__, netmem_ref_count(netmem));
1045 
1046 		page_pool_return_page(pool, netmem);
1047 	}
1048 }
1049 
1050 static void __page_pool_destroy(struct page_pool *pool)
1051 {
1052 	if (pool->disconnect)
1053 		pool->disconnect(pool);
1054 
1055 	page_pool_unlist(pool);
1056 	page_pool_uninit(pool);
1057 
1058 	if (pool->mp_ops) {
1059 		pool->mp_ops->destroy(pool);
1060 		static_branch_dec(&page_pool_mem_providers);
1061 	}
1062 
1063 	kfree(pool);
1064 }
1065 
1066 static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
1067 {
1068 	netmem_ref netmem;
1069 
1070 	if (pool->destroy_cnt)
1071 		return;
1072 
1073 	/* Empty alloc cache, assume caller made sure this is
1074 	 * no-longer in use, and page_pool_alloc_pages() cannot be
1075 	 * call concurrently.
1076 	 */
1077 	while (pool->alloc.count) {
1078 		netmem = pool->alloc.cache[--pool->alloc.count];
1079 		page_pool_return_page(pool, netmem);
1080 	}
1081 }
1082 
1083 static void page_pool_scrub(struct page_pool *pool)
1084 {
1085 	page_pool_empty_alloc_cache_once(pool);
1086 	pool->destroy_cnt++;
1087 
1088 	/* No more consumers should exist, but producers could still
1089 	 * be in-flight.
1090 	 */
1091 	page_pool_empty_ring(pool);
1092 }
1093 
1094 static int page_pool_release(struct page_pool *pool)
1095 {
1096 	int inflight;
1097 
1098 	page_pool_scrub(pool);
1099 	inflight = page_pool_inflight(pool, true);
1100 	if (!inflight)
1101 		__page_pool_destroy(pool);
1102 
1103 	return inflight;
1104 }
1105 
1106 static void page_pool_release_retry(struct work_struct *wq)
1107 {
1108 	struct delayed_work *dwq = to_delayed_work(wq);
1109 	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
1110 	void *netdev;
1111 	int inflight;
1112 
1113 	inflight = page_pool_release(pool);
1114 	if (!inflight)
1115 		return;
1116 
1117 	/* Periodic warning for page pools the user can't see */
1118 	netdev = READ_ONCE(pool->slow.netdev);
1119 	if (time_after_eq(jiffies, pool->defer_warn) &&
1120 	    (!netdev || netdev == NET_PTR_POISON)) {
1121 		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
1122 
1123 		pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
1124 			__func__, pool->user.id, inflight, sec);
1125 		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
1126 	}
1127 
1128 	/* Still not ready to be disconnected, retry later */
1129 	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1130 }
1131 
1132 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
1133 			   const struct xdp_mem_info *mem)
1134 {
1135 	refcount_inc(&pool->user_cnt);
1136 	pool->disconnect = disconnect;
1137 	pool->xdp_mem_id = mem->id;
1138 }
1139 
1140 void page_pool_disable_direct_recycling(struct page_pool *pool)
1141 {
1142 	/* Disable direct recycling based on pool->cpuid.
1143 	 * Paired with READ_ONCE() in page_pool_napi_local().
1144 	 */
1145 	WRITE_ONCE(pool->cpuid, -1);
1146 
1147 	if (!pool->p.napi)
1148 		return;
1149 
1150 	/* To avoid races with recycling and additional barriers make sure
1151 	 * pool and NAPI are unlinked when NAPI is disabled.
1152 	 */
1153 	WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state));
1154 	WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);
1155 
1156 	mutex_lock(&page_pools_lock);
1157 	WRITE_ONCE(pool->p.napi, NULL);
1158 	mutex_unlock(&page_pools_lock);
1159 }
1160 EXPORT_SYMBOL(page_pool_disable_direct_recycling);
1161 
1162 void page_pool_destroy(struct page_pool *pool)
1163 {
1164 	if (!pool)
1165 		return;
1166 
1167 	if (!page_pool_put(pool))
1168 		return;
1169 
1170 	page_pool_disable_direct_recycling(pool);
1171 	page_pool_free_frag(pool);
1172 
1173 	if (!page_pool_release(pool))
1174 		return;
1175 
1176 	page_pool_detached(pool);
1177 	pool->defer_start = jiffies;
1178 	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
1179 
1180 	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
1181 	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1182 }
1183 EXPORT_SYMBOL(page_pool_destroy);
1184 
1185 /* Caller must provide appropriate safe context, e.g. NAPI. */
1186 void page_pool_update_nid(struct page_pool *pool, int new_nid)
1187 {
1188 	netmem_ref netmem;
1189 
1190 	trace_page_pool_update_nid(pool, new_nid);
1191 	pool->p.nid = new_nid;
1192 
1193 	/* Flush pool alloc cache, as refill will check NUMA node */
1194 	while (pool->alloc.count) {
1195 		netmem = pool->alloc.cache[--pool->alloc.count];
1196 		page_pool_return_page(pool, netmem);
1197 	}
1198 }
1199 EXPORT_SYMBOL(page_pool_update_nid);
1200 
1201 bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr)
1202 {
1203 	return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr);
1204 }
1205 
1206 /* Associate a niov with a page pool. Should follow with a matching
1207  * net_mp_niov_clear_page_pool()
1208  */
1209 void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov)
1210 {
1211 	netmem_ref netmem = net_iov_to_netmem(niov);
1212 
1213 	page_pool_set_pp_info(pool, netmem);
1214 
1215 	pool->pages_state_hold_cnt++;
1216 	trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt);
1217 }
1218 
1219 /* Disassociate a niov from a page pool. Should only be used in the
1220  * ->release_netmem() path.
1221  */
1222 void net_mp_niov_clear_page_pool(struct net_iov *niov)
1223 {
1224 	netmem_ref netmem = net_iov_to_netmem(niov);
1225 
1226 	page_pool_clear_pp_info(netmem);
1227 }
1228