xref: /linux/net/core/page_pool.c (revision 95f68e06b41b9e88291796efa3969409d13fdd4c)
1 /* SPDX-License-Identifier: GPL-2.0
2  *
3  * page_pool.c
4  *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
5  *	Copyright (C) 2016 Red Hat, Inc.
6  */
7 
8 #include <linux/error-injection.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/slab.h>
12 #include <linux/device.h>
13 
14 #include <net/netdev_rx_queue.h>
15 #include <net/page_pool/helpers.h>
16 #include <net/xdp.h>
17 
18 #include <linux/dma-direction.h>
19 #include <linux/dma-mapping.h>
20 #include <linux/page-flags.h>
21 #include <linux/mm.h> /* for put_page() */
22 #include <linux/poison.h>
23 #include <linux/ethtool.h>
24 #include <linux/netdevice.h>
25 
26 #include <trace/events/page_pool.h>
27 
28 #include "mp_dmabuf_devmem.h"
29 #include "netmem_priv.h"
30 #include "page_pool_priv.h"
31 
32 DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
33 
34 #define DEFER_TIME (msecs_to_jiffies(1000))
35 #define DEFER_WARN_INTERVAL (60 * HZ)
36 
37 #define BIAS_MAX	(LONG_MAX >> 1)
38 
39 #ifdef CONFIG_PAGE_POOL_STATS
40 static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
41 
42 /* alloc_stat_inc is intended to be used in softirq context */
43 #define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
44 /* recycle_stat_inc is safe to use when preemption is possible. */
45 #define recycle_stat_inc(pool, __stat)							\
46 	do {										\
47 		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
48 		this_cpu_inc(s->__stat);						\
49 	} while (0)
50 
51 #define recycle_stat_add(pool, __stat, val)						\
52 	do {										\
53 		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
54 		this_cpu_add(s->__stat, val);						\
55 	} while (0)
56 
57 static const char pp_stats[][ETH_GSTRING_LEN] = {
58 	"rx_pp_alloc_fast",
59 	"rx_pp_alloc_slow",
60 	"rx_pp_alloc_slow_ho",
61 	"rx_pp_alloc_empty",
62 	"rx_pp_alloc_refill",
63 	"rx_pp_alloc_waive",
64 	"rx_pp_recycle_cached",
65 	"rx_pp_recycle_cache_full",
66 	"rx_pp_recycle_ring",
67 	"rx_pp_recycle_ring_full",
68 	"rx_pp_recycle_released_ref",
69 };
70 
71 /**
72  * page_pool_get_stats() - fetch page pool stats
73  * @pool:	pool from which page was allocated
74  * @stats:	struct page_pool_stats to fill in
75  *
76  * Retrieve statistics about the page_pool. This API is only available
77  * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
78  * A pointer to a caller allocated struct page_pool_stats structure
79  * is passed to this API which is filled in. The caller can then report
80  * those stats to the user (perhaps via ethtool, debugfs, etc.).
81  */
82 bool page_pool_get_stats(const struct page_pool *pool,
83 			 struct page_pool_stats *stats)
84 {
85 	int cpu = 0;
86 
87 	if (!stats)
88 		return false;
89 
90 	/* The caller is responsible to initialize stats. */
91 	stats->alloc_stats.fast += pool->alloc_stats.fast;
92 	stats->alloc_stats.slow += pool->alloc_stats.slow;
93 	stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
94 	stats->alloc_stats.empty += pool->alloc_stats.empty;
95 	stats->alloc_stats.refill += pool->alloc_stats.refill;
96 	stats->alloc_stats.waive += pool->alloc_stats.waive;
97 
98 	for_each_possible_cpu(cpu) {
99 		const struct page_pool_recycle_stats *pcpu =
100 			per_cpu_ptr(pool->recycle_stats, cpu);
101 
102 		stats->recycle_stats.cached += pcpu->cached;
103 		stats->recycle_stats.cache_full += pcpu->cache_full;
104 		stats->recycle_stats.ring += pcpu->ring;
105 		stats->recycle_stats.ring_full += pcpu->ring_full;
106 		stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
107 	}
108 
109 	return true;
110 }
111 EXPORT_SYMBOL(page_pool_get_stats);
112 
113 u8 *page_pool_ethtool_stats_get_strings(u8 *data)
114 {
115 	int i;
116 
117 	for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
118 		memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
119 		data += ETH_GSTRING_LEN;
120 	}
121 
122 	return data;
123 }
124 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
125 
126 int page_pool_ethtool_stats_get_count(void)
127 {
128 	return ARRAY_SIZE(pp_stats);
129 }
130 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
131 
132 u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
133 {
134 	const struct page_pool_stats *pool_stats = stats;
135 
136 	*data++ = pool_stats->alloc_stats.fast;
137 	*data++ = pool_stats->alloc_stats.slow;
138 	*data++ = pool_stats->alloc_stats.slow_high_order;
139 	*data++ = pool_stats->alloc_stats.empty;
140 	*data++ = pool_stats->alloc_stats.refill;
141 	*data++ = pool_stats->alloc_stats.waive;
142 	*data++ = pool_stats->recycle_stats.cached;
143 	*data++ = pool_stats->recycle_stats.cache_full;
144 	*data++ = pool_stats->recycle_stats.ring;
145 	*data++ = pool_stats->recycle_stats.ring_full;
146 	*data++ = pool_stats->recycle_stats.released_refcnt;
147 
148 	return data;
149 }
150 EXPORT_SYMBOL(page_pool_ethtool_stats_get);
151 
152 #else
153 #define alloc_stat_inc(pool, __stat)
154 #define recycle_stat_inc(pool, __stat)
155 #define recycle_stat_add(pool, __stat, val)
156 #endif
157 
158 static bool page_pool_producer_lock(struct page_pool *pool)
159 	__acquires(&pool->ring.producer_lock)
160 {
161 	bool in_softirq = in_softirq();
162 
163 	if (in_softirq)
164 		spin_lock(&pool->ring.producer_lock);
165 	else
166 		spin_lock_bh(&pool->ring.producer_lock);
167 
168 	return in_softirq;
169 }
170 
171 static void page_pool_producer_unlock(struct page_pool *pool,
172 				      bool in_softirq)
173 	__releases(&pool->ring.producer_lock)
174 {
175 	if (in_softirq)
176 		spin_unlock(&pool->ring.producer_lock);
177 	else
178 		spin_unlock_bh(&pool->ring.producer_lock);
179 }
180 
181 static void page_pool_struct_check(void)
182 {
183 	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
184 	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
185 	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
186 	CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
187 				    PAGE_POOL_FRAG_GROUP_ALIGN);
188 }
189 
190 static int page_pool_init(struct page_pool *pool,
191 			  const struct page_pool_params *params,
192 			  int cpuid)
193 {
194 	unsigned int ring_qsize = 1024; /* Default */
195 	struct netdev_rx_queue *rxq;
196 	int err;
197 
198 	page_pool_struct_check();
199 
200 	memcpy(&pool->p, &params->fast, sizeof(pool->p));
201 	memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
202 
203 	pool->cpuid = cpuid;
204 	pool->dma_sync_for_cpu = true;
205 
206 	/* Validate only known flags were used */
207 	if (pool->slow.flags & ~PP_FLAG_ALL)
208 		return -EINVAL;
209 
210 	if (pool->p.pool_size)
211 		ring_qsize = pool->p.pool_size;
212 
213 	/* Sanity limit mem that can be pinned down */
214 	if (ring_qsize > 32768)
215 		return -E2BIG;
216 
217 	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
218 	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
219 	 * which is the XDP_TX use-case.
220 	 */
221 	if (pool->slow.flags & PP_FLAG_DMA_MAP) {
222 		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
223 		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
224 			return -EINVAL;
225 
226 		pool->dma_map = true;
227 	}
228 
229 	if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
230 		/* In order to request DMA-sync-for-device the page
231 		 * needs to be mapped
232 		 */
233 		if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
234 			return -EINVAL;
235 
236 		if (!pool->p.max_len)
237 			return -EINVAL;
238 
239 		pool->dma_sync = true;
240 
241 		/* pool->p.offset has to be set according to the address
242 		 * offset used by the DMA engine to start copying rx data
243 		 */
244 	}
245 
246 	pool->has_init_callback = !!pool->slow.init_callback;
247 
248 #ifdef CONFIG_PAGE_POOL_STATS
249 	if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
250 		pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
251 		if (!pool->recycle_stats)
252 			return -ENOMEM;
253 	} else {
254 		/* For system page pool instance we use a singular stats object
255 		 * instead of allocating a separate percpu variable for each
256 		 * (also percpu) page pool instance.
257 		 */
258 		pool->recycle_stats = &pp_system_recycle_stats;
259 		pool->system = true;
260 	}
261 #endif
262 
263 	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
264 #ifdef CONFIG_PAGE_POOL_STATS
265 		if (!pool->system)
266 			free_percpu(pool->recycle_stats);
267 #endif
268 		return -ENOMEM;
269 	}
270 
271 	atomic_set(&pool->pages_state_release_cnt, 0);
272 
273 	/* Driver calling page_pool_create() also call page_pool_destroy() */
274 	refcount_set(&pool->user_cnt, 1);
275 
276 	if (pool->dma_map)
277 		get_device(pool->p.dev);
278 
279 	if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
280 		/* We rely on rtnl_lock()ing to make sure netdev_rx_queue
281 		 * configuration doesn't change while we're initializing
282 		 * the page_pool.
283 		 */
284 		ASSERT_RTNL();
285 		rxq = __netif_get_rx_queue(pool->slow.netdev,
286 					   pool->slow.queue_idx);
287 		pool->mp_priv = rxq->mp_params.mp_priv;
288 	}
289 
290 	if (pool->mp_priv) {
291 		if (!pool->dma_map || !pool->dma_sync)
292 			return -EOPNOTSUPP;
293 
294 		err = mp_dmabuf_devmem_init(pool);
295 		if (err) {
296 			pr_warn("%s() mem-provider init failed %d\n", __func__,
297 				err);
298 			goto free_ptr_ring;
299 		}
300 
301 		static_branch_inc(&page_pool_mem_providers);
302 	}
303 
304 	return 0;
305 
306 free_ptr_ring:
307 	ptr_ring_cleanup(&pool->ring, NULL);
308 #ifdef CONFIG_PAGE_POOL_STATS
309 	if (!pool->system)
310 		free_percpu(pool->recycle_stats);
311 #endif
312 	return err;
313 }
314 
315 static void page_pool_uninit(struct page_pool *pool)
316 {
317 	ptr_ring_cleanup(&pool->ring, NULL);
318 
319 	if (pool->dma_map)
320 		put_device(pool->p.dev);
321 
322 #ifdef CONFIG_PAGE_POOL_STATS
323 	if (!pool->system)
324 		free_percpu(pool->recycle_stats);
325 #endif
326 }
327 
328 /**
329  * page_pool_create_percpu() - create a page pool for a given cpu.
330  * @params: parameters, see struct page_pool_params
331  * @cpuid: cpu identifier
332  */
333 struct page_pool *
334 page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
335 {
336 	struct page_pool *pool;
337 	int err;
338 
339 	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
340 	if (!pool)
341 		return ERR_PTR(-ENOMEM);
342 
343 	err = page_pool_init(pool, params, cpuid);
344 	if (err < 0)
345 		goto err_free;
346 
347 	err = page_pool_list(pool);
348 	if (err)
349 		goto err_uninit;
350 
351 	return pool;
352 
353 err_uninit:
354 	page_pool_uninit(pool);
355 err_free:
356 	pr_warn("%s() gave up with errno %d\n", __func__, err);
357 	kfree(pool);
358 	return ERR_PTR(err);
359 }
360 EXPORT_SYMBOL(page_pool_create_percpu);
361 
362 /**
363  * page_pool_create() - create a page pool
364  * @params: parameters, see struct page_pool_params
365  */
366 struct page_pool *page_pool_create(const struct page_pool_params *params)
367 {
368 	return page_pool_create_percpu(params, -1);
369 }
370 EXPORT_SYMBOL(page_pool_create);
371 
372 static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
373 
374 static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
375 {
376 	struct ptr_ring *r = &pool->ring;
377 	netmem_ref netmem;
378 	int pref_nid; /* preferred NUMA node */
379 
380 	/* Quicker fallback, avoid locks when ring is empty */
381 	if (__ptr_ring_empty(r)) {
382 		alloc_stat_inc(pool, empty);
383 		return 0;
384 	}
385 
386 	/* Softirq guarantee CPU and thus NUMA node is stable. This,
387 	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
388 	 */
389 #ifdef CONFIG_NUMA
390 	pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
391 #else
392 	/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
393 	pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
394 #endif
395 
396 	/* Refill alloc array, but only if NUMA match */
397 	do {
398 		netmem = (__force netmem_ref)__ptr_ring_consume(r);
399 		if (unlikely(!netmem))
400 			break;
401 
402 		if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
403 			pool->alloc.cache[pool->alloc.count++] = netmem;
404 		} else {
405 			/* NUMA mismatch;
406 			 * (1) release 1 page to page-allocator and
407 			 * (2) break out to fallthrough to alloc_pages_node.
408 			 * This limit stress on page buddy alloactor.
409 			 */
410 			page_pool_return_page(pool, netmem);
411 			alloc_stat_inc(pool, waive);
412 			netmem = 0;
413 			break;
414 		}
415 	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
416 
417 	/* Return last page */
418 	if (likely(pool->alloc.count > 0)) {
419 		netmem = pool->alloc.cache[--pool->alloc.count];
420 		alloc_stat_inc(pool, refill);
421 	}
422 
423 	return netmem;
424 }
425 
426 /* fast path */
427 static netmem_ref __page_pool_get_cached(struct page_pool *pool)
428 {
429 	netmem_ref netmem;
430 
431 	/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
432 	if (likely(pool->alloc.count)) {
433 		/* Fast-path */
434 		netmem = pool->alloc.cache[--pool->alloc.count];
435 		alloc_stat_inc(pool, fast);
436 	} else {
437 		netmem = page_pool_refill_alloc_cache(pool);
438 	}
439 
440 	return netmem;
441 }
442 
443 static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
444 					    netmem_ref netmem,
445 					    u32 dma_sync_size)
446 {
447 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
448 	dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
449 
450 	dma_sync_size = min(dma_sync_size, pool->p.max_len);
451 	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
452 				     dma_sync_size, pool->p.dma_dir);
453 #endif
454 }
455 
456 static __always_inline void
457 page_pool_dma_sync_for_device(const struct page_pool *pool,
458 			      netmem_ref netmem,
459 			      u32 dma_sync_size)
460 {
461 	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
462 		__page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
463 }
464 
465 static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
466 {
467 	dma_addr_t dma;
468 
469 	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
470 	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
471 	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
472 	 * This mapping is kept for lifetime of page, until leaving pool.
473 	 */
474 	dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
475 				 (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
476 				 DMA_ATTR_SKIP_CPU_SYNC |
477 					 DMA_ATTR_WEAK_ORDERING);
478 	if (dma_mapping_error(pool->p.dev, dma))
479 		return false;
480 
481 	if (page_pool_set_dma_addr_netmem(netmem, dma))
482 		goto unmap_failed;
483 
484 	page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
485 
486 	return true;
487 
488 unmap_failed:
489 	WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
490 	dma_unmap_page_attrs(pool->p.dev, dma,
491 			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
492 			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
493 	return false;
494 }
495 
496 static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
497 						 gfp_t gfp)
498 {
499 	struct page *page;
500 
501 	gfp |= __GFP_COMP;
502 	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
503 	if (unlikely(!page))
504 		return NULL;
505 
506 	if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
507 		put_page(page);
508 		return NULL;
509 	}
510 
511 	alloc_stat_inc(pool, slow_high_order);
512 	page_pool_set_pp_info(pool, page_to_netmem(page));
513 
514 	/* Track how many pages are held 'in-flight' */
515 	pool->pages_state_hold_cnt++;
516 	trace_page_pool_state_hold(pool, page_to_netmem(page),
517 				   pool->pages_state_hold_cnt);
518 	return page;
519 }
520 
521 /* slow path */
522 static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
523 							gfp_t gfp)
524 {
525 	const int bulk = PP_ALLOC_CACHE_REFILL;
526 	unsigned int pp_order = pool->p.order;
527 	bool dma_map = pool->dma_map;
528 	netmem_ref netmem;
529 	int i, nr_pages;
530 
531 	/* Don't support bulk alloc for high-order pages */
532 	if (unlikely(pp_order))
533 		return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
534 
535 	/* Unnecessary as alloc cache is empty, but guarantees zero count */
536 	if (unlikely(pool->alloc.count > 0))
537 		return pool->alloc.cache[--pool->alloc.count];
538 
539 	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
540 	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
541 
542 	nr_pages = alloc_pages_bulk_array_node(gfp,
543 					       pool->p.nid, bulk,
544 					       (struct page **)pool->alloc.cache);
545 	if (unlikely(!nr_pages))
546 		return 0;
547 
548 	/* Pages have been filled into alloc.cache array, but count is zero and
549 	 * page element have not been (possibly) DMA mapped.
550 	 */
551 	for (i = 0; i < nr_pages; i++) {
552 		netmem = pool->alloc.cache[i];
553 		if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
554 			put_page(netmem_to_page(netmem));
555 			continue;
556 		}
557 
558 		page_pool_set_pp_info(pool, netmem);
559 		pool->alloc.cache[pool->alloc.count++] = netmem;
560 		/* Track how many pages are held 'in-flight' */
561 		pool->pages_state_hold_cnt++;
562 		trace_page_pool_state_hold(pool, netmem,
563 					   pool->pages_state_hold_cnt);
564 	}
565 
566 	/* Return last page */
567 	if (likely(pool->alloc.count > 0)) {
568 		netmem = pool->alloc.cache[--pool->alloc.count];
569 		alloc_stat_inc(pool, slow);
570 	} else {
571 		netmem = 0;
572 	}
573 
574 	/* When page just alloc'ed is should/must have refcnt 1. */
575 	return netmem;
576 }
577 
578 /* For using page_pool replace: alloc_pages() API calls, but provide
579  * synchronization guarantee for allocation side.
580  */
581 netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp)
582 {
583 	netmem_ref netmem;
584 
585 	/* Fast-path: Get a page from cache */
586 	netmem = __page_pool_get_cached(pool);
587 	if (netmem)
588 		return netmem;
589 
590 	/* Slow-path: cache empty, do real allocation */
591 	if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
592 		netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp);
593 	else
594 		netmem = __page_pool_alloc_pages_slow(pool, gfp);
595 	return netmem;
596 }
597 EXPORT_SYMBOL(page_pool_alloc_netmems);
598 
599 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
600 {
601 	return netmem_to_page(page_pool_alloc_netmems(pool, gfp));
602 }
603 EXPORT_SYMBOL(page_pool_alloc_pages);
604 ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);
605 
606 /* Calculate distance between two u32 values, valid if distance is below 2^(31)
607  *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
608  */
609 #define _distance(a, b)	(s32)((a) - (b))
610 
611 s32 page_pool_inflight(const struct page_pool *pool, bool strict)
612 {
613 	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
614 	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
615 	s32 inflight;
616 
617 	inflight = _distance(hold_cnt, release_cnt);
618 
619 	if (strict) {
620 		trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
621 		WARN(inflight < 0, "Negative(%d) inflight packet-pages",
622 		     inflight);
623 	} else {
624 		inflight = max(0, inflight);
625 	}
626 
627 	return inflight;
628 }
629 
630 void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
631 {
632 	netmem_set_pp(netmem, pool);
633 	netmem_or_pp_magic(netmem, PP_SIGNATURE);
634 
635 	/* Ensuring all pages have been split into one fragment initially:
636 	 * page_pool_set_pp_info() is only called once for every page when it
637 	 * is allocated from the page allocator and page_pool_fragment_page()
638 	 * is dirtying the same cache line as the page->pp_magic above, so
639 	 * the overhead is negligible.
640 	 */
641 	page_pool_fragment_netmem(netmem, 1);
642 	if (pool->has_init_callback)
643 		pool->slow.init_callback(netmem, pool->slow.init_arg);
644 }
645 
646 void page_pool_clear_pp_info(netmem_ref netmem)
647 {
648 	netmem_clear_pp_magic(netmem);
649 	netmem_set_pp(netmem, NULL);
650 }
651 
652 static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
653 							 netmem_ref netmem)
654 {
655 	dma_addr_t dma;
656 
657 	if (!pool->dma_map)
658 		/* Always account for inflight pages, even if we didn't
659 		 * map them
660 		 */
661 		return;
662 
663 	dma = page_pool_get_dma_addr_netmem(netmem);
664 
665 	/* When page is unmapped, it cannot be returned to our pool */
666 	dma_unmap_page_attrs(pool->p.dev, dma,
667 			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
668 			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
669 	page_pool_set_dma_addr_netmem(netmem, 0);
670 }
671 
672 /* Disconnects a page (from a page_pool).  API users can have a need
673  * to disconnect a page (from a page_pool), to allow it to be used as
674  * a regular page (that will eventually be returned to the normal
675  * page-allocator via put_page).
676  */
677 void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
678 {
679 	int count;
680 	bool put;
681 
682 	put = true;
683 	if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
684 		put = mp_dmabuf_devmem_release_page(pool, netmem);
685 	else
686 		__page_pool_release_page_dma(pool, netmem);
687 
688 	/* This may be the last page returned, releasing the pool, so
689 	 * it is not safe to reference pool afterwards.
690 	 */
691 	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
692 	trace_page_pool_state_release(pool, netmem, count);
693 
694 	if (put) {
695 		page_pool_clear_pp_info(netmem);
696 		put_page(netmem_to_page(netmem));
697 	}
698 	/* An optimization would be to call __free_pages(page, pool->p.order)
699 	 * knowing page is not part of page-cache (thus avoiding a
700 	 * __page_cache_release() call).
701 	 */
702 }
703 
704 static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
705 {
706 	int ret;
707 	/* BH protection not needed if current is softirq */
708 	if (in_softirq())
709 		ret = ptr_ring_produce(&pool->ring, (__force void *)netmem);
710 	else
711 		ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem);
712 
713 	if (!ret) {
714 		recycle_stat_inc(pool, ring);
715 		return true;
716 	}
717 
718 	return false;
719 }
720 
721 /* Only allow direct recycling in special circumstances, into the
722  * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
723  *
724  * Caller must provide appropriate safe context.
725  */
726 static bool page_pool_recycle_in_cache(netmem_ref netmem,
727 				       struct page_pool *pool)
728 {
729 	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
730 		recycle_stat_inc(pool, cache_full);
731 		return false;
732 	}
733 
734 	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
735 	pool->alloc.cache[pool->alloc.count++] = netmem;
736 	recycle_stat_inc(pool, cached);
737 	return true;
738 }
739 
740 static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
741 {
742 	return netmem_is_net_iov(netmem) ||
743 	       (page_ref_count(netmem_to_page(netmem)) == 1 &&
744 		!page_is_pfmemalloc(netmem_to_page(netmem)));
745 }
746 
747 /* If the page refcnt == 1, this will try to recycle the page.
748  * If pool->dma_sync is set, we'll try to sync the DMA area for
749  * the configured size min(dma_sync_size, pool->max_len).
750  * If the page refcnt != 1, then the page will be returned to memory
751  * subsystem.
752  */
753 static __always_inline netmem_ref
754 __page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
755 		     unsigned int dma_sync_size, bool allow_direct)
756 {
757 	lockdep_assert_no_hardirq();
758 
759 	/* This allocator is optimized for the XDP mode that uses
760 	 * one-frame-per-page, but have fallbacks that act like the
761 	 * regular page allocator APIs.
762 	 *
763 	 * refcnt == 1 means page_pool owns page, and can recycle it.
764 	 *
765 	 * page is NOT reusable when allocated when system is under
766 	 * some pressure. (page_is_pfmemalloc)
767 	 */
768 	if (likely(__page_pool_page_can_be_recycled(netmem))) {
769 		/* Read barrier done in page_ref_count / READ_ONCE */
770 
771 		page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
772 
773 		if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
774 			return 0;
775 
776 		/* Page found as candidate for recycling */
777 		return netmem;
778 	}
779 
780 	/* Fallback/non-XDP mode: API user have elevated refcnt.
781 	 *
782 	 * Many drivers split up the page into fragments, and some
783 	 * want to keep doing this to save memory and do refcnt based
784 	 * recycling. Support this use case too, to ease drivers
785 	 * switching between XDP/non-XDP.
786 	 *
787 	 * In-case page_pool maintains the DMA mapping, API user must
788 	 * call page_pool_put_page once.  In this elevated refcnt
789 	 * case, the DMA is unmapped/released, as driver is likely
790 	 * doing refcnt based recycle tricks, meaning another process
791 	 * will be invoking put_page.
792 	 */
793 	recycle_stat_inc(pool, released_refcnt);
794 	page_pool_return_page(pool, netmem);
795 
796 	return 0;
797 }
798 
799 static bool page_pool_napi_local(const struct page_pool *pool)
800 {
801 	const struct napi_struct *napi;
802 	u32 cpuid;
803 
804 	if (unlikely(!in_softirq()))
805 		return false;
806 
807 	/* Allow direct recycle if we have reasons to believe that we are
808 	 * in the same context as the consumer would run, so there's
809 	 * no possible race.
810 	 * __page_pool_put_page() makes sure we're not in hardirq context
811 	 * and interrupts are enabled prior to accessing the cache.
812 	 */
813 	cpuid = smp_processor_id();
814 	if (READ_ONCE(pool->cpuid) == cpuid)
815 		return true;
816 
817 	napi = READ_ONCE(pool->p.napi);
818 
819 	return napi && READ_ONCE(napi->list_owner) == cpuid;
820 }
821 
822 void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
823 				  unsigned int dma_sync_size, bool allow_direct)
824 {
825 	if (!allow_direct)
826 		allow_direct = page_pool_napi_local(pool);
827 
828 	netmem =
829 		__page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
830 	if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
831 		/* Cache full, fallback to free pages */
832 		recycle_stat_inc(pool, ring_full);
833 		page_pool_return_page(pool, netmem);
834 	}
835 }
836 EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
837 
838 void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
839 				unsigned int dma_sync_size, bool allow_direct)
840 {
841 	page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
842 				     allow_direct);
843 }
844 EXPORT_SYMBOL(page_pool_put_unrefed_page);
845 
846 static void page_pool_recycle_ring_bulk(struct page_pool *pool,
847 					netmem_ref *bulk,
848 					u32 bulk_len)
849 {
850 	bool in_softirq;
851 	u32 i;
852 
853 	/* Bulk produce into ptr_ring page_pool cache */
854 	in_softirq = page_pool_producer_lock(pool);
855 
856 	for (i = 0; i < bulk_len; i++) {
857 		if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) {
858 			/* ring full */
859 			recycle_stat_inc(pool, ring_full);
860 			break;
861 		}
862 	}
863 
864 	page_pool_producer_unlock(pool, in_softirq);
865 	recycle_stat_add(pool, ring, i);
866 
867 	/* Hopefully all pages were returned into ptr_ring */
868 	if (likely(i == bulk_len))
869 		return;
870 
871 	/*
872 	 * ptr_ring cache is full, free remaining pages outside producer lock
873 	 * since put_page() with refcnt == 1 can be an expensive operation.
874 	 */
875 	for (; i < bulk_len; i++)
876 		page_pool_return_page(pool, bulk[i]);
877 }
878 
879 /**
880  * page_pool_put_netmem_bulk() - release references on multiple netmems
881  * @data:	array holding netmem references
882  * @count:	number of entries in @data
883  *
884  * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring
885  * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk()
886  * will release leftover netmems to the memory provider.
887  * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx
888  * completion loop for the XDP_REDIRECT use case.
889  *
890  * Please note the caller must not use data area after running
891  * page_pool_put_netmem_bulk(), as this function overwrites it.
892  */
893 void page_pool_put_netmem_bulk(netmem_ref *data, u32 count)
894 {
895 	u32 bulk_len = 0;
896 
897 	for (u32 i = 0; i < count; i++) {
898 		netmem_ref netmem = netmem_compound_head(data[i]);
899 
900 		if (page_pool_unref_and_test(netmem))
901 			data[bulk_len++] = netmem;
902 	}
903 
904 	count = bulk_len;
905 	while (count) {
906 		netmem_ref bulk[XDP_BULK_QUEUE_SIZE];
907 		struct page_pool *pool = NULL;
908 		bool allow_direct;
909 		u32 foreign = 0;
910 
911 		bulk_len = 0;
912 
913 		for (u32 i = 0; i < count; i++) {
914 			struct page_pool *netmem_pp;
915 			netmem_ref netmem = data[i];
916 
917 			netmem_pp = netmem_get_pp(netmem);
918 			if (unlikely(!pool)) {
919 				pool = netmem_pp;
920 				allow_direct = page_pool_napi_local(pool);
921 			} else if (netmem_pp != pool) {
922 				/*
923 				 * If the netmem belongs to a different
924 				 * page_pool, save it for another round.
925 				 */
926 				data[foreign++] = netmem;
927 				continue;
928 			}
929 
930 			netmem = __page_pool_put_page(pool, netmem, -1,
931 						      allow_direct);
932 			/* Approved for bulk recycling in ptr_ring cache */
933 			if (netmem)
934 				bulk[bulk_len++] = netmem;
935 		}
936 
937 		if (bulk_len)
938 			page_pool_recycle_ring_bulk(pool, bulk, bulk_len);
939 
940 		count = foreign;
941 	}
942 }
943 EXPORT_SYMBOL(page_pool_put_netmem_bulk);
944 
945 static netmem_ref page_pool_drain_frag(struct page_pool *pool,
946 				       netmem_ref netmem)
947 {
948 	long drain_count = BIAS_MAX - pool->frag_users;
949 
950 	/* Some user is still using the page frag */
951 	if (likely(page_pool_unref_netmem(netmem, drain_count)))
952 		return 0;
953 
954 	if (__page_pool_page_can_be_recycled(netmem)) {
955 		page_pool_dma_sync_for_device(pool, netmem, -1);
956 		return netmem;
957 	}
958 
959 	page_pool_return_page(pool, netmem);
960 	return 0;
961 }
962 
963 static void page_pool_free_frag(struct page_pool *pool)
964 {
965 	long drain_count = BIAS_MAX - pool->frag_users;
966 	netmem_ref netmem = pool->frag_page;
967 
968 	pool->frag_page = 0;
969 
970 	if (!netmem || page_pool_unref_netmem(netmem, drain_count))
971 		return;
972 
973 	page_pool_return_page(pool, netmem);
974 }
975 
976 netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
977 				       unsigned int *offset, unsigned int size,
978 				       gfp_t gfp)
979 {
980 	unsigned int max_size = PAGE_SIZE << pool->p.order;
981 	netmem_ref netmem = pool->frag_page;
982 
983 	if (WARN_ON(size > max_size))
984 		return 0;
985 
986 	size = ALIGN(size, dma_get_cache_alignment());
987 	*offset = pool->frag_offset;
988 
989 	if (netmem && *offset + size > max_size) {
990 		netmem = page_pool_drain_frag(pool, netmem);
991 		if (netmem) {
992 			recycle_stat_inc(pool, cached);
993 			alloc_stat_inc(pool, fast);
994 			goto frag_reset;
995 		}
996 	}
997 
998 	if (!netmem) {
999 		netmem = page_pool_alloc_netmems(pool, gfp);
1000 		if (unlikely(!netmem)) {
1001 			pool->frag_page = 0;
1002 			return 0;
1003 		}
1004 
1005 		pool->frag_page = netmem;
1006 
1007 frag_reset:
1008 		pool->frag_users = 1;
1009 		*offset = 0;
1010 		pool->frag_offset = size;
1011 		page_pool_fragment_netmem(netmem, BIAS_MAX);
1012 		return netmem;
1013 	}
1014 
1015 	pool->frag_users++;
1016 	pool->frag_offset = *offset + size;
1017 	return netmem;
1018 }
1019 EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
1020 
1021 struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
1022 				  unsigned int size, gfp_t gfp)
1023 {
1024 	return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
1025 							  gfp));
1026 }
1027 EXPORT_SYMBOL(page_pool_alloc_frag);
1028 
1029 static void page_pool_empty_ring(struct page_pool *pool)
1030 {
1031 	netmem_ref netmem;
1032 
1033 	/* Empty recycle ring */
1034 	while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
1035 		/* Verify the refcnt invariant of cached pages */
1036 		if (!(netmem_ref_count(netmem) == 1))
1037 			pr_crit("%s() page_pool refcnt %d violation\n",
1038 				__func__, netmem_ref_count(netmem));
1039 
1040 		page_pool_return_page(pool, netmem);
1041 	}
1042 }
1043 
1044 static void __page_pool_destroy(struct page_pool *pool)
1045 {
1046 	if (pool->disconnect)
1047 		pool->disconnect(pool);
1048 
1049 	page_pool_unlist(pool);
1050 	page_pool_uninit(pool);
1051 
1052 	if (pool->mp_priv) {
1053 		mp_dmabuf_devmem_destroy(pool);
1054 		static_branch_dec(&page_pool_mem_providers);
1055 	}
1056 
1057 	kfree(pool);
1058 }
1059 
1060 static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
1061 {
1062 	netmem_ref netmem;
1063 
1064 	if (pool->destroy_cnt)
1065 		return;
1066 
1067 	/* Empty alloc cache, assume caller made sure this is
1068 	 * no-longer in use, and page_pool_alloc_pages() cannot be
1069 	 * call concurrently.
1070 	 */
1071 	while (pool->alloc.count) {
1072 		netmem = pool->alloc.cache[--pool->alloc.count];
1073 		page_pool_return_page(pool, netmem);
1074 	}
1075 }
1076 
1077 static void page_pool_scrub(struct page_pool *pool)
1078 {
1079 	page_pool_empty_alloc_cache_once(pool);
1080 	pool->destroy_cnt++;
1081 
1082 	/* No more consumers should exist, but producers could still
1083 	 * be in-flight.
1084 	 */
1085 	page_pool_empty_ring(pool);
1086 }
1087 
1088 static int page_pool_release(struct page_pool *pool)
1089 {
1090 	int inflight;
1091 
1092 	page_pool_scrub(pool);
1093 	inflight = page_pool_inflight(pool, true);
1094 	if (!inflight)
1095 		__page_pool_destroy(pool);
1096 
1097 	return inflight;
1098 }
1099 
1100 static void page_pool_release_retry(struct work_struct *wq)
1101 {
1102 	struct delayed_work *dwq = to_delayed_work(wq);
1103 	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
1104 	void *netdev;
1105 	int inflight;
1106 
1107 	inflight = page_pool_release(pool);
1108 	if (!inflight)
1109 		return;
1110 
1111 	/* Periodic warning for page pools the user can't see */
1112 	netdev = READ_ONCE(pool->slow.netdev);
1113 	if (time_after_eq(jiffies, pool->defer_warn) &&
1114 	    (!netdev || netdev == NET_PTR_POISON)) {
1115 		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
1116 
1117 		pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
1118 			__func__, pool->user.id, inflight, sec);
1119 		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
1120 	}
1121 
1122 	/* Still not ready to be disconnected, retry later */
1123 	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1124 }
1125 
1126 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
1127 			   const struct xdp_mem_info *mem)
1128 {
1129 	refcount_inc(&pool->user_cnt);
1130 	pool->disconnect = disconnect;
1131 	pool->xdp_mem_id = mem->id;
1132 }
1133 
1134 void page_pool_disable_direct_recycling(struct page_pool *pool)
1135 {
1136 	/* Disable direct recycling based on pool->cpuid.
1137 	 * Paired with READ_ONCE() in page_pool_napi_local().
1138 	 */
1139 	WRITE_ONCE(pool->cpuid, -1);
1140 
1141 	if (!pool->p.napi)
1142 		return;
1143 
1144 	/* To avoid races with recycling and additional barriers make sure
1145 	 * pool and NAPI are unlinked when NAPI is disabled.
1146 	 */
1147 	WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state));
1148 	WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);
1149 
1150 	WRITE_ONCE(pool->p.napi, NULL);
1151 }
1152 EXPORT_SYMBOL(page_pool_disable_direct_recycling);
1153 
1154 void page_pool_destroy(struct page_pool *pool)
1155 {
1156 	if (!pool)
1157 		return;
1158 
1159 	if (!page_pool_put(pool))
1160 		return;
1161 
1162 	page_pool_disable_direct_recycling(pool);
1163 	page_pool_free_frag(pool);
1164 
1165 	if (!page_pool_release(pool))
1166 		return;
1167 
1168 	page_pool_detached(pool);
1169 	pool->defer_start = jiffies;
1170 	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
1171 
1172 	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
1173 	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1174 }
1175 EXPORT_SYMBOL(page_pool_destroy);
1176 
1177 /* Caller must provide appropriate safe context, e.g. NAPI. */
1178 void page_pool_update_nid(struct page_pool *pool, int new_nid)
1179 {
1180 	netmem_ref netmem;
1181 
1182 	trace_page_pool_update_nid(pool, new_nid);
1183 	pool->p.nid = new_nid;
1184 
1185 	/* Flush pool alloc cache, as refill will check NUMA node */
1186 	while (pool->alloc.count) {
1187 		netmem = pool->alloc.cache[--pool->alloc.count];
1188 		page_pool_return_page(pool, netmem);
1189 	}
1190 }
1191 EXPORT_SYMBOL(page_pool_update_nid);
1192