xref: /linux/net/core/page_pool.c (revision 8ab79ed50cf10f338465c296012500de1081646f)
1 /* SPDX-License-Identifier: GPL-2.0
2  *
3  * page_pool.c
4  *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
5  *	Copyright (C) 2016 Red Hat, Inc.
6  */
7 
8 #include <linux/error-injection.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/slab.h>
12 #include <linux/device.h>
13 
14 #include <net/page_pool/helpers.h>
15 #include <net/xdp.h>
16 
17 #include <linux/dma-direction.h>
18 #include <linux/dma-mapping.h>
19 #include <linux/page-flags.h>
20 #include <linux/mm.h> /* for put_page() */
21 #include <linux/poison.h>
22 #include <linux/ethtool.h>
23 #include <linux/netdevice.h>
24 
25 #include <trace/events/page_pool.h>
26 
27 #include "netmem_priv.h"
28 #include "page_pool_priv.h"
29 
30 DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
31 
32 #define DEFER_TIME (msecs_to_jiffies(1000))
33 #define DEFER_WARN_INTERVAL (60 * HZ)
34 
35 #define BIAS_MAX	(LONG_MAX >> 1)
36 
37 #ifdef CONFIG_PAGE_POOL_STATS
38 static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
39 
40 /* alloc_stat_inc is intended to be used in softirq context */
41 #define alloc_stat_inc(pool, __stat)	(pool->alloc_stats.__stat++)
42 /* recycle_stat_inc is safe to use when preemption is possible. */
43 #define recycle_stat_inc(pool, __stat)							\
44 	do {										\
45 		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
46 		this_cpu_inc(s->__stat);						\
47 	} while (0)
48 
49 #define recycle_stat_add(pool, __stat, val)						\
50 	do {										\
51 		struct page_pool_recycle_stats __percpu *s = pool->recycle_stats;	\
52 		this_cpu_add(s->__stat, val);						\
53 	} while (0)
54 
55 static const char pp_stats[][ETH_GSTRING_LEN] = {
56 	"rx_pp_alloc_fast",
57 	"rx_pp_alloc_slow",
58 	"rx_pp_alloc_slow_ho",
59 	"rx_pp_alloc_empty",
60 	"rx_pp_alloc_refill",
61 	"rx_pp_alloc_waive",
62 	"rx_pp_recycle_cached",
63 	"rx_pp_recycle_cache_full",
64 	"rx_pp_recycle_ring",
65 	"rx_pp_recycle_ring_full",
66 	"rx_pp_recycle_released_ref",
67 };
68 
69 /**
70  * page_pool_get_stats() - fetch page pool stats
71  * @pool:	pool from which page was allocated
72  * @stats:	struct page_pool_stats to fill in
73  *
74  * Retrieve statistics about the page_pool. This API is only available
75  * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
76  * A pointer to a caller allocated struct page_pool_stats structure
77  * is passed to this API which is filled in. The caller can then report
78  * those stats to the user (perhaps via ethtool, debugfs, etc.).
79  */
80 bool page_pool_get_stats(const struct page_pool *pool,
81 			 struct page_pool_stats *stats)
82 {
83 	int cpu = 0;
84 
85 	if (!stats)
86 		return false;
87 
88 	/* The caller is responsible to initialize stats. */
89 	stats->alloc_stats.fast += pool->alloc_stats.fast;
90 	stats->alloc_stats.slow += pool->alloc_stats.slow;
91 	stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
92 	stats->alloc_stats.empty += pool->alloc_stats.empty;
93 	stats->alloc_stats.refill += pool->alloc_stats.refill;
94 	stats->alloc_stats.waive += pool->alloc_stats.waive;
95 
96 	for_each_possible_cpu(cpu) {
97 		const struct page_pool_recycle_stats *pcpu =
98 			per_cpu_ptr(pool->recycle_stats, cpu);
99 
100 		stats->recycle_stats.cached += pcpu->cached;
101 		stats->recycle_stats.cache_full += pcpu->cache_full;
102 		stats->recycle_stats.ring += pcpu->ring;
103 		stats->recycle_stats.ring_full += pcpu->ring_full;
104 		stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
105 	}
106 
107 	return true;
108 }
109 EXPORT_SYMBOL(page_pool_get_stats);
110 
111 u8 *page_pool_ethtool_stats_get_strings(u8 *data)
112 {
113 	int i;
114 
115 	for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
116 		memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
117 		data += ETH_GSTRING_LEN;
118 	}
119 
120 	return data;
121 }
122 EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
123 
124 int page_pool_ethtool_stats_get_count(void)
125 {
126 	return ARRAY_SIZE(pp_stats);
127 }
128 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
129 
130 u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
131 {
132 	const struct page_pool_stats *pool_stats = stats;
133 
134 	*data++ = pool_stats->alloc_stats.fast;
135 	*data++ = pool_stats->alloc_stats.slow;
136 	*data++ = pool_stats->alloc_stats.slow_high_order;
137 	*data++ = pool_stats->alloc_stats.empty;
138 	*data++ = pool_stats->alloc_stats.refill;
139 	*data++ = pool_stats->alloc_stats.waive;
140 	*data++ = pool_stats->recycle_stats.cached;
141 	*data++ = pool_stats->recycle_stats.cache_full;
142 	*data++ = pool_stats->recycle_stats.ring;
143 	*data++ = pool_stats->recycle_stats.ring_full;
144 	*data++ = pool_stats->recycle_stats.released_refcnt;
145 
146 	return data;
147 }
148 EXPORT_SYMBOL(page_pool_ethtool_stats_get);
149 
150 #else
151 #define alloc_stat_inc(pool, __stat)
152 #define recycle_stat_inc(pool, __stat)
153 #define recycle_stat_add(pool, __stat, val)
154 #endif
155 
156 static bool page_pool_producer_lock(struct page_pool *pool)
157 	__acquires(&pool->ring.producer_lock)
158 {
159 	bool in_softirq = in_softirq();
160 
161 	if (in_softirq)
162 		spin_lock(&pool->ring.producer_lock);
163 	else
164 		spin_lock_bh(&pool->ring.producer_lock);
165 
166 	return in_softirq;
167 }
168 
169 static void page_pool_producer_unlock(struct page_pool *pool,
170 				      bool in_softirq)
171 	__releases(&pool->ring.producer_lock)
172 {
173 	if (in_softirq)
174 		spin_unlock(&pool->ring.producer_lock);
175 	else
176 		spin_unlock_bh(&pool->ring.producer_lock);
177 }
178 
179 static void page_pool_struct_check(void)
180 {
181 	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
182 	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
183 	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
184 	CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
185 				    PAGE_POOL_FRAG_GROUP_ALIGN);
186 }
187 
188 static int page_pool_init(struct page_pool *pool,
189 			  const struct page_pool_params *params,
190 			  int cpuid)
191 {
192 	unsigned int ring_qsize = 1024; /* Default */
193 
194 	page_pool_struct_check();
195 
196 	memcpy(&pool->p, &params->fast, sizeof(pool->p));
197 	memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
198 
199 	pool->cpuid = cpuid;
200 
201 	/* Validate only known flags were used */
202 	if (pool->slow.flags & ~PP_FLAG_ALL)
203 		return -EINVAL;
204 
205 	if (pool->p.pool_size)
206 		ring_qsize = pool->p.pool_size;
207 
208 	/* Sanity limit mem that can be pinned down */
209 	if (ring_qsize > 32768)
210 		return -E2BIG;
211 
212 	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
213 	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
214 	 * which is the XDP_TX use-case.
215 	 */
216 	if (pool->slow.flags & PP_FLAG_DMA_MAP) {
217 		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
218 		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
219 			return -EINVAL;
220 
221 		pool->dma_map = true;
222 	}
223 
224 	if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
225 		/* In order to request DMA-sync-for-device the page
226 		 * needs to be mapped
227 		 */
228 		if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
229 			return -EINVAL;
230 
231 		if (!pool->p.max_len)
232 			return -EINVAL;
233 
234 		pool->dma_sync = true;
235 
236 		/* pool->p.offset has to be set according to the address
237 		 * offset used by the DMA engine to start copying rx data
238 		 */
239 	}
240 
241 	pool->has_init_callback = !!pool->slow.init_callback;
242 
243 #ifdef CONFIG_PAGE_POOL_STATS
244 	if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
245 		pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
246 		if (!pool->recycle_stats)
247 			return -ENOMEM;
248 	} else {
249 		/* For system page pool instance we use a singular stats object
250 		 * instead of allocating a separate percpu variable for each
251 		 * (also percpu) page pool instance.
252 		 */
253 		pool->recycle_stats = &pp_system_recycle_stats;
254 		pool->system = true;
255 	}
256 #endif
257 
258 	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
259 #ifdef CONFIG_PAGE_POOL_STATS
260 		if (!pool->system)
261 			free_percpu(pool->recycle_stats);
262 #endif
263 		return -ENOMEM;
264 	}
265 
266 	atomic_set(&pool->pages_state_release_cnt, 0);
267 
268 	/* Driver calling page_pool_create() also call page_pool_destroy() */
269 	refcount_set(&pool->user_cnt, 1);
270 
271 	if (pool->dma_map)
272 		get_device(pool->p.dev);
273 
274 	return 0;
275 }
276 
277 static void page_pool_uninit(struct page_pool *pool)
278 {
279 	ptr_ring_cleanup(&pool->ring, NULL);
280 
281 	if (pool->dma_map)
282 		put_device(pool->p.dev);
283 
284 #ifdef CONFIG_PAGE_POOL_STATS
285 	if (!pool->system)
286 		free_percpu(pool->recycle_stats);
287 #endif
288 }
289 
290 /**
291  * page_pool_create_percpu() - create a page pool for a given cpu.
292  * @params: parameters, see struct page_pool_params
293  * @cpuid: cpu identifier
294  */
295 struct page_pool *
296 page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
297 {
298 	struct page_pool *pool;
299 	int err;
300 
301 	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
302 	if (!pool)
303 		return ERR_PTR(-ENOMEM);
304 
305 	err = page_pool_init(pool, params, cpuid);
306 	if (err < 0)
307 		goto err_free;
308 
309 	err = page_pool_list(pool);
310 	if (err)
311 		goto err_uninit;
312 
313 	return pool;
314 
315 err_uninit:
316 	page_pool_uninit(pool);
317 err_free:
318 	pr_warn("%s() gave up with errno %d\n", __func__, err);
319 	kfree(pool);
320 	return ERR_PTR(err);
321 }
322 EXPORT_SYMBOL(page_pool_create_percpu);
323 
324 /**
325  * page_pool_create() - create a page pool
326  * @params: parameters, see struct page_pool_params
327  */
328 struct page_pool *page_pool_create(const struct page_pool_params *params)
329 {
330 	return page_pool_create_percpu(params, -1);
331 }
332 EXPORT_SYMBOL(page_pool_create);
333 
334 static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem);
335 
336 static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
337 {
338 	struct ptr_ring *r = &pool->ring;
339 	netmem_ref netmem;
340 	int pref_nid; /* preferred NUMA node */
341 
342 	/* Quicker fallback, avoid locks when ring is empty */
343 	if (__ptr_ring_empty(r)) {
344 		alloc_stat_inc(pool, empty);
345 		return 0;
346 	}
347 
348 	/* Softirq guarantee CPU and thus NUMA node is stable. This,
349 	 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
350 	 */
351 #ifdef CONFIG_NUMA
352 	pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
353 #else
354 	/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
355 	pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
356 #endif
357 
358 	/* Refill alloc array, but only if NUMA match */
359 	do {
360 		netmem = (__force netmem_ref)__ptr_ring_consume(r);
361 		if (unlikely(!netmem))
362 			break;
363 
364 		if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
365 			pool->alloc.cache[pool->alloc.count++] = netmem;
366 		} else {
367 			/* NUMA mismatch;
368 			 * (1) release 1 page to page-allocator and
369 			 * (2) break out to fallthrough to alloc_pages_node.
370 			 * This limit stress on page buddy alloactor.
371 			 */
372 			page_pool_return_page(pool, netmem);
373 			alloc_stat_inc(pool, waive);
374 			netmem = 0;
375 			break;
376 		}
377 	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
378 
379 	/* Return last page */
380 	if (likely(pool->alloc.count > 0)) {
381 		netmem = pool->alloc.cache[--pool->alloc.count];
382 		alloc_stat_inc(pool, refill);
383 	}
384 
385 	return netmem;
386 }
387 
388 /* fast path */
389 static netmem_ref __page_pool_get_cached(struct page_pool *pool)
390 {
391 	netmem_ref netmem;
392 
393 	/* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
394 	if (likely(pool->alloc.count)) {
395 		/* Fast-path */
396 		netmem = pool->alloc.cache[--pool->alloc.count];
397 		alloc_stat_inc(pool, fast);
398 	} else {
399 		netmem = page_pool_refill_alloc_cache(pool);
400 	}
401 
402 	return netmem;
403 }
404 
405 static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
406 					    netmem_ref netmem,
407 					    u32 dma_sync_size)
408 {
409 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
410 	dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
411 
412 	dma_sync_size = min(dma_sync_size, pool->p.max_len);
413 	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
414 				     dma_sync_size, pool->p.dma_dir);
415 #endif
416 }
417 
418 static __always_inline void
419 page_pool_dma_sync_for_device(const struct page_pool *pool,
420 			      netmem_ref netmem,
421 			      u32 dma_sync_size)
422 {
423 	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
424 		__page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
425 }
426 
427 static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
428 {
429 	dma_addr_t dma;
430 
431 	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
432 	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
433 	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
434 	 * This mapping is kept for lifetime of page, until leaving pool.
435 	 */
436 	dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
437 				 (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
438 				 DMA_ATTR_SKIP_CPU_SYNC |
439 					 DMA_ATTR_WEAK_ORDERING);
440 	if (dma_mapping_error(pool->p.dev, dma))
441 		return false;
442 
443 	if (page_pool_set_dma_addr_netmem(netmem, dma))
444 		goto unmap_failed;
445 
446 	page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
447 
448 	return true;
449 
450 unmap_failed:
451 	WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
452 	dma_unmap_page_attrs(pool->p.dev, dma,
453 			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
454 			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
455 	return false;
456 }
457 
458 static void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
459 {
460 	netmem_set_pp(netmem, pool);
461 	netmem_or_pp_magic(netmem, PP_SIGNATURE);
462 
463 	/* Ensuring all pages have been split into one fragment initially:
464 	 * page_pool_set_pp_info() is only called once for every page when it
465 	 * is allocated from the page allocator and page_pool_fragment_page()
466 	 * is dirtying the same cache line as the page->pp_magic above, so
467 	 * the overhead is negligible.
468 	 */
469 	page_pool_fragment_netmem(netmem, 1);
470 	if (pool->has_init_callback)
471 		pool->slow.init_callback(netmem, pool->slow.init_arg);
472 }
473 
474 static void page_pool_clear_pp_info(netmem_ref netmem)
475 {
476 	netmem_clear_pp_magic(netmem);
477 	netmem_set_pp(netmem, NULL);
478 }
479 
480 static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
481 						 gfp_t gfp)
482 {
483 	struct page *page;
484 
485 	gfp |= __GFP_COMP;
486 	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
487 	if (unlikely(!page))
488 		return NULL;
489 
490 	if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
491 		put_page(page);
492 		return NULL;
493 	}
494 
495 	alloc_stat_inc(pool, slow_high_order);
496 	page_pool_set_pp_info(pool, page_to_netmem(page));
497 
498 	/* Track how many pages are held 'in-flight' */
499 	pool->pages_state_hold_cnt++;
500 	trace_page_pool_state_hold(pool, page_to_netmem(page),
501 				   pool->pages_state_hold_cnt);
502 	return page;
503 }
504 
505 /* slow path */
506 static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
507 							gfp_t gfp)
508 {
509 	const int bulk = PP_ALLOC_CACHE_REFILL;
510 	unsigned int pp_order = pool->p.order;
511 	bool dma_map = pool->dma_map;
512 	netmem_ref netmem;
513 	int i, nr_pages;
514 
515 	/* Don't support bulk alloc for high-order pages */
516 	if (unlikely(pp_order))
517 		return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
518 
519 	/* Unnecessary as alloc cache is empty, but guarantees zero count */
520 	if (unlikely(pool->alloc.count > 0))
521 		return pool->alloc.cache[--pool->alloc.count];
522 
523 	/* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
524 	memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
525 
526 	nr_pages = alloc_pages_bulk_array_node(gfp,
527 					       pool->p.nid, bulk,
528 					       (struct page **)pool->alloc.cache);
529 	if (unlikely(!nr_pages))
530 		return 0;
531 
532 	/* Pages have been filled into alloc.cache array, but count is zero and
533 	 * page element have not been (possibly) DMA mapped.
534 	 */
535 	for (i = 0; i < nr_pages; i++) {
536 		netmem = pool->alloc.cache[i];
537 		if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
538 			put_page(netmem_to_page(netmem));
539 			continue;
540 		}
541 
542 		page_pool_set_pp_info(pool, netmem);
543 		pool->alloc.cache[pool->alloc.count++] = netmem;
544 		/* Track how many pages are held 'in-flight' */
545 		pool->pages_state_hold_cnt++;
546 		trace_page_pool_state_hold(pool, netmem,
547 					   pool->pages_state_hold_cnt);
548 	}
549 
550 	/* Return last page */
551 	if (likely(pool->alloc.count > 0)) {
552 		netmem = pool->alloc.cache[--pool->alloc.count];
553 		alloc_stat_inc(pool, slow);
554 	} else {
555 		netmem = 0;
556 	}
557 
558 	/* When page just alloc'ed is should/must have refcnt 1. */
559 	return netmem;
560 }
561 
562 /* For using page_pool replace: alloc_pages() API calls, but provide
563  * synchronization guarantee for allocation side.
564  */
565 netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp)
566 {
567 	netmem_ref netmem;
568 
569 	/* Fast-path: Get a page from cache */
570 	netmem = __page_pool_get_cached(pool);
571 	if (netmem)
572 		return netmem;
573 
574 	/* Slow-path: cache empty, do real allocation */
575 	netmem = __page_pool_alloc_pages_slow(pool, gfp);
576 	return netmem;
577 }
578 EXPORT_SYMBOL(page_pool_alloc_netmem);
579 
580 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
581 {
582 	return netmem_to_page(page_pool_alloc_netmem(pool, gfp));
583 }
584 EXPORT_SYMBOL(page_pool_alloc_pages);
585 ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);
586 
587 /* Calculate distance between two u32 values, valid if distance is below 2^(31)
588  *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
589  */
590 #define _distance(a, b)	(s32)((a) - (b))
591 
592 s32 page_pool_inflight(const struct page_pool *pool, bool strict)
593 {
594 	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
595 	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
596 	s32 inflight;
597 
598 	inflight = _distance(hold_cnt, release_cnt);
599 
600 	if (strict) {
601 		trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
602 		WARN(inflight < 0, "Negative(%d) inflight packet-pages",
603 		     inflight);
604 	} else {
605 		inflight = max(0, inflight);
606 	}
607 
608 	return inflight;
609 }
610 
611 static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
612 							 netmem_ref netmem)
613 {
614 	dma_addr_t dma;
615 
616 	if (!pool->dma_map)
617 		/* Always account for inflight pages, even if we didn't
618 		 * map them
619 		 */
620 		return;
621 
622 	dma = page_pool_get_dma_addr_netmem(netmem);
623 
624 	/* When page is unmapped, it cannot be returned to our pool */
625 	dma_unmap_page_attrs(pool->p.dev, dma,
626 			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
627 			     DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
628 	page_pool_set_dma_addr_netmem(netmem, 0);
629 }
630 
631 /* Disconnects a page (from a page_pool).  API users can have a need
632  * to disconnect a page (from a page_pool), to allow it to be used as
633  * a regular page (that will eventually be returned to the normal
634  * page-allocator via put_page).
635  */
636 void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
637 {
638 	int count;
639 
640 	__page_pool_release_page_dma(pool, netmem);
641 
642 	/* This may be the last page returned, releasing the pool, so
643 	 * it is not safe to reference pool afterwards.
644 	 */
645 	count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
646 	trace_page_pool_state_release(pool, netmem, count);
647 
648 	page_pool_clear_pp_info(netmem);
649 	put_page(netmem_to_page(netmem));
650 	/* An optimization would be to call __free_pages(page, pool->p.order)
651 	 * knowing page is not part of page-cache (thus avoiding a
652 	 * __page_cache_release() call).
653 	 */
654 }
655 
656 static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
657 {
658 	int ret;
659 	/* BH protection not needed if current is softirq */
660 	if (in_softirq())
661 		ret = ptr_ring_produce(&pool->ring, (__force void *)netmem);
662 	else
663 		ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem);
664 
665 	if (!ret) {
666 		recycle_stat_inc(pool, ring);
667 		return true;
668 	}
669 
670 	return false;
671 }
672 
673 /* Only allow direct recycling in special circumstances, into the
674  * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
675  *
676  * Caller must provide appropriate safe context.
677  */
678 static bool page_pool_recycle_in_cache(netmem_ref netmem,
679 				       struct page_pool *pool)
680 {
681 	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
682 		recycle_stat_inc(pool, cache_full);
683 		return false;
684 	}
685 
686 	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
687 	pool->alloc.cache[pool->alloc.count++] = netmem;
688 	recycle_stat_inc(pool, cached);
689 	return true;
690 }
691 
692 static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
693 {
694 	return netmem_is_net_iov(netmem) ||
695 	       (page_ref_count(netmem_to_page(netmem)) == 1 &&
696 		!page_is_pfmemalloc(netmem_to_page(netmem)));
697 }
698 
699 /* If the page refcnt == 1, this will try to recycle the page.
700  * If pool->dma_sync is set, we'll try to sync the DMA area for
701  * the configured size min(dma_sync_size, pool->max_len).
702  * If the page refcnt != 1, then the page will be returned to memory
703  * subsystem.
704  */
705 static __always_inline netmem_ref
706 __page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
707 		     unsigned int dma_sync_size, bool allow_direct)
708 {
709 	lockdep_assert_no_hardirq();
710 
711 	/* This allocator is optimized for the XDP mode that uses
712 	 * one-frame-per-page, but have fallbacks that act like the
713 	 * regular page allocator APIs.
714 	 *
715 	 * refcnt == 1 means page_pool owns page, and can recycle it.
716 	 *
717 	 * page is NOT reusable when allocated when system is under
718 	 * some pressure. (page_is_pfmemalloc)
719 	 */
720 	if (likely(__page_pool_page_can_be_recycled(netmem))) {
721 		/* Read barrier done in page_ref_count / READ_ONCE */
722 
723 		page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
724 
725 		if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
726 			return 0;
727 
728 		/* Page found as candidate for recycling */
729 		return netmem;
730 	}
731 
732 	/* Fallback/non-XDP mode: API user have elevated refcnt.
733 	 *
734 	 * Many drivers split up the page into fragments, and some
735 	 * want to keep doing this to save memory and do refcnt based
736 	 * recycling. Support this use case too, to ease drivers
737 	 * switching between XDP/non-XDP.
738 	 *
739 	 * In-case page_pool maintains the DMA mapping, API user must
740 	 * call page_pool_put_page once.  In this elevated refcnt
741 	 * case, the DMA is unmapped/released, as driver is likely
742 	 * doing refcnt based recycle tricks, meaning another process
743 	 * will be invoking put_page.
744 	 */
745 	recycle_stat_inc(pool, released_refcnt);
746 	page_pool_return_page(pool, netmem);
747 
748 	return 0;
749 }
750 
751 static bool page_pool_napi_local(const struct page_pool *pool)
752 {
753 	const struct napi_struct *napi;
754 	u32 cpuid;
755 
756 	if (unlikely(!in_softirq()))
757 		return false;
758 
759 	/* Allow direct recycle if we have reasons to believe that we are
760 	 * in the same context as the consumer would run, so there's
761 	 * no possible race.
762 	 * __page_pool_put_page() makes sure we're not in hardirq context
763 	 * and interrupts are enabled prior to accessing the cache.
764 	 */
765 	cpuid = smp_processor_id();
766 	if (READ_ONCE(pool->cpuid) == cpuid)
767 		return true;
768 
769 	napi = READ_ONCE(pool->p.napi);
770 
771 	return napi && READ_ONCE(napi->list_owner) == cpuid;
772 }
773 
774 void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
775 				  unsigned int dma_sync_size, bool allow_direct)
776 {
777 	if (!allow_direct)
778 		allow_direct = page_pool_napi_local(pool);
779 
780 	netmem =
781 		__page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
782 	if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
783 		/* Cache full, fallback to free pages */
784 		recycle_stat_inc(pool, ring_full);
785 		page_pool_return_page(pool, netmem);
786 	}
787 }
788 EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
789 
790 void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
791 				unsigned int dma_sync_size, bool allow_direct)
792 {
793 	page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
794 				     allow_direct);
795 }
796 EXPORT_SYMBOL(page_pool_put_unrefed_page);
797 
798 /**
799  * page_pool_put_page_bulk() - release references on multiple pages
800  * @pool:	pool from which pages were allocated
801  * @data:	array holding page pointers
802  * @count:	number of pages in @data
803  *
804  * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
805  * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
806  * will release leftover pages to the page allocator.
807  * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
808  * completion loop for the XDP_REDIRECT use case.
809  *
810  * Please note the caller must not use data area after running
811  * page_pool_put_page_bulk(), as this function overwrites it.
812  */
813 void page_pool_put_page_bulk(struct page_pool *pool, void **data,
814 			     int count)
815 {
816 	int i, bulk_len = 0;
817 	bool allow_direct;
818 	bool in_softirq;
819 
820 	allow_direct = page_pool_napi_local(pool);
821 
822 	for (i = 0; i < count; i++) {
823 		netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i]));
824 
825 		/* It is not the last user for the page frag case */
826 		if (!page_pool_is_last_ref(netmem))
827 			continue;
828 
829 		netmem = __page_pool_put_page(pool, netmem, -1, allow_direct);
830 		/* Approved for bulk recycling in ptr_ring cache */
831 		if (netmem)
832 			data[bulk_len++] = (__force void *)netmem;
833 	}
834 
835 	if (!bulk_len)
836 		return;
837 
838 	/* Bulk producer into ptr_ring page_pool cache */
839 	in_softirq = page_pool_producer_lock(pool);
840 	for (i = 0; i < bulk_len; i++) {
841 		if (__ptr_ring_produce(&pool->ring, data[i])) {
842 			/* ring full */
843 			recycle_stat_inc(pool, ring_full);
844 			break;
845 		}
846 	}
847 	recycle_stat_add(pool, ring, i);
848 	page_pool_producer_unlock(pool, in_softirq);
849 
850 	/* Hopefully all pages was return into ptr_ring */
851 	if (likely(i == bulk_len))
852 		return;
853 
854 	/* ptr_ring cache full, free remaining pages outside producer lock
855 	 * since put_page() with refcnt == 1 can be an expensive operation
856 	 */
857 	for (; i < bulk_len; i++)
858 		page_pool_return_page(pool, (__force netmem_ref)data[i]);
859 }
860 EXPORT_SYMBOL(page_pool_put_page_bulk);
861 
862 static netmem_ref page_pool_drain_frag(struct page_pool *pool,
863 				       netmem_ref netmem)
864 {
865 	long drain_count = BIAS_MAX - pool->frag_users;
866 
867 	/* Some user is still using the page frag */
868 	if (likely(page_pool_unref_netmem(netmem, drain_count)))
869 		return 0;
870 
871 	if (__page_pool_page_can_be_recycled(netmem)) {
872 		page_pool_dma_sync_for_device(pool, netmem, -1);
873 		return netmem;
874 	}
875 
876 	page_pool_return_page(pool, netmem);
877 	return 0;
878 }
879 
880 static void page_pool_free_frag(struct page_pool *pool)
881 {
882 	long drain_count = BIAS_MAX - pool->frag_users;
883 	netmem_ref netmem = pool->frag_page;
884 
885 	pool->frag_page = 0;
886 
887 	if (!netmem || page_pool_unref_netmem(netmem, drain_count))
888 		return;
889 
890 	page_pool_return_page(pool, netmem);
891 }
892 
893 netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
894 				       unsigned int *offset, unsigned int size,
895 				       gfp_t gfp)
896 {
897 	unsigned int max_size = PAGE_SIZE << pool->p.order;
898 	netmem_ref netmem = pool->frag_page;
899 
900 	if (WARN_ON(size > max_size))
901 		return 0;
902 
903 	size = ALIGN(size, dma_get_cache_alignment());
904 	*offset = pool->frag_offset;
905 
906 	if (netmem && *offset + size > max_size) {
907 		netmem = page_pool_drain_frag(pool, netmem);
908 		if (netmem) {
909 			alloc_stat_inc(pool, fast);
910 			goto frag_reset;
911 		}
912 	}
913 
914 	if (!netmem) {
915 		netmem = page_pool_alloc_netmem(pool, gfp);
916 		if (unlikely(!netmem)) {
917 			pool->frag_page = 0;
918 			return 0;
919 		}
920 
921 		pool->frag_page = netmem;
922 
923 frag_reset:
924 		pool->frag_users = 1;
925 		*offset = 0;
926 		pool->frag_offset = size;
927 		page_pool_fragment_netmem(netmem, BIAS_MAX);
928 		return netmem;
929 	}
930 
931 	pool->frag_users++;
932 	pool->frag_offset = *offset + size;
933 	alloc_stat_inc(pool, fast);
934 	return netmem;
935 }
936 EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
937 
938 struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
939 				  unsigned int size, gfp_t gfp)
940 {
941 	return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
942 							  gfp));
943 }
944 EXPORT_SYMBOL(page_pool_alloc_frag);
945 
946 static void page_pool_empty_ring(struct page_pool *pool)
947 {
948 	netmem_ref netmem;
949 
950 	/* Empty recycle ring */
951 	while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
952 		/* Verify the refcnt invariant of cached pages */
953 		if (!(netmem_ref_count(netmem) == 1))
954 			pr_crit("%s() page_pool refcnt %d violation\n",
955 				__func__, netmem_ref_count(netmem));
956 
957 		page_pool_return_page(pool, netmem);
958 	}
959 }
960 
961 static void __page_pool_destroy(struct page_pool *pool)
962 {
963 	if (pool->disconnect)
964 		pool->disconnect(pool);
965 
966 	page_pool_unlist(pool);
967 	page_pool_uninit(pool);
968 	kfree(pool);
969 }
970 
971 static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
972 {
973 	netmem_ref netmem;
974 
975 	if (pool->destroy_cnt)
976 		return;
977 
978 	/* Empty alloc cache, assume caller made sure this is
979 	 * no-longer in use, and page_pool_alloc_pages() cannot be
980 	 * call concurrently.
981 	 */
982 	while (pool->alloc.count) {
983 		netmem = pool->alloc.cache[--pool->alloc.count];
984 		page_pool_return_page(pool, netmem);
985 	}
986 }
987 
988 static void page_pool_scrub(struct page_pool *pool)
989 {
990 	page_pool_empty_alloc_cache_once(pool);
991 	pool->destroy_cnt++;
992 
993 	/* No more consumers should exist, but producers could still
994 	 * be in-flight.
995 	 */
996 	page_pool_empty_ring(pool);
997 }
998 
999 static int page_pool_release(struct page_pool *pool)
1000 {
1001 	int inflight;
1002 
1003 	page_pool_scrub(pool);
1004 	inflight = page_pool_inflight(pool, true);
1005 	if (!inflight)
1006 		__page_pool_destroy(pool);
1007 
1008 	return inflight;
1009 }
1010 
1011 static void page_pool_release_retry(struct work_struct *wq)
1012 {
1013 	struct delayed_work *dwq = to_delayed_work(wq);
1014 	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
1015 	void *netdev;
1016 	int inflight;
1017 
1018 	inflight = page_pool_release(pool);
1019 	if (!inflight)
1020 		return;
1021 
1022 	/* Periodic warning for page pools the user can't see */
1023 	netdev = READ_ONCE(pool->slow.netdev);
1024 	if (time_after_eq(jiffies, pool->defer_warn) &&
1025 	    (!netdev || netdev == NET_PTR_POISON)) {
1026 		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
1027 
1028 		pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
1029 			__func__, pool->user.id, inflight, sec);
1030 		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
1031 	}
1032 
1033 	/* Still not ready to be disconnected, retry later */
1034 	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1035 }
1036 
1037 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
1038 			   const struct xdp_mem_info *mem)
1039 {
1040 	refcount_inc(&pool->user_cnt);
1041 	pool->disconnect = disconnect;
1042 	pool->xdp_mem_id = mem->id;
1043 }
1044 
1045 void page_pool_disable_direct_recycling(struct page_pool *pool)
1046 {
1047 	/* Disable direct recycling based on pool->cpuid.
1048 	 * Paired with READ_ONCE() in page_pool_napi_local().
1049 	 */
1050 	WRITE_ONCE(pool->cpuid, -1);
1051 
1052 	if (!pool->p.napi)
1053 		return;
1054 
1055 	/* To avoid races with recycling and additional barriers make sure
1056 	 * pool and NAPI are unlinked when NAPI is disabled.
1057 	 */
1058 	WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state));
1059 	WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);
1060 
1061 	WRITE_ONCE(pool->p.napi, NULL);
1062 }
1063 EXPORT_SYMBOL(page_pool_disable_direct_recycling);
1064 
1065 void page_pool_destroy(struct page_pool *pool)
1066 {
1067 	if (!pool)
1068 		return;
1069 
1070 	if (!page_pool_put(pool))
1071 		return;
1072 
1073 	page_pool_disable_direct_recycling(pool);
1074 	page_pool_free_frag(pool);
1075 
1076 	if (!page_pool_release(pool))
1077 		return;
1078 
1079 	page_pool_detached(pool);
1080 	pool->defer_start = jiffies;
1081 	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
1082 
1083 	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
1084 	schedule_delayed_work(&pool->release_dw, DEFER_TIME);
1085 }
1086 EXPORT_SYMBOL(page_pool_destroy);
1087 
1088 /* Caller must provide appropriate safe context, e.g. NAPI. */
1089 void page_pool_update_nid(struct page_pool *pool, int new_nid)
1090 {
1091 	netmem_ref netmem;
1092 
1093 	trace_page_pool_update_nid(pool, new_nid);
1094 	pool->p.nid = new_nid;
1095 
1096 	/* Flush pool alloc cache, as refill will check NUMA node */
1097 	while (pool->alloc.count) {
1098 		netmem = pool->alloc.cache[--pool->alloc.count];
1099 		page_pool_return_page(pool, netmem);
1100 	}
1101 }
1102 EXPORT_SYMBOL(page_pool_update_nid);
1103