1 /* SPDX-License-Identifier: GPL-2.0 2 * 3 * page_pool.c 4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> 5 * Copyright (C) 2016 Red Hat, Inc. 6 */ 7 8 #include <linux/types.h> 9 #include <linux/kernel.h> 10 #include <linux/slab.h> 11 #include <linux/device.h> 12 13 #include <net/page_pool.h> 14 #include <linux/dma-direction.h> 15 #include <linux/dma-mapping.h> 16 #include <linux/page-flags.h> 17 #include <linux/mm.h> /* for __put_page() */ 18 19 #include <trace/events/page_pool.h> 20 21 static int page_pool_init(struct page_pool *pool, 22 const struct page_pool_params *params) 23 { 24 unsigned int ring_qsize = 1024; /* Default */ 25 26 memcpy(&pool->p, params, sizeof(pool->p)); 27 28 /* Validate only known flags were used */ 29 if (pool->p.flags & ~(PP_FLAG_ALL)) 30 return -EINVAL; 31 32 if (pool->p.pool_size) 33 ring_qsize = pool->p.pool_size; 34 35 /* Sanity limit mem that can be pinned down */ 36 if (ring_qsize > 32768) 37 return -E2BIG; 38 39 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. 40 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, 41 * which is the XDP_TX use-case. 42 */ 43 if ((pool->p.dma_dir != DMA_FROM_DEVICE) && 44 (pool->p.dma_dir != DMA_BIDIRECTIONAL)) 45 return -EINVAL; 46 47 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) 48 return -ENOMEM; 49 50 atomic_set(&pool->pages_state_release_cnt, 0); 51 52 /* Driver calling page_pool_create() also call page_pool_destroy() */ 53 refcount_set(&pool->user_cnt, 1); 54 55 if (pool->p.flags & PP_FLAG_DMA_MAP) 56 get_device(pool->p.dev); 57 58 return 0; 59 } 60 61 struct page_pool *page_pool_create(const struct page_pool_params *params) 62 { 63 struct page_pool *pool; 64 int err = 0; 65 66 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); 67 if (!pool) 68 return ERR_PTR(-ENOMEM); 69 70 err = page_pool_init(pool, params); 71 if (err < 0) { 72 pr_warn("%s() gave up with errno %d\n", __func__, err); 73 kfree(pool); 74 return ERR_PTR(err); 75 } 76 77 return pool; 78 } 79 EXPORT_SYMBOL(page_pool_create); 80 81 /* fast path */ 82 static struct page *__page_pool_get_cached(struct page_pool *pool) 83 { 84 struct ptr_ring *r = &pool->ring; 85 struct page *page; 86 87 /* Quicker fallback, avoid locks when ring is empty */ 88 if (__ptr_ring_empty(r)) 89 return NULL; 90 91 /* Test for safe-context, caller should provide this guarantee */ 92 if (likely(in_serving_softirq())) { 93 if (likely(pool->alloc.count)) { 94 /* Fast-path */ 95 page = pool->alloc.cache[--pool->alloc.count]; 96 return page; 97 } 98 /* Slower-path: Alloc array empty, time to refill 99 * 100 * Open-coded bulk ptr_ring consumer. 101 * 102 * Discussion: the ring consumer lock is not really 103 * needed due to the softirq/NAPI protection, but 104 * later need the ability to reclaim pages on the 105 * ring. Thus, keeping the locks. 106 */ 107 spin_lock(&r->consumer_lock); 108 while ((page = __ptr_ring_consume(r))) { 109 if (pool->alloc.count == PP_ALLOC_CACHE_REFILL) 110 break; 111 pool->alloc.cache[pool->alloc.count++] = page; 112 } 113 spin_unlock(&r->consumer_lock); 114 return page; 115 } 116 117 /* Slow-path: Get page from locked ring queue */ 118 page = ptr_ring_consume(&pool->ring); 119 return page; 120 } 121 122 /* slow path */ 123 noinline 124 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, 125 gfp_t _gfp) 126 { 127 struct page *page; 128 gfp_t gfp = _gfp; 129 dma_addr_t dma; 130 131 /* We could always set __GFP_COMP, and avoid this branch, as 132 * prep_new_page() can handle order-0 with __GFP_COMP. 133 */ 134 if (pool->p.order) 135 gfp |= __GFP_COMP; 136 137 /* FUTURE development: 138 * 139 * Current slow-path essentially falls back to single page 140 * allocations, which doesn't improve performance. This code 141 * need bulk allocation support from the page allocator code. 142 */ 143 144 /* Cache was empty, do real allocation */ 145 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); 146 if (!page) 147 return NULL; 148 149 if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 150 goto skip_dma_map; 151 152 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr 153 * since dma_addr_t can be either 32 or 64 bits and does not always fit 154 * into page private data (i.e 32bit cpu with 64bit DMA caps) 155 * This mapping is kept for lifetime of page, until leaving pool. 156 */ 157 dma = dma_map_page_attrs(pool->p.dev, page, 0, 158 (PAGE_SIZE << pool->p.order), 159 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); 160 if (dma_mapping_error(pool->p.dev, dma)) { 161 put_page(page); 162 return NULL; 163 } 164 page->dma_addr = dma; 165 166 skip_dma_map: 167 /* Track how many pages are held 'in-flight' */ 168 pool->pages_state_hold_cnt++; 169 170 trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); 171 172 /* When page just alloc'ed is should/must have refcnt 1. */ 173 return page; 174 } 175 176 /* For using page_pool replace: alloc_pages() API calls, but provide 177 * synchronization guarantee for allocation side. 178 */ 179 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) 180 { 181 struct page *page; 182 183 /* Fast-path: Get a page from cache */ 184 page = __page_pool_get_cached(pool); 185 if (page) 186 return page; 187 188 /* Slow-path: cache empty, do real allocation */ 189 page = __page_pool_alloc_pages_slow(pool, gfp); 190 return page; 191 } 192 EXPORT_SYMBOL(page_pool_alloc_pages); 193 194 /* Calculate distance between two u32 values, valid if distance is below 2^(31) 195 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution 196 */ 197 #define _distance(a, b) (s32)((a) - (b)) 198 199 static s32 page_pool_inflight(struct page_pool *pool) 200 { 201 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); 202 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); 203 s32 distance; 204 205 distance = _distance(hold_cnt, release_cnt); 206 207 trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt); 208 return distance; 209 } 210 211 static bool __page_pool_safe_to_destroy(struct page_pool *pool) 212 { 213 s32 inflight = page_pool_inflight(pool); 214 215 /* The distance should not be able to become negative */ 216 WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); 217 218 return (inflight == 0); 219 } 220 221 /* Cleanup page_pool state from page */ 222 static void __page_pool_clean_page(struct page_pool *pool, 223 struct page *page) 224 { 225 dma_addr_t dma; 226 227 if (!(pool->p.flags & PP_FLAG_DMA_MAP)) 228 goto skip_dma_unmap; 229 230 dma = page->dma_addr; 231 /* DMA unmap */ 232 dma_unmap_page_attrs(pool->p.dev, dma, 233 PAGE_SIZE << pool->p.order, pool->p.dma_dir, 234 DMA_ATTR_SKIP_CPU_SYNC); 235 page->dma_addr = 0; 236 skip_dma_unmap: 237 atomic_inc(&pool->pages_state_release_cnt); 238 trace_page_pool_state_release(pool, page, 239 atomic_read(&pool->pages_state_release_cnt)); 240 } 241 242 /* unmap the page and clean our state */ 243 void page_pool_unmap_page(struct page_pool *pool, struct page *page) 244 { 245 /* When page is unmapped, this implies page will not be 246 * returned to page_pool. 247 */ 248 __page_pool_clean_page(pool, page); 249 } 250 EXPORT_SYMBOL(page_pool_unmap_page); 251 252 /* Return a page to the page allocator, cleaning up our state */ 253 static void __page_pool_return_page(struct page_pool *pool, struct page *page) 254 { 255 __page_pool_clean_page(pool, page); 256 257 put_page(page); 258 /* An optimization would be to call __free_pages(page, pool->p.order) 259 * knowing page is not part of page-cache (thus avoiding a 260 * __page_cache_release() call). 261 */ 262 } 263 264 static bool __page_pool_recycle_into_ring(struct page_pool *pool, 265 struct page *page) 266 { 267 int ret; 268 /* BH protection not needed if current is serving softirq */ 269 if (in_serving_softirq()) 270 ret = ptr_ring_produce(&pool->ring, page); 271 else 272 ret = ptr_ring_produce_bh(&pool->ring, page); 273 274 return (ret == 0) ? true : false; 275 } 276 277 /* Only allow direct recycling in special circumstances, into the 278 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. 279 * 280 * Caller must provide appropriate safe context. 281 */ 282 static bool __page_pool_recycle_direct(struct page *page, 283 struct page_pool *pool) 284 { 285 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) 286 return false; 287 288 /* Caller MUST have verified/know (page_ref_count(page) == 1) */ 289 pool->alloc.cache[pool->alloc.count++] = page; 290 return true; 291 } 292 293 void __page_pool_put_page(struct page_pool *pool, 294 struct page *page, bool allow_direct) 295 { 296 /* This allocator is optimized for the XDP mode that uses 297 * one-frame-per-page, but have fallbacks that act like the 298 * regular page allocator APIs. 299 * 300 * refcnt == 1 means page_pool owns page, and can recycle it. 301 */ 302 if (likely(page_ref_count(page) == 1)) { 303 /* Read barrier done in page_ref_count / READ_ONCE */ 304 305 if (allow_direct && in_serving_softirq()) 306 if (__page_pool_recycle_direct(page, pool)) 307 return; 308 309 if (!__page_pool_recycle_into_ring(pool, page)) { 310 /* Cache full, fallback to free pages */ 311 __page_pool_return_page(pool, page); 312 } 313 return; 314 } 315 /* Fallback/non-XDP mode: API user have elevated refcnt. 316 * 317 * Many drivers split up the page into fragments, and some 318 * want to keep doing this to save memory and do refcnt based 319 * recycling. Support this use case too, to ease drivers 320 * switching between XDP/non-XDP. 321 * 322 * In-case page_pool maintains the DMA mapping, API user must 323 * call page_pool_put_page once. In this elevated refcnt 324 * case, the DMA is unmapped/released, as driver is likely 325 * doing refcnt based recycle tricks, meaning another process 326 * will be invoking put_page. 327 */ 328 __page_pool_clean_page(pool, page); 329 put_page(page); 330 } 331 EXPORT_SYMBOL(__page_pool_put_page); 332 333 static void __page_pool_empty_ring(struct page_pool *pool) 334 { 335 struct page *page; 336 337 /* Empty recycle ring */ 338 while ((page = ptr_ring_consume_bh(&pool->ring))) { 339 /* Verify the refcnt invariant of cached pages */ 340 if (!(page_ref_count(page) == 1)) 341 pr_crit("%s() page_pool refcnt %d violation\n", 342 __func__, page_ref_count(page)); 343 344 __page_pool_return_page(pool, page); 345 } 346 } 347 348 static void __warn_in_flight(struct page_pool *pool) 349 { 350 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); 351 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); 352 s32 distance; 353 354 distance = _distance(hold_cnt, release_cnt); 355 356 /* Drivers should fix this, but only problematic when DMA is used */ 357 WARN(1, "Still in-flight pages:%d hold:%u released:%u", 358 distance, hold_cnt, release_cnt); 359 } 360 361 void __page_pool_free(struct page_pool *pool) 362 { 363 /* Only last user actually free/release resources */ 364 if (!page_pool_put(pool)) 365 return; 366 367 WARN(pool->alloc.count, "API usage violation"); 368 WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty"); 369 370 /* Can happen due to forced shutdown */ 371 if (!__page_pool_safe_to_destroy(pool)) 372 __warn_in_flight(pool); 373 374 ptr_ring_cleanup(&pool->ring, NULL); 375 376 if (pool->p.flags & PP_FLAG_DMA_MAP) 377 put_device(pool->p.dev); 378 379 kfree(pool); 380 } 381 EXPORT_SYMBOL(__page_pool_free); 382 383 /* Request to shutdown: release pages cached by page_pool, and check 384 * for in-flight pages 385 */ 386 bool __page_pool_request_shutdown(struct page_pool *pool) 387 { 388 struct page *page; 389 390 /* Empty alloc cache, assume caller made sure this is 391 * no-longer in use, and page_pool_alloc_pages() cannot be 392 * call concurrently. 393 */ 394 while (pool->alloc.count) { 395 page = pool->alloc.cache[--pool->alloc.count]; 396 __page_pool_return_page(pool, page); 397 } 398 399 /* No more consumers should exist, but producers could still 400 * be in-flight. 401 */ 402 __page_pool_empty_ring(pool); 403 404 return __page_pool_safe_to_destroy(pool); 405 } 406 EXPORT_SYMBOL(__page_pool_request_shutdown); 407