1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Routines having to do with the 'struct sk_buff' memory handlers. 4 * 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> 6 * Florian La Roche <rzsfl@rz.uni-sb.de> 7 * 8 * Fixes: 9 * Alan Cox : Fixed the worst of the load 10 * balancer bugs. 11 * Dave Platt : Interrupt stacking fix. 12 * Richard Kooijman : Timestamp fixes. 13 * Alan Cox : Changed buffer format. 14 * Alan Cox : destructor hook for AF_UNIX etc. 15 * Linus Torvalds : Better skb_clone. 16 * Alan Cox : Added skb_copy. 17 * Alan Cox : Added all the changed routines Linus 18 * only put in the headers 19 * Ray VanTassle : Fixed --skb->lock in free 20 * Alan Cox : skb_copy copy arp field 21 * Andi Kleen : slabified it. 22 * Robert Olsson : Removed skb_head_pool 23 * 24 * NOTE: 25 * The __skb_ routines should be called with interrupts 26 * disabled, or you better be *real* sure that the operation is atomic 27 * with respect to whatever list is being frobbed (e.g. via lock_sock() 28 * or via disabling bottom half handlers, etc). 29 */ 30 31 /* 32 * The functions in this file will not compile correctly with gcc 2.4.x 33 */ 34 35 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 36 37 #include <linux/module.h> 38 #include <linux/types.h> 39 #include <linux/kernel.h> 40 #include <linux/mm.h> 41 #include <linux/interrupt.h> 42 #include <linux/in.h> 43 #include <linux/inet.h> 44 #include <linux/slab.h> 45 #include <linux/tcp.h> 46 #include <linux/udp.h> 47 #include <linux/sctp.h> 48 #include <linux/netdevice.h> 49 #ifdef CONFIG_NET_CLS_ACT 50 #include <net/pkt_sched.h> 51 #endif 52 #include <linux/string.h> 53 #include <linux/skbuff.h> 54 #include <linux/splice.h> 55 #include <linux/cache.h> 56 #include <linux/rtnetlink.h> 57 #include <linux/init.h> 58 #include <linux/scatterlist.h> 59 #include <linux/errqueue.h> 60 #include <linux/prefetch.h> 61 #include <linux/if_vlan.h> 62 #include <linux/mpls.h> 63 #include <linux/kcov.h> 64 65 #include <net/protocol.h> 66 #include <net/dst.h> 67 #include <net/sock.h> 68 #include <net/checksum.h> 69 #include <net/ip6_checksum.h> 70 #include <net/xfrm.h> 71 #include <net/mpls.h> 72 #include <net/mptcp.h> 73 #include <net/mctp.h> 74 #include <net/page_pool.h> 75 76 #include <linux/uaccess.h> 77 #include <trace/events/skb.h> 78 #include <linux/highmem.h> 79 #include <linux/capability.h> 80 #include <linux/user_namespace.h> 81 #include <linux/indirect_call_wrapper.h> 82 83 #include "dev.h" 84 #include "sock_destructor.h" 85 86 struct kmem_cache *skbuff_head_cache __ro_after_init; 87 static struct kmem_cache *skbuff_fclone_cache __ro_after_init; 88 #ifdef CONFIG_SKB_EXTENSIONS 89 static struct kmem_cache *skbuff_ext_cache __ro_after_init; 90 #endif 91 int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; 92 EXPORT_SYMBOL(sysctl_max_skb_frags); 93 94 #undef FN 95 #define FN(reason) [SKB_DROP_REASON_##reason] = #reason, 96 const char * const drop_reasons[] = { 97 DEFINE_DROP_REASON(FN, FN) 98 }; 99 EXPORT_SYMBOL(drop_reasons); 100 101 /** 102 * skb_panic - private function for out-of-line support 103 * @skb: buffer 104 * @sz: size 105 * @addr: address 106 * @msg: skb_over_panic or skb_under_panic 107 * 108 * Out-of-line support for skb_put() and skb_push(). 109 * Called via the wrapper skb_over_panic() or skb_under_panic(). 110 * Keep out of line to prevent kernel bloat. 111 * __builtin_return_address is not used because it is not always reliable. 112 */ 113 static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, 114 const char msg[]) 115 { 116 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", 117 msg, addr, skb->len, sz, skb->head, skb->data, 118 (unsigned long)skb->tail, (unsigned long)skb->end, 119 skb->dev ? skb->dev->name : "<NULL>"); 120 BUG(); 121 } 122 123 static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) 124 { 125 skb_panic(skb, sz, addr, __func__); 126 } 127 128 static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) 129 { 130 skb_panic(skb, sz, addr, __func__); 131 } 132 133 #define NAPI_SKB_CACHE_SIZE 64 134 #define NAPI_SKB_CACHE_BULK 16 135 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) 136 137 #if PAGE_SIZE == SZ_4K 138 139 #define NAPI_HAS_SMALL_PAGE_FRAG 1 140 #define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) 141 142 /* specialized page frag allocator using a single order 0 page 143 * and slicing it into 1K sized fragment. Constrained to systems 144 * with a very limited amount of 1K fragments fitting a single 145 * page - to avoid excessive truesize underestimation 146 */ 147 148 struct page_frag_1k { 149 void *va; 150 u16 offset; 151 bool pfmemalloc; 152 }; 153 154 static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) 155 { 156 struct page *page; 157 int offset; 158 159 offset = nc->offset - SZ_1K; 160 if (likely(offset >= 0)) 161 goto use_frag; 162 163 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); 164 if (!page) 165 return NULL; 166 167 nc->va = page_address(page); 168 nc->pfmemalloc = page_is_pfmemalloc(page); 169 offset = PAGE_SIZE - SZ_1K; 170 page_ref_add(page, offset / SZ_1K); 171 172 use_frag: 173 nc->offset = offset; 174 return nc->va + offset; 175 } 176 #else 177 178 /* the small page is actually unused in this build; add dummy helpers 179 * to please the compiler and avoid later preprocessor's conditionals 180 */ 181 #define NAPI_HAS_SMALL_PAGE_FRAG 0 182 #define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false 183 184 struct page_frag_1k { 185 }; 186 187 static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) 188 { 189 return NULL; 190 } 191 192 #endif 193 194 struct napi_alloc_cache { 195 struct page_frag_cache page; 196 struct page_frag_1k page_small; 197 unsigned int skb_count; 198 void *skb_cache[NAPI_SKB_CACHE_SIZE]; 199 }; 200 201 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); 202 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); 203 204 /* Double check that napi_get_frags() allocates skbs with 205 * skb->head being backed by slab, not a page fragment. 206 * This is to make sure bug fixed in 3226b158e67c 207 * ("net: avoid 32 x truesize under-estimation for tiny skbs") 208 * does not accidentally come back. 209 */ 210 void napi_get_frags_check(struct napi_struct *napi) 211 { 212 struct sk_buff *skb; 213 214 local_bh_disable(); 215 skb = napi_get_frags(napi); 216 WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); 217 napi_free_frags(napi); 218 local_bh_enable(); 219 } 220 221 void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) 222 { 223 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 224 225 fragsz = SKB_DATA_ALIGN(fragsz); 226 227 return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); 228 } 229 EXPORT_SYMBOL(__napi_alloc_frag_align); 230 231 void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) 232 { 233 void *data; 234 235 fragsz = SKB_DATA_ALIGN(fragsz); 236 if (in_hardirq() || irqs_disabled()) { 237 struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); 238 239 data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); 240 } else { 241 struct napi_alloc_cache *nc; 242 243 local_bh_disable(); 244 nc = this_cpu_ptr(&napi_alloc_cache); 245 data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); 246 local_bh_enable(); 247 } 248 return data; 249 } 250 EXPORT_SYMBOL(__netdev_alloc_frag_align); 251 252 static struct sk_buff *napi_skb_cache_get(void) 253 { 254 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 255 struct sk_buff *skb; 256 257 if (unlikely(!nc->skb_count)) { 258 nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache, 259 GFP_ATOMIC, 260 NAPI_SKB_CACHE_BULK, 261 nc->skb_cache); 262 if (unlikely(!nc->skb_count)) 263 return NULL; 264 } 265 266 skb = nc->skb_cache[--nc->skb_count]; 267 kasan_unpoison_object_data(skbuff_head_cache, skb); 268 269 return skb; 270 } 271 272 /* Caller must provide SKB that is memset cleared */ 273 static void __build_skb_around(struct sk_buff *skb, void *data, 274 unsigned int frag_size) 275 { 276 struct skb_shared_info *shinfo; 277 unsigned int size = frag_size ? : ksize(data); 278 279 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 280 281 /* Assumes caller memset cleared SKB */ 282 skb->truesize = SKB_TRUESIZE(size); 283 refcount_set(&skb->users, 1); 284 skb->head = data; 285 skb->data = data; 286 skb_reset_tail_pointer(skb); 287 skb_set_end_offset(skb, size); 288 skb->mac_header = (typeof(skb->mac_header))~0U; 289 skb->transport_header = (typeof(skb->transport_header))~0U; 290 skb->alloc_cpu = raw_smp_processor_id(); 291 /* make sure we initialize shinfo sequentially */ 292 shinfo = skb_shinfo(skb); 293 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 294 atomic_set(&shinfo->dataref, 1); 295 296 skb_set_kcov_handle(skb, kcov_common_handle()); 297 } 298 299 /** 300 * __build_skb - build a network buffer 301 * @data: data buffer provided by caller 302 * @frag_size: size of data, or 0 if head was kmalloced 303 * 304 * Allocate a new &sk_buff. Caller provides space holding head and 305 * skb_shared_info. @data must have been allocated by kmalloc() only if 306 * @frag_size is 0, otherwise data should come from the page allocator 307 * or vmalloc() 308 * The return is the new skb buffer. 309 * On a failure the return is %NULL, and @data is not freed. 310 * Notes : 311 * Before IO, driver allocates only data buffer where NIC put incoming frame 312 * Driver should add room at head (NET_SKB_PAD) and 313 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) 314 * After IO, driver calls build_skb(), to allocate sk_buff and populate it 315 * before giving packet to stack. 316 * RX rings only contains data buffers, not full skbs. 317 */ 318 struct sk_buff *__build_skb(void *data, unsigned int frag_size) 319 { 320 struct sk_buff *skb; 321 322 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); 323 if (unlikely(!skb)) 324 return NULL; 325 326 memset(skb, 0, offsetof(struct sk_buff, tail)); 327 __build_skb_around(skb, data, frag_size); 328 329 return skb; 330 } 331 332 /* build_skb() is wrapper over __build_skb(), that specifically 333 * takes care of skb->head and skb->pfmemalloc 334 * This means that if @frag_size is not zero, then @data must be backed 335 * by a page fragment, not kmalloc() or vmalloc() 336 */ 337 struct sk_buff *build_skb(void *data, unsigned int frag_size) 338 { 339 struct sk_buff *skb = __build_skb(data, frag_size); 340 341 if (skb && frag_size) { 342 skb->head_frag = 1; 343 if (page_is_pfmemalloc(virt_to_head_page(data))) 344 skb->pfmemalloc = 1; 345 } 346 return skb; 347 } 348 EXPORT_SYMBOL(build_skb); 349 350 /** 351 * build_skb_around - build a network buffer around provided skb 352 * @skb: sk_buff provide by caller, must be memset cleared 353 * @data: data buffer provided by caller 354 * @frag_size: size of data, or 0 if head was kmalloced 355 */ 356 struct sk_buff *build_skb_around(struct sk_buff *skb, 357 void *data, unsigned int frag_size) 358 { 359 if (unlikely(!skb)) 360 return NULL; 361 362 __build_skb_around(skb, data, frag_size); 363 364 if (frag_size) { 365 skb->head_frag = 1; 366 if (page_is_pfmemalloc(virt_to_head_page(data))) 367 skb->pfmemalloc = 1; 368 } 369 return skb; 370 } 371 EXPORT_SYMBOL(build_skb_around); 372 373 /** 374 * __napi_build_skb - build a network buffer 375 * @data: data buffer provided by caller 376 * @frag_size: size of data, or 0 if head was kmalloced 377 * 378 * Version of __build_skb() that uses NAPI percpu caches to obtain 379 * skbuff_head instead of inplace allocation. 380 * 381 * Returns a new &sk_buff on success, %NULL on allocation failure. 382 */ 383 static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) 384 { 385 struct sk_buff *skb; 386 387 skb = napi_skb_cache_get(); 388 if (unlikely(!skb)) 389 return NULL; 390 391 memset(skb, 0, offsetof(struct sk_buff, tail)); 392 __build_skb_around(skb, data, frag_size); 393 394 return skb; 395 } 396 397 /** 398 * napi_build_skb - build a network buffer 399 * @data: data buffer provided by caller 400 * @frag_size: size of data, or 0 if head was kmalloced 401 * 402 * Version of __napi_build_skb() that takes care of skb->head_frag 403 * and skb->pfmemalloc when the data is a page or page fragment. 404 * 405 * Returns a new &sk_buff on success, %NULL on allocation failure. 406 */ 407 struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) 408 { 409 struct sk_buff *skb = __napi_build_skb(data, frag_size); 410 411 if (likely(skb) && frag_size) { 412 skb->head_frag = 1; 413 skb_propagate_pfmemalloc(virt_to_head_page(data), skb); 414 } 415 416 return skb; 417 } 418 EXPORT_SYMBOL(napi_build_skb); 419 420 /* 421 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells 422 * the caller if emergency pfmemalloc reserves are being used. If it is and 423 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves 424 * may be used. Otherwise, the packet data may be discarded until enough 425 * memory is free 426 */ 427 static void *kmalloc_reserve(size_t size, gfp_t flags, int node, 428 bool *pfmemalloc) 429 { 430 void *obj; 431 bool ret_pfmemalloc = false; 432 433 /* 434 * Try a regular allocation, when that fails and we're not entitled 435 * to the reserves, fail. 436 */ 437 obj = kmalloc_node_track_caller(size, 438 flags | __GFP_NOMEMALLOC | __GFP_NOWARN, 439 node); 440 if (obj || !(gfp_pfmemalloc_allowed(flags))) 441 goto out; 442 443 /* Try again but now we are using pfmemalloc reserves */ 444 ret_pfmemalloc = true; 445 obj = kmalloc_node_track_caller(size, flags, node); 446 447 out: 448 if (pfmemalloc) 449 *pfmemalloc = ret_pfmemalloc; 450 451 return obj; 452 } 453 454 /* Allocate a new skbuff. We do this ourselves so we can fill in a few 455 * 'private' fields and also do memory statistics to find all the 456 * [BEEP] leaks. 457 * 458 */ 459 460 /** 461 * __alloc_skb - allocate a network buffer 462 * @size: size to allocate 463 * @gfp_mask: allocation mask 464 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache 465 * instead of head cache and allocate a cloned (child) skb. 466 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for 467 * allocations in case the data is required for writeback 468 * @node: numa node to allocate memory on 469 * 470 * Allocate a new &sk_buff. The returned buffer has no headroom and a 471 * tail room of at least size bytes. The object has a reference count 472 * of one. The return is the buffer. On a failure the return is %NULL. 473 * 474 * Buffers may only be allocated from interrupts using a @gfp_mask of 475 * %GFP_ATOMIC. 476 */ 477 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 478 int flags, int node) 479 { 480 struct kmem_cache *cache; 481 struct sk_buff *skb; 482 unsigned int osize; 483 bool pfmemalloc; 484 u8 *data; 485 486 cache = (flags & SKB_ALLOC_FCLONE) 487 ? skbuff_fclone_cache : skbuff_head_cache; 488 489 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) 490 gfp_mask |= __GFP_MEMALLOC; 491 492 /* Get the HEAD */ 493 if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && 494 likely(node == NUMA_NO_NODE || node == numa_mem_id())) 495 skb = napi_skb_cache_get(); 496 else 497 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); 498 if (unlikely(!skb)) 499 return NULL; 500 prefetchw(skb); 501 502 /* We do our best to align skb_shared_info on a separate cache 503 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives 504 * aligned memory blocks, unless SLUB/SLAB debug is enabled. 505 * Both skb->head and skb_shared_info are cache line aligned. 506 */ 507 size = SKB_DATA_ALIGN(size); 508 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 509 osize = kmalloc_size_roundup(size); 510 data = kmalloc_reserve(osize, gfp_mask, node, &pfmemalloc); 511 if (unlikely(!data)) 512 goto nodata; 513 /* kmalloc_size_roundup() might give us more room than requested. 514 * Put skb_shared_info exactly at the end of allocated zone, 515 * to allow max possible filling before reallocation. 516 */ 517 size = SKB_WITH_OVERHEAD(osize); 518 prefetchw(data + size); 519 520 /* 521 * Only clear those fields we need to clear, not those that we will 522 * actually initialise below. Hence, don't put any more fields after 523 * the tail pointer in struct sk_buff! 524 */ 525 memset(skb, 0, offsetof(struct sk_buff, tail)); 526 __build_skb_around(skb, data, osize); 527 skb->pfmemalloc = pfmemalloc; 528 529 if (flags & SKB_ALLOC_FCLONE) { 530 struct sk_buff_fclones *fclones; 531 532 fclones = container_of(skb, struct sk_buff_fclones, skb1); 533 534 skb->fclone = SKB_FCLONE_ORIG; 535 refcount_set(&fclones->fclone_ref, 1); 536 } 537 538 return skb; 539 540 nodata: 541 kmem_cache_free(cache, skb); 542 return NULL; 543 } 544 EXPORT_SYMBOL(__alloc_skb); 545 546 /** 547 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device 548 * @dev: network device to receive on 549 * @len: length to allocate 550 * @gfp_mask: get_free_pages mask, passed to alloc_skb 551 * 552 * Allocate a new &sk_buff and assign it a usage count of one. The 553 * buffer has NET_SKB_PAD headroom built in. Users should allocate 554 * the headroom they think they need without accounting for the 555 * built in space. The built in space is used for optimisations. 556 * 557 * %NULL is returned if there is no free memory. 558 */ 559 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, 560 gfp_t gfp_mask) 561 { 562 struct page_frag_cache *nc; 563 struct sk_buff *skb; 564 bool pfmemalloc; 565 void *data; 566 567 len += NET_SKB_PAD; 568 569 /* If requested length is either too small or too big, 570 * we use kmalloc() for skb->head allocation. 571 */ 572 if (len <= SKB_WITH_OVERHEAD(1024) || 573 len > SKB_WITH_OVERHEAD(PAGE_SIZE) || 574 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 575 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); 576 if (!skb) 577 goto skb_fail; 578 goto skb_success; 579 } 580 581 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 582 len = SKB_DATA_ALIGN(len); 583 584 if (sk_memalloc_socks()) 585 gfp_mask |= __GFP_MEMALLOC; 586 587 if (in_hardirq() || irqs_disabled()) { 588 nc = this_cpu_ptr(&netdev_alloc_cache); 589 data = page_frag_alloc(nc, len, gfp_mask); 590 pfmemalloc = nc->pfmemalloc; 591 } else { 592 local_bh_disable(); 593 nc = this_cpu_ptr(&napi_alloc_cache.page); 594 data = page_frag_alloc(nc, len, gfp_mask); 595 pfmemalloc = nc->pfmemalloc; 596 local_bh_enable(); 597 } 598 599 if (unlikely(!data)) 600 return NULL; 601 602 skb = __build_skb(data, len); 603 if (unlikely(!skb)) { 604 skb_free_frag(data); 605 return NULL; 606 } 607 608 if (pfmemalloc) 609 skb->pfmemalloc = 1; 610 skb->head_frag = 1; 611 612 skb_success: 613 skb_reserve(skb, NET_SKB_PAD); 614 skb->dev = dev; 615 616 skb_fail: 617 return skb; 618 } 619 EXPORT_SYMBOL(__netdev_alloc_skb); 620 621 /** 622 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance 623 * @napi: napi instance this buffer was allocated for 624 * @len: length to allocate 625 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages 626 * 627 * Allocate a new sk_buff for use in NAPI receive. This buffer will 628 * attempt to allocate the head from a special reserved region used 629 * only for NAPI Rx allocation. By doing this we can save several 630 * CPU cycles by avoiding having to disable and re-enable IRQs. 631 * 632 * %NULL is returned if there is no free memory. 633 */ 634 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, 635 gfp_t gfp_mask) 636 { 637 struct napi_alloc_cache *nc; 638 struct sk_buff *skb; 639 bool pfmemalloc; 640 void *data; 641 642 DEBUG_NET_WARN_ON_ONCE(!in_softirq()); 643 len += NET_SKB_PAD + NET_IP_ALIGN; 644 645 /* If requested length is either too small or too big, 646 * we use kmalloc() for skb->head allocation. 647 * When the small frag allocator is available, prefer it over kmalloc 648 * for small fragments 649 */ 650 if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || 651 len > SKB_WITH_OVERHEAD(PAGE_SIZE) || 652 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 653 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, 654 NUMA_NO_NODE); 655 if (!skb) 656 goto skb_fail; 657 goto skb_success; 658 } 659 660 nc = this_cpu_ptr(&napi_alloc_cache); 661 662 if (sk_memalloc_socks()) 663 gfp_mask |= __GFP_MEMALLOC; 664 665 if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { 666 /* we are artificially inflating the allocation size, but 667 * that is not as bad as it may look like, as: 668 * - 'len' less than GRO_MAX_HEAD makes little sense 669 * - On most systems, larger 'len' values lead to fragment 670 * size above 512 bytes 671 * - kmalloc would use the kmalloc-1k slab for such values 672 * - Builds with smaller GRO_MAX_HEAD will very likely do 673 * little networking, as that implies no WiFi and no 674 * tunnels support, and 32 bits arches. 675 */ 676 len = SZ_1K; 677 678 data = page_frag_alloc_1k(&nc->page_small, gfp_mask); 679 pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); 680 } else { 681 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 682 len = SKB_DATA_ALIGN(len); 683 684 data = page_frag_alloc(&nc->page, len, gfp_mask); 685 pfmemalloc = nc->page.pfmemalloc; 686 } 687 688 if (unlikely(!data)) 689 return NULL; 690 691 skb = __napi_build_skb(data, len); 692 if (unlikely(!skb)) { 693 skb_free_frag(data); 694 return NULL; 695 } 696 697 if (pfmemalloc) 698 skb->pfmemalloc = 1; 699 skb->head_frag = 1; 700 701 skb_success: 702 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); 703 skb->dev = napi->dev; 704 705 skb_fail: 706 return skb; 707 } 708 EXPORT_SYMBOL(__napi_alloc_skb); 709 710 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, 711 int size, unsigned int truesize) 712 { 713 skb_fill_page_desc(skb, i, page, off, size); 714 skb->len += size; 715 skb->data_len += size; 716 skb->truesize += truesize; 717 } 718 EXPORT_SYMBOL(skb_add_rx_frag); 719 720 void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, 721 unsigned int truesize) 722 { 723 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 724 725 skb_frag_size_add(frag, size); 726 skb->len += size; 727 skb->data_len += size; 728 skb->truesize += truesize; 729 } 730 EXPORT_SYMBOL(skb_coalesce_rx_frag); 731 732 static void skb_drop_list(struct sk_buff **listp) 733 { 734 kfree_skb_list(*listp); 735 *listp = NULL; 736 } 737 738 static inline void skb_drop_fraglist(struct sk_buff *skb) 739 { 740 skb_drop_list(&skb_shinfo(skb)->frag_list); 741 } 742 743 static void skb_clone_fraglist(struct sk_buff *skb) 744 { 745 struct sk_buff *list; 746 747 skb_walk_frags(skb, list) 748 skb_get(list); 749 } 750 751 static bool skb_pp_recycle(struct sk_buff *skb, void *data) 752 { 753 if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) 754 return false; 755 return page_pool_return_skb_page(virt_to_page(data)); 756 } 757 758 static void skb_free_head(struct sk_buff *skb) 759 { 760 unsigned char *head = skb->head; 761 762 if (skb->head_frag) { 763 if (skb_pp_recycle(skb, head)) 764 return; 765 skb_free_frag(head); 766 } else { 767 kfree(head); 768 } 769 } 770 771 static void skb_release_data(struct sk_buff *skb) 772 { 773 struct skb_shared_info *shinfo = skb_shinfo(skb); 774 int i; 775 776 if (skb->cloned && 777 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, 778 &shinfo->dataref)) 779 goto exit; 780 781 if (skb_zcopy(skb)) { 782 bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS; 783 784 skb_zcopy_clear(skb, true); 785 if (skip_unref) 786 goto free_head; 787 } 788 789 for (i = 0; i < shinfo->nr_frags; i++) 790 __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); 791 792 free_head: 793 if (shinfo->frag_list) 794 kfree_skb_list(shinfo->frag_list); 795 796 skb_free_head(skb); 797 exit: 798 /* When we clone an SKB we copy the reycling bit. The pp_recycle 799 * bit is only set on the head though, so in order to avoid races 800 * while trying to recycle fragments on __skb_frag_unref() we need 801 * to make one SKB responsible for triggering the recycle path. 802 * So disable the recycling bit if an SKB is cloned and we have 803 * additional references to the fragmented part of the SKB. 804 * Eventually the last SKB will have the recycling bit set and it's 805 * dataref set to 0, which will trigger the recycling 806 */ 807 skb->pp_recycle = 0; 808 } 809 810 /* 811 * Free an skbuff by memory without cleaning the state. 812 */ 813 static void kfree_skbmem(struct sk_buff *skb) 814 { 815 struct sk_buff_fclones *fclones; 816 817 switch (skb->fclone) { 818 case SKB_FCLONE_UNAVAILABLE: 819 kmem_cache_free(skbuff_head_cache, skb); 820 return; 821 822 case SKB_FCLONE_ORIG: 823 fclones = container_of(skb, struct sk_buff_fclones, skb1); 824 825 /* We usually free the clone (TX completion) before original skb 826 * This test would have no chance to be true for the clone, 827 * while here, branch prediction will be good. 828 */ 829 if (refcount_read(&fclones->fclone_ref) == 1) 830 goto fastpath; 831 break; 832 833 default: /* SKB_FCLONE_CLONE */ 834 fclones = container_of(skb, struct sk_buff_fclones, skb2); 835 break; 836 } 837 if (!refcount_dec_and_test(&fclones->fclone_ref)) 838 return; 839 fastpath: 840 kmem_cache_free(skbuff_fclone_cache, fclones); 841 } 842 843 void skb_release_head_state(struct sk_buff *skb) 844 { 845 skb_dst_drop(skb); 846 if (skb->destructor) { 847 DEBUG_NET_WARN_ON_ONCE(in_hardirq()); 848 skb->destructor(skb); 849 } 850 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 851 nf_conntrack_put(skb_nfct(skb)); 852 #endif 853 skb_ext_put(skb); 854 } 855 856 /* Free everything but the sk_buff shell. */ 857 static void skb_release_all(struct sk_buff *skb) 858 { 859 skb_release_head_state(skb); 860 if (likely(skb->head)) 861 skb_release_data(skb); 862 } 863 864 /** 865 * __kfree_skb - private function 866 * @skb: buffer 867 * 868 * Free an sk_buff. Release anything attached to the buffer. 869 * Clean the state. This is an internal helper function. Users should 870 * always call kfree_skb 871 */ 872 873 void __kfree_skb(struct sk_buff *skb) 874 { 875 skb_release_all(skb); 876 kfree_skbmem(skb); 877 } 878 EXPORT_SYMBOL(__kfree_skb); 879 880 /** 881 * kfree_skb_reason - free an sk_buff with special reason 882 * @skb: buffer to free 883 * @reason: reason why this skb is dropped 884 * 885 * Drop a reference to the buffer and free it if the usage count has 886 * hit zero. Meanwhile, pass the drop reason to 'kfree_skb' 887 * tracepoint. 888 */ 889 void __fix_address 890 kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) 891 { 892 if (unlikely(!skb_unref(skb))) 893 return; 894 895 DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); 896 897 trace_kfree_skb(skb, __builtin_return_address(0), reason); 898 __kfree_skb(skb); 899 } 900 EXPORT_SYMBOL(kfree_skb_reason); 901 902 void kfree_skb_list_reason(struct sk_buff *segs, 903 enum skb_drop_reason reason) 904 { 905 while (segs) { 906 struct sk_buff *next = segs->next; 907 908 kfree_skb_reason(segs, reason); 909 segs = next; 910 } 911 } 912 EXPORT_SYMBOL(kfree_skb_list_reason); 913 914 /* Dump skb information and contents. 915 * 916 * Must only be called from net_ratelimit()-ed paths. 917 * 918 * Dumps whole packets if full_pkt, only headers otherwise. 919 */ 920 void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) 921 { 922 struct skb_shared_info *sh = skb_shinfo(skb); 923 struct net_device *dev = skb->dev; 924 struct sock *sk = skb->sk; 925 struct sk_buff *list_skb; 926 bool has_mac, has_trans; 927 int headroom, tailroom; 928 int i, len, seg_len; 929 930 if (full_pkt) 931 len = skb->len; 932 else 933 len = min_t(int, skb->len, MAX_HEADER + 128); 934 935 headroom = skb_headroom(skb); 936 tailroom = skb_tailroom(skb); 937 938 has_mac = skb_mac_header_was_set(skb); 939 has_trans = skb_transport_header_was_set(skb); 940 941 printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" 942 "mac=(%d,%d) net=(%d,%d) trans=%d\n" 943 "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" 944 "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" 945 "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", 946 level, skb->len, headroom, skb_headlen(skb), tailroom, 947 has_mac ? skb->mac_header : -1, 948 has_mac ? skb_mac_header_len(skb) : -1, 949 skb->network_header, 950 has_trans ? skb_network_header_len(skb) : -1, 951 has_trans ? skb->transport_header : -1, 952 sh->tx_flags, sh->nr_frags, 953 sh->gso_size, sh->gso_type, sh->gso_segs, 954 skb->csum, skb->ip_summed, skb->csum_complete_sw, 955 skb->csum_valid, skb->csum_level, 956 skb->hash, skb->sw_hash, skb->l4_hash, 957 ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); 958 959 if (dev) 960 printk("%sdev name=%s feat=%pNF\n", 961 level, dev->name, &dev->features); 962 if (sk) 963 printk("%ssk family=%hu type=%u proto=%u\n", 964 level, sk->sk_family, sk->sk_type, sk->sk_protocol); 965 966 if (full_pkt && headroom) 967 print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, 968 16, 1, skb->head, headroom, false); 969 970 seg_len = min_t(int, skb_headlen(skb), len); 971 if (seg_len) 972 print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, 973 16, 1, skb->data, seg_len, false); 974 len -= seg_len; 975 976 if (full_pkt && tailroom) 977 print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, 978 16, 1, skb_tail_pointer(skb), tailroom, false); 979 980 for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { 981 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 982 u32 p_off, p_len, copied; 983 struct page *p; 984 u8 *vaddr; 985 986 skb_frag_foreach_page(frag, skb_frag_off(frag), 987 skb_frag_size(frag), p, p_off, p_len, 988 copied) { 989 seg_len = min_t(int, p_len, len); 990 vaddr = kmap_atomic(p); 991 print_hex_dump(level, "skb frag: ", 992 DUMP_PREFIX_OFFSET, 993 16, 1, vaddr + p_off, seg_len, false); 994 kunmap_atomic(vaddr); 995 len -= seg_len; 996 if (!len) 997 break; 998 } 999 } 1000 1001 if (full_pkt && skb_has_frag_list(skb)) { 1002 printk("skb fraglist:\n"); 1003 skb_walk_frags(skb, list_skb) 1004 skb_dump(level, list_skb, true); 1005 } 1006 } 1007 EXPORT_SYMBOL(skb_dump); 1008 1009 /** 1010 * skb_tx_error - report an sk_buff xmit error 1011 * @skb: buffer that triggered an error 1012 * 1013 * Report xmit error if a device callback is tracking this skb. 1014 * skb must be freed afterwards. 1015 */ 1016 void skb_tx_error(struct sk_buff *skb) 1017 { 1018 if (skb) { 1019 skb_zcopy_downgrade_managed(skb); 1020 skb_zcopy_clear(skb, true); 1021 } 1022 } 1023 EXPORT_SYMBOL(skb_tx_error); 1024 1025 #ifdef CONFIG_TRACEPOINTS 1026 /** 1027 * consume_skb - free an skbuff 1028 * @skb: buffer to free 1029 * 1030 * Drop a ref to the buffer and free it if the usage count has hit zero 1031 * Functions identically to kfree_skb, but kfree_skb assumes that the frame 1032 * is being dropped after a failure and notes that 1033 */ 1034 void consume_skb(struct sk_buff *skb) 1035 { 1036 if (!skb_unref(skb)) 1037 return; 1038 1039 trace_consume_skb(skb); 1040 __kfree_skb(skb); 1041 } 1042 EXPORT_SYMBOL(consume_skb); 1043 #endif 1044 1045 /** 1046 * __consume_stateless_skb - free an skbuff, assuming it is stateless 1047 * @skb: buffer to free 1048 * 1049 * Alike consume_skb(), but this variant assumes that this is the last 1050 * skb reference and all the head states have been already dropped 1051 */ 1052 void __consume_stateless_skb(struct sk_buff *skb) 1053 { 1054 trace_consume_skb(skb); 1055 skb_release_data(skb); 1056 kfree_skbmem(skb); 1057 } 1058 1059 static void napi_skb_cache_put(struct sk_buff *skb) 1060 { 1061 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 1062 u32 i; 1063 1064 kasan_poison_object_data(skbuff_head_cache, skb); 1065 nc->skb_cache[nc->skb_count++] = skb; 1066 1067 if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { 1068 for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) 1069 kasan_unpoison_object_data(skbuff_head_cache, 1070 nc->skb_cache[i]); 1071 1072 kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF, 1073 nc->skb_cache + NAPI_SKB_CACHE_HALF); 1074 nc->skb_count = NAPI_SKB_CACHE_HALF; 1075 } 1076 } 1077 1078 void __kfree_skb_defer(struct sk_buff *skb) 1079 { 1080 skb_release_all(skb); 1081 napi_skb_cache_put(skb); 1082 } 1083 1084 void napi_skb_free_stolen_head(struct sk_buff *skb) 1085 { 1086 if (unlikely(skb->slow_gro)) { 1087 nf_reset_ct(skb); 1088 skb_dst_drop(skb); 1089 skb_ext_put(skb); 1090 skb_orphan(skb); 1091 skb->slow_gro = 0; 1092 } 1093 napi_skb_cache_put(skb); 1094 } 1095 1096 void napi_consume_skb(struct sk_buff *skb, int budget) 1097 { 1098 /* Zero budget indicate non-NAPI context called us, like netpoll */ 1099 if (unlikely(!budget)) { 1100 dev_consume_skb_any(skb); 1101 return; 1102 } 1103 1104 DEBUG_NET_WARN_ON_ONCE(!in_softirq()); 1105 1106 if (!skb_unref(skb)) 1107 return; 1108 1109 /* if reaching here SKB is ready to free */ 1110 trace_consume_skb(skb); 1111 1112 /* if SKB is a clone, don't handle this case */ 1113 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { 1114 __kfree_skb(skb); 1115 return; 1116 } 1117 1118 skb_release_all(skb); 1119 napi_skb_cache_put(skb); 1120 } 1121 EXPORT_SYMBOL(napi_consume_skb); 1122 1123 /* Make sure a field is contained by headers group */ 1124 #define CHECK_SKB_FIELD(field) \ 1125 BUILD_BUG_ON(offsetof(struct sk_buff, field) != \ 1126 offsetof(struct sk_buff, headers.field)); \ 1127 1128 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 1129 { 1130 new->tstamp = old->tstamp; 1131 /* We do not copy old->sk */ 1132 new->dev = old->dev; 1133 memcpy(new->cb, old->cb, sizeof(old->cb)); 1134 skb_dst_copy(new, old); 1135 __skb_ext_copy(new, old); 1136 __nf_copy(new, old, false); 1137 1138 /* Note : this field could be in the headers group. 1139 * It is not yet because we do not want to have a 16 bit hole 1140 */ 1141 new->queue_mapping = old->queue_mapping; 1142 1143 memcpy(&new->headers, &old->headers, sizeof(new->headers)); 1144 CHECK_SKB_FIELD(protocol); 1145 CHECK_SKB_FIELD(csum); 1146 CHECK_SKB_FIELD(hash); 1147 CHECK_SKB_FIELD(priority); 1148 CHECK_SKB_FIELD(skb_iif); 1149 CHECK_SKB_FIELD(vlan_proto); 1150 CHECK_SKB_FIELD(vlan_tci); 1151 CHECK_SKB_FIELD(transport_header); 1152 CHECK_SKB_FIELD(network_header); 1153 CHECK_SKB_FIELD(mac_header); 1154 CHECK_SKB_FIELD(inner_protocol); 1155 CHECK_SKB_FIELD(inner_transport_header); 1156 CHECK_SKB_FIELD(inner_network_header); 1157 CHECK_SKB_FIELD(inner_mac_header); 1158 CHECK_SKB_FIELD(mark); 1159 #ifdef CONFIG_NETWORK_SECMARK 1160 CHECK_SKB_FIELD(secmark); 1161 #endif 1162 #ifdef CONFIG_NET_RX_BUSY_POLL 1163 CHECK_SKB_FIELD(napi_id); 1164 #endif 1165 CHECK_SKB_FIELD(alloc_cpu); 1166 #ifdef CONFIG_XPS 1167 CHECK_SKB_FIELD(sender_cpu); 1168 #endif 1169 #ifdef CONFIG_NET_SCHED 1170 CHECK_SKB_FIELD(tc_index); 1171 #endif 1172 1173 } 1174 1175 /* 1176 * You should not add any new code to this function. Add it to 1177 * __copy_skb_header above instead. 1178 */ 1179 static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) 1180 { 1181 #define C(x) n->x = skb->x 1182 1183 n->next = n->prev = NULL; 1184 n->sk = NULL; 1185 __copy_skb_header(n, skb); 1186 1187 C(len); 1188 C(data_len); 1189 C(mac_len); 1190 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; 1191 n->cloned = 1; 1192 n->nohdr = 0; 1193 n->peeked = 0; 1194 C(pfmemalloc); 1195 C(pp_recycle); 1196 n->destructor = NULL; 1197 C(tail); 1198 C(end); 1199 C(head); 1200 C(head_frag); 1201 C(data); 1202 C(truesize); 1203 refcount_set(&n->users, 1); 1204 1205 atomic_inc(&(skb_shinfo(skb)->dataref)); 1206 skb->cloned = 1; 1207 1208 return n; 1209 #undef C 1210 } 1211 1212 /** 1213 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg 1214 * @first: first sk_buff of the msg 1215 */ 1216 struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) 1217 { 1218 struct sk_buff *n; 1219 1220 n = alloc_skb(0, GFP_ATOMIC); 1221 if (!n) 1222 return NULL; 1223 1224 n->len = first->len; 1225 n->data_len = first->len; 1226 n->truesize = first->truesize; 1227 1228 skb_shinfo(n)->frag_list = first; 1229 1230 __copy_skb_header(n, first); 1231 n->destructor = NULL; 1232 1233 return n; 1234 } 1235 EXPORT_SYMBOL_GPL(alloc_skb_for_msg); 1236 1237 /** 1238 * skb_morph - morph one skb into another 1239 * @dst: the skb to receive the contents 1240 * @src: the skb to supply the contents 1241 * 1242 * This is identical to skb_clone except that the target skb is 1243 * supplied by the user. 1244 * 1245 * The target skb is returned upon exit. 1246 */ 1247 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) 1248 { 1249 skb_release_all(dst); 1250 return __skb_clone(dst, src); 1251 } 1252 EXPORT_SYMBOL_GPL(skb_morph); 1253 1254 int mm_account_pinned_pages(struct mmpin *mmp, size_t size) 1255 { 1256 unsigned long max_pg, num_pg, new_pg, old_pg; 1257 struct user_struct *user; 1258 1259 if (capable(CAP_IPC_LOCK) || !size) 1260 return 0; 1261 1262 num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ 1263 max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 1264 user = mmp->user ? : current_user(); 1265 1266 do { 1267 old_pg = atomic_long_read(&user->locked_vm); 1268 new_pg = old_pg + num_pg; 1269 if (new_pg > max_pg) 1270 return -ENOBUFS; 1271 } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) != 1272 old_pg); 1273 1274 if (!mmp->user) { 1275 mmp->user = get_uid(user); 1276 mmp->num_pg = num_pg; 1277 } else { 1278 mmp->num_pg += num_pg; 1279 } 1280 1281 return 0; 1282 } 1283 EXPORT_SYMBOL_GPL(mm_account_pinned_pages); 1284 1285 void mm_unaccount_pinned_pages(struct mmpin *mmp) 1286 { 1287 if (mmp->user) { 1288 atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); 1289 free_uid(mmp->user); 1290 } 1291 } 1292 EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); 1293 1294 static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) 1295 { 1296 struct ubuf_info_msgzc *uarg; 1297 struct sk_buff *skb; 1298 1299 WARN_ON_ONCE(!in_task()); 1300 1301 skb = sock_omalloc(sk, 0, GFP_KERNEL); 1302 if (!skb) 1303 return NULL; 1304 1305 BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); 1306 uarg = (void *)skb->cb; 1307 uarg->mmp.user = NULL; 1308 1309 if (mm_account_pinned_pages(&uarg->mmp, size)) { 1310 kfree_skb(skb); 1311 return NULL; 1312 } 1313 1314 uarg->ubuf.callback = msg_zerocopy_callback; 1315 uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; 1316 uarg->len = 1; 1317 uarg->bytelen = size; 1318 uarg->zerocopy = 1; 1319 uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; 1320 refcount_set(&uarg->ubuf.refcnt, 1); 1321 sock_hold(sk); 1322 1323 return &uarg->ubuf; 1324 } 1325 1326 static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) 1327 { 1328 return container_of((void *)uarg, struct sk_buff, cb); 1329 } 1330 1331 struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, 1332 struct ubuf_info *uarg) 1333 { 1334 if (uarg) { 1335 struct ubuf_info_msgzc *uarg_zc; 1336 const u32 byte_limit = 1 << 19; /* limit to a few TSO */ 1337 u32 bytelen, next; 1338 1339 /* there might be non MSG_ZEROCOPY users */ 1340 if (uarg->callback != msg_zerocopy_callback) 1341 return NULL; 1342 1343 /* realloc only when socket is locked (TCP, UDP cork), 1344 * so uarg->len and sk_zckey access is serialized 1345 */ 1346 if (!sock_owned_by_user(sk)) { 1347 WARN_ON_ONCE(1); 1348 return NULL; 1349 } 1350 1351 uarg_zc = uarg_to_msgzc(uarg); 1352 bytelen = uarg_zc->bytelen + size; 1353 if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) { 1354 /* TCP can create new skb to attach new uarg */ 1355 if (sk->sk_type == SOCK_STREAM) 1356 goto new_alloc; 1357 return NULL; 1358 } 1359 1360 next = (u32)atomic_read(&sk->sk_zckey); 1361 if ((u32)(uarg_zc->id + uarg_zc->len) == next) { 1362 if (mm_account_pinned_pages(&uarg_zc->mmp, size)) 1363 return NULL; 1364 uarg_zc->len++; 1365 uarg_zc->bytelen = bytelen; 1366 atomic_set(&sk->sk_zckey, ++next); 1367 1368 /* no extra ref when appending to datagram (MSG_MORE) */ 1369 if (sk->sk_type == SOCK_STREAM) 1370 net_zcopy_get(uarg); 1371 1372 return uarg; 1373 } 1374 } 1375 1376 new_alloc: 1377 return msg_zerocopy_alloc(sk, size); 1378 } 1379 EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); 1380 1381 static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) 1382 { 1383 struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); 1384 u32 old_lo, old_hi; 1385 u64 sum_len; 1386 1387 old_lo = serr->ee.ee_info; 1388 old_hi = serr->ee.ee_data; 1389 sum_len = old_hi - old_lo + 1ULL + len; 1390 1391 if (sum_len >= (1ULL << 32)) 1392 return false; 1393 1394 if (lo != old_hi + 1) 1395 return false; 1396 1397 serr->ee.ee_data += len; 1398 return true; 1399 } 1400 1401 static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg) 1402 { 1403 struct sk_buff *tail, *skb = skb_from_uarg(uarg); 1404 struct sock_exterr_skb *serr; 1405 struct sock *sk = skb->sk; 1406 struct sk_buff_head *q; 1407 unsigned long flags; 1408 bool is_zerocopy; 1409 u32 lo, hi; 1410 u16 len; 1411 1412 mm_unaccount_pinned_pages(&uarg->mmp); 1413 1414 /* if !len, there was only 1 call, and it was aborted 1415 * so do not queue a completion notification 1416 */ 1417 if (!uarg->len || sock_flag(sk, SOCK_DEAD)) 1418 goto release; 1419 1420 len = uarg->len; 1421 lo = uarg->id; 1422 hi = uarg->id + len - 1; 1423 is_zerocopy = uarg->zerocopy; 1424 1425 serr = SKB_EXT_ERR(skb); 1426 memset(serr, 0, sizeof(*serr)); 1427 serr->ee.ee_errno = 0; 1428 serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; 1429 serr->ee.ee_data = hi; 1430 serr->ee.ee_info = lo; 1431 if (!is_zerocopy) 1432 serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; 1433 1434 q = &sk->sk_error_queue; 1435 spin_lock_irqsave(&q->lock, flags); 1436 tail = skb_peek_tail(q); 1437 if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || 1438 !skb_zerocopy_notify_extend(tail, lo, len)) { 1439 __skb_queue_tail(q, skb); 1440 skb = NULL; 1441 } 1442 spin_unlock_irqrestore(&q->lock, flags); 1443 1444 sk_error_report(sk); 1445 1446 release: 1447 consume_skb(skb); 1448 sock_put(sk); 1449 } 1450 1451 void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, 1452 bool success) 1453 { 1454 struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); 1455 1456 uarg_zc->zerocopy = uarg_zc->zerocopy & success; 1457 1458 if (refcount_dec_and_test(&uarg->refcnt)) 1459 __msg_zerocopy_callback(uarg_zc); 1460 } 1461 EXPORT_SYMBOL_GPL(msg_zerocopy_callback); 1462 1463 void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) 1464 { 1465 struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk; 1466 1467 atomic_dec(&sk->sk_zckey); 1468 uarg_to_msgzc(uarg)->len--; 1469 1470 if (have_uref) 1471 msg_zerocopy_callback(NULL, uarg, true); 1472 } 1473 EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); 1474 1475 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, 1476 struct msghdr *msg, int len, 1477 struct ubuf_info *uarg) 1478 { 1479 struct ubuf_info *orig_uarg = skb_zcopy(skb); 1480 int err, orig_len = skb->len; 1481 1482 /* An skb can only point to one uarg. This edge case happens when 1483 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. 1484 */ 1485 if (orig_uarg && uarg != orig_uarg) 1486 return -EEXIST; 1487 1488 err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); 1489 if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { 1490 struct sock *save_sk = skb->sk; 1491 1492 /* Streams do not free skb on error. Reset to prev state. */ 1493 iov_iter_revert(&msg->msg_iter, skb->len - orig_len); 1494 skb->sk = sk; 1495 ___pskb_trim(skb, orig_len); 1496 skb->sk = save_sk; 1497 return err; 1498 } 1499 1500 skb_zcopy_set(skb, uarg, NULL); 1501 return skb->len - orig_len; 1502 } 1503 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); 1504 1505 void __skb_zcopy_downgrade_managed(struct sk_buff *skb) 1506 { 1507 int i; 1508 1509 skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; 1510 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1511 skb_frag_ref(skb, i); 1512 } 1513 EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed); 1514 1515 static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, 1516 gfp_t gfp_mask) 1517 { 1518 if (skb_zcopy(orig)) { 1519 if (skb_zcopy(nskb)) { 1520 /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ 1521 if (!gfp_mask) { 1522 WARN_ON_ONCE(1); 1523 return -ENOMEM; 1524 } 1525 if (skb_uarg(nskb) == skb_uarg(orig)) 1526 return 0; 1527 if (skb_copy_ubufs(nskb, GFP_ATOMIC)) 1528 return -EIO; 1529 } 1530 skb_zcopy_set(nskb, skb_uarg(orig), NULL); 1531 } 1532 return 0; 1533 } 1534 1535 /** 1536 * skb_copy_ubufs - copy userspace skb frags buffers to kernel 1537 * @skb: the skb to modify 1538 * @gfp_mask: allocation priority 1539 * 1540 * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. 1541 * It will copy all frags into kernel and drop the reference 1542 * to userspace pages. 1543 * 1544 * If this function is called from an interrupt gfp_mask() must be 1545 * %GFP_ATOMIC. 1546 * 1547 * Returns 0 on success or a negative error code on failure 1548 * to allocate kernel memory to copy to. 1549 */ 1550 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) 1551 { 1552 int num_frags = skb_shinfo(skb)->nr_frags; 1553 struct page *page, *head = NULL; 1554 int i, new_frags; 1555 u32 d_off; 1556 1557 if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) 1558 return -EINVAL; 1559 1560 if (!num_frags) 1561 goto release; 1562 1563 new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1564 for (i = 0; i < new_frags; i++) { 1565 page = alloc_page(gfp_mask); 1566 if (!page) { 1567 while (head) { 1568 struct page *next = (struct page *)page_private(head); 1569 put_page(head); 1570 head = next; 1571 } 1572 return -ENOMEM; 1573 } 1574 set_page_private(page, (unsigned long)head); 1575 head = page; 1576 } 1577 1578 page = head; 1579 d_off = 0; 1580 for (i = 0; i < num_frags; i++) { 1581 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 1582 u32 p_off, p_len, copied; 1583 struct page *p; 1584 u8 *vaddr; 1585 1586 skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), 1587 p, p_off, p_len, copied) { 1588 u32 copy, done = 0; 1589 vaddr = kmap_atomic(p); 1590 1591 while (done < p_len) { 1592 if (d_off == PAGE_SIZE) { 1593 d_off = 0; 1594 page = (struct page *)page_private(page); 1595 } 1596 copy = min_t(u32, PAGE_SIZE - d_off, p_len - done); 1597 memcpy(page_address(page) + d_off, 1598 vaddr + p_off + done, copy); 1599 done += copy; 1600 d_off += copy; 1601 } 1602 kunmap_atomic(vaddr); 1603 } 1604 } 1605 1606 /* skb frags release userspace buffers */ 1607 for (i = 0; i < num_frags; i++) 1608 skb_frag_unref(skb, i); 1609 1610 /* skb frags point to kernel buffers */ 1611 for (i = 0; i < new_frags - 1; i++) { 1612 __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE); 1613 head = (struct page *)page_private(head); 1614 } 1615 __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); 1616 skb_shinfo(skb)->nr_frags = new_frags; 1617 1618 release: 1619 skb_zcopy_clear(skb, false); 1620 return 0; 1621 } 1622 EXPORT_SYMBOL_GPL(skb_copy_ubufs); 1623 1624 /** 1625 * skb_clone - duplicate an sk_buff 1626 * @skb: buffer to clone 1627 * @gfp_mask: allocation priority 1628 * 1629 * Duplicate an &sk_buff. The new one is not owned by a socket. Both 1630 * copies share the same packet data but not structure. The new 1631 * buffer has a reference count of 1. If the allocation fails the 1632 * function returns %NULL otherwise the new buffer is returned. 1633 * 1634 * If this function is called from an interrupt gfp_mask() must be 1635 * %GFP_ATOMIC. 1636 */ 1637 1638 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) 1639 { 1640 struct sk_buff_fclones *fclones = container_of(skb, 1641 struct sk_buff_fclones, 1642 skb1); 1643 struct sk_buff *n; 1644 1645 if (skb_orphan_frags(skb, gfp_mask)) 1646 return NULL; 1647 1648 if (skb->fclone == SKB_FCLONE_ORIG && 1649 refcount_read(&fclones->fclone_ref) == 1) { 1650 n = &fclones->skb2; 1651 refcount_set(&fclones->fclone_ref, 2); 1652 n->fclone = SKB_FCLONE_CLONE; 1653 } else { 1654 if (skb_pfmemalloc(skb)) 1655 gfp_mask |= __GFP_MEMALLOC; 1656 1657 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 1658 if (!n) 1659 return NULL; 1660 1661 n->fclone = SKB_FCLONE_UNAVAILABLE; 1662 } 1663 1664 return __skb_clone(n, skb); 1665 } 1666 EXPORT_SYMBOL(skb_clone); 1667 1668 void skb_headers_offset_update(struct sk_buff *skb, int off) 1669 { 1670 /* Only adjust this if it actually is csum_start rather than csum */ 1671 if (skb->ip_summed == CHECKSUM_PARTIAL) 1672 skb->csum_start += off; 1673 /* {transport,network,mac}_header and tail are relative to skb->head */ 1674 skb->transport_header += off; 1675 skb->network_header += off; 1676 if (skb_mac_header_was_set(skb)) 1677 skb->mac_header += off; 1678 skb->inner_transport_header += off; 1679 skb->inner_network_header += off; 1680 skb->inner_mac_header += off; 1681 } 1682 EXPORT_SYMBOL(skb_headers_offset_update); 1683 1684 void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) 1685 { 1686 __copy_skb_header(new, old); 1687 1688 skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; 1689 skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; 1690 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 1691 } 1692 EXPORT_SYMBOL(skb_copy_header); 1693 1694 static inline int skb_alloc_rx_flag(const struct sk_buff *skb) 1695 { 1696 if (skb_pfmemalloc(skb)) 1697 return SKB_ALLOC_RX; 1698 return 0; 1699 } 1700 1701 /** 1702 * skb_copy - create private copy of an sk_buff 1703 * @skb: buffer to copy 1704 * @gfp_mask: allocation priority 1705 * 1706 * Make a copy of both an &sk_buff and its data. This is used when the 1707 * caller wishes to modify the data and needs a private copy of the 1708 * data to alter. Returns %NULL on failure or the pointer to the buffer 1709 * on success. The returned buffer has a reference count of 1. 1710 * 1711 * As by-product this function converts non-linear &sk_buff to linear 1712 * one, so that &sk_buff becomes completely private and caller is allowed 1713 * to modify all the data of returned buffer. This means that this 1714 * function is not recommended for use in circumstances when only 1715 * header is going to be modified. Use pskb_copy() instead. 1716 */ 1717 1718 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) 1719 { 1720 int headerlen = skb_headroom(skb); 1721 unsigned int size = skb_end_offset(skb) + skb->data_len; 1722 struct sk_buff *n = __alloc_skb(size, gfp_mask, 1723 skb_alloc_rx_flag(skb), NUMA_NO_NODE); 1724 1725 if (!n) 1726 return NULL; 1727 1728 /* Set the data pointer */ 1729 skb_reserve(n, headerlen); 1730 /* Set the tail pointer and length */ 1731 skb_put(n, skb->len); 1732 1733 BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); 1734 1735 skb_copy_header(n, skb); 1736 return n; 1737 } 1738 EXPORT_SYMBOL(skb_copy); 1739 1740 /** 1741 * __pskb_copy_fclone - create copy of an sk_buff with private head. 1742 * @skb: buffer to copy 1743 * @headroom: headroom of new skb 1744 * @gfp_mask: allocation priority 1745 * @fclone: if true allocate the copy of the skb from the fclone 1746 * cache instead of the head cache; it is recommended to set this 1747 * to true for the cases where the copy will likely be cloned 1748 * 1749 * Make a copy of both an &sk_buff and part of its data, located 1750 * in header. Fragmented data remain shared. This is used when 1751 * the caller wishes to modify only header of &sk_buff and needs 1752 * private copy of the header to alter. Returns %NULL on failure 1753 * or the pointer to the buffer on success. 1754 * The returned buffer has a reference count of 1. 1755 */ 1756 1757 struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, 1758 gfp_t gfp_mask, bool fclone) 1759 { 1760 unsigned int size = skb_headlen(skb) + headroom; 1761 int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); 1762 struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); 1763 1764 if (!n) 1765 goto out; 1766 1767 /* Set the data pointer */ 1768 skb_reserve(n, headroom); 1769 /* Set the tail pointer and length */ 1770 skb_put(n, skb_headlen(skb)); 1771 /* Copy the bytes */ 1772 skb_copy_from_linear_data(skb, n->data, n->len); 1773 1774 n->truesize += skb->data_len; 1775 n->data_len = skb->data_len; 1776 n->len = skb->len; 1777 1778 if (skb_shinfo(skb)->nr_frags) { 1779 int i; 1780 1781 if (skb_orphan_frags(skb, gfp_mask) || 1782 skb_zerocopy_clone(n, skb, gfp_mask)) { 1783 kfree_skb(n); 1784 n = NULL; 1785 goto out; 1786 } 1787 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1788 skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; 1789 skb_frag_ref(skb, i); 1790 } 1791 skb_shinfo(n)->nr_frags = i; 1792 } 1793 1794 if (skb_has_frag_list(skb)) { 1795 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; 1796 skb_clone_fraglist(n); 1797 } 1798 1799 skb_copy_header(n, skb); 1800 out: 1801 return n; 1802 } 1803 EXPORT_SYMBOL(__pskb_copy_fclone); 1804 1805 /** 1806 * pskb_expand_head - reallocate header of &sk_buff 1807 * @skb: buffer to reallocate 1808 * @nhead: room to add at head 1809 * @ntail: room to add at tail 1810 * @gfp_mask: allocation priority 1811 * 1812 * Expands (or creates identical copy, if @nhead and @ntail are zero) 1813 * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have 1814 * reference count of 1. Returns zero in the case of success or error, 1815 * if expansion failed. In the last case, &sk_buff is not changed. 1816 * 1817 * All the pointers pointing into skb header may change and must be 1818 * reloaded after call to this function. 1819 */ 1820 1821 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, 1822 gfp_t gfp_mask) 1823 { 1824 unsigned int osize = skb_end_offset(skb); 1825 unsigned int size = osize + nhead + ntail; 1826 long off; 1827 u8 *data; 1828 int i; 1829 1830 BUG_ON(nhead < 0); 1831 1832 BUG_ON(skb_shared(skb)); 1833 1834 skb_zcopy_downgrade_managed(skb); 1835 1836 if (skb_pfmemalloc(skb)) 1837 gfp_mask |= __GFP_MEMALLOC; 1838 1839 size = SKB_DATA_ALIGN(size); 1840 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 1841 size = kmalloc_size_roundup(size); 1842 data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); 1843 if (!data) 1844 goto nodata; 1845 size = SKB_WITH_OVERHEAD(size); 1846 1847 /* Copy only real data... and, alas, header. This should be 1848 * optimized for the cases when header is void. 1849 */ 1850 memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); 1851 1852 memcpy((struct skb_shared_info *)(data + size), 1853 skb_shinfo(skb), 1854 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); 1855 1856 /* 1857 * if shinfo is shared we must drop the old head gracefully, but if it 1858 * is not we can just drop the old head and let the existing refcount 1859 * be since all we did is relocate the values 1860 */ 1861 if (skb_cloned(skb)) { 1862 if (skb_orphan_frags(skb, gfp_mask)) 1863 goto nofrags; 1864 if (skb_zcopy(skb)) 1865 refcount_inc(&skb_uarg(skb)->refcnt); 1866 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1867 skb_frag_ref(skb, i); 1868 1869 if (skb_has_frag_list(skb)) 1870 skb_clone_fraglist(skb); 1871 1872 skb_release_data(skb); 1873 } else { 1874 skb_free_head(skb); 1875 } 1876 off = (data + nhead) - skb->head; 1877 1878 skb->head = data; 1879 skb->head_frag = 0; 1880 skb->data += off; 1881 1882 skb_set_end_offset(skb, size); 1883 #ifdef NET_SKBUFF_DATA_USES_OFFSET 1884 off = nhead; 1885 #endif 1886 skb->tail += off; 1887 skb_headers_offset_update(skb, nhead); 1888 skb->cloned = 0; 1889 skb->hdr_len = 0; 1890 skb->nohdr = 0; 1891 atomic_set(&skb_shinfo(skb)->dataref, 1); 1892 1893 skb_metadata_clear(skb); 1894 1895 /* It is not generally safe to change skb->truesize. 1896 * For the moment, we really care of rx path, or 1897 * when skb is orphaned (not attached to a socket). 1898 */ 1899 if (!skb->sk || skb->destructor == sock_edemux) 1900 skb->truesize += size - osize; 1901 1902 return 0; 1903 1904 nofrags: 1905 kfree(data); 1906 nodata: 1907 return -ENOMEM; 1908 } 1909 EXPORT_SYMBOL(pskb_expand_head); 1910 1911 /* Make private copy of skb with writable head and some headroom */ 1912 1913 struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) 1914 { 1915 struct sk_buff *skb2; 1916 int delta = headroom - skb_headroom(skb); 1917 1918 if (delta <= 0) 1919 skb2 = pskb_copy(skb, GFP_ATOMIC); 1920 else { 1921 skb2 = skb_clone(skb, GFP_ATOMIC); 1922 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, 1923 GFP_ATOMIC)) { 1924 kfree_skb(skb2); 1925 skb2 = NULL; 1926 } 1927 } 1928 return skb2; 1929 } 1930 EXPORT_SYMBOL(skb_realloc_headroom); 1931 1932 int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) 1933 { 1934 unsigned int saved_end_offset, saved_truesize; 1935 struct skb_shared_info *shinfo; 1936 int res; 1937 1938 saved_end_offset = skb_end_offset(skb); 1939 saved_truesize = skb->truesize; 1940 1941 res = pskb_expand_head(skb, 0, 0, pri); 1942 if (res) 1943 return res; 1944 1945 skb->truesize = saved_truesize; 1946 1947 if (likely(skb_end_offset(skb) == saved_end_offset)) 1948 return 0; 1949 1950 shinfo = skb_shinfo(skb); 1951 1952 /* We are about to change back skb->end, 1953 * we need to move skb_shinfo() to its new location. 1954 */ 1955 memmove(skb->head + saved_end_offset, 1956 shinfo, 1957 offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); 1958 1959 skb_set_end_offset(skb, saved_end_offset); 1960 1961 return 0; 1962 } 1963 1964 /** 1965 * skb_expand_head - reallocate header of &sk_buff 1966 * @skb: buffer to reallocate 1967 * @headroom: needed headroom 1968 * 1969 * Unlike skb_realloc_headroom, this one does not allocate a new skb 1970 * if possible; copies skb->sk to new skb as needed 1971 * and frees original skb in case of failures. 1972 * 1973 * It expect increased headroom and generates warning otherwise. 1974 */ 1975 1976 struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) 1977 { 1978 int delta = headroom - skb_headroom(skb); 1979 int osize = skb_end_offset(skb); 1980 struct sock *sk = skb->sk; 1981 1982 if (WARN_ONCE(delta <= 0, 1983 "%s is expecting an increase in the headroom", __func__)) 1984 return skb; 1985 1986 delta = SKB_DATA_ALIGN(delta); 1987 /* pskb_expand_head() might crash, if skb is shared. */ 1988 if (skb_shared(skb) || !is_skb_wmem(skb)) { 1989 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); 1990 1991 if (unlikely(!nskb)) 1992 goto fail; 1993 1994 if (sk) 1995 skb_set_owner_w(nskb, sk); 1996 consume_skb(skb); 1997 skb = nskb; 1998 } 1999 if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) 2000 goto fail; 2001 2002 if (sk && is_skb_wmem(skb)) { 2003 delta = skb_end_offset(skb) - osize; 2004 refcount_add(delta, &sk->sk_wmem_alloc); 2005 skb->truesize += delta; 2006 } 2007 return skb; 2008 2009 fail: 2010 kfree_skb(skb); 2011 return NULL; 2012 } 2013 EXPORT_SYMBOL(skb_expand_head); 2014 2015 /** 2016 * skb_copy_expand - copy and expand sk_buff 2017 * @skb: buffer to copy 2018 * @newheadroom: new free bytes at head 2019 * @newtailroom: new free bytes at tail 2020 * @gfp_mask: allocation priority 2021 * 2022 * Make a copy of both an &sk_buff and its data and while doing so 2023 * allocate additional space. 2024 * 2025 * This is used when the caller wishes to modify the data and needs a 2026 * private copy of the data to alter as well as more space for new fields. 2027 * Returns %NULL on failure or the pointer to the buffer 2028 * on success. The returned buffer has a reference count of 1. 2029 * 2030 * You must pass %GFP_ATOMIC as the allocation priority if this function 2031 * is called from an interrupt. 2032 */ 2033 struct sk_buff *skb_copy_expand(const struct sk_buff *skb, 2034 int newheadroom, int newtailroom, 2035 gfp_t gfp_mask) 2036 { 2037 /* 2038 * Allocate the copy buffer 2039 */ 2040 struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, 2041 gfp_mask, skb_alloc_rx_flag(skb), 2042 NUMA_NO_NODE); 2043 int oldheadroom = skb_headroom(skb); 2044 int head_copy_len, head_copy_off; 2045 2046 if (!n) 2047 return NULL; 2048 2049 skb_reserve(n, newheadroom); 2050 2051 /* Set the tail pointer and length */ 2052 skb_put(n, skb->len); 2053 2054 head_copy_len = oldheadroom; 2055 head_copy_off = 0; 2056 if (newheadroom <= head_copy_len) 2057 head_copy_len = newheadroom; 2058 else 2059 head_copy_off = newheadroom - head_copy_len; 2060 2061 /* Copy the linear header and data. */ 2062 BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, 2063 skb->len + head_copy_len)); 2064 2065 skb_copy_header(n, skb); 2066 2067 skb_headers_offset_update(n, newheadroom - oldheadroom); 2068 2069 return n; 2070 } 2071 EXPORT_SYMBOL(skb_copy_expand); 2072 2073 /** 2074 * __skb_pad - zero pad the tail of an skb 2075 * @skb: buffer to pad 2076 * @pad: space to pad 2077 * @free_on_error: free buffer on error 2078 * 2079 * Ensure that a buffer is followed by a padding area that is zero 2080 * filled. Used by network drivers which may DMA or transfer data 2081 * beyond the buffer end onto the wire. 2082 * 2083 * May return error in out of memory cases. The skb is freed on error 2084 * if @free_on_error is true. 2085 */ 2086 2087 int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) 2088 { 2089 int err; 2090 int ntail; 2091 2092 /* If the skbuff is non linear tailroom is always zero.. */ 2093 if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { 2094 memset(skb->data+skb->len, 0, pad); 2095 return 0; 2096 } 2097 2098 ntail = skb->data_len + pad - (skb->end - skb->tail); 2099 if (likely(skb_cloned(skb) || ntail > 0)) { 2100 err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); 2101 if (unlikely(err)) 2102 goto free_skb; 2103 } 2104 2105 /* FIXME: The use of this function with non-linear skb's really needs 2106 * to be audited. 2107 */ 2108 err = skb_linearize(skb); 2109 if (unlikely(err)) 2110 goto free_skb; 2111 2112 memset(skb->data + skb->len, 0, pad); 2113 return 0; 2114 2115 free_skb: 2116 if (free_on_error) 2117 kfree_skb(skb); 2118 return err; 2119 } 2120 EXPORT_SYMBOL(__skb_pad); 2121 2122 /** 2123 * pskb_put - add data to the tail of a potentially fragmented buffer 2124 * @skb: start of the buffer to use 2125 * @tail: tail fragment of the buffer to use 2126 * @len: amount of data to add 2127 * 2128 * This function extends the used data area of the potentially 2129 * fragmented buffer. @tail must be the last fragment of @skb -- or 2130 * @skb itself. If this would exceed the total buffer size the kernel 2131 * will panic. A pointer to the first byte of the extra data is 2132 * returned. 2133 */ 2134 2135 void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) 2136 { 2137 if (tail != skb) { 2138 skb->data_len += len; 2139 skb->len += len; 2140 } 2141 return skb_put(tail, len); 2142 } 2143 EXPORT_SYMBOL_GPL(pskb_put); 2144 2145 /** 2146 * skb_put - add data to a buffer 2147 * @skb: buffer to use 2148 * @len: amount of data to add 2149 * 2150 * This function extends the used data area of the buffer. If this would 2151 * exceed the total buffer size the kernel will panic. A pointer to the 2152 * first byte of the extra data is returned. 2153 */ 2154 void *skb_put(struct sk_buff *skb, unsigned int len) 2155 { 2156 void *tmp = skb_tail_pointer(skb); 2157 SKB_LINEAR_ASSERT(skb); 2158 skb->tail += len; 2159 skb->len += len; 2160 if (unlikely(skb->tail > skb->end)) 2161 skb_over_panic(skb, len, __builtin_return_address(0)); 2162 return tmp; 2163 } 2164 EXPORT_SYMBOL(skb_put); 2165 2166 /** 2167 * skb_push - add data to the start of a buffer 2168 * @skb: buffer to use 2169 * @len: amount of data to add 2170 * 2171 * This function extends the used data area of the buffer at the buffer 2172 * start. If this would exceed the total buffer headroom the kernel will 2173 * panic. A pointer to the first byte of the extra data is returned. 2174 */ 2175 void *skb_push(struct sk_buff *skb, unsigned int len) 2176 { 2177 skb->data -= len; 2178 skb->len += len; 2179 if (unlikely(skb->data < skb->head)) 2180 skb_under_panic(skb, len, __builtin_return_address(0)); 2181 return skb->data; 2182 } 2183 EXPORT_SYMBOL(skb_push); 2184 2185 /** 2186 * skb_pull - remove data from the start of a buffer 2187 * @skb: buffer to use 2188 * @len: amount of data to remove 2189 * 2190 * This function removes data from the start of a buffer, returning 2191 * the memory to the headroom. A pointer to the next data in the buffer 2192 * is returned. Once the data has been pulled future pushes will overwrite 2193 * the old data. 2194 */ 2195 void *skb_pull(struct sk_buff *skb, unsigned int len) 2196 { 2197 return skb_pull_inline(skb, len); 2198 } 2199 EXPORT_SYMBOL(skb_pull); 2200 2201 /** 2202 * skb_pull_data - remove data from the start of a buffer returning its 2203 * original position. 2204 * @skb: buffer to use 2205 * @len: amount of data to remove 2206 * 2207 * This function removes data from the start of a buffer, returning 2208 * the memory to the headroom. A pointer to the original data in the buffer 2209 * is returned after checking if there is enough data to pull. Once the 2210 * data has been pulled future pushes will overwrite the old data. 2211 */ 2212 void *skb_pull_data(struct sk_buff *skb, size_t len) 2213 { 2214 void *data = skb->data; 2215 2216 if (skb->len < len) 2217 return NULL; 2218 2219 skb_pull(skb, len); 2220 2221 return data; 2222 } 2223 EXPORT_SYMBOL(skb_pull_data); 2224 2225 /** 2226 * skb_trim - remove end from a buffer 2227 * @skb: buffer to alter 2228 * @len: new length 2229 * 2230 * Cut the length of a buffer down by removing data from the tail. If 2231 * the buffer is already under the length specified it is not modified. 2232 * The skb must be linear. 2233 */ 2234 void skb_trim(struct sk_buff *skb, unsigned int len) 2235 { 2236 if (skb->len > len) 2237 __skb_trim(skb, len); 2238 } 2239 EXPORT_SYMBOL(skb_trim); 2240 2241 /* Trims skb to length len. It can change skb pointers. 2242 */ 2243 2244 int ___pskb_trim(struct sk_buff *skb, unsigned int len) 2245 { 2246 struct sk_buff **fragp; 2247 struct sk_buff *frag; 2248 int offset = skb_headlen(skb); 2249 int nfrags = skb_shinfo(skb)->nr_frags; 2250 int i; 2251 int err; 2252 2253 if (skb_cloned(skb) && 2254 unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) 2255 return err; 2256 2257 i = 0; 2258 if (offset >= len) 2259 goto drop_pages; 2260 2261 for (; i < nfrags; i++) { 2262 int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); 2263 2264 if (end < len) { 2265 offset = end; 2266 continue; 2267 } 2268 2269 skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); 2270 2271 drop_pages: 2272 skb_shinfo(skb)->nr_frags = i; 2273 2274 for (; i < nfrags; i++) 2275 skb_frag_unref(skb, i); 2276 2277 if (skb_has_frag_list(skb)) 2278 skb_drop_fraglist(skb); 2279 goto done; 2280 } 2281 2282 for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); 2283 fragp = &frag->next) { 2284 int end = offset + frag->len; 2285 2286 if (skb_shared(frag)) { 2287 struct sk_buff *nfrag; 2288 2289 nfrag = skb_clone(frag, GFP_ATOMIC); 2290 if (unlikely(!nfrag)) 2291 return -ENOMEM; 2292 2293 nfrag->next = frag->next; 2294 consume_skb(frag); 2295 frag = nfrag; 2296 *fragp = frag; 2297 } 2298 2299 if (end < len) { 2300 offset = end; 2301 continue; 2302 } 2303 2304 if (end > len && 2305 unlikely((err = pskb_trim(frag, len - offset)))) 2306 return err; 2307 2308 if (frag->next) 2309 skb_drop_list(&frag->next); 2310 break; 2311 } 2312 2313 done: 2314 if (len > skb_headlen(skb)) { 2315 skb->data_len -= skb->len - len; 2316 skb->len = len; 2317 } else { 2318 skb->len = len; 2319 skb->data_len = 0; 2320 skb_set_tail_pointer(skb, len); 2321 } 2322 2323 if (!skb->sk || skb->destructor == sock_edemux) 2324 skb_condense(skb); 2325 return 0; 2326 } 2327 EXPORT_SYMBOL(___pskb_trim); 2328 2329 /* Note : use pskb_trim_rcsum() instead of calling this directly 2330 */ 2331 int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) 2332 { 2333 if (skb->ip_summed == CHECKSUM_COMPLETE) { 2334 int delta = skb->len - len; 2335 2336 skb->csum = csum_block_sub(skb->csum, 2337 skb_checksum(skb, len, delta, 0), 2338 len); 2339 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { 2340 int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; 2341 int offset = skb_checksum_start_offset(skb) + skb->csum_offset; 2342 2343 if (offset + sizeof(__sum16) > hdlen) 2344 return -EINVAL; 2345 } 2346 return __pskb_trim(skb, len); 2347 } 2348 EXPORT_SYMBOL(pskb_trim_rcsum_slow); 2349 2350 /** 2351 * __pskb_pull_tail - advance tail of skb header 2352 * @skb: buffer to reallocate 2353 * @delta: number of bytes to advance tail 2354 * 2355 * The function makes a sense only on a fragmented &sk_buff, 2356 * it expands header moving its tail forward and copying necessary 2357 * data from fragmented part. 2358 * 2359 * &sk_buff MUST have reference count of 1. 2360 * 2361 * Returns %NULL (and &sk_buff does not change) if pull failed 2362 * or value of new tail of skb in the case of success. 2363 * 2364 * All the pointers pointing into skb header may change and must be 2365 * reloaded after call to this function. 2366 */ 2367 2368 /* Moves tail of skb head forward, copying data from fragmented part, 2369 * when it is necessary. 2370 * 1. It may fail due to malloc failure. 2371 * 2. It may change skb pointers. 2372 * 2373 * It is pretty complicated. Luckily, it is called only in exceptional cases. 2374 */ 2375 void *__pskb_pull_tail(struct sk_buff *skb, int delta) 2376 { 2377 /* If skb has not enough free space at tail, get new one 2378 * plus 128 bytes for future expansions. If we have enough 2379 * room at tail, reallocate without expansion only if skb is cloned. 2380 */ 2381 int i, k, eat = (skb->tail + delta) - skb->end; 2382 2383 if (eat > 0 || skb_cloned(skb)) { 2384 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, 2385 GFP_ATOMIC)) 2386 return NULL; 2387 } 2388 2389 BUG_ON(skb_copy_bits(skb, skb_headlen(skb), 2390 skb_tail_pointer(skb), delta)); 2391 2392 /* Optimization: no fragments, no reasons to preestimate 2393 * size of pulled pages. Superb. 2394 */ 2395 if (!skb_has_frag_list(skb)) 2396 goto pull_pages; 2397 2398 /* Estimate size of pulled pages. */ 2399 eat = delta; 2400 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2401 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2402 2403 if (size >= eat) 2404 goto pull_pages; 2405 eat -= size; 2406 } 2407 2408 /* If we need update frag list, we are in troubles. 2409 * Certainly, it is possible to add an offset to skb data, 2410 * but taking into account that pulling is expected to 2411 * be very rare operation, it is worth to fight against 2412 * further bloating skb head and crucify ourselves here instead. 2413 * Pure masohism, indeed. 8)8) 2414 */ 2415 if (eat) { 2416 struct sk_buff *list = skb_shinfo(skb)->frag_list; 2417 struct sk_buff *clone = NULL; 2418 struct sk_buff *insp = NULL; 2419 2420 do { 2421 if (list->len <= eat) { 2422 /* Eaten as whole. */ 2423 eat -= list->len; 2424 list = list->next; 2425 insp = list; 2426 } else { 2427 /* Eaten partially. */ 2428 2429 if (skb_shared(list)) { 2430 /* Sucks! We need to fork list. :-( */ 2431 clone = skb_clone(list, GFP_ATOMIC); 2432 if (!clone) 2433 return NULL; 2434 insp = list->next; 2435 list = clone; 2436 } else { 2437 /* This may be pulled without 2438 * problems. */ 2439 insp = list; 2440 } 2441 if (!pskb_pull(list, eat)) { 2442 kfree_skb(clone); 2443 return NULL; 2444 } 2445 break; 2446 } 2447 } while (eat); 2448 2449 /* Free pulled out fragments. */ 2450 while ((list = skb_shinfo(skb)->frag_list) != insp) { 2451 skb_shinfo(skb)->frag_list = list->next; 2452 consume_skb(list); 2453 } 2454 /* And insert new clone at head. */ 2455 if (clone) { 2456 clone->next = list; 2457 skb_shinfo(skb)->frag_list = clone; 2458 } 2459 } 2460 /* Success! Now we may commit changes to skb data. */ 2461 2462 pull_pages: 2463 eat = delta; 2464 k = 0; 2465 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2466 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2467 2468 if (size <= eat) { 2469 skb_frag_unref(skb, i); 2470 eat -= size; 2471 } else { 2472 skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; 2473 2474 *frag = skb_shinfo(skb)->frags[i]; 2475 if (eat) { 2476 skb_frag_off_add(frag, eat); 2477 skb_frag_size_sub(frag, eat); 2478 if (!i) 2479 goto end; 2480 eat = 0; 2481 } 2482 k++; 2483 } 2484 } 2485 skb_shinfo(skb)->nr_frags = k; 2486 2487 end: 2488 skb->tail += delta; 2489 skb->data_len -= delta; 2490 2491 if (!skb->data_len) 2492 skb_zcopy_clear(skb, false); 2493 2494 return skb_tail_pointer(skb); 2495 } 2496 EXPORT_SYMBOL(__pskb_pull_tail); 2497 2498 /** 2499 * skb_copy_bits - copy bits from skb to kernel buffer 2500 * @skb: source skb 2501 * @offset: offset in source 2502 * @to: destination buffer 2503 * @len: number of bytes to copy 2504 * 2505 * Copy the specified number of bytes from the source skb to the 2506 * destination buffer. 2507 * 2508 * CAUTION ! : 2509 * If its prototype is ever changed, 2510 * check arch/{*}/net/{*}.S files, 2511 * since it is called from BPF assembly code. 2512 */ 2513 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) 2514 { 2515 int start = skb_headlen(skb); 2516 struct sk_buff *frag_iter; 2517 int i, copy; 2518 2519 if (offset > (int)skb->len - len) 2520 goto fault; 2521 2522 /* Copy header. */ 2523 if ((copy = start - offset) > 0) { 2524 if (copy > len) 2525 copy = len; 2526 skb_copy_from_linear_data_offset(skb, offset, to, copy); 2527 if ((len -= copy) == 0) 2528 return 0; 2529 offset += copy; 2530 to += copy; 2531 } 2532 2533 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2534 int end; 2535 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 2536 2537 WARN_ON(start > offset + len); 2538 2539 end = start + skb_frag_size(f); 2540 if ((copy = end - offset) > 0) { 2541 u32 p_off, p_len, copied; 2542 struct page *p; 2543 u8 *vaddr; 2544 2545 if (copy > len) 2546 copy = len; 2547 2548 skb_frag_foreach_page(f, 2549 skb_frag_off(f) + offset - start, 2550 copy, p, p_off, p_len, copied) { 2551 vaddr = kmap_atomic(p); 2552 memcpy(to + copied, vaddr + p_off, p_len); 2553 kunmap_atomic(vaddr); 2554 } 2555 2556 if ((len -= copy) == 0) 2557 return 0; 2558 offset += copy; 2559 to += copy; 2560 } 2561 start = end; 2562 } 2563 2564 skb_walk_frags(skb, frag_iter) { 2565 int end; 2566 2567 WARN_ON(start > offset + len); 2568 2569 end = start + frag_iter->len; 2570 if ((copy = end - offset) > 0) { 2571 if (copy > len) 2572 copy = len; 2573 if (skb_copy_bits(frag_iter, offset - start, to, copy)) 2574 goto fault; 2575 if ((len -= copy) == 0) 2576 return 0; 2577 offset += copy; 2578 to += copy; 2579 } 2580 start = end; 2581 } 2582 2583 if (!len) 2584 return 0; 2585 2586 fault: 2587 return -EFAULT; 2588 } 2589 EXPORT_SYMBOL(skb_copy_bits); 2590 2591 /* 2592 * Callback from splice_to_pipe(), if we need to release some pages 2593 * at the end of the spd in case we error'ed out in filling the pipe. 2594 */ 2595 static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) 2596 { 2597 put_page(spd->pages[i]); 2598 } 2599 2600 static struct page *linear_to_page(struct page *page, unsigned int *len, 2601 unsigned int *offset, 2602 struct sock *sk) 2603 { 2604 struct page_frag *pfrag = sk_page_frag(sk); 2605 2606 if (!sk_page_frag_refill(sk, pfrag)) 2607 return NULL; 2608 2609 *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); 2610 2611 memcpy(page_address(pfrag->page) + pfrag->offset, 2612 page_address(page) + *offset, *len); 2613 *offset = pfrag->offset; 2614 pfrag->offset += *len; 2615 2616 return pfrag->page; 2617 } 2618 2619 static bool spd_can_coalesce(const struct splice_pipe_desc *spd, 2620 struct page *page, 2621 unsigned int offset) 2622 { 2623 return spd->nr_pages && 2624 spd->pages[spd->nr_pages - 1] == page && 2625 (spd->partial[spd->nr_pages - 1].offset + 2626 spd->partial[spd->nr_pages - 1].len == offset); 2627 } 2628 2629 /* 2630 * Fill page/offset/length into spd, if it can hold more pages. 2631 */ 2632 static bool spd_fill_page(struct splice_pipe_desc *spd, 2633 struct pipe_inode_info *pipe, struct page *page, 2634 unsigned int *len, unsigned int offset, 2635 bool linear, 2636 struct sock *sk) 2637 { 2638 if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) 2639 return true; 2640 2641 if (linear) { 2642 page = linear_to_page(page, len, &offset, sk); 2643 if (!page) 2644 return true; 2645 } 2646 if (spd_can_coalesce(spd, page, offset)) { 2647 spd->partial[spd->nr_pages - 1].len += *len; 2648 return false; 2649 } 2650 get_page(page); 2651 spd->pages[spd->nr_pages] = page; 2652 spd->partial[spd->nr_pages].len = *len; 2653 spd->partial[spd->nr_pages].offset = offset; 2654 spd->nr_pages++; 2655 2656 return false; 2657 } 2658 2659 static bool __splice_segment(struct page *page, unsigned int poff, 2660 unsigned int plen, unsigned int *off, 2661 unsigned int *len, 2662 struct splice_pipe_desc *spd, bool linear, 2663 struct sock *sk, 2664 struct pipe_inode_info *pipe) 2665 { 2666 if (!*len) 2667 return true; 2668 2669 /* skip this segment if already processed */ 2670 if (*off >= plen) { 2671 *off -= plen; 2672 return false; 2673 } 2674 2675 /* ignore any bits we already processed */ 2676 poff += *off; 2677 plen -= *off; 2678 *off = 0; 2679 2680 do { 2681 unsigned int flen = min(*len, plen); 2682 2683 if (spd_fill_page(spd, pipe, page, &flen, poff, 2684 linear, sk)) 2685 return true; 2686 poff += flen; 2687 plen -= flen; 2688 *len -= flen; 2689 } while (*len && plen); 2690 2691 return false; 2692 } 2693 2694 /* 2695 * Map linear and fragment data from the skb to spd. It reports true if the 2696 * pipe is full or if we already spliced the requested length. 2697 */ 2698 static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, 2699 unsigned int *offset, unsigned int *len, 2700 struct splice_pipe_desc *spd, struct sock *sk) 2701 { 2702 int seg; 2703 struct sk_buff *iter; 2704 2705 /* map the linear part : 2706 * If skb->head_frag is set, this 'linear' part is backed by a 2707 * fragment, and if the head is not shared with any clones then 2708 * we can avoid a copy since we own the head portion of this page. 2709 */ 2710 if (__splice_segment(virt_to_page(skb->data), 2711 (unsigned long) skb->data & (PAGE_SIZE - 1), 2712 skb_headlen(skb), 2713 offset, len, spd, 2714 skb_head_is_locked(skb), 2715 sk, pipe)) 2716 return true; 2717 2718 /* 2719 * then map the fragments 2720 */ 2721 for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { 2722 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 2723 2724 if (__splice_segment(skb_frag_page(f), 2725 skb_frag_off(f), skb_frag_size(f), 2726 offset, len, spd, false, sk, pipe)) 2727 return true; 2728 } 2729 2730 skb_walk_frags(skb, iter) { 2731 if (*offset >= iter->len) { 2732 *offset -= iter->len; 2733 continue; 2734 } 2735 /* __skb_splice_bits() only fails if the output has no room 2736 * left, so no point in going over the frag_list for the error 2737 * case. 2738 */ 2739 if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) 2740 return true; 2741 } 2742 2743 return false; 2744 } 2745 2746 /* 2747 * Map data from the skb to a pipe. Should handle both the linear part, 2748 * the fragments, and the frag list. 2749 */ 2750 int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, 2751 struct pipe_inode_info *pipe, unsigned int tlen, 2752 unsigned int flags) 2753 { 2754 struct partial_page partial[MAX_SKB_FRAGS]; 2755 struct page *pages[MAX_SKB_FRAGS]; 2756 struct splice_pipe_desc spd = { 2757 .pages = pages, 2758 .partial = partial, 2759 .nr_pages_max = MAX_SKB_FRAGS, 2760 .ops = &nosteal_pipe_buf_ops, 2761 .spd_release = sock_spd_release, 2762 }; 2763 int ret = 0; 2764 2765 __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); 2766 2767 if (spd.nr_pages) 2768 ret = splice_to_pipe(pipe, &spd); 2769 2770 return ret; 2771 } 2772 EXPORT_SYMBOL_GPL(skb_splice_bits); 2773 2774 static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, 2775 struct kvec *vec, size_t num, size_t size) 2776 { 2777 struct socket *sock = sk->sk_socket; 2778 2779 if (!sock) 2780 return -EINVAL; 2781 return kernel_sendmsg(sock, msg, vec, num, size); 2782 } 2783 2784 static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, 2785 size_t size, int flags) 2786 { 2787 struct socket *sock = sk->sk_socket; 2788 2789 if (!sock) 2790 return -EINVAL; 2791 return kernel_sendpage(sock, page, offset, size, flags); 2792 } 2793 2794 typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, 2795 struct kvec *vec, size_t num, size_t size); 2796 typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, 2797 size_t size, int flags); 2798 static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, 2799 int len, sendmsg_func sendmsg, sendpage_func sendpage) 2800 { 2801 unsigned int orig_len = len; 2802 struct sk_buff *head = skb; 2803 unsigned short fragidx; 2804 int slen, ret; 2805 2806 do_frag_list: 2807 2808 /* Deal with head data */ 2809 while (offset < skb_headlen(skb) && len) { 2810 struct kvec kv; 2811 struct msghdr msg; 2812 2813 slen = min_t(int, len, skb_headlen(skb) - offset); 2814 kv.iov_base = skb->data + offset; 2815 kv.iov_len = slen; 2816 memset(&msg, 0, sizeof(msg)); 2817 msg.msg_flags = MSG_DONTWAIT; 2818 2819 ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, 2820 sendmsg_unlocked, sk, &msg, &kv, 1, slen); 2821 if (ret <= 0) 2822 goto error; 2823 2824 offset += ret; 2825 len -= ret; 2826 } 2827 2828 /* All the data was skb head? */ 2829 if (!len) 2830 goto out; 2831 2832 /* Make offset relative to start of frags */ 2833 offset -= skb_headlen(skb); 2834 2835 /* Find where we are in frag list */ 2836 for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { 2837 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; 2838 2839 if (offset < skb_frag_size(frag)) 2840 break; 2841 2842 offset -= skb_frag_size(frag); 2843 } 2844 2845 for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { 2846 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; 2847 2848 slen = min_t(size_t, len, skb_frag_size(frag) - offset); 2849 2850 while (slen) { 2851 ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, 2852 sendpage_unlocked, sk, 2853 skb_frag_page(frag), 2854 skb_frag_off(frag) + offset, 2855 slen, MSG_DONTWAIT); 2856 if (ret <= 0) 2857 goto error; 2858 2859 len -= ret; 2860 offset += ret; 2861 slen -= ret; 2862 } 2863 2864 offset = 0; 2865 } 2866 2867 if (len) { 2868 /* Process any frag lists */ 2869 2870 if (skb == head) { 2871 if (skb_has_frag_list(skb)) { 2872 skb = skb_shinfo(skb)->frag_list; 2873 goto do_frag_list; 2874 } 2875 } else if (skb->next) { 2876 skb = skb->next; 2877 goto do_frag_list; 2878 } 2879 } 2880 2881 out: 2882 return orig_len - len; 2883 2884 error: 2885 return orig_len == len ? ret : orig_len - len; 2886 } 2887 2888 /* Send skb data on a socket. Socket must be locked. */ 2889 int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, 2890 int len) 2891 { 2892 return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, 2893 kernel_sendpage_locked); 2894 } 2895 EXPORT_SYMBOL_GPL(skb_send_sock_locked); 2896 2897 /* Send skb data on a socket. Socket must be unlocked. */ 2898 int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) 2899 { 2900 return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 2901 sendpage_unlocked); 2902 } 2903 2904 /** 2905 * skb_store_bits - store bits from kernel buffer to skb 2906 * @skb: destination buffer 2907 * @offset: offset in destination 2908 * @from: source buffer 2909 * @len: number of bytes to copy 2910 * 2911 * Copy the specified number of bytes from the source buffer to the 2912 * destination skb. This function handles all the messy bits of 2913 * traversing fragment lists and such. 2914 */ 2915 2916 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) 2917 { 2918 int start = skb_headlen(skb); 2919 struct sk_buff *frag_iter; 2920 int i, copy; 2921 2922 if (offset > (int)skb->len - len) 2923 goto fault; 2924 2925 if ((copy = start - offset) > 0) { 2926 if (copy > len) 2927 copy = len; 2928 skb_copy_to_linear_data_offset(skb, offset, from, copy); 2929 if ((len -= copy) == 0) 2930 return 0; 2931 offset += copy; 2932 from += copy; 2933 } 2934 2935 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2936 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2937 int end; 2938 2939 WARN_ON(start > offset + len); 2940 2941 end = start + skb_frag_size(frag); 2942 if ((copy = end - offset) > 0) { 2943 u32 p_off, p_len, copied; 2944 struct page *p; 2945 u8 *vaddr; 2946 2947 if (copy > len) 2948 copy = len; 2949 2950 skb_frag_foreach_page(frag, 2951 skb_frag_off(frag) + offset - start, 2952 copy, p, p_off, p_len, copied) { 2953 vaddr = kmap_atomic(p); 2954 memcpy(vaddr + p_off, from + copied, p_len); 2955 kunmap_atomic(vaddr); 2956 } 2957 2958 if ((len -= copy) == 0) 2959 return 0; 2960 offset += copy; 2961 from += copy; 2962 } 2963 start = end; 2964 } 2965 2966 skb_walk_frags(skb, frag_iter) { 2967 int end; 2968 2969 WARN_ON(start > offset + len); 2970 2971 end = start + frag_iter->len; 2972 if ((copy = end - offset) > 0) { 2973 if (copy > len) 2974 copy = len; 2975 if (skb_store_bits(frag_iter, offset - start, 2976 from, copy)) 2977 goto fault; 2978 if ((len -= copy) == 0) 2979 return 0; 2980 offset += copy; 2981 from += copy; 2982 } 2983 start = end; 2984 } 2985 if (!len) 2986 return 0; 2987 2988 fault: 2989 return -EFAULT; 2990 } 2991 EXPORT_SYMBOL(skb_store_bits); 2992 2993 /* Checksum skb data. */ 2994 __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, 2995 __wsum csum, const struct skb_checksum_ops *ops) 2996 { 2997 int start = skb_headlen(skb); 2998 int i, copy = start - offset; 2999 struct sk_buff *frag_iter; 3000 int pos = 0; 3001 3002 /* Checksum header. */ 3003 if (copy > 0) { 3004 if (copy > len) 3005 copy = len; 3006 csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, 3007 skb->data + offset, copy, csum); 3008 if ((len -= copy) == 0) 3009 return csum; 3010 offset += copy; 3011 pos = copy; 3012 } 3013 3014 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3015 int end; 3016 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3017 3018 WARN_ON(start > offset + len); 3019 3020 end = start + skb_frag_size(frag); 3021 if ((copy = end - offset) > 0) { 3022 u32 p_off, p_len, copied; 3023 struct page *p; 3024 __wsum csum2; 3025 u8 *vaddr; 3026 3027 if (copy > len) 3028 copy = len; 3029 3030 skb_frag_foreach_page(frag, 3031 skb_frag_off(frag) + offset - start, 3032 copy, p, p_off, p_len, copied) { 3033 vaddr = kmap_atomic(p); 3034 csum2 = INDIRECT_CALL_1(ops->update, 3035 csum_partial_ext, 3036 vaddr + p_off, p_len, 0); 3037 kunmap_atomic(vaddr); 3038 csum = INDIRECT_CALL_1(ops->combine, 3039 csum_block_add_ext, csum, 3040 csum2, pos, p_len); 3041 pos += p_len; 3042 } 3043 3044 if (!(len -= copy)) 3045 return csum; 3046 offset += copy; 3047 } 3048 start = end; 3049 } 3050 3051 skb_walk_frags(skb, frag_iter) { 3052 int end; 3053 3054 WARN_ON(start > offset + len); 3055 3056 end = start + frag_iter->len; 3057 if ((copy = end - offset) > 0) { 3058 __wsum csum2; 3059 if (copy > len) 3060 copy = len; 3061 csum2 = __skb_checksum(frag_iter, offset - start, 3062 copy, 0, ops); 3063 csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, 3064 csum, csum2, pos, copy); 3065 if ((len -= copy) == 0) 3066 return csum; 3067 offset += copy; 3068 pos += copy; 3069 } 3070 start = end; 3071 } 3072 BUG_ON(len); 3073 3074 return csum; 3075 } 3076 EXPORT_SYMBOL(__skb_checksum); 3077 3078 __wsum skb_checksum(const struct sk_buff *skb, int offset, 3079 int len, __wsum csum) 3080 { 3081 const struct skb_checksum_ops ops = { 3082 .update = csum_partial_ext, 3083 .combine = csum_block_add_ext, 3084 }; 3085 3086 return __skb_checksum(skb, offset, len, csum, &ops); 3087 } 3088 EXPORT_SYMBOL(skb_checksum); 3089 3090 /* Both of above in one bottle. */ 3091 3092 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, 3093 u8 *to, int len) 3094 { 3095 int start = skb_headlen(skb); 3096 int i, copy = start - offset; 3097 struct sk_buff *frag_iter; 3098 int pos = 0; 3099 __wsum csum = 0; 3100 3101 /* Copy header. */ 3102 if (copy > 0) { 3103 if (copy > len) 3104 copy = len; 3105 csum = csum_partial_copy_nocheck(skb->data + offset, to, 3106 copy); 3107 if ((len -= copy) == 0) 3108 return csum; 3109 offset += copy; 3110 to += copy; 3111 pos = copy; 3112 } 3113 3114 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3115 int end; 3116 3117 WARN_ON(start > offset + len); 3118 3119 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 3120 if ((copy = end - offset) > 0) { 3121 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3122 u32 p_off, p_len, copied; 3123 struct page *p; 3124 __wsum csum2; 3125 u8 *vaddr; 3126 3127 if (copy > len) 3128 copy = len; 3129 3130 skb_frag_foreach_page(frag, 3131 skb_frag_off(frag) + offset - start, 3132 copy, p, p_off, p_len, copied) { 3133 vaddr = kmap_atomic(p); 3134 csum2 = csum_partial_copy_nocheck(vaddr + p_off, 3135 to + copied, 3136 p_len); 3137 kunmap_atomic(vaddr); 3138 csum = csum_block_add(csum, csum2, pos); 3139 pos += p_len; 3140 } 3141 3142 if (!(len -= copy)) 3143 return csum; 3144 offset += copy; 3145 to += copy; 3146 } 3147 start = end; 3148 } 3149 3150 skb_walk_frags(skb, frag_iter) { 3151 __wsum csum2; 3152 int end; 3153 3154 WARN_ON(start > offset + len); 3155 3156 end = start + frag_iter->len; 3157 if ((copy = end - offset) > 0) { 3158 if (copy > len) 3159 copy = len; 3160 csum2 = skb_copy_and_csum_bits(frag_iter, 3161 offset - start, 3162 to, copy); 3163 csum = csum_block_add(csum, csum2, pos); 3164 if ((len -= copy) == 0) 3165 return csum; 3166 offset += copy; 3167 to += copy; 3168 pos += copy; 3169 } 3170 start = end; 3171 } 3172 BUG_ON(len); 3173 return csum; 3174 } 3175 EXPORT_SYMBOL(skb_copy_and_csum_bits); 3176 3177 __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) 3178 { 3179 __sum16 sum; 3180 3181 sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); 3182 /* See comments in __skb_checksum_complete(). */ 3183 if (likely(!sum)) { 3184 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 3185 !skb->csum_complete_sw) 3186 netdev_rx_csum_fault(skb->dev, skb); 3187 } 3188 if (!skb_shared(skb)) 3189 skb->csum_valid = !sum; 3190 return sum; 3191 } 3192 EXPORT_SYMBOL(__skb_checksum_complete_head); 3193 3194 /* This function assumes skb->csum already holds pseudo header's checksum, 3195 * which has been changed from the hardware checksum, for example, by 3196 * __skb_checksum_validate_complete(). And, the original skb->csum must 3197 * have been validated unsuccessfully for CHECKSUM_COMPLETE case. 3198 * 3199 * It returns non-zero if the recomputed checksum is still invalid, otherwise 3200 * zero. The new checksum is stored back into skb->csum unless the skb is 3201 * shared. 3202 */ 3203 __sum16 __skb_checksum_complete(struct sk_buff *skb) 3204 { 3205 __wsum csum; 3206 __sum16 sum; 3207 3208 csum = skb_checksum(skb, 0, skb->len, 0); 3209 3210 sum = csum_fold(csum_add(skb->csum, csum)); 3211 /* This check is inverted, because we already knew the hardware 3212 * checksum is invalid before calling this function. So, if the 3213 * re-computed checksum is valid instead, then we have a mismatch 3214 * between the original skb->csum and skb_checksum(). This means either 3215 * the original hardware checksum is incorrect or we screw up skb->csum 3216 * when moving skb->data around. 3217 */ 3218 if (likely(!sum)) { 3219 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 3220 !skb->csum_complete_sw) 3221 netdev_rx_csum_fault(skb->dev, skb); 3222 } 3223 3224 if (!skb_shared(skb)) { 3225 /* Save full packet checksum */ 3226 skb->csum = csum; 3227 skb->ip_summed = CHECKSUM_COMPLETE; 3228 skb->csum_complete_sw = 1; 3229 skb->csum_valid = !sum; 3230 } 3231 3232 return sum; 3233 } 3234 EXPORT_SYMBOL(__skb_checksum_complete); 3235 3236 static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) 3237 { 3238 net_warn_ratelimited( 3239 "%s: attempt to compute crc32c without libcrc32c.ko\n", 3240 __func__); 3241 return 0; 3242 } 3243 3244 static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, 3245 int offset, int len) 3246 { 3247 net_warn_ratelimited( 3248 "%s: attempt to compute crc32c without libcrc32c.ko\n", 3249 __func__); 3250 return 0; 3251 } 3252 3253 static const struct skb_checksum_ops default_crc32c_ops = { 3254 .update = warn_crc32c_csum_update, 3255 .combine = warn_crc32c_csum_combine, 3256 }; 3257 3258 const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = 3259 &default_crc32c_ops; 3260 EXPORT_SYMBOL(crc32c_csum_stub); 3261 3262 /** 3263 * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() 3264 * @from: source buffer 3265 * 3266 * Calculates the amount of linear headroom needed in the 'to' skb passed 3267 * into skb_zerocopy(). 3268 */ 3269 unsigned int 3270 skb_zerocopy_headlen(const struct sk_buff *from) 3271 { 3272 unsigned int hlen = 0; 3273 3274 if (!from->head_frag || 3275 skb_headlen(from) < L1_CACHE_BYTES || 3276 skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { 3277 hlen = skb_headlen(from); 3278 if (!hlen) 3279 hlen = from->len; 3280 } 3281 3282 if (skb_has_frag_list(from)) 3283 hlen = from->len; 3284 3285 return hlen; 3286 } 3287 EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); 3288 3289 /** 3290 * skb_zerocopy - Zero copy skb to skb 3291 * @to: destination buffer 3292 * @from: source buffer 3293 * @len: number of bytes to copy from source buffer 3294 * @hlen: size of linear headroom in destination buffer 3295 * 3296 * Copies up to `len` bytes from `from` to `to` by creating references 3297 * to the frags in the source buffer. 3298 * 3299 * The `hlen` as calculated by skb_zerocopy_headlen() specifies the 3300 * headroom in the `to` buffer. 3301 * 3302 * Return value: 3303 * 0: everything is OK 3304 * -ENOMEM: couldn't orphan frags of @from due to lack of memory 3305 * -EFAULT: skb_copy_bits() found some problem with skb geometry 3306 */ 3307 int 3308 skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) 3309 { 3310 int i, j = 0; 3311 int plen = 0; /* length of skb->head fragment */ 3312 int ret; 3313 struct page *page; 3314 unsigned int offset; 3315 3316 BUG_ON(!from->head_frag && !hlen); 3317 3318 /* dont bother with small payloads */ 3319 if (len <= skb_tailroom(to)) 3320 return skb_copy_bits(from, 0, skb_put(to, len), len); 3321 3322 if (hlen) { 3323 ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); 3324 if (unlikely(ret)) 3325 return ret; 3326 len -= hlen; 3327 } else { 3328 plen = min_t(int, skb_headlen(from), len); 3329 if (plen) { 3330 page = virt_to_head_page(from->head); 3331 offset = from->data - (unsigned char *)page_address(page); 3332 __skb_fill_page_desc(to, 0, page, offset, plen); 3333 get_page(page); 3334 j = 1; 3335 len -= plen; 3336 } 3337 } 3338 3339 skb_len_add(to, len + plen); 3340 3341 if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { 3342 skb_tx_error(from); 3343 return -ENOMEM; 3344 } 3345 skb_zerocopy_clone(to, from, GFP_ATOMIC); 3346 3347 for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { 3348 int size; 3349 3350 if (!len) 3351 break; 3352 skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; 3353 size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), 3354 len); 3355 skb_frag_size_set(&skb_shinfo(to)->frags[j], size); 3356 len -= size; 3357 skb_frag_ref(to, j); 3358 j++; 3359 } 3360 skb_shinfo(to)->nr_frags = j; 3361 3362 return 0; 3363 } 3364 EXPORT_SYMBOL_GPL(skb_zerocopy); 3365 3366 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) 3367 { 3368 __wsum csum; 3369 long csstart; 3370 3371 if (skb->ip_summed == CHECKSUM_PARTIAL) 3372 csstart = skb_checksum_start_offset(skb); 3373 else 3374 csstart = skb_headlen(skb); 3375 3376 BUG_ON(csstart > skb_headlen(skb)); 3377 3378 skb_copy_from_linear_data(skb, to, csstart); 3379 3380 csum = 0; 3381 if (csstart != skb->len) 3382 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, 3383 skb->len - csstart); 3384 3385 if (skb->ip_summed == CHECKSUM_PARTIAL) { 3386 long csstuff = csstart + skb->csum_offset; 3387 3388 *((__sum16 *)(to + csstuff)) = csum_fold(csum); 3389 } 3390 } 3391 EXPORT_SYMBOL(skb_copy_and_csum_dev); 3392 3393 /** 3394 * skb_dequeue - remove from the head of the queue 3395 * @list: list to dequeue from 3396 * 3397 * Remove the head of the list. The list lock is taken so the function 3398 * may be used safely with other locking list functions. The head item is 3399 * returned or %NULL if the list is empty. 3400 */ 3401 3402 struct sk_buff *skb_dequeue(struct sk_buff_head *list) 3403 { 3404 unsigned long flags; 3405 struct sk_buff *result; 3406 3407 spin_lock_irqsave(&list->lock, flags); 3408 result = __skb_dequeue(list); 3409 spin_unlock_irqrestore(&list->lock, flags); 3410 return result; 3411 } 3412 EXPORT_SYMBOL(skb_dequeue); 3413 3414 /** 3415 * skb_dequeue_tail - remove from the tail of the queue 3416 * @list: list to dequeue from 3417 * 3418 * Remove the tail of the list. The list lock is taken so the function 3419 * may be used safely with other locking list functions. The tail item is 3420 * returned or %NULL if the list is empty. 3421 */ 3422 struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) 3423 { 3424 unsigned long flags; 3425 struct sk_buff *result; 3426 3427 spin_lock_irqsave(&list->lock, flags); 3428 result = __skb_dequeue_tail(list); 3429 spin_unlock_irqrestore(&list->lock, flags); 3430 return result; 3431 } 3432 EXPORT_SYMBOL(skb_dequeue_tail); 3433 3434 /** 3435 * skb_queue_purge - empty a list 3436 * @list: list to empty 3437 * 3438 * Delete all buffers on an &sk_buff list. Each buffer is removed from 3439 * the list and one reference dropped. This function takes the list 3440 * lock and is atomic with respect to other list locking functions. 3441 */ 3442 void skb_queue_purge(struct sk_buff_head *list) 3443 { 3444 struct sk_buff *skb; 3445 while ((skb = skb_dequeue(list)) != NULL) 3446 kfree_skb(skb); 3447 } 3448 EXPORT_SYMBOL(skb_queue_purge); 3449 3450 /** 3451 * skb_rbtree_purge - empty a skb rbtree 3452 * @root: root of the rbtree to empty 3453 * Return value: the sum of truesizes of all purged skbs. 3454 * 3455 * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from 3456 * the list and one reference dropped. This function does not take 3457 * any lock. Synchronization should be handled by the caller (e.g., TCP 3458 * out-of-order queue is protected by the socket lock). 3459 */ 3460 unsigned int skb_rbtree_purge(struct rb_root *root) 3461 { 3462 struct rb_node *p = rb_first(root); 3463 unsigned int sum = 0; 3464 3465 while (p) { 3466 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 3467 3468 p = rb_next(p); 3469 rb_erase(&skb->rbnode, root); 3470 sum += skb->truesize; 3471 kfree_skb(skb); 3472 } 3473 return sum; 3474 } 3475 3476 /** 3477 * skb_queue_head - queue a buffer at the list head 3478 * @list: list to use 3479 * @newsk: buffer to queue 3480 * 3481 * Queue a buffer at the start of the list. This function takes the 3482 * list lock and can be used safely with other locking &sk_buff functions 3483 * safely. 3484 * 3485 * A buffer cannot be placed on two lists at the same time. 3486 */ 3487 void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) 3488 { 3489 unsigned long flags; 3490 3491 spin_lock_irqsave(&list->lock, flags); 3492 __skb_queue_head(list, newsk); 3493 spin_unlock_irqrestore(&list->lock, flags); 3494 } 3495 EXPORT_SYMBOL(skb_queue_head); 3496 3497 /** 3498 * skb_queue_tail - queue a buffer at the list tail 3499 * @list: list to use 3500 * @newsk: buffer to queue 3501 * 3502 * Queue a buffer at the tail of the list. This function takes the 3503 * list lock and can be used safely with other locking &sk_buff functions 3504 * safely. 3505 * 3506 * A buffer cannot be placed on two lists at the same time. 3507 */ 3508 void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) 3509 { 3510 unsigned long flags; 3511 3512 spin_lock_irqsave(&list->lock, flags); 3513 __skb_queue_tail(list, newsk); 3514 spin_unlock_irqrestore(&list->lock, flags); 3515 } 3516 EXPORT_SYMBOL(skb_queue_tail); 3517 3518 /** 3519 * skb_unlink - remove a buffer from a list 3520 * @skb: buffer to remove 3521 * @list: list to use 3522 * 3523 * Remove a packet from a list. The list locks are taken and this 3524 * function is atomic with respect to other list locked calls 3525 * 3526 * You must know what list the SKB is on. 3527 */ 3528 void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) 3529 { 3530 unsigned long flags; 3531 3532 spin_lock_irqsave(&list->lock, flags); 3533 __skb_unlink(skb, list); 3534 spin_unlock_irqrestore(&list->lock, flags); 3535 } 3536 EXPORT_SYMBOL(skb_unlink); 3537 3538 /** 3539 * skb_append - append a buffer 3540 * @old: buffer to insert after 3541 * @newsk: buffer to insert 3542 * @list: list to use 3543 * 3544 * Place a packet after a given packet in a list. The list locks are taken 3545 * and this function is atomic with respect to other list locked calls. 3546 * A buffer cannot be placed on two lists at the same time. 3547 */ 3548 void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 3549 { 3550 unsigned long flags; 3551 3552 spin_lock_irqsave(&list->lock, flags); 3553 __skb_queue_after(list, old, newsk); 3554 spin_unlock_irqrestore(&list->lock, flags); 3555 } 3556 EXPORT_SYMBOL(skb_append); 3557 3558 static inline void skb_split_inside_header(struct sk_buff *skb, 3559 struct sk_buff* skb1, 3560 const u32 len, const int pos) 3561 { 3562 int i; 3563 3564 skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), 3565 pos - len); 3566 /* And move data appendix as is. */ 3567 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 3568 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; 3569 3570 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; 3571 skb_shinfo(skb)->nr_frags = 0; 3572 skb1->data_len = skb->data_len; 3573 skb1->len += skb1->data_len; 3574 skb->data_len = 0; 3575 skb->len = len; 3576 skb_set_tail_pointer(skb, len); 3577 } 3578 3579 static inline void skb_split_no_header(struct sk_buff *skb, 3580 struct sk_buff* skb1, 3581 const u32 len, int pos) 3582 { 3583 int i, k = 0; 3584 const int nfrags = skb_shinfo(skb)->nr_frags; 3585 3586 skb_shinfo(skb)->nr_frags = 0; 3587 skb1->len = skb1->data_len = skb->len - len; 3588 skb->len = len; 3589 skb->data_len = len - pos; 3590 3591 for (i = 0; i < nfrags; i++) { 3592 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 3593 3594 if (pos + size > len) { 3595 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; 3596 3597 if (pos < len) { 3598 /* Split frag. 3599 * We have two variants in this case: 3600 * 1. Move all the frag to the second 3601 * part, if it is possible. F.e. 3602 * this approach is mandatory for TUX, 3603 * where splitting is expensive. 3604 * 2. Split is accurately. We make this. 3605 */ 3606 skb_frag_ref(skb, i); 3607 skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); 3608 skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); 3609 skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); 3610 skb_shinfo(skb)->nr_frags++; 3611 } 3612 k++; 3613 } else 3614 skb_shinfo(skb)->nr_frags++; 3615 pos += size; 3616 } 3617 skb_shinfo(skb1)->nr_frags = k; 3618 } 3619 3620 /** 3621 * skb_split - Split fragmented skb to two parts at length len. 3622 * @skb: the buffer to split 3623 * @skb1: the buffer to receive the second part 3624 * @len: new length for skb 3625 */ 3626 void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) 3627 { 3628 int pos = skb_headlen(skb); 3629 const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; 3630 3631 skb_zcopy_downgrade_managed(skb); 3632 3633 skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; 3634 skb_zerocopy_clone(skb1, skb, 0); 3635 if (len < pos) /* Split line is inside header. */ 3636 skb_split_inside_header(skb, skb1, len, pos); 3637 else /* Second chunk has no header, nothing to copy. */ 3638 skb_split_no_header(skb, skb1, len, pos); 3639 } 3640 EXPORT_SYMBOL(skb_split); 3641 3642 /* Shifting from/to a cloned skb is a no-go. 3643 * 3644 * Caller cannot keep skb_shinfo related pointers past calling here! 3645 */ 3646 static int skb_prepare_for_shift(struct sk_buff *skb) 3647 { 3648 return skb_unclone_keeptruesize(skb, GFP_ATOMIC); 3649 } 3650 3651 /** 3652 * skb_shift - Shifts paged data partially from skb to another 3653 * @tgt: buffer into which tail data gets added 3654 * @skb: buffer from which the paged data comes from 3655 * @shiftlen: shift up to this many bytes 3656 * 3657 * Attempts to shift up to shiftlen worth of bytes, which may be less than 3658 * the length of the skb, from skb to tgt. Returns number bytes shifted. 3659 * It's up to caller to free skb if everything was shifted. 3660 * 3661 * If @tgt runs out of frags, the whole operation is aborted. 3662 * 3663 * Skb cannot include anything else but paged data while tgt is allowed 3664 * to have non-paged data as well. 3665 * 3666 * TODO: full sized shift could be optimized but that would need 3667 * specialized skb free'er to handle frags without up-to-date nr_frags. 3668 */ 3669 int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) 3670 { 3671 int from, to, merge, todo; 3672 skb_frag_t *fragfrom, *fragto; 3673 3674 BUG_ON(shiftlen > skb->len); 3675 3676 if (skb_headlen(skb)) 3677 return 0; 3678 if (skb_zcopy(tgt) || skb_zcopy(skb)) 3679 return 0; 3680 3681 todo = shiftlen; 3682 from = 0; 3683 to = skb_shinfo(tgt)->nr_frags; 3684 fragfrom = &skb_shinfo(skb)->frags[from]; 3685 3686 /* Actual merge is delayed until the point when we know we can 3687 * commit all, so that we don't have to undo partial changes 3688 */ 3689 if (!to || 3690 !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), 3691 skb_frag_off(fragfrom))) { 3692 merge = -1; 3693 } else { 3694 merge = to - 1; 3695 3696 todo -= skb_frag_size(fragfrom); 3697 if (todo < 0) { 3698 if (skb_prepare_for_shift(skb) || 3699 skb_prepare_for_shift(tgt)) 3700 return 0; 3701 3702 /* All previous frag pointers might be stale! */ 3703 fragfrom = &skb_shinfo(skb)->frags[from]; 3704 fragto = &skb_shinfo(tgt)->frags[merge]; 3705 3706 skb_frag_size_add(fragto, shiftlen); 3707 skb_frag_size_sub(fragfrom, shiftlen); 3708 skb_frag_off_add(fragfrom, shiftlen); 3709 3710 goto onlymerged; 3711 } 3712 3713 from++; 3714 } 3715 3716 /* Skip full, not-fitting skb to avoid expensive operations */ 3717 if ((shiftlen == skb->len) && 3718 (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) 3719 return 0; 3720 3721 if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) 3722 return 0; 3723 3724 while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { 3725 if (to == MAX_SKB_FRAGS) 3726 return 0; 3727 3728 fragfrom = &skb_shinfo(skb)->frags[from]; 3729 fragto = &skb_shinfo(tgt)->frags[to]; 3730 3731 if (todo >= skb_frag_size(fragfrom)) { 3732 *fragto = *fragfrom; 3733 todo -= skb_frag_size(fragfrom); 3734 from++; 3735 to++; 3736 3737 } else { 3738 __skb_frag_ref(fragfrom); 3739 skb_frag_page_copy(fragto, fragfrom); 3740 skb_frag_off_copy(fragto, fragfrom); 3741 skb_frag_size_set(fragto, todo); 3742 3743 skb_frag_off_add(fragfrom, todo); 3744 skb_frag_size_sub(fragfrom, todo); 3745 todo = 0; 3746 3747 to++; 3748 break; 3749 } 3750 } 3751 3752 /* Ready to "commit" this state change to tgt */ 3753 skb_shinfo(tgt)->nr_frags = to; 3754 3755 if (merge >= 0) { 3756 fragfrom = &skb_shinfo(skb)->frags[0]; 3757 fragto = &skb_shinfo(tgt)->frags[merge]; 3758 3759 skb_frag_size_add(fragto, skb_frag_size(fragfrom)); 3760 __skb_frag_unref(fragfrom, skb->pp_recycle); 3761 } 3762 3763 /* Reposition in the original skb */ 3764 to = 0; 3765 while (from < skb_shinfo(skb)->nr_frags) 3766 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; 3767 skb_shinfo(skb)->nr_frags = to; 3768 3769 BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); 3770 3771 onlymerged: 3772 /* Most likely the tgt won't ever need its checksum anymore, skb on 3773 * the other hand might need it if it needs to be resent 3774 */ 3775 tgt->ip_summed = CHECKSUM_PARTIAL; 3776 skb->ip_summed = CHECKSUM_PARTIAL; 3777 3778 skb_len_add(skb, -shiftlen); 3779 skb_len_add(tgt, shiftlen); 3780 3781 return shiftlen; 3782 } 3783 3784 /** 3785 * skb_prepare_seq_read - Prepare a sequential read of skb data 3786 * @skb: the buffer to read 3787 * @from: lower offset of data to be read 3788 * @to: upper offset of data to be read 3789 * @st: state variable 3790 * 3791 * Initializes the specified state variable. Must be called before 3792 * invoking skb_seq_read() for the first time. 3793 */ 3794 void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, 3795 unsigned int to, struct skb_seq_state *st) 3796 { 3797 st->lower_offset = from; 3798 st->upper_offset = to; 3799 st->root_skb = st->cur_skb = skb; 3800 st->frag_idx = st->stepped_offset = 0; 3801 st->frag_data = NULL; 3802 st->frag_off = 0; 3803 } 3804 EXPORT_SYMBOL(skb_prepare_seq_read); 3805 3806 /** 3807 * skb_seq_read - Sequentially read skb data 3808 * @consumed: number of bytes consumed by the caller so far 3809 * @data: destination pointer for data to be returned 3810 * @st: state variable 3811 * 3812 * Reads a block of skb data at @consumed relative to the 3813 * lower offset specified to skb_prepare_seq_read(). Assigns 3814 * the head of the data block to @data and returns the length 3815 * of the block or 0 if the end of the skb data or the upper 3816 * offset has been reached. 3817 * 3818 * The caller is not required to consume all of the data 3819 * returned, i.e. @consumed is typically set to the number 3820 * of bytes already consumed and the next call to 3821 * skb_seq_read() will return the remaining part of the block. 3822 * 3823 * Note 1: The size of each block of data returned can be arbitrary, 3824 * this limitation is the cost for zerocopy sequential 3825 * reads of potentially non linear data. 3826 * 3827 * Note 2: Fragment lists within fragments are not implemented 3828 * at the moment, state->root_skb could be replaced with 3829 * a stack for this purpose. 3830 */ 3831 unsigned int skb_seq_read(unsigned int consumed, const u8 **data, 3832 struct skb_seq_state *st) 3833 { 3834 unsigned int block_limit, abs_offset = consumed + st->lower_offset; 3835 skb_frag_t *frag; 3836 3837 if (unlikely(abs_offset >= st->upper_offset)) { 3838 if (st->frag_data) { 3839 kunmap_atomic(st->frag_data); 3840 st->frag_data = NULL; 3841 } 3842 return 0; 3843 } 3844 3845 next_skb: 3846 block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; 3847 3848 if (abs_offset < block_limit && !st->frag_data) { 3849 *data = st->cur_skb->data + (abs_offset - st->stepped_offset); 3850 return block_limit - abs_offset; 3851 } 3852 3853 if (st->frag_idx == 0 && !st->frag_data) 3854 st->stepped_offset += skb_headlen(st->cur_skb); 3855 3856 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { 3857 unsigned int pg_idx, pg_off, pg_sz; 3858 3859 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; 3860 3861 pg_idx = 0; 3862 pg_off = skb_frag_off(frag); 3863 pg_sz = skb_frag_size(frag); 3864 3865 if (skb_frag_must_loop(skb_frag_page(frag))) { 3866 pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; 3867 pg_off = offset_in_page(pg_off + st->frag_off); 3868 pg_sz = min_t(unsigned int, pg_sz - st->frag_off, 3869 PAGE_SIZE - pg_off); 3870 } 3871 3872 block_limit = pg_sz + st->stepped_offset; 3873 if (abs_offset < block_limit) { 3874 if (!st->frag_data) 3875 st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx); 3876 3877 *data = (u8 *)st->frag_data + pg_off + 3878 (abs_offset - st->stepped_offset); 3879 3880 return block_limit - abs_offset; 3881 } 3882 3883 if (st->frag_data) { 3884 kunmap_atomic(st->frag_data); 3885 st->frag_data = NULL; 3886 } 3887 3888 st->stepped_offset += pg_sz; 3889 st->frag_off += pg_sz; 3890 if (st->frag_off == skb_frag_size(frag)) { 3891 st->frag_off = 0; 3892 st->frag_idx++; 3893 } 3894 } 3895 3896 if (st->frag_data) { 3897 kunmap_atomic(st->frag_data); 3898 st->frag_data = NULL; 3899 } 3900 3901 if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { 3902 st->cur_skb = skb_shinfo(st->root_skb)->frag_list; 3903 st->frag_idx = 0; 3904 goto next_skb; 3905 } else if (st->cur_skb->next) { 3906 st->cur_skb = st->cur_skb->next; 3907 st->frag_idx = 0; 3908 goto next_skb; 3909 } 3910 3911 return 0; 3912 } 3913 EXPORT_SYMBOL(skb_seq_read); 3914 3915 /** 3916 * skb_abort_seq_read - Abort a sequential read of skb data 3917 * @st: state variable 3918 * 3919 * Must be called if skb_seq_read() was not called until it 3920 * returned 0. 3921 */ 3922 void skb_abort_seq_read(struct skb_seq_state *st) 3923 { 3924 if (st->frag_data) 3925 kunmap_atomic(st->frag_data); 3926 } 3927 EXPORT_SYMBOL(skb_abort_seq_read); 3928 3929 #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) 3930 3931 static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, 3932 struct ts_config *conf, 3933 struct ts_state *state) 3934 { 3935 return skb_seq_read(offset, text, TS_SKB_CB(state)); 3936 } 3937 3938 static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) 3939 { 3940 skb_abort_seq_read(TS_SKB_CB(state)); 3941 } 3942 3943 /** 3944 * skb_find_text - Find a text pattern in skb data 3945 * @skb: the buffer to look in 3946 * @from: search offset 3947 * @to: search limit 3948 * @config: textsearch configuration 3949 * 3950 * Finds a pattern in the skb data according to the specified 3951 * textsearch configuration. Use textsearch_next() to retrieve 3952 * subsequent occurrences of the pattern. Returns the offset 3953 * to the first occurrence or UINT_MAX if no match was found. 3954 */ 3955 unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, 3956 unsigned int to, struct ts_config *config) 3957 { 3958 struct ts_state state; 3959 unsigned int ret; 3960 3961 BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb)); 3962 3963 config->get_next_block = skb_ts_get_next_block; 3964 config->finish = skb_ts_finish; 3965 3966 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); 3967 3968 ret = textsearch_find(config, &state); 3969 return (ret <= to - from ? ret : UINT_MAX); 3970 } 3971 EXPORT_SYMBOL(skb_find_text); 3972 3973 int skb_append_pagefrags(struct sk_buff *skb, struct page *page, 3974 int offset, size_t size) 3975 { 3976 int i = skb_shinfo(skb)->nr_frags; 3977 3978 if (skb_can_coalesce(skb, i, page, offset)) { 3979 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); 3980 } else if (i < MAX_SKB_FRAGS) { 3981 skb_zcopy_downgrade_managed(skb); 3982 get_page(page); 3983 skb_fill_page_desc_noacc(skb, i, page, offset, size); 3984 } else { 3985 return -EMSGSIZE; 3986 } 3987 3988 return 0; 3989 } 3990 EXPORT_SYMBOL_GPL(skb_append_pagefrags); 3991 3992 /** 3993 * skb_pull_rcsum - pull skb and update receive checksum 3994 * @skb: buffer to update 3995 * @len: length of data pulled 3996 * 3997 * This function performs an skb_pull on the packet and updates 3998 * the CHECKSUM_COMPLETE checksum. It should be used on 3999 * receive path processing instead of skb_pull unless you know 4000 * that the checksum difference is zero (e.g., a valid IP header) 4001 * or you are setting ip_summed to CHECKSUM_NONE. 4002 */ 4003 void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) 4004 { 4005 unsigned char *data = skb->data; 4006 4007 BUG_ON(len > skb->len); 4008 __skb_pull(skb, len); 4009 skb_postpull_rcsum(skb, data, len); 4010 return skb->data; 4011 } 4012 EXPORT_SYMBOL_GPL(skb_pull_rcsum); 4013 4014 static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) 4015 { 4016 skb_frag_t head_frag; 4017 struct page *page; 4018 4019 page = virt_to_head_page(frag_skb->head); 4020 __skb_frag_set_page(&head_frag, page); 4021 skb_frag_off_set(&head_frag, frag_skb->data - 4022 (unsigned char *)page_address(page)); 4023 skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); 4024 return head_frag; 4025 } 4026 4027 struct sk_buff *skb_segment_list(struct sk_buff *skb, 4028 netdev_features_t features, 4029 unsigned int offset) 4030 { 4031 struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; 4032 unsigned int tnl_hlen = skb_tnl_header_len(skb); 4033 unsigned int delta_truesize = 0; 4034 unsigned int delta_len = 0; 4035 struct sk_buff *tail = NULL; 4036 struct sk_buff *nskb, *tmp; 4037 int len_diff, err; 4038 4039 skb_push(skb, -skb_network_offset(skb) + offset); 4040 4041 skb_shinfo(skb)->frag_list = NULL; 4042 4043 do { 4044 nskb = list_skb; 4045 list_skb = list_skb->next; 4046 4047 err = 0; 4048 delta_truesize += nskb->truesize; 4049 if (skb_shared(nskb)) { 4050 tmp = skb_clone(nskb, GFP_ATOMIC); 4051 if (tmp) { 4052 consume_skb(nskb); 4053 nskb = tmp; 4054 err = skb_unclone(nskb, GFP_ATOMIC); 4055 } else { 4056 err = -ENOMEM; 4057 } 4058 } 4059 4060 if (!tail) 4061 skb->next = nskb; 4062 else 4063 tail->next = nskb; 4064 4065 if (unlikely(err)) { 4066 nskb->next = list_skb; 4067 goto err_linearize; 4068 } 4069 4070 tail = nskb; 4071 4072 delta_len += nskb->len; 4073 4074 skb_push(nskb, -skb_network_offset(nskb) + offset); 4075 4076 skb_release_head_state(nskb); 4077 len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb); 4078 __copy_skb_header(nskb, skb); 4079 4080 skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); 4081 nskb->transport_header += len_diff; 4082 skb_copy_from_linear_data_offset(skb, -tnl_hlen, 4083 nskb->data - tnl_hlen, 4084 offset + tnl_hlen); 4085 4086 if (skb_needs_linearize(nskb, features) && 4087 __skb_linearize(nskb)) 4088 goto err_linearize; 4089 4090 } while (list_skb); 4091 4092 skb->truesize = skb->truesize - delta_truesize; 4093 skb->data_len = skb->data_len - delta_len; 4094 skb->len = skb->len - delta_len; 4095 4096 skb_gso_reset(skb); 4097 4098 skb->prev = tail; 4099 4100 if (skb_needs_linearize(skb, features) && 4101 __skb_linearize(skb)) 4102 goto err_linearize; 4103 4104 skb_get(skb); 4105 4106 return skb; 4107 4108 err_linearize: 4109 kfree_skb_list(skb->next); 4110 skb->next = NULL; 4111 return ERR_PTR(-ENOMEM); 4112 } 4113 EXPORT_SYMBOL_GPL(skb_segment_list); 4114 4115 /** 4116 * skb_segment - Perform protocol segmentation on skb. 4117 * @head_skb: buffer to segment 4118 * @features: features for the output path (see dev->features) 4119 * 4120 * This function performs segmentation on the given skb. It returns 4121 * a pointer to the first in a list of new skbs for the segments. 4122 * In case of error it returns ERR_PTR(err). 4123 */ 4124 struct sk_buff *skb_segment(struct sk_buff *head_skb, 4125 netdev_features_t features) 4126 { 4127 struct sk_buff *segs = NULL; 4128 struct sk_buff *tail = NULL; 4129 struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; 4130 skb_frag_t *frag = skb_shinfo(head_skb)->frags; 4131 unsigned int mss = skb_shinfo(head_skb)->gso_size; 4132 unsigned int doffset = head_skb->data - skb_mac_header(head_skb); 4133 struct sk_buff *frag_skb = head_skb; 4134 unsigned int offset = doffset; 4135 unsigned int tnl_hlen = skb_tnl_header_len(head_skb); 4136 unsigned int partial_segs = 0; 4137 unsigned int headroom; 4138 unsigned int len = head_skb->len; 4139 __be16 proto; 4140 bool csum, sg; 4141 int nfrags = skb_shinfo(head_skb)->nr_frags; 4142 int err = -ENOMEM; 4143 int i = 0; 4144 int pos; 4145 4146 if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) && 4147 (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) { 4148 /* gso_size is untrusted, and we have a frag_list with a linear 4149 * non head_frag head. 4150 * 4151 * (we assume checking the first list_skb member suffices; 4152 * i.e if either of the list_skb members have non head_frag 4153 * head, then the first one has too). 4154 * 4155 * If head_skb's headlen does not fit requested gso_size, it 4156 * means that the frag_list members do NOT terminate on exact 4157 * gso_size boundaries. Hence we cannot perform skb_frag_t page 4158 * sharing. Therefore we must fallback to copying the frag_list 4159 * skbs; we do so by disabling SG. 4160 */ 4161 if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) 4162 features &= ~NETIF_F_SG; 4163 } 4164 4165 __skb_push(head_skb, doffset); 4166 proto = skb_network_protocol(head_skb, NULL); 4167 if (unlikely(!proto)) 4168 return ERR_PTR(-EINVAL); 4169 4170 sg = !!(features & NETIF_F_SG); 4171 csum = !!can_checksum_protocol(features, proto); 4172 4173 if (sg && csum && (mss != GSO_BY_FRAGS)) { 4174 if (!(features & NETIF_F_GSO_PARTIAL)) { 4175 struct sk_buff *iter; 4176 unsigned int frag_len; 4177 4178 if (!list_skb || 4179 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) 4180 goto normal; 4181 4182 /* If we get here then all the required 4183 * GSO features except frag_list are supported. 4184 * Try to split the SKB to multiple GSO SKBs 4185 * with no frag_list. 4186 * Currently we can do that only when the buffers don't 4187 * have a linear part and all the buffers except 4188 * the last are of the same length. 4189 */ 4190 frag_len = list_skb->len; 4191 skb_walk_frags(head_skb, iter) { 4192 if (frag_len != iter->len && iter->next) 4193 goto normal; 4194 if (skb_headlen(iter) && !iter->head_frag) 4195 goto normal; 4196 4197 len -= iter->len; 4198 } 4199 4200 if (len != frag_len) 4201 goto normal; 4202 } 4203 4204 /* GSO partial only requires that we trim off any excess that 4205 * doesn't fit into an MSS sized block, so take care of that 4206 * now. 4207 */ 4208 partial_segs = len / mss; 4209 if (partial_segs > 1) 4210 mss *= partial_segs; 4211 else 4212 partial_segs = 0; 4213 } 4214 4215 normal: 4216 headroom = skb_headroom(head_skb); 4217 pos = skb_headlen(head_skb); 4218 4219 do { 4220 struct sk_buff *nskb; 4221 skb_frag_t *nskb_frag; 4222 int hsize; 4223 int size; 4224 4225 if (unlikely(mss == GSO_BY_FRAGS)) { 4226 len = list_skb->len; 4227 } else { 4228 len = head_skb->len - offset; 4229 if (len > mss) 4230 len = mss; 4231 } 4232 4233 hsize = skb_headlen(head_skb) - offset; 4234 4235 if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) && 4236 (skb_headlen(list_skb) == len || sg)) { 4237 BUG_ON(skb_headlen(list_skb) > len); 4238 4239 i = 0; 4240 nfrags = skb_shinfo(list_skb)->nr_frags; 4241 frag = skb_shinfo(list_skb)->frags; 4242 frag_skb = list_skb; 4243 pos += skb_headlen(list_skb); 4244 4245 while (pos < offset + len) { 4246 BUG_ON(i >= nfrags); 4247 4248 size = skb_frag_size(frag); 4249 if (pos + size > offset + len) 4250 break; 4251 4252 i++; 4253 pos += size; 4254 frag++; 4255 } 4256 4257 nskb = skb_clone(list_skb, GFP_ATOMIC); 4258 list_skb = list_skb->next; 4259 4260 if (unlikely(!nskb)) 4261 goto err; 4262 4263 if (unlikely(pskb_trim(nskb, len))) { 4264 kfree_skb(nskb); 4265 goto err; 4266 } 4267 4268 hsize = skb_end_offset(nskb); 4269 if (skb_cow_head(nskb, doffset + headroom)) { 4270 kfree_skb(nskb); 4271 goto err; 4272 } 4273 4274 nskb->truesize += skb_end_offset(nskb) - hsize; 4275 skb_release_head_state(nskb); 4276 __skb_push(nskb, doffset); 4277 } else { 4278 if (hsize < 0) 4279 hsize = 0; 4280 if (hsize > len || !sg) 4281 hsize = len; 4282 4283 nskb = __alloc_skb(hsize + doffset + headroom, 4284 GFP_ATOMIC, skb_alloc_rx_flag(head_skb), 4285 NUMA_NO_NODE); 4286 4287 if (unlikely(!nskb)) 4288 goto err; 4289 4290 skb_reserve(nskb, headroom); 4291 __skb_put(nskb, doffset); 4292 } 4293 4294 if (segs) 4295 tail->next = nskb; 4296 else 4297 segs = nskb; 4298 tail = nskb; 4299 4300 __copy_skb_header(nskb, head_skb); 4301 4302 skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); 4303 skb_reset_mac_len(nskb); 4304 4305 skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, 4306 nskb->data - tnl_hlen, 4307 doffset + tnl_hlen); 4308 4309 if (nskb->len == len + doffset) 4310 goto perform_csum_check; 4311 4312 if (!sg) { 4313 if (!csum) { 4314 if (!nskb->remcsum_offload) 4315 nskb->ip_summed = CHECKSUM_NONE; 4316 SKB_GSO_CB(nskb)->csum = 4317 skb_copy_and_csum_bits(head_skb, offset, 4318 skb_put(nskb, 4319 len), 4320 len); 4321 SKB_GSO_CB(nskb)->csum_start = 4322 skb_headroom(nskb) + doffset; 4323 } else { 4324 if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) 4325 goto err; 4326 } 4327 continue; 4328 } 4329 4330 nskb_frag = skb_shinfo(nskb)->frags; 4331 4332 skb_copy_from_linear_data_offset(head_skb, offset, 4333 skb_put(nskb, hsize), hsize); 4334 4335 skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & 4336 SKBFL_SHARED_FRAG; 4337 4338 if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || 4339 skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) 4340 goto err; 4341 4342 while (pos < offset + len) { 4343 if (i >= nfrags) { 4344 i = 0; 4345 nfrags = skb_shinfo(list_skb)->nr_frags; 4346 frag = skb_shinfo(list_skb)->frags; 4347 frag_skb = list_skb; 4348 if (!skb_headlen(list_skb)) { 4349 BUG_ON(!nfrags); 4350 } else { 4351 BUG_ON(!list_skb->head_frag); 4352 4353 /* to make room for head_frag. */ 4354 i--; 4355 frag--; 4356 } 4357 if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || 4358 skb_zerocopy_clone(nskb, frag_skb, 4359 GFP_ATOMIC)) 4360 goto err; 4361 4362 list_skb = list_skb->next; 4363 } 4364 4365 if (unlikely(skb_shinfo(nskb)->nr_frags >= 4366 MAX_SKB_FRAGS)) { 4367 net_warn_ratelimited( 4368 "skb_segment: too many frags: %u %u\n", 4369 pos, mss); 4370 err = -EINVAL; 4371 goto err; 4372 } 4373 4374 *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; 4375 __skb_frag_ref(nskb_frag); 4376 size = skb_frag_size(nskb_frag); 4377 4378 if (pos < offset) { 4379 skb_frag_off_add(nskb_frag, offset - pos); 4380 skb_frag_size_sub(nskb_frag, offset - pos); 4381 } 4382 4383 skb_shinfo(nskb)->nr_frags++; 4384 4385 if (pos + size <= offset + len) { 4386 i++; 4387 frag++; 4388 pos += size; 4389 } else { 4390 skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); 4391 goto skip_fraglist; 4392 } 4393 4394 nskb_frag++; 4395 } 4396 4397 skip_fraglist: 4398 nskb->data_len = len - hsize; 4399 nskb->len += nskb->data_len; 4400 nskb->truesize += nskb->data_len; 4401 4402 perform_csum_check: 4403 if (!csum) { 4404 if (skb_has_shared_frag(nskb) && 4405 __skb_linearize(nskb)) 4406 goto err; 4407 4408 if (!nskb->remcsum_offload) 4409 nskb->ip_summed = CHECKSUM_NONE; 4410 SKB_GSO_CB(nskb)->csum = 4411 skb_checksum(nskb, doffset, 4412 nskb->len - doffset, 0); 4413 SKB_GSO_CB(nskb)->csum_start = 4414 skb_headroom(nskb) + doffset; 4415 } 4416 } while ((offset += len) < head_skb->len); 4417 4418 /* Some callers want to get the end of the list. 4419 * Put it in segs->prev to avoid walking the list. 4420 * (see validate_xmit_skb_list() for example) 4421 */ 4422 segs->prev = tail; 4423 4424 if (partial_segs) { 4425 struct sk_buff *iter; 4426 int type = skb_shinfo(head_skb)->gso_type; 4427 unsigned short gso_size = skb_shinfo(head_skb)->gso_size; 4428 4429 /* Update type to add partial and then remove dodgy if set */ 4430 type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; 4431 type &= ~SKB_GSO_DODGY; 4432 4433 /* Update GSO info and prepare to start updating headers on 4434 * our way back down the stack of protocols. 4435 */ 4436 for (iter = segs; iter; iter = iter->next) { 4437 skb_shinfo(iter)->gso_size = gso_size; 4438 skb_shinfo(iter)->gso_segs = partial_segs; 4439 skb_shinfo(iter)->gso_type = type; 4440 SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; 4441 } 4442 4443 if (tail->len - doffset <= gso_size) 4444 skb_shinfo(tail)->gso_size = 0; 4445 else if (tail != segs) 4446 skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); 4447 } 4448 4449 /* Following permits correct backpressure, for protocols 4450 * using skb_set_owner_w(). 4451 * Idea is to tranfert ownership from head_skb to last segment. 4452 */ 4453 if (head_skb->destructor == sock_wfree) { 4454 swap(tail->truesize, head_skb->truesize); 4455 swap(tail->destructor, head_skb->destructor); 4456 swap(tail->sk, head_skb->sk); 4457 } 4458 return segs; 4459 4460 err: 4461 kfree_skb_list(segs); 4462 return ERR_PTR(err); 4463 } 4464 EXPORT_SYMBOL_GPL(skb_segment); 4465 4466 #ifdef CONFIG_SKB_EXTENSIONS 4467 #define SKB_EXT_ALIGN_VALUE 8 4468 #define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) 4469 4470 static const u8 skb_ext_type_len[] = { 4471 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 4472 [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), 4473 #endif 4474 #ifdef CONFIG_XFRM 4475 [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), 4476 #endif 4477 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) 4478 [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), 4479 #endif 4480 #if IS_ENABLED(CONFIG_MPTCP) 4481 [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), 4482 #endif 4483 #if IS_ENABLED(CONFIG_MCTP_FLOWS) 4484 [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), 4485 #endif 4486 }; 4487 4488 static __always_inline unsigned int skb_ext_total_length(void) 4489 { 4490 return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + 4491 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 4492 skb_ext_type_len[SKB_EXT_BRIDGE_NF] + 4493 #endif 4494 #ifdef CONFIG_XFRM 4495 skb_ext_type_len[SKB_EXT_SEC_PATH] + 4496 #endif 4497 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) 4498 skb_ext_type_len[TC_SKB_EXT] + 4499 #endif 4500 #if IS_ENABLED(CONFIG_MPTCP) 4501 skb_ext_type_len[SKB_EXT_MPTCP] + 4502 #endif 4503 #if IS_ENABLED(CONFIG_MCTP_FLOWS) 4504 skb_ext_type_len[SKB_EXT_MCTP] + 4505 #endif 4506 0; 4507 } 4508 4509 static void skb_extensions_init(void) 4510 { 4511 BUILD_BUG_ON(SKB_EXT_NUM >= 8); 4512 BUILD_BUG_ON(skb_ext_total_length() > 255); 4513 4514 skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", 4515 SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), 4516 0, 4517 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4518 NULL); 4519 } 4520 #else 4521 static void skb_extensions_init(void) {} 4522 #endif 4523 4524 void __init skb_init(void) 4525 { 4526 skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", 4527 sizeof(struct sk_buff), 4528 0, 4529 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4530 offsetof(struct sk_buff, cb), 4531 sizeof_field(struct sk_buff, cb), 4532 NULL); 4533 skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", 4534 sizeof(struct sk_buff_fclones), 4535 0, 4536 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4537 NULL); 4538 skb_extensions_init(); 4539 } 4540 4541 static int 4542 __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, 4543 unsigned int recursion_level) 4544 { 4545 int start = skb_headlen(skb); 4546 int i, copy = start - offset; 4547 struct sk_buff *frag_iter; 4548 int elt = 0; 4549 4550 if (unlikely(recursion_level >= 24)) 4551 return -EMSGSIZE; 4552 4553 if (copy > 0) { 4554 if (copy > len) 4555 copy = len; 4556 sg_set_buf(sg, skb->data + offset, copy); 4557 elt++; 4558 if ((len -= copy) == 0) 4559 return elt; 4560 offset += copy; 4561 } 4562 4563 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 4564 int end; 4565 4566 WARN_ON(start > offset + len); 4567 4568 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 4569 if ((copy = end - offset) > 0) { 4570 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 4571 if (unlikely(elt && sg_is_last(&sg[elt - 1]))) 4572 return -EMSGSIZE; 4573 4574 if (copy > len) 4575 copy = len; 4576 sg_set_page(&sg[elt], skb_frag_page(frag), copy, 4577 skb_frag_off(frag) + offset - start); 4578 elt++; 4579 if (!(len -= copy)) 4580 return elt; 4581 offset += copy; 4582 } 4583 start = end; 4584 } 4585 4586 skb_walk_frags(skb, frag_iter) { 4587 int end, ret; 4588 4589 WARN_ON(start > offset + len); 4590 4591 end = start + frag_iter->len; 4592 if ((copy = end - offset) > 0) { 4593 if (unlikely(elt && sg_is_last(&sg[elt - 1]))) 4594 return -EMSGSIZE; 4595 4596 if (copy > len) 4597 copy = len; 4598 ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, 4599 copy, recursion_level + 1); 4600 if (unlikely(ret < 0)) 4601 return ret; 4602 elt += ret; 4603 if ((len -= copy) == 0) 4604 return elt; 4605 offset += copy; 4606 } 4607 start = end; 4608 } 4609 BUG_ON(len); 4610 return elt; 4611 } 4612 4613 /** 4614 * skb_to_sgvec - Fill a scatter-gather list from a socket buffer 4615 * @skb: Socket buffer containing the buffers to be mapped 4616 * @sg: The scatter-gather list to map into 4617 * @offset: The offset into the buffer's contents to start mapping 4618 * @len: Length of buffer space to be mapped 4619 * 4620 * Fill the specified scatter-gather list with mappings/pointers into a 4621 * region of the buffer space attached to a socket buffer. Returns either 4622 * the number of scatterlist items used, or -EMSGSIZE if the contents 4623 * could not fit. 4624 */ 4625 int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 4626 { 4627 int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); 4628 4629 if (nsg <= 0) 4630 return nsg; 4631 4632 sg_mark_end(&sg[nsg - 1]); 4633 4634 return nsg; 4635 } 4636 EXPORT_SYMBOL_GPL(skb_to_sgvec); 4637 4638 /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given 4639 * sglist without mark the sg which contain last skb data as the end. 4640 * So the caller can mannipulate sg list as will when padding new data after 4641 * the first call without calling sg_unmark_end to expend sg list. 4642 * 4643 * Scenario to use skb_to_sgvec_nomark: 4644 * 1. sg_init_table 4645 * 2. skb_to_sgvec_nomark(payload1) 4646 * 3. skb_to_sgvec_nomark(payload2) 4647 * 4648 * This is equivalent to: 4649 * 1. sg_init_table 4650 * 2. skb_to_sgvec(payload1) 4651 * 3. sg_unmark_end 4652 * 4. skb_to_sgvec(payload2) 4653 * 4654 * When mapping mutilple payload conditionally, skb_to_sgvec_nomark 4655 * is more preferable. 4656 */ 4657 int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, 4658 int offset, int len) 4659 { 4660 return __skb_to_sgvec(skb, sg, offset, len, 0); 4661 } 4662 EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); 4663 4664 4665 4666 /** 4667 * skb_cow_data - Check that a socket buffer's data buffers are writable 4668 * @skb: The socket buffer to check. 4669 * @tailbits: Amount of trailing space to be added 4670 * @trailer: Returned pointer to the skb where the @tailbits space begins 4671 * 4672 * Make sure that the data buffers attached to a socket buffer are 4673 * writable. If they are not, private copies are made of the data buffers 4674 * and the socket buffer is set to use these instead. 4675 * 4676 * If @tailbits is given, make sure that there is space to write @tailbits 4677 * bytes of data beyond current end of socket buffer. @trailer will be 4678 * set to point to the skb in which this space begins. 4679 * 4680 * The number of scatterlist elements required to completely map the 4681 * COW'd and extended socket buffer will be returned. 4682 */ 4683 int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) 4684 { 4685 int copyflag; 4686 int elt; 4687 struct sk_buff *skb1, **skb_p; 4688 4689 /* If skb is cloned or its head is paged, reallocate 4690 * head pulling out all the pages (pages are considered not writable 4691 * at the moment even if they are anonymous). 4692 */ 4693 if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && 4694 !__pskb_pull_tail(skb, __skb_pagelen(skb))) 4695 return -ENOMEM; 4696 4697 /* Easy case. Most of packets will go this way. */ 4698 if (!skb_has_frag_list(skb)) { 4699 /* A little of trouble, not enough of space for trailer. 4700 * This should not happen, when stack is tuned to generate 4701 * good frames. OK, on miss we reallocate and reserve even more 4702 * space, 128 bytes is fair. */ 4703 4704 if (skb_tailroom(skb) < tailbits && 4705 pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) 4706 return -ENOMEM; 4707 4708 /* Voila! */ 4709 *trailer = skb; 4710 return 1; 4711 } 4712 4713 /* Misery. We are in troubles, going to mincer fragments... */ 4714 4715 elt = 1; 4716 skb_p = &skb_shinfo(skb)->frag_list; 4717 copyflag = 0; 4718 4719 while ((skb1 = *skb_p) != NULL) { 4720 int ntail = 0; 4721 4722 /* The fragment is partially pulled by someone, 4723 * this can happen on input. Copy it and everything 4724 * after it. */ 4725 4726 if (skb_shared(skb1)) 4727 copyflag = 1; 4728 4729 /* If the skb is the last, worry about trailer. */ 4730 4731 if (skb1->next == NULL && tailbits) { 4732 if (skb_shinfo(skb1)->nr_frags || 4733 skb_has_frag_list(skb1) || 4734 skb_tailroom(skb1) < tailbits) 4735 ntail = tailbits + 128; 4736 } 4737 4738 if (copyflag || 4739 skb_cloned(skb1) || 4740 ntail || 4741 skb_shinfo(skb1)->nr_frags || 4742 skb_has_frag_list(skb1)) { 4743 struct sk_buff *skb2; 4744 4745 /* Fuck, we are miserable poor guys... */ 4746 if (ntail == 0) 4747 skb2 = skb_copy(skb1, GFP_ATOMIC); 4748 else 4749 skb2 = skb_copy_expand(skb1, 4750 skb_headroom(skb1), 4751 ntail, 4752 GFP_ATOMIC); 4753 if (unlikely(skb2 == NULL)) 4754 return -ENOMEM; 4755 4756 if (skb1->sk) 4757 skb_set_owner_w(skb2, skb1->sk); 4758 4759 /* Looking around. Are we still alive? 4760 * OK, link new skb, drop old one */ 4761 4762 skb2->next = skb1->next; 4763 *skb_p = skb2; 4764 kfree_skb(skb1); 4765 skb1 = skb2; 4766 } 4767 elt++; 4768 *trailer = skb1; 4769 skb_p = &skb1->next; 4770 } 4771 4772 return elt; 4773 } 4774 EXPORT_SYMBOL_GPL(skb_cow_data); 4775 4776 static void sock_rmem_free(struct sk_buff *skb) 4777 { 4778 struct sock *sk = skb->sk; 4779 4780 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 4781 } 4782 4783 static void skb_set_err_queue(struct sk_buff *skb) 4784 { 4785 /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. 4786 * So, it is safe to (mis)use it to mark skbs on the error queue. 4787 */ 4788 skb->pkt_type = PACKET_OUTGOING; 4789 BUILD_BUG_ON(PACKET_OUTGOING == 0); 4790 } 4791 4792 /* 4793 * Note: We dont mem charge error packets (no sk_forward_alloc changes) 4794 */ 4795 int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) 4796 { 4797 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 4798 (unsigned int)READ_ONCE(sk->sk_rcvbuf)) 4799 return -ENOMEM; 4800 4801 skb_orphan(skb); 4802 skb->sk = sk; 4803 skb->destructor = sock_rmem_free; 4804 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 4805 skb_set_err_queue(skb); 4806 4807 /* before exiting rcu section, make sure dst is refcounted */ 4808 skb_dst_force(skb); 4809 4810 skb_queue_tail(&sk->sk_error_queue, skb); 4811 if (!sock_flag(sk, SOCK_DEAD)) 4812 sk_error_report(sk); 4813 return 0; 4814 } 4815 EXPORT_SYMBOL(sock_queue_err_skb); 4816 4817 static bool is_icmp_err_skb(const struct sk_buff *skb) 4818 { 4819 return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || 4820 SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); 4821 } 4822 4823 struct sk_buff *sock_dequeue_err_skb(struct sock *sk) 4824 { 4825 struct sk_buff_head *q = &sk->sk_error_queue; 4826 struct sk_buff *skb, *skb_next = NULL; 4827 bool icmp_next = false; 4828 unsigned long flags; 4829 4830 spin_lock_irqsave(&q->lock, flags); 4831 skb = __skb_dequeue(q); 4832 if (skb && (skb_next = skb_peek(q))) { 4833 icmp_next = is_icmp_err_skb(skb_next); 4834 if (icmp_next) 4835 sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; 4836 } 4837 spin_unlock_irqrestore(&q->lock, flags); 4838 4839 if (is_icmp_err_skb(skb) && !icmp_next) 4840 sk->sk_err = 0; 4841 4842 if (skb_next) 4843 sk_error_report(sk); 4844 4845 return skb; 4846 } 4847 EXPORT_SYMBOL(sock_dequeue_err_skb); 4848 4849 /** 4850 * skb_clone_sk - create clone of skb, and take reference to socket 4851 * @skb: the skb to clone 4852 * 4853 * This function creates a clone of a buffer that holds a reference on 4854 * sk_refcnt. Buffers created via this function are meant to be 4855 * returned using sock_queue_err_skb, or free via kfree_skb. 4856 * 4857 * When passing buffers allocated with this function to sock_queue_err_skb 4858 * it is necessary to wrap the call with sock_hold/sock_put in order to 4859 * prevent the socket from being released prior to being enqueued on 4860 * the sk_error_queue. 4861 */ 4862 struct sk_buff *skb_clone_sk(struct sk_buff *skb) 4863 { 4864 struct sock *sk = skb->sk; 4865 struct sk_buff *clone; 4866 4867 if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) 4868 return NULL; 4869 4870 clone = skb_clone(skb, GFP_ATOMIC); 4871 if (!clone) { 4872 sock_put(sk); 4873 return NULL; 4874 } 4875 4876 clone->sk = sk; 4877 clone->destructor = sock_efree; 4878 4879 return clone; 4880 } 4881 EXPORT_SYMBOL(skb_clone_sk); 4882 4883 static void __skb_complete_tx_timestamp(struct sk_buff *skb, 4884 struct sock *sk, 4885 int tstype, 4886 bool opt_stats) 4887 { 4888 struct sock_exterr_skb *serr; 4889 int err; 4890 4891 BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); 4892 4893 serr = SKB_EXT_ERR(skb); 4894 memset(serr, 0, sizeof(*serr)); 4895 serr->ee.ee_errno = ENOMSG; 4896 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 4897 serr->ee.ee_info = tstype; 4898 serr->opt_stats = opt_stats; 4899 serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; 4900 if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { 4901 serr->ee.ee_data = skb_shinfo(skb)->tskey; 4902 if (sk_is_tcp(sk)) 4903 serr->ee.ee_data -= atomic_read(&sk->sk_tskey); 4904 } 4905 4906 err = sock_queue_err_skb(sk, skb); 4907 4908 if (err) 4909 kfree_skb(skb); 4910 } 4911 4912 static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) 4913 { 4914 bool ret; 4915 4916 if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) 4917 return true; 4918 4919 read_lock_bh(&sk->sk_callback_lock); 4920 ret = sk->sk_socket && sk->sk_socket->file && 4921 file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); 4922 read_unlock_bh(&sk->sk_callback_lock); 4923 return ret; 4924 } 4925 4926 void skb_complete_tx_timestamp(struct sk_buff *skb, 4927 struct skb_shared_hwtstamps *hwtstamps) 4928 { 4929 struct sock *sk = skb->sk; 4930 4931 if (!skb_may_tx_timestamp(sk, false)) 4932 goto err; 4933 4934 /* Take a reference to prevent skb_orphan() from freeing the socket, 4935 * but only if the socket refcount is not zero. 4936 */ 4937 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { 4938 *skb_hwtstamps(skb) = *hwtstamps; 4939 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); 4940 sock_put(sk); 4941 return; 4942 } 4943 4944 err: 4945 kfree_skb(skb); 4946 } 4947 EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); 4948 4949 void __skb_tstamp_tx(struct sk_buff *orig_skb, 4950 const struct sk_buff *ack_skb, 4951 struct skb_shared_hwtstamps *hwtstamps, 4952 struct sock *sk, int tstype) 4953 { 4954 struct sk_buff *skb; 4955 bool tsonly, opt_stats = false; 4956 4957 if (!sk) 4958 return; 4959 4960 if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && 4961 skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) 4962 return; 4963 4964 tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; 4965 if (!skb_may_tx_timestamp(sk, tsonly)) 4966 return; 4967 4968 if (tsonly) { 4969 #ifdef CONFIG_INET 4970 if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && 4971 sk_is_tcp(sk)) { 4972 skb = tcp_get_timestamping_opt_stats(sk, orig_skb, 4973 ack_skb); 4974 opt_stats = true; 4975 } else 4976 #endif 4977 skb = alloc_skb(0, GFP_ATOMIC); 4978 } else { 4979 skb = skb_clone(orig_skb, GFP_ATOMIC); 4980 } 4981 if (!skb) 4982 return; 4983 4984 if (tsonly) { 4985 skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & 4986 SKBTX_ANY_TSTAMP; 4987 skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; 4988 } 4989 4990 if (hwtstamps) 4991 *skb_hwtstamps(skb) = *hwtstamps; 4992 else 4993 __net_timestamp(skb); 4994 4995 __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); 4996 } 4997 EXPORT_SYMBOL_GPL(__skb_tstamp_tx); 4998 4999 void skb_tstamp_tx(struct sk_buff *orig_skb, 5000 struct skb_shared_hwtstamps *hwtstamps) 5001 { 5002 return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, 5003 SCM_TSTAMP_SND); 5004 } 5005 EXPORT_SYMBOL_GPL(skb_tstamp_tx); 5006 5007 void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) 5008 { 5009 struct sock *sk = skb->sk; 5010 struct sock_exterr_skb *serr; 5011 int err = 1; 5012 5013 skb->wifi_acked_valid = 1; 5014 skb->wifi_acked = acked; 5015 5016 serr = SKB_EXT_ERR(skb); 5017 memset(serr, 0, sizeof(*serr)); 5018 serr->ee.ee_errno = ENOMSG; 5019 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; 5020 5021 /* Take a reference to prevent skb_orphan() from freeing the socket, 5022 * but only if the socket refcount is not zero. 5023 */ 5024 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { 5025 err = sock_queue_err_skb(sk, skb); 5026 sock_put(sk); 5027 } 5028 if (err) 5029 kfree_skb(skb); 5030 } 5031 EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); 5032 5033 /** 5034 * skb_partial_csum_set - set up and verify partial csum values for packet 5035 * @skb: the skb to set 5036 * @start: the number of bytes after skb->data to start checksumming. 5037 * @off: the offset from start to place the checksum. 5038 * 5039 * For untrusted partially-checksummed packets, we need to make sure the values 5040 * for skb->csum_start and skb->csum_offset are valid so we don't oops. 5041 * 5042 * This function checks and sets those values and skb->ip_summed: if this 5043 * returns false you should drop the packet. 5044 */ 5045 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) 5046 { 5047 u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); 5048 u32 csum_start = skb_headroom(skb) + (u32)start; 5049 5050 if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) { 5051 net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", 5052 start, off, skb_headroom(skb), skb_headlen(skb)); 5053 return false; 5054 } 5055 skb->ip_summed = CHECKSUM_PARTIAL; 5056 skb->csum_start = csum_start; 5057 skb->csum_offset = off; 5058 skb_set_transport_header(skb, start); 5059 return true; 5060 } 5061 EXPORT_SYMBOL_GPL(skb_partial_csum_set); 5062 5063 static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, 5064 unsigned int max) 5065 { 5066 if (skb_headlen(skb) >= len) 5067 return 0; 5068 5069 /* If we need to pullup then pullup to the max, so we 5070 * won't need to do it again. 5071 */ 5072 if (max > skb->len) 5073 max = skb->len; 5074 5075 if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) 5076 return -ENOMEM; 5077 5078 if (skb_headlen(skb) < len) 5079 return -EPROTO; 5080 5081 return 0; 5082 } 5083 5084 #define MAX_TCP_HDR_LEN (15 * 4) 5085 5086 static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, 5087 typeof(IPPROTO_IP) proto, 5088 unsigned int off) 5089 { 5090 int err; 5091 5092 switch (proto) { 5093 case IPPROTO_TCP: 5094 err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), 5095 off + MAX_TCP_HDR_LEN); 5096 if (!err && !skb_partial_csum_set(skb, off, 5097 offsetof(struct tcphdr, 5098 check))) 5099 err = -EPROTO; 5100 return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; 5101 5102 case IPPROTO_UDP: 5103 err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), 5104 off + sizeof(struct udphdr)); 5105 if (!err && !skb_partial_csum_set(skb, off, 5106 offsetof(struct udphdr, 5107 check))) 5108 err = -EPROTO; 5109 return err ? ERR_PTR(err) : &udp_hdr(skb)->check; 5110 } 5111 5112 return ERR_PTR(-EPROTO); 5113 } 5114 5115 /* This value should be large enough to cover a tagged ethernet header plus 5116 * maximally sized IP and TCP or UDP headers. 5117 */ 5118 #define MAX_IP_HDR_LEN 128 5119 5120 static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) 5121 { 5122 unsigned int off; 5123 bool fragment; 5124 __sum16 *csum; 5125 int err; 5126 5127 fragment = false; 5128 5129 err = skb_maybe_pull_tail(skb, 5130 sizeof(struct iphdr), 5131 MAX_IP_HDR_LEN); 5132 if (err < 0) 5133 goto out; 5134 5135 if (ip_is_fragment(ip_hdr(skb))) 5136 fragment = true; 5137 5138 off = ip_hdrlen(skb); 5139 5140 err = -EPROTO; 5141 5142 if (fragment) 5143 goto out; 5144 5145 csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); 5146 if (IS_ERR(csum)) 5147 return PTR_ERR(csum); 5148 5149 if (recalculate) 5150 *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, 5151 ip_hdr(skb)->daddr, 5152 skb->len - off, 5153 ip_hdr(skb)->protocol, 0); 5154 err = 0; 5155 5156 out: 5157 return err; 5158 } 5159 5160 /* This value should be large enough to cover a tagged ethernet header plus 5161 * an IPv6 header, all options, and a maximal TCP or UDP header. 5162 */ 5163 #define MAX_IPV6_HDR_LEN 256 5164 5165 #define OPT_HDR(type, skb, off) \ 5166 (type *)(skb_network_header(skb) + (off)) 5167 5168 static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) 5169 { 5170 int err; 5171 u8 nexthdr; 5172 unsigned int off; 5173 unsigned int len; 5174 bool fragment; 5175 bool done; 5176 __sum16 *csum; 5177 5178 fragment = false; 5179 done = false; 5180 5181 off = sizeof(struct ipv6hdr); 5182 5183 err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); 5184 if (err < 0) 5185 goto out; 5186 5187 nexthdr = ipv6_hdr(skb)->nexthdr; 5188 5189 len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); 5190 while (off <= len && !done) { 5191 switch (nexthdr) { 5192 case IPPROTO_DSTOPTS: 5193 case IPPROTO_HOPOPTS: 5194 case IPPROTO_ROUTING: { 5195 struct ipv6_opt_hdr *hp; 5196 5197 err = skb_maybe_pull_tail(skb, 5198 off + 5199 sizeof(struct ipv6_opt_hdr), 5200 MAX_IPV6_HDR_LEN); 5201 if (err < 0) 5202 goto out; 5203 5204 hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); 5205 nexthdr = hp->nexthdr; 5206 off += ipv6_optlen(hp); 5207 break; 5208 } 5209 case IPPROTO_AH: { 5210 struct ip_auth_hdr *hp; 5211 5212 err = skb_maybe_pull_tail(skb, 5213 off + 5214 sizeof(struct ip_auth_hdr), 5215 MAX_IPV6_HDR_LEN); 5216 if (err < 0) 5217 goto out; 5218 5219 hp = OPT_HDR(struct ip_auth_hdr, skb, off); 5220 nexthdr = hp->nexthdr; 5221 off += ipv6_authlen(hp); 5222 break; 5223 } 5224 case IPPROTO_FRAGMENT: { 5225 struct frag_hdr *hp; 5226 5227 err = skb_maybe_pull_tail(skb, 5228 off + 5229 sizeof(struct frag_hdr), 5230 MAX_IPV6_HDR_LEN); 5231 if (err < 0) 5232 goto out; 5233 5234 hp = OPT_HDR(struct frag_hdr, skb, off); 5235 5236 if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) 5237 fragment = true; 5238 5239 nexthdr = hp->nexthdr; 5240 off += sizeof(struct frag_hdr); 5241 break; 5242 } 5243 default: 5244 done = true; 5245 break; 5246 } 5247 } 5248 5249 err = -EPROTO; 5250 5251 if (!done || fragment) 5252 goto out; 5253 5254 csum = skb_checksum_setup_ip(skb, nexthdr, off); 5255 if (IS_ERR(csum)) 5256 return PTR_ERR(csum); 5257 5258 if (recalculate) 5259 *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, 5260 &ipv6_hdr(skb)->daddr, 5261 skb->len - off, nexthdr, 0); 5262 err = 0; 5263 5264 out: 5265 return err; 5266 } 5267 5268 /** 5269 * skb_checksum_setup - set up partial checksum offset 5270 * @skb: the skb to set up 5271 * @recalculate: if true the pseudo-header checksum will be recalculated 5272 */ 5273 int skb_checksum_setup(struct sk_buff *skb, bool recalculate) 5274 { 5275 int err; 5276 5277 switch (skb->protocol) { 5278 case htons(ETH_P_IP): 5279 err = skb_checksum_setup_ipv4(skb, recalculate); 5280 break; 5281 5282 case htons(ETH_P_IPV6): 5283 err = skb_checksum_setup_ipv6(skb, recalculate); 5284 break; 5285 5286 default: 5287 err = -EPROTO; 5288 break; 5289 } 5290 5291 return err; 5292 } 5293 EXPORT_SYMBOL(skb_checksum_setup); 5294 5295 /** 5296 * skb_checksum_maybe_trim - maybe trims the given skb 5297 * @skb: the skb to check 5298 * @transport_len: the data length beyond the network header 5299 * 5300 * Checks whether the given skb has data beyond the given transport length. 5301 * If so, returns a cloned skb trimmed to this transport length. 5302 * Otherwise returns the provided skb. Returns NULL in error cases 5303 * (e.g. transport_len exceeds skb length or out-of-memory). 5304 * 5305 * Caller needs to set the skb transport header and free any returned skb if it 5306 * differs from the provided skb. 5307 */ 5308 static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, 5309 unsigned int transport_len) 5310 { 5311 struct sk_buff *skb_chk; 5312 unsigned int len = skb_transport_offset(skb) + transport_len; 5313 int ret; 5314 5315 if (skb->len < len) 5316 return NULL; 5317 else if (skb->len == len) 5318 return skb; 5319 5320 skb_chk = skb_clone(skb, GFP_ATOMIC); 5321 if (!skb_chk) 5322 return NULL; 5323 5324 ret = pskb_trim_rcsum(skb_chk, len); 5325 if (ret) { 5326 kfree_skb(skb_chk); 5327 return NULL; 5328 } 5329 5330 return skb_chk; 5331 } 5332 5333 /** 5334 * skb_checksum_trimmed - validate checksum of an skb 5335 * @skb: the skb to check 5336 * @transport_len: the data length beyond the network header 5337 * @skb_chkf: checksum function to use 5338 * 5339 * Applies the given checksum function skb_chkf to the provided skb. 5340 * Returns a checked and maybe trimmed skb. Returns NULL on error. 5341 * 5342 * If the skb has data beyond the given transport length, then a 5343 * trimmed & cloned skb is checked and returned. 5344 * 5345 * Caller needs to set the skb transport header and free any returned skb if it 5346 * differs from the provided skb. 5347 */ 5348 struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, 5349 unsigned int transport_len, 5350 __sum16(*skb_chkf)(struct sk_buff *skb)) 5351 { 5352 struct sk_buff *skb_chk; 5353 unsigned int offset = skb_transport_offset(skb); 5354 __sum16 ret; 5355 5356 skb_chk = skb_checksum_maybe_trim(skb, transport_len); 5357 if (!skb_chk) 5358 goto err; 5359 5360 if (!pskb_may_pull(skb_chk, offset)) 5361 goto err; 5362 5363 skb_pull_rcsum(skb_chk, offset); 5364 ret = skb_chkf(skb_chk); 5365 skb_push_rcsum(skb_chk, offset); 5366 5367 if (ret) 5368 goto err; 5369 5370 return skb_chk; 5371 5372 err: 5373 if (skb_chk && skb_chk != skb) 5374 kfree_skb(skb_chk); 5375 5376 return NULL; 5377 5378 } 5379 EXPORT_SYMBOL(skb_checksum_trimmed); 5380 5381 void __skb_warn_lro_forwarding(const struct sk_buff *skb) 5382 { 5383 net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", 5384 skb->dev->name); 5385 } 5386 EXPORT_SYMBOL(__skb_warn_lro_forwarding); 5387 5388 void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) 5389 { 5390 if (head_stolen) { 5391 skb_release_head_state(skb); 5392 kmem_cache_free(skbuff_head_cache, skb); 5393 } else { 5394 __kfree_skb(skb); 5395 } 5396 } 5397 EXPORT_SYMBOL(kfree_skb_partial); 5398 5399 /** 5400 * skb_try_coalesce - try to merge skb to prior one 5401 * @to: prior buffer 5402 * @from: buffer to add 5403 * @fragstolen: pointer to boolean 5404 * @delta_truesize: how much more was allocated than was requested 5405 */ 5406 bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, 5407 bool *fragstolen, int *delta_truesize) 5408 { 5409 struct skb_shared_info *to_shinfo, *from_shinfo; 5410 int i, delta, len = from->len; 5411 5412 *fragstolen = false; 5413 5414 if (skb_cloned(to)) 5415 return false; 5416 5417 /* In general, avoid mixing slab allocated and page_pool allocated 5418 * pages within the same SKB. However when @to is not pp_recycle and 5419 * @from is cloned, we can transition frag pages from page_pool to 5420 * reference counted. 5421 * 5422 * On the other hand, don't allow coalescing two pp_recycle SKBs if 5423 * @from is cloned, in case the SKB is using page_pool fragment 5424 * references (PP_FLAG_PAGE_FRAG). Since we only take full page 5425 * references for cloned SKBs at the moment that would result in 5426 * inconsistent reference counts. 5427 */ 5428 if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from))) 5429 return false; 5430 5431 if (len <= skb_tailroom(to)) { 5432 if (len) 5433 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); 5434 *delta_truesize = 0; 5435 return true; 5436 } 5437 5438 to_shinfo = skb_shinfo(to); 5439 from_shinfo = skb_shinfo(from); 5440 if (to_shinfo->frag_list || from_shinfo->frag_list) 5441 return false; 5442 if (skb_zcopy(to) || skb_zcopy(from)) 5443 return false; 5444 5445 if (skb_headlen(from) != 0) { 5446 struct page *page; 5447 unsigned int offset; 5448 5449 if (to_shinfo->nr_frags + 5450 from_shinfo->nr_frags >= MAX_SKB_FRAGS) 5451 return false; 5452 5453 if (skb_head_is_locked(from)) 5454 return false; 5455 5456 delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 5457 5458 page = virt_to_head_page(from->head); 5459 offset = from->data - (unsigned char *)page_address(page); 5460 5461 skb_fill_page_desc(to, to_shinfo->nr_frags, 5462 page, offset, skb_headlen(from)); 5463 *fragstolen = true; 5464 } else { 5465 if (to_shinfo->nr_frags + 5466 from_shinfo->nr_frags > MAX_SKB_FRAGS) 5467 return false; 5468 5469 delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); 5470 } 5471 5472 WARN_ON_ONCE(delta < len); 5473 5474 memcpy(to_shinfo->frags + to_shinfo->nr_frags, 5475 from_shinfo->frags, 5476 from_shinfo->nr_frags * sizeof(skb_frag_t)); 5477 to_shinfo->nr_frags += from_shinfo->nr_frags; 5478 5479 if (!skb_cloned(from)) 5480 from_shinfo->nr_frags = 0; 5481 5482 /* if the skb is not cloned this does nothing 5483 * since we set nr_frags to 0. 5484 */ 5485 for (i = 0; i < from_shinfo->nr_frags; i++) 5486 __skb_frag_ref(&from_shinfo->frags[i]); 5487 5488 to->truesize += delta; 5489 to->len += len; 5490 to->data_len += len; 5491 5492 *delta_truesize = delta; 5493 return true; 5494 } 5495 EXPORT_SYMBOL(skb_try_coalesce); 5496 5497 /** 5498 * skb_scrub_packet - scrub an skb 5499 * 5500 * @skb: buffer to clean 5501 * @xnet: packet is crossing netns 5502 * 5503 * skb_scrub_packet can be used after encapsulating or decapsulting a packet 5504 * into/from a tunnel. Some information have to be cleared during these 5505 * operations. 5506 * skb_scrub_packet can also be used to clean a skb before injecting it in 5507 * another namespace (@xnet == true). We have to clear all information in the 5508 * skb that could impact namespace isolation. 5509 */ 5510 void skb_scrub_packet(struct sk_buff *skb, bool xnet) 5511 { 5512 skb->pkt_type = PACKET_HOST; 5513 skb->skb_iif = 0; 5514 skb->ignore_df = 0; 5515 skb_dst_drop(skb); 5516 skb_ext_reset(skb); 5517 nf_reset_ct(skb); 5518 nf_reset_trace(skb); 5519 5520 #ifdef CONFIG_NET_SWITCHDEV 5521 skb->offload_fwd_mark = 0; 5522 skb->offload_l3_fwd_mark = 0; 5523 #endif 5524 5525 if (!xnet) 5526 return; 5527 5528 ipvs_reset(skb); 5529 skb->mark = 0; 5530 skb_clear_tstamp(skb); 5531 } 5532 EXPORT_SYMBOL_GPL(skb_scrub_packet); 5533 5534 /** 5535 * skb_gso_transport_seglen - Return length of individual segments of a gso packet 5536 * 5537 * @skb: GSO skb 5538 * 5539 * skb_gso_transport_seglen is used to determine the real size of the 5540 * individual segments, including Layer4 headers (TCP/UDP). 5541 * 5542 * The MAC/L2 or network (IP, IPv6) headers are not accounted for. 5543 */ 5544 static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) 5545 { 5546 const struct skb_shared_info *shinfo = skb_shinfo(skb); 5547 unsigned int thlen = 0; 5548 5549 if (skb->encapsulation) { 5550 thlen = skb_inner_transport_header(skb) - 5551 skb_transport_header(skb); 5552 5553 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 5554 thlen += inner_tcp_hdrlen(skb); 5555 } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { 5556 thlen = tcp_hdrlen(skb); 5557 } else if (unlikely(skb_is_gso_sctp(skb))) { 5558 thlen = sizeof(struct sctphdr); 5559 } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { 5560 thlen = sizeof(struct udphdr); 5561 } 5562 /* UFO sets gso_size to the size of the fragmentation 5563 * payload, i.e. the size of the L4 (UDP) header is already 5564 * accounted for. 5565 */ 5566 return thlen + shinfo->gso_size; 5567 } 5568 5569 /** 5570 * skb_gso_network_seglen - Return length of individual segments of a gso packet 5571 * 5572 * @skb: GSO skb 5573 * 5574 * skb_gso_network_seglen is used to determine the real size of the 5575 * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). 5576 * 5577 * The MAC/L2 header is not accounted for. 5578 */ 5579 static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) 5580 { 5581 unsigned int hdr_len = skb_transport_header(skb) - 5582 skb_network_header(skb); 5583 5584 return hdr_len + skb_gso_transport_seglen(skb); 5585 } 5586 5587 /** 5588 * skb_gso_mac_seglen - Return length of individual segments of a gso packet 5589 * 5590 * @skb: GSO skb 5591 * 5592 * skb_gso_mac_seglen is used to determine the real size of the 5593 * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 5594 * headers (TCP/UDP). 5595 */ 5596 static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) 5597 { 5598 unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 5599 5600 return hdr_len + skb_gso_transport_seglen(skb); 5601 } 5602 5603 /** 5604 * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS 5605 * 5606 * There are a couple of instances where we have a GSO skb, and we 5607 * want to determine what size it would be after it is segmented. 5608 * 5609 * We might want to check: 5610 * - L3+L4+payload size (e.g. IP forwarding) 5611 * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) 5612 * 5613 * This is a helper to do that correctly considering GSO_BY_FRAGS. 5614 * 5615 * @skb: GSO skb 5616 * 5617 * @seg_len: The segmented length (from skb_gso_*_seglen). In the 5618 * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. 5619 * 5620 * @max_len: The maximum permissible length. 5621 * 5622 * Returns true if the segmented length <= max length. 5623 */ 5624 static inline bool skb_gso_size_check(const struct sk_buff *skb, 5625 unsigned int seg_len, 5626 unsigned int max_len) { 5627 const struct skb_shared_info *shinfo = skb_shinfo(skb); 5628 const struct sk_buff *iter; 5629 5630 if (shinfo->gso_size != GSO_BY_FRAGS) 5631 return seg_len <= max_len; 5632 5633 /* Undo this so we can re-use header sizes */ 5634 seg_len -= GSO_BY_FRAGS; 5635 5636 skb_walk_frags(skb, iter) { 5637 if (seg_len + skb_headlen(iter) > max_len) 5638 return false; 5639 } 5640 5641 return true; 5642 } 5643 5644 /** 5645 * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? 5646 * 5647 * @skb: GSO skb 5648 * @mtu: MTU to validate against 5649 * 5650 * skb_gso_validate_network_len validates if a given skb will fit a 5651 * wanted MTU once split. It considers L3 headers, L4 headers, and the 5652 * payload. 5653 */ 5654 bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) 5655 { 5656 return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); 5657 } 5658 EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); 5659 5660 /** 5661 * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? 5662 * 5663 * @skb: GSO skb 5664 * @len: length to validate against 5665 * 5666 * skb_gso_validate_mac_len validates if a given skb will fit a wanted 5667 * length once split, including L2, L3 and L4 headers and the payload. 5668 */ 5669 bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) 5670 { 5671 return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); 5672 } 5673 EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); 5674 5675 static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) 5676 { 5677 int mac_len, meta_len; 5678 void *meta; 5679 5680 if (skb_cow(skb, skb_headroom(skb)) < 0) { 5681 kfree_skb(skb); 5682 return NULL; 5683 } 5684 5685 mac_len = skb->data - skb_mac_header(skb); 5686 if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { 5687 memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), 5688 mac_len - VLAN_HLEN - ETH_TLEN); 5689 } 5690 5691 meta_len = skb_metadata_len(skb); 5692 if (meta_len) { 5693 meta = skb_metadata_end(skb) - meta_len; 5694 memmove(meta + VLAN_HLEN, meta, meta_len); 5695 } 5696 5697 skb->mac_header += VLAN_HLEN; 5698 return skb; 5699 } 5700 5701 struct sk_buff *skb_vlan_untag(struct sk_buff *skb) 5702 { 5703 struct vlan_hdr *vhdr; 5704 u16 vlan_tci; 5705 5706 if (unlikely(skb_vlan_tag_present(skb))) { 5707 /* vlan_tci is already set-up so leave this for another time */ 5708 return skb; 5709 } 5710 5711 skb = skb_share_check(skb, GFP_ATOMIC); 5712 if (unlikely(!skb)) 5713 goto err_free; 5714 /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ 5715 if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) 5716 goto err_free; 5717 5718 vhdr = (struct vlan_hdr *)skb->data; 5719 vlan_tci = ntohs(vhdr->h_vlan_TCI); 5720 __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); 5721 5722 skb_pull_rcsum(skb, VLAN_HLEN); 5723 vlan_set_encap_proto(skb, vhdr); 5724 5725 skb = skb_reorder_vlan_header(skb); 5726 if (unlikely(!skb)) 5727 goto err_free; 5728 5729 skb_reset_network_header(skb); 5730 if (!skb_transport_header_was_set(skb)) 5731 skb_reset_transport_header(skb); 5732 skb_reset_mac_len(skb); 5733 5734 return skb; 5735 5736 err_free: 5737 kfree_skb(skb); 5738 return NULL; 5739 } 5740 EXPORT_SYMBOL(skb_vlan_untag); 5741 5742 int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) 5743 { 5744 if (!pskb_may_pull(skb, write_len)) 5745 return -ENOMEM; 5746 5747 if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) 5748 return 0; 5749 5750 return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 5751 } 5752 EXPORT_SYMBOL(skb_ensure_writable); 5753 5754 /* remove VLAN header from packet and update csum accordingly. 5755 * expects a non skb_vlan_tag_present skb with a vlan tag payload 5756 */ 5757 int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) 5758 { 5759 struct vlan_hdr *vhdr; 5760 int offset = skb->data - skb_mac_header(skb); 5761 int err; 5762 5763 if (WARN_ONCE(offset, 5764 "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", 5765 offset)) { 5766 return -EINVAL; 5767 } 5768 5769 err = skb_ensure_writable(skb, VLAN_ETH_HLEN); 5770 if (unlikely(err)) 5771 return err; 5772 5773 skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 5774 5775 vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); 5776 *vlan_tci = ntohs(vhdr->h_vlan_TCI); 5777 5778 memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); 5779 __skb_pull(skb, VLAN_HLEN); 5780 5781 vlan_set_encap_proto(skb, vhdr); 5782 skb->mac_header += VLAN_HLEN; 5783 5784 if (skb_network_offset(skb) < ETH_HLEN) 5785 skb_set_network_header(skb, ETH_HLEN); 5786 5787 skb_reset_mac_len(skb); 5788 5789 return err; 5790 } 5791 EXPORT_SYMBOL(__skb_vlan_pop); 5792 5793 /* Pop a vlan tag either from hwaccel or from payload. 5794 * Expects skb->data at mac header. 5795 */ 5796 int skb_vlan_pop(struct sk_buff *skb) 5797 { 5798 u16 vlan_tci; 5799 __be16 vlan_proto; 5800 int err; 5801 5802 if (likely(skb_vlan_tag_present(skb))) { 5803 __vlan_hwaccel_clear_tag(skb); 5804 } else { 5805 if (unlikely(!eth_type_vlan(skb->protocol))) 5806 return 0; 5807 5808 err = __skb_vlan_pop(skb, &vlan_tci); 5809 if (err) 5810 return err; 5811 } 5812 /* move next vlan tag to hw accel tag */ 5813 if (likely(!eth_type_vlan(skb->protocol))) 5814 return 0; 5815 5816 vlan_proto = skb->protocol; 5817 err = __skb_vlan_pop(skb, &vlan_tci); 5818 if (unlikely(err)) 5819 return err; 5820 5821 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 5822 return 0; 5823 } 5824 EXPORT_SYMBOL(skb_vlan_pop); 5825 5826 /* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). 5827 * Expects skb->data at mac header. 5828 */ 5829 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) 5830 { 5831 if (skb_vlan_tag_present(skb)) { 5832 int offset = skb->data - skb_mac_header(skb); 5833 int err; 5834 5835 if (WARN_ONCE(offset, 5836 "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", 5837 offset)) { 5838 return -EINVAL; 5839 } 5840 5841 err = __vlan_insert_tag(skb, skb->vlan_proto, 5842 skb_vlan_tag_get(skb)); 5843 if (err) 5844 return err; 5845 5846 skb->protocol = skb->vlan_proto; 5847 skb->mac_len += VLAN_HLEN; 5848 5849 skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 5850 } 5851 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 5852 return 0; 5853 } 5854 EXPORT_SYMBOL(skb_vlan_push); 5855 5856 /** 5857 * skb_eth_pop() - Drop the Ethernet header at the head of a packet 5858 * 5859 * @skb: Socket buffer to modify 5860 * 5861 * Drop the Ethernet header of @skb. 5862 * 5863 * Expects that skb->data points to the mac header and that no VLAN tags are 5864 * present. 5865 * 5866 * Returns 0 on success, -errno otherwise. 5867 */ 5868 int skb_eth_pop(struct sk_buff *skb) 5869 { 5870 if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || 5871 skb_network_offset(skb) < ETH_HLEN) 5872 return -EPROTO; 5873 5874 skb_pull_rcsum(skb, ETH_HLEN); 5875 skb_reset_mac_header(skb); 5876 skb_reset_mac_len(skb); 5877 5878 return 0; 5879 } 5880 EXPORT_SYMBOL(skb_eth_pop); 5881 5882 /** 5883 * skb_eth_push() - Add a new Ethernet header at the head of a packet 5884 * 5885 * @skb: Socket buffer to modify 5886 * @dst: Destination MAC address of the new header 5887 * @src: Source MAC address of the new header 5888 * 5889 * Prepend @skb with a new Ethernet header. 5890 * 5891 * Expects that skb->data points to the mac header, which must be empty. 5892 * 5893 * Returns 0 on success, -errno otherwise. 5894 */ 5895 int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, 5896 const unsigned char *src) 5897 { 5898 struct ethhdr *eth; 5899 int err; 5900 5901 if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) 5902 return -EPROTO; 5903 5904 err = skb_cow_head(skb, sizeof(*eth)); 5905 if (err < 0) 5906 return err; 5907 5908 skb_push(skb, sizeof(*eth)); 5909 skb_reset_mac_header(skb); 5910 skb_reset_mac_len(skb); 5911 5912 eth = eth_hdr(skb); 5913 ether_addr_copy(eth->h_dest, dst); 5914 ether_addr_copy(eth->h_source, src); 5915 eth->h_proto = skb->protocol; 5916 5917 skb_postpush_rcsum(skb, eth, sizeof(*eth)); 5918 5919 return 0; 5920 } 5921 EXPORT_SYMBOL(skb_eth_push); 5922 5923 /* Update the ethertype of hdr and the skb csum value if required. */ 5924 static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, 5925 __be16 ethertype) 5926 { 5927 if (skb->ip_summed == CHECKSUM_COMPLETE) { 5928 __be16 diff[] = { ~hdr->h_proto, ethertype }; 5929 5930 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); 5931 } 5932 5933 hdr->h_proto = ethertype; 5934 } 5935 5936 /** 5937 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of 5938 * the packet 5939 * 5940 * @skb: buffer 5941 * @mpls_lse: MPLS label stack entry to push 5942 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) 5943 * @mac_len: length of the MAC header 5944 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is 5945 * ethernet 5946 * 5947 * Expects skb->data at mac header. 5948 * 5949 * Returns 0 on success, -errno otherwise. 5950 */ 5951 int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, 5952 int mac_len, bool ethernet) 5953 { 5954 struct mpls_shim_hdr *lse; 5955 int err; 5956 5957 if (unlikely(!eth_p_mpls(mpls_proto))) 5958 return -EINVAL; 5959 5960 /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ 5961 if (skb->encapsulation) 5962 return -EINVAL; 5963 5964 err = skb_cow_head(skb, MPLS_HLEN); 5965 if (unlikely(err)) 5966 return err; 5967 5968 if (!skb->inner_protocol) { 5969 skb_set_inner_network_header(skb, skb_network_offset(skb)); 5970 skb_set_inner_protocol(skb, skb->protocol); 5971 } 5972 5973 skb_push(skb, MPLS_HLEN); 5974 memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), 5975 mac_len); 5976 skb_reset_mac_header(skb); 5977 skb_set_network_header(skb, mac_len); 5978 skb_reset_mac_len(skb); 5979 5980 lse = mpls_hdr(skb); 5981 lse->label_stack_entry = mpls_lse; 5982 skb_postpush_rcsum(skb, lse, MPLS_HLEN); 5983 5984 if (ethernet && mac_len >= ETH_HLEN) 5985 skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); 5986 skb->protocol = mpls_proto; 5987 5988 return 0; 5989 } 5990 EXPORT_SYMBOL_GPL(skb_mpls_push); 5991 5992 /** 5993 * skb_mpls_pop() - pop the outermost MPLS header 5994 * 5995 * @skb: buffer 5996 * @next_proto: ethertype of header after popped MPLS header 5997 * @mac_len: length of the MAC header 5998 * @ethernet: flag to indicate if the packet is ethernet 5999 * 6000 * Expects skb->data at mac header. 6001 * 6002 * Returns 0 on success, -errno otherwise. 6003 */ 6004 int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, 6005 bool ethernet) 6006 { 6007 int err; 6008 6009 if (unlikely(!eth_p_mpls(skb->protocol))) 6010 return 0; 6011 6012 err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); 6013 if (unlikely(err)) 6014 return err; 6015 6016 skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); 6017 memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), 6018 mac_len); 6019 6020 __skb_pull(skb, MPLS_HLEN); 6021 skb_reset_mac_header(skb); 6022 skb_set_network_header(skb, mac_len); 6023 6024 if (ethernet && mac_len >= ETH_HLEN) { 6025 struct ethhdr *hdr; 6026 6027 /* use mpls_hdr() to get ethertype to account for VLANs. */ 6028 hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); 6029 skb_mod_eth_type(skb, hdr, next_proto); 6030 } 6031 skb->protocol = next_proto; 6032 6033 return 0; 6034 } 6035 EXPORT_SYMBOL_GPL(skb_mpls_pop); 6036 6037 /** 6038 * skb_mpls_update_lse() - modify outermost MPLS header and update csum 6039 * 6040 * @skb: buffer 6041 * @mpls_lse: new MPLS label stack entry to update to 6042 * 6043 * Expects skb->data at mac header. 6044 * 6045 * Returns 0 on success, -errno otherwise. 6046 */ 6047 int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) 6048 { 6049 int err; 6050 6051 if (unlikely(!eth_p_mpls(skb->protocol))) 6052 return -EINVAL; 6053 6054 err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); 6055 if (unlikely(err)) 6056 return err; 6057 6058 if (skb->ip_summed == CHECKSUM_COMPLETE) { 6059 __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; 6060 6061 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); 6062 } 6063 6064 mpls_hdr(skb)->label_stack_entry = mpls_lse; 6065 6066 return 0; 6067 } 6068 EXPORT_SYMBOL_GPL(skb_mpls_update_lse); 6069 6070 /** 6071 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header 6072 * 6073 * @skb: buffer 6074 * 6075 * Expects skb->data at mac header. 6076 * 6077 * Returns 0 on success, -errno otherwise. 6078 */ 6079 int skb_mpls_dec_ttl(struct sk_buff *skb) 6080 { 6081 u32 lse; 6082 u8 ttl; 6083 6084 if (unlikely(!eth_p_mpls(skb->protocol))) 6085 return -EINVAL; 6086 6087 if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) 6088 return -ENOMEM; 6089 6090 lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); 6091 ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; 6092 if (!--ttl) 6093 return -EINVAL; 6094 6095 lse &= ~MPLS_LS_TTL_MASK; 6096 lse |= ttl << MPLS_LS_TTL_SHIFT; 6097 6098 return skb_mpls_update_lse(skb, cpu_to_be32(lse)); 6099 } 6100 EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); 6101 6102 /** 6103 * alloc_skb_with_frags - allocate skb with page frags 6104 * 6105 * @header_len: size of linear part 6106 * @data_len: needed length in frags 6107 * @max_page_order: max page order desired. 6108 * @errcode: pointer to error code if any 6109 * @gfp_mask: allocation mask 6110 * 6111 * This can be used to allocate a paged skb, given a maximal order for frags. 6112 */ 6113 struct sk_buff *alloc_skb_with_frags(unsigned long header_len, 6114 unsigned long data_len, 6115 int max_page_order, 6116 int *errcode, 6117 gfp_t gfp_mask) 6118 { 6119 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; 6120 unsigned long chunk; 6121 struct sk_buff *skb; 6122 struct page *page; 6123 int i; 6124 6125 *errcode = -EMSGSIZE; 6126 /* Note this test could be relaxed, if we succeed to allocate 6127 * high order pages... 6128 */ 6129 if (npages > MAX_SKB_FRAGS) 6130 return NULL; 6131 6132 *errcode = -ENOBUFS; 6133 skb = alloc_skb(header_len, gfp_mask); 6134 if (!skb) 6135 return NULL; 6136 6137 skb->truesize += npages << PAGE_SHIFT; 6138 6139 for (i = 0; npages > 0; i++) { 6140 int order = max_page_order; 6141 6142 while (order) { 6143 if (npages >= 1 << order) { 6144 page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | 6145 __GFP_COMP | 6146 __GFP_NOWARN, 6147 order); 6148 if (page) 6149 goto fill_page; 6150 /* Do not retry other high order allocations */ 6151 order = 1; 6152 max_page_order = 0; 6153 } 6154 order--; 6155 } 6156 page = alloc_page(gfp_mask); 6157 if (!page) 6158 goto failure; 6159 fill_page: 6160 chunk = min_t(unsigned long, data_len, 6161 PAGE_SIZE << order); 6162 skb_fill_page_desc(skb, i, page, 0, chunk); 6163 data_len -= chunk; 6164 npages -= 1 << order; 6165 } 6166 return skb; 6167 6168 failure: 6169 kfree_skb(skb); 6170 return NULL; 6171 } 6172 EXPORT_SYMBOL(alloc_skb_with_frags); 6173 6174 /* carve out the first off bytes from skb when off < headlen */ 6175 static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, 6176 const int headlen, gfp_t gfp_mask) 6177 { 6178 int i; 6179 unsigned int size = skb_end_offset(skb); 6180 int new_hlen = headlen - off; 6181 u8 *data; 6182 6183 if (skb_pfmemalloc(skb)) 6184 gfp_mask |= __GFP_MEMALLOC; 6185 6186 size = SKB_DATA_ALIGN(size); 6187 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 6188 size = kmalloc_size_roundup(size); 6189 data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); 6190 if (!data) 6191 return -ENOMEM; 6192 size = SKB_WITH_OVERHEAD(size); 6193 6194 /* Copy real data, and all frags */ 6195 skb_copy_from_linear_data_offset(skb, off, data, new_hlen); 6196 skb->len -= off; 6197 6198 memcpy((struct skb_shared_info *)(data + size), 6199 skb_shinfo(skb), 6200 offsetof(struct skb_shared_info, 6201 frags[skb_shinfo(skb)->nr_frags])); 6202 if (skb_cloned(skb)) { 6203 /* drop the old head gracefully */ 6204 if (skb_orphan_frags(skb, gfp_mask)) { 6205 kfree(data); 6206 return -ENOMEM; 6207 } 6208 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 6209 skb_frag_ref(skb, i); 6210 if (skb_has_frag_list(skb)) 6211 skb_clone_fraglist(skb); 6212 skb_release_data(skb); 6213 } else { 6214 /* we can reuse existing recount- all we did was 6215 * relocate values 6216 */ 6217 skb_free_head(skb); 6218 } 6219 6220 skb->head = data; 6221 skb->data = data; 6222 skb->head_frag = 0; 6223 skb_set_end_offset(skb, size); 6224 skb_set_tail_pointer(skb, skb_headlen(skb)); 6225 skb_headers_offset_update(skb, 0); 6226 skb->cloned = 0; 6227 skb->hdr_len = 0; 6228 skb->nohdr = 0; 6229 atomic_set(&skb_shinfo(skb)->dataref, 1); 6230 6231 return 0; 6232 } 6233 6234 static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); 6235 6236 /* carve out the first eat bytes from skb's frag_list. May recurse into 6237 * pskb_carve() 6238 */ 6239 static int pskb_carve_frag_list(struct sk_buff *skb, 6240 struct skb_shared_info *shinfo, int eat, 6241 gfp_t gfp_mask) 6242 { 6243 struct sk_buff *list = shinfo->frag_list; 6244 struct sk_buff *clone = NULL; 6245 struct sk_buff *insp = NULL; 6246 6247 do { 6248 if (!list) { 6249 pr_err("Not enough bytes to eat. Want %d\n", eat); 6250 return -EFAULT; 6251 } 6252 if (list->len <= eat) { 6253 /* Eaten as whole. */ 6254 eat -= list->len; 6255 list = list->next; 6256 insp = list; 6257 } else { 6258 /* Eaten partially. */ 6259 if (skb_shared(list)) { 6260 clone = skb_clone(list, gfp_mask); 6261 if (!clone) 6262 return -ENOMEM; 6263 insp = list->next; 6264 list = clone; 6265 } else { 6266 /* This may be pulled without problems. */ 6267 insp = list; 6268 } 6269 if (pskb_carve(list, eat, gfp_mask) < 0) { 6270 kfree_skb(clone); 6271 return -ENOMEM; 6272 } 6273 break; 6274 } 6275 } while (eat); 6276 6277 /* Free pulled out fragments. */ 6278 while ((list = shinfo->frag_list) != insp) { 6279 shinfo->frag_list = list->next; 6280 consume_skb(list); 6281 } 6282 /* And insert new clone at head. */ 6283 if (clone) { 6284 clone->next = list; 6285 shinfo->frag_list = clone; 6286 } 6287 return 0; 6288 } 6289 6290 /* carve off first len bytes from skb. Split line (off) is in the 6291 * non-linear part of skb 6292 */ 6293 static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, 6294 int pos, gfp_t gfp_mask) 6295 { 6296 int i, k = 0; 6297 unsigned int size = skb_end_offset(skb); 6298 u8 *data; 6299 const int nfrags = skb_shinfo(skb)->nr_frags; 6300 struct skb_shared_info *shinfo; 6301 6302 if (skb_pfmemalloc(skb)) 6303 gfp_mask |= __GFP_MEMALLOC; 6304 6305 size = SKB_DATA_ALIGN(size); 6306 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 6307 size = kmalloc_size_roundup(size); 6308 data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); 6309 if (!data) 6310 return -ENOMEM; 6311 size = SKB_WITH_OVERHEAD(size); 6312 6313 memcpy((struct skb_shared_info *)(data + size), 6314 skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); 6315 if (skb_orphan_frags(skb, gfp_mask)) { 6316 kfree(data); 6317 return -ENOMEM; 6318 } 6319 shinfo = (struct skb_shared_info *)(data + size); 6320 for (i = 0; i < nfrags; i++) { 6321 int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); 6322 6323 if (pos + fsize > off) { 6324 shinfo->frags[k] = skb_shinfo(skb)->frags[i]; 6325 6326 if (pos < off) { 6327 /* Split frag. 6328 * We have two variants in this case: 6329 * 1. Move all the frag to the second 6330 * part, if it is possible. F.e. 6331 * this approach is mandatory for TUX, 6332 * where splitting is expensive. 6333 * 2. Split is accurately. We make this. 6334 */ 6335 skb_frag_off_add(&shinfo->frags[0], off - pos); 6336 skb_frag_size_sub(&shinfo->frags[0], off - pos); 6337 } 6338 skb_frag_ref(skb, i); 6339 k++; 6340 } 6341 pos += fsize; 6342 } 6343 shinfo->nr_frags = k; 6344 if (skb_has_frag_list(skb)) 6345 skb_clone_fraglist(skb); 6346 6347 /* split line is in frag list */ 6348 if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) { 6349 /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ 6350 if (skb_has_frag_list(skb)) 6351 kfree_skb_list(skb_shinfo(skb)->frag_list); 6352 kfree(data); 6353 return -ENOMEM; 6354 } 6355 skb_release_data(skb); 6356 6357 skb->head = data; 6358 skb->head_frag = 0; 6359 skb->data = data; 6360 skb_set_end_offset(skb, size); 6361 skb_reset_tail_pointer(skb); 6362 skb_headers_offset_update(skb, 0); 6363 skb->cloned = 0; 6364 skb->hdr_len = 0; 6365 skb->nohdr = 0; 6366 skb->len -= off; 6367 skb->data_len = skb->len; 6368 atomic_set(&skb_shinfo(skb)->dataref, 1); 6369 return 0; 6370 } 6371 6372 /* remove len bytes from the beginning of the skb */ 6373 static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) 6374 { 6375 int headlen = skb_headlen(skb); 6376 6377 if (len < headlen) 6378 return pskb_carve_inside_header(skb, len, headlen, gfp); 6379 else 6380 return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); 6381 } 6382 6383 /* Extract to_copy bytes starting at off from skb, and return this in 6384 * a new skb 6385 */ 6386 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, 6387 int to_copy, gfp_t gfp) 6388 { 6389 struct sk_buff *clone = skb_clone(skb, gfp); 6390 6391 if (!clone) 6392 return NULL; 6393 6394 if (pskb_carve(clone, off, gfp) < 0 || 6395 pskb_trim(clone, to_copy)) { 6396 kfree_skb(clone); 6397 return NULL; 6398 } 6399 return clone; 6400 } 6401 EXPORT_SYMBOL(pskb_extract); 6402 6403 /** 6404 * skb_condense - try to get rid of fragments/frag_list if possible 6405 * @skb: buffer 6406 * 6407 * Can be used to save memory before skb is added to a busy queue. 6408 * If packet has bytes in frags and enough tail room in skb->head, 6409 * pull all of them, so that we can free the frags right now and adjust 6410 * truesize. 6411 * Notes: 6412 * We do not reallocate skb->head thus can not fail. 6413 * Caller must re-evaluate skb->truesize if needed. 6414 */ 6415 void skb_condense(struct sk_buff *skb) 6416 { 6417 if (skb->data_len) { 6418 if (skb->data_len > skb->end - skb->tail || 6419 skb_cloned(skb)) 6420 return; 6421 6422 /* Nice, we can free page frag(s) right now */ 6423 __pskb_pull_tail(skb, skb->data_len); 6424 } 6425 /* At this point, skb->truesize might be over estimated, 6426 * because skb had a fragment, and fragments do not tell 6427 * their truesize. 6428 * When we pulled its content into skb->head, fragment 6429 * was freed, but __pskb_pull_tail() could not possibly 6430 * adjust skb->truesize, not knowing the frag truesize. 6431 */ 6432 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 6433 } 6434 6435 #ifdef CONFIG_SKB_EXTENSIONS 6436 static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) 6437 { 6438 return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); 6439 } 6440 6441 /** 6442 * __skb_ext_alloc - allocate a new skb extensions storage 6443 * 6444 * @flags: See kmalloc(). 6445 * 6446 * Returns the newly allocated pointer. The pointer can later attached to a 6447 * skb via __skb_ext_set(). 6448 * Note: caller must handle the skb_ext as an opaque data. 6449 */ 6450 struct skb_ext *__skb_ext_alloc(gfp_t flags) 6451 { 6452 struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); 6453 6454 if (new) { 6455 memset(new->offset, 0, sizeof(new->offset)); 6456 refcount_set(&new->refcnt, 1); 6457 } 6458 6459 return new; 6460 } 6461 6462 static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, 6463 unsigned int old_active) 6464 { 6465 struct skb_ext *new; 6466 6467 if (refcount_read(&old->refcnt) == 1) 6468 return old; 6469 6470 new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); 6471 if (!new) 6472 return NULL; 6473 6474 memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); 6475 refcount_set(&new->refcnt, 1); 6476 6477 #ifdef CONFIG_XFRM 6478 if (old_active & (1 << SKB_EXT_SEC_PATH)) { 6479 struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); 6480 unsigned int i; 6481 6482 for (i = 0; i < sp->len; i++) 6483 xfrm_state_hold(sp->xvec[i]); 6484 } 6485 #endif 6486 __skb_ext_put(old); 6487 return new; 6488 } 6489 6490 /** 6491 * __skb_ext_set - attach the specified extension storage to this skb 6492 * @skb: buffer 6493 * @id: extension id 6494 * @ext: extension storage previously allocated via __skb_ext_alloc() 6495 * 6496 * Existing extensions, if any, are cleared. 6497 * 6498 * Returns the pointer to the extension. 6499 */ 6500 void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, 6501 struct skb_ext *ext) 6502 { 6503 unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); 6504 6505 skb_ext_put(skb); 6506 newlen = newoff + skb_ext_type_len[id]; 6507 ext->chunks = newlen; 6508 ext->offset[id] = newoff; 6509 skb->extensions = ext; 6510 skb->active_extensions = 1 << id; 6511 return skb_ext_get_ptr(ext, id); 6512 } 6513 6514 /** 6515 * skb_ext_add - allocate space for given extension, COW if needed 6516 * @skb: buffer 6517 * @id: extension to allocate space for 6518 * 6519 * Allocates enough space for the given extension. 6520 * If the extension is already present, a pointer to that extension 6521 * is returned. 6522 * 6523 * If the skb was cloned, COW applies and the returned memory can be 6524 * modified without changing the extension space of clones buffers. 6525 * 6526 * Returns pointer to the extension or NULL on allocation failure. 6527 */ 6528 void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) 6529 { 6530 struct skb_ext *new, *old = NULL; 6531 unsigned int newlen, newoff; 6532 6533 if (skb->active_extensions) { 6534 old = skb->extensions; 6535 6536 new = skb_ext_maybe_cow(old, skb->active_extensions); 6537 if (!new) 6538 return NULL; 6539 6540 if (__skb_ext_exist(new, id)) 6541 goto set_active; 6542 6543 newoff = new->chunks; 6544 } else { 6545 newoff = SKB_EXT_CHUNKSIZEOF(*new); 6546 6547 new = __skb_ext_alloc(GFP_ATOMIC); 6548 if (!new) 6549 return NULL; 6550 } 6551 6552 newlen = newoff + skb_ext_type_len[id]; 6553 new->chunks = newlen; 6554 new->offset[id] = newoff; 6555 set_active: 6556 skb->slow_gro = 1; 6557 skb->extensions = new; 6558 skb->active_extensions |= 1 << id; 6559 return skb_ext_get_ptr(new, id); 6560 } 6561 EXPORT_SYMBOL(skb_ext_add); 6562 6563 #ifdef CONFIG_XFRM 6564 static void skb_ext_put_sp(struct sec_path *sp) 6565 { 6566 unsigned int i; 6567 6568 for (i = 0; i < sp->len; i++) 6569 xfrm_state_put(sp->xvec[i]); 6570 } 6571 #endif 6572 6573 #ifdef CONFIG_MCTP_FLOWS 6574 static void skb_ext_put_mctp(struct mctp_flow *flow) 6575 { 6576 if (flow->key) 6577 mctp_key_unref(flow->key); 6578 } 6579 #endif 6580 6581 void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) 6582 { 6583 struct skb_ext *ext = skb->extensions; 6584 6585 skb->active_extensions &= ~(1 << id); 6586 if (skb->active_extensions == 0) { 6587 skb->extensions = NULL; 6588 __skb_ext_put(ext); 6589 #ifdef CONFIG_XFRM 6590 } else if (id == SKB_EXT_SEC_PATH && 6591 refcount_read(&ext->refcnt) == 1) { 6592 struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); 6593 6594 skb_ext_put_sp(sp); 6595 sp->len = 0; 6596 #endif 6597 } 6598 } 6599 EXPORT_SYMBOL(__skb_ext_del); 6600 6601 void __skb_ext_put(struct skb_ext *ext) 6602 { 6603 /* If this is last clone, nothing can increment 6604 * it after check passes. Avoids one atomic op. 6605 */ 6606 if (refcount_read(&ext->refcnt) == 1) 6607 goto free_now; 6608 6609 if (!refcount_dec_and_test(&ext->refcnt)) 6610 return; 6611 free_now: 6612 #ifdef CONFIG_XFRM 6613 if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) 6614 skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); 6615 #endif 6616 #ifdef CONFIG_MCTP_FLOWS 6617 if (__skb_ext_exist(ext, SKB_EXT_MCTP)) 6618 skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP)); 6619 #endif 6620 6621 kmem_cache_free(skbuff_ext_cache, ext); 6622 } 6623 EXPORT_SYMBOL(__skb_ext_put); 6624 #endif /* CONFIG_SKB_EXTENSIONS */ 6625 6626 /** 6627 * skb_attempt_defer_free - queue skb for remote freeing 6628 * @skb: buffer 6629 * 6630 * Put @skb in a per-cpu list, using the cpu which 6631 * allocated the skb/pages to reduce false sharing 6632 * and memory zone spinlock contention. 6633 */ 6634 void skb_attempt_defer_free(struct sk_buff *skb) 6635 { 6636 int cpu = skb->alloc_cpu; 6637 struct softnet_data *sd; 6638 unsigned long flags; 6639 unsigned int defer_max; 6640 bool kick; 6641 6642 if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || 6643 !cpu_online(cpu) || 6644 cpu == raw_smp_processor_id()) { 6645 nodefer: __kfree_skb(skb); 6646 return; 6647 } 6648 6649 sd = &per_cpu(softnet_data, cpu); 6650 defer_max = READ_ONCE(sysctl_skb_defer_max); 6651 if (READ_ONCE(sd->defer_count) >= defer_max) 6652 goto nodefer; 6653 6654 spin_lock_irqsave(&sd->defer_lock, flags); 6655 /* Send an IPI every time queue reaches half capacity. */ 6656 kick = sd->defer_count == (defer_max >> 1); 6657 /* Paired with the READ_ONCE() few lines above */ 6658 WRITE_ONCE(sd->defer_count, sd->defer_count + 1); 6659 6660 skb->next = sd->defer_list; 6661 /* Paired with READ_ONCE() in skb_defer_free_flush() */ 6662 WRITE_ONCE(sd->defer_list, skb); 6663 spin_unlock_irqrestore(&sd->defer_lock, flags); 6664 6665 /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU 6666 * if we are unlucky enough (this seems very unlikely). 6667 */ 6668 if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) 6669 smp_call_function_single_async(cpu, &sd->defer_csd); 6670 } 6671