1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Routines having to do with the 'struct sk_buff' memory handlers. 4 * 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> 6 * Florian La Roche <rzsfl@rz.uni-sb.de> 7 * 8 * Fixes: 9 * Alan Cox : Fixed the worst of the load 10 * balancer bugs. 11 * Dave Platt : Interrupt stacking fix. 12 * Richard Kooijman : Timestamp fixes. 13 * Alan Cox : Changed buffer format. 14 * Alan Cox : destructor hook for AF_UNIX etc. 15 * Linus Torvalds : Better skb_clone. 16 * Alan Cox : Added skb_copy. 17 * Alan Cox : Added all the changed routines Linus 18 * only put in the headers 19 * Ray VanTassle : Fixed --skb->lock in free 20 * Alan Cox : skb_copy copy arp field 21 * Andi Kleen : slabified it. 22 * Robert Olsson : Removed skb_head_pool 23 * 24 * NOTE: 25 * The __skb_ routines should be called with interrupts 26 * disabled, or you better be *real* sure that the operation is atomic 27 * with respect to whatever list is being frobbed (e.g. via lock_sock() 28 * or via disabling bottom half handlers, etc). 29 */ 30 31 /* 32 * The functions in this file will not compile correctly with gcc 2.4.x 33 */ 34 35 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 36 37 #include <linux/module.h> 38 #include <linux/types.h> 39 #include <linux/kernel.h> 40 #include <linux/mm.h> 41 #include <linux/interrupt.h> 42 #include <linux/in.h> 43 #include <linux/inet.h> 44 #include <linux/slab.h> 45 #include <linux/tcp.h> 46 #include <linux/udp.h> 47 #include <linux/sctp.h> 48 #include <linux/netdevice.h> 49 #ifdef CONFIG_NET_CLS_ACT 50 #include <net/pkt_sched.h> 51 #endif 52 #include <linux/string.h> 53 #include <linux/skbuff.h> 54 #include <linux/splice.h> 55 #include <linux/cache.h> 56 #include <linux/rtnetlink.h> 57 #include <linux/init.h> 58 #include <linux/scatterlist.h> 59 #include <linux/errqueue.h> 60 #include <linux/prefetch.h> 61 #include <linux/if_vlan.h> 62 #include <linux/mpls.h> 63 #include <linux/kcov.h> 64 65 #include <net/protocol.h> 66 #include <net/dst.h> 67 #include <net/sock.h> 68 #include <net/checksum.h> 69 #include <net/ip6_checksum.h> 70 #include <net/xfrm.h> 71 #include <net/mpls.h> 72 #include <net/mptcp.h> 73 #include <net/mctp.h> 74 #include <net/page_pool.h> 75 76 #include <linux/uaccess.h> 77 #include <trace/events/skb.h> 78 #include <linux/highmem.h> 79 #include <linux/capability.h> 80 #include <linux/user_namespace.h> 81 #include <linux/indirect_call_wrapper.h> 82 83 #include "dev.h" 84 #include "sock_destructor.h" 85 86 struct kmem_cache *skbuff_head_cache __ro_after_init; 87 static struct kmem_cache *skbuff_fclone_cache __ro_after_init; 88 #ifdef CONFIG_SKB_EXTENSIONS 89 static struct kmem_cache *skbuff_ext_cache __ro_after_init; 90 #endif 91 int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; 92 EXPORT_SYMBOL(sysctl_max_skb_frags); 93 94 #undef FN 95 #define FN(reason) [SKB_DROP_REASON_##reason] = #reason, 96 const char * const drop_reasons[] = { 97 DEFINE_DROP_REASON(FN, FN) 98 }; 99 EXPORT_SYMBOL(drop_reasons); 100 101 /** 102 * skb_panic - private function for out-of-line support 103 * @skb: buffer 104 * @sz: size 105 * @addr: address 106 * @msg: skb_over_panic or skb_under_panic 107 * 108 * Out-of-line support for skb_put() and skb_push(). 109 * Called via the wrapper skb_over_panic() or skb_under_panic(). 110 * Keep out of line to prevent kernel bloat. 111 * __builtin_return_address is not used because it is not always reliable. 112 */ 113 static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, 114 const char msg[]) 115 { 116 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", 117 msg, addr, skb->len, sz, skb->head, skb->data, 118 (unsigned long)skb->tail, (unsigned long)skb->end, 119 skb->dev ? skb->dev->name : "<NULL>"); 120 BUG(); 121 } 122 123 static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) 124 { 125 skb_panic(skb, sz, addr, __func__); 126 } 127 128 static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) 129 { 130 skb_panic(skb, sz, addr, __func__); 131 } 132 133 #define NAPI_SKB_CACHE_SIZE 64 134 #define NAPI_SKB_CACHE_BULK 16 135 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) 136 137 #if PAGE_SIZE == SZ_4K 138 139 #define NAPI_HAS_SMALL_PAGE_FRAG 1 140 #define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) 141 142 /* specialized page frag allocator using a single order 0 page 143 * and slicing it into 1K sized fragment. Constrained to systems 144 * with a very limited amount of 1K fragments fitting a single 145 * page - to avoid excessive truesize underestimation 146 */ 147 148 struct page_frag_1k { 149 void *va; 150 u16 offset; 151 bool pfmemalloc; 152 }; 153 154 static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) 155 { 156 struct page *page; 157 int offset; 158 159 offset = nc->offset - SZ_1K; 160 if (likely(offset >= 0)) 161 goto use_frag; 162 163 page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); 164 if (!page) 165 return NULL; 166 167 nc->va = page_address(page); 168 nc->pfmemalloc = page_is_pfmemalloc(page); 169 offset = PAGE_SIZE - SZ_1K; 170 page_ref_add(page, offset / SZ_1K); 171 172 use_frag: 173 nc->offset = offset; 174 return nc->va + offset; 175 } 176 #else 177 178 /* the small page is actually unused in this build; add dummy helpers 179 * to please the compiler and avoid later preprocessor's conditionals 180 */ 181 #define NAPI_HAS_SMALL_PAGE_FRAG 0 182 #define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false 183 184 struct page_frag_1k { 185 }; 186 187 static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) 188 { 189 return NULL; 190 } 191 192 #endif 193 194 struct napi_alloc_cache { 195 struct page_frag_cache page; 196 struct page_frag_1k page_small; 197 unsigned int skb_count; 198 void *skb_cache[NAPI_SKB_CACHE_SIZE]; 199 }; 200 201 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); 202 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); 203 204 /* Double check that napi_get_frags() allocates skbs with 205 * skb->head being backed by slab, not a page fragment. 206 * This is to make sure bug fixed in 3226b158e67c 207 * ("net: avoid 32 x truesize under-estimation for tiny skbs") 208 * does not accidentally come back. 209 */ 210 void napi_get_frags_check(struct napi_struct *napi) 211 { 212 struct sk_buff *skb; 213 214 local_bh_disable(); 215 skb = napi_get_frags(napi); 216 WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); 217 napi_free_frags(napi); 218 local_bh_enable(); 219 } 220 221 void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) 222 { 223 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 224 225 fragsz = SKB_DATA_ALIGN(fragsz); 226 227 return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); 228 } 229 EXPORT_SYMBOL(__napi_alloc_frag_align); 230 231 void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) 232 { 233 void *data; 234 235 fragsz = SKB_DATA_ALIGN(fragsz); 236 if (in_hardirq() || irqs_disabled()) { 237 struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); 238 239 data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); 240 } else { 241 struct napi_alloc_cache *nc; 242 243 local_bh_disable(); 244 nc = this_cpu_ptr(&napi_alloc_cache); 245 data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); 246 local_bh_enable(); 247 } 248 return data; 249 } 250 EXPORT_SYMBOL(__netdev_alloc_frag_align); 251 252 static struct sk_buff *napi_skb_cache_get(void) 253 { 254 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 255 struct sk_buff *skb; 256 257 if (unlikely(!nc->skb_count)) { 258 nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache, 259 GFP_ATOMIC, 260 NAPI_SKB_CACHE_BULK, 261 nc->skb_cache); 262 if (unlikely(!nc->skb_count)) 263 return NULL; 264 } 265 266 skb = nc->skb_cache[--nc->skb_count]; 267 kasan_unpoison_object_data(skbuff_head_cache, skb); 268 269 return skb; 270 } 271 272 /* Caller must provide SKB that is memset cleared */ 273 static void __build_skb_around(struct sk_buff *skb, void *data, 274 unsigned int frag_size) 275 { 276 struct skb_shared_info *shinfo; 277 unsigned int size = frag_size ? : ksize(data); 278 279 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 280 281 /* Assumes caller memset cleared SKB */ 282 skb->truesize = SKB_TRUESIZE(size); 283 refcount_set(&skb->users, 1); 284 skb->head = data; 285 skb->data = data; 286 skb_reset_tail_pointer(skb); 287 skb_set_end_offset(skb, size); 288 skb->mac_header = (typeof(skb->mac_header))~0U; 289 skb->transport_header = (typeof(skb->transport_header))~0U; 290 skb->alloc_cpu = raw_smp_processor_id(); 291 /* make sure we initialize shinfo sequentially */ 292 shinfo = skb_shinfo(skb); 293 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 294 atomic_set(&shinfo->dataref, 1); 295 296 skb_set_kcov_handle(skb, kcov_common_handle()); 297 } 298 299 /** 300 * __build_skb - build a network buffer 301 * @data: data buffer provided by caller 302 * @frag_size: size of data, or 0 if head was kmalloced 303 * 304 * Allocate a new &sk_buff. Caller provides space holding head and 305 * skb_shared_info. @data must have been allocated by kmalloc() only if 306 * @frag_size is 0, otherwise data should come from the page allocator 307 * or vmalloc() 308 * The return is the new skb buffer. 309 * On a failure the return is %NULL, and @data is not freed. 310 * Notes : 311 * Before IO, driver allocates only data buffer where NIC put incoming frame 312 * Driver should add room at head (NET_SKB_PAD) and 313 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) 314 * After IO, driver calls build_skb(), to allocate sk_buff and populate it 315 * before giving packet to stack. 316 * RX rings only contains data buffers, not full skbs. 317 */ 318 struct sk_buff *__build_skb(void *data, unsigned int frag_size) 319 { 320 struct sk_buff *skb; 321 322 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); 323 if (unlikely(!skb)) 324 return NULL; 325 326 memset(skb, 0, offsetof(struct sk_buff, tail)); 327 __build_skb_around(skb, data, frag_size); 328 329 return skb; 330 } 331 332 /* build_skb() is wrapper over __build_skb(), that specifically 333 * takes care of skb->head and skb->pfmemalloc 334 * This means that if @frag_size is not zero, then @data must be backed 335 * by a page fragment, not kmalloc() or vmalloc() 336 */ 337 struct sk_buff *build_skb(void *data, unsigned int frag_size) 338 { 339 struct sk_buff *skb = __build_skb(data, frag_size); 340 341 if (skb && frag_size) { 342 skb->head_frag = 1; 343 if (page_is_pfmemalloc(virt_to_head_page(data))) 344 skb->pfmemalloc = 1; 345 } 346 return skb; 347 } 348 EXPORT_SYMBOL(build_skb); 349 350 /** 351 * build_skb_around - build a network buffer around provided skb 352 * @skb: sk_buff provide by caller, must be memset cleared 353 * @data: data buffer provided by caller 354 * @frag_size: size of data, or 0 if head was kmalloced 355 */ 356 struct sk_buff *build_skb_around(struct sk_buff *skb, 357 void *data, unsigned int frag_size) 358 { 359 if (unlikely(!skb)) 360 return NULL; 361 362 __build_skb_around(skb, data, frag_size); 363 364 if (frag_size) { 365 skb->head_frag = 1; 366 if (page_is_pfmemalloc(virt_to_head_page(data))) 367 skb->pfmemalloc = 1; 368 } 369 return skb; 370 } 371 EXPORT_SYMBOL(build_skb_around); 372 373 /** 374 * __napi_build_skb - build a network buffer 375 * @data: data buffer provided by caller 376 * @frag_size: size of data, or 0 if head was kmalloced 377 * 378 * Version of __build_skb() that uses NAPI percpu caches to obtain 379 * skbuff_head instead of inplace allocation. 380 * 381 * Returns a new &sk_buff on success, %NULL on allocation failure. 382 */ 383 static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) 384 { 385 struct sk_buff *skb; 386 387 skb = napi_skb_cache_get(); 388 if (unlikely(!skb)) 389 return NULL; 390 391 memset(skb, 0, offsetof(struct sk_buff, tail)); 392 __build_skb_around(skb, data, frag_size); 393 394 return skb; 395 } 396 397 /** 398 * napi_build_skb - build a network buffer 399 * @data: data buffer provided by caller 400 * @frag_size: size of data, or 0 if head was kmalloced 401 * 402 * Version of __napi_build_skb() that takes care of skb->head_frag 403 * and skb->pfmemalloc when the data is a page or page fragment. 404 * 405 * Returns a new &sk_buff on success, %NULL on allocation failure. 406 */ 407 struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) 408 { 409 struct sk_buff *skb = __napi_build_skb(data, frag_size); 410 411 if (likely(skb) && frag_size) { 412 skb->head_frag = 1; 413 skb_propagate_pfmemalloc(virt_to_head_page(data), skb); 414 } 415 416 return skb; 417 } 418 EXPORT_SYMBOL(napi_build_skb); 419 420 /* 421 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells 422 * the caller if emergency pfmemalloc reserves are being used. If it is and 423 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves 424 * may be used. Otherwise, the packet data may be discarded until enough 425 * memory is free 426 */ 427 static void *kmalloc_reserve(size_t size, gfp_t flags, int node, 428 bool *pfmemalloc) 429 { 430 void *obj; 431 bool ret_pfmemalloc = false; 432 433 /* 434 * Try a regular allocation, when that fails and we're not entitled 435 * to the reserves, fail. 436 */ 437 obj = kmalloc_node_track_caller(size, 438 flags | __GFP_NOMEMALLOC | __GFP_NOWARN, 439 node); 440 if (obj || !(gfp_pfmemalloc_allowed(flags))) 441 goto out; 442 443 /* Try again but now we are using pfmemalloc reserves */ 444 ret_pfmemalloc = true; 445 obj = kmalloc_node_track_caller(size, flags, node); 446 447 out: 448 if (pfmemalloc) 449 *pfmemalloc = ret_pfmemalloc; 450 451 return obj; 452 } 453 454 /* Allocate a new skbuff. We do this ourselves so we can fill in a few 455 * 'private' fields and also do memory statistics to find all the 456 * [BEEP] leaks. 457 * 458 */ 459 460 /** 461 * __alloc_skb - allocate a network buffer 462 * @size: size to allocate 463 * @gfp_mask: allocation mask 464 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache 465 * instead of head cache and allocate a cloned (child) skb. 466 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for 467 * allocations in case the data is required for writeback 468 * @node: numa node to allocate memory on 469 * 470 * Allocate a new &sk_buff. The returned buffer has no headroom and a 471 * tail room of at least size bytes. The object has a reference count 472 * of one. The return is the buffer. On a failure the return is %NULL. 473 * 474 * Buffers may only be allocated from interrupts using a @gfp_mask of 475 * %GFP_ATOMIC. 476 */ 477 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 478 int flags, int node) 479 { 480 struct kmem_cache *cache; 481 struct sk_buff *skb; 482 unsigned int osize; 483 bool pfmemalloc; 484 u8 *data; 485 486 cache = (flags & SKB_ALLOC_FCLONE) 487 ? skbuff_fclone_cache : skbuff_head_cache; 488 489 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) 490 gfp_mask |= __GFP_MEMALLOC; 491 492 /* Get the HEAD */ 493 if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && 494 likely(node == NUMA_NO_NODE || node == numa_mem_id())) 495 skb = napi_skb_cache_get(); 496 else 497 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); 498 if (unlikely(!skb)) 499 return NULL; 500 prefetchw(skb); 501 502 /* We do our best to align skb_shared_info on a separate cache 503 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives 504 * aligned memory blocks, unless SLUB/SLAB debug is enabled. 505 * Both skb->head and skb_shared_info are cache line aligned. 506 */ 507 size = SKB_DATA_ALIGN(size); 508 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 509 data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); 510 if (unlikely(!data)) 511 goto nodata; 512 /* kmalloc(size) might give us more room than requested. 513 * Put skb_shared_info exactly at the end of allocated zone, 514 * to allow max possible filling before reallocation. 515 */ 516 osize = ksize(data); 517 size = SKB_WITH_OVERHEAD(osize); 518 prefetchw(data + size); 519 520 /* 521 * Only clear those fields we need to clear, not those that we will 522 * actually initialise below. Hence, don't put any more fields after 523 * the tail pointer in struct sk_buff! 524 */ 525 memset(skb, 0, offsetof(struct sk_buff, tail)); 526 __build_skb_around(skb, data, osize); 527 skb->pfmemalloc = pfmemalloc; 528 529 if (flags & SKB_ALLOC_FCLONE) { 530 struct sk_buff_fclones *fclones; 531 532 fclones = container_of(skb, struct sk_buff_fclones, skb1); 533 534 skb->fclone = SKB_FCLONE_ORIG; 535 refcount_set(&fclones->fclone_ref, 1); 536 } 537 538 return skb; 539 540 nodata: 541 kmem_cache_free(cache, skb); 542 return NULL; 543 } 544 EXPORT_SYMBOL(__alloc_skb); 545 546 /** 547 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device 548 * @dev: network device to receive on 549 * @len: length to allocate 550 * @gfp_mask: get_free_pages mask, passed to alloc_skb 551 * 552 * Allocate a new &sk_buff and assign it a usage count of one. The 553 * buffer has NET_SKB_PAD headroom built in. Users should allocate 554 * the headroom they think they need without accounting for the 555 * built in space. The built in space is used for optimisations. 556 * 557 * %NULL is returned if there is no free memory. 558 */ 559 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, 560 gfp_t gfp_mask) 561 { 562 struct page_frag_cache *nc; 563 struct sk_buff *skb; 564 bool pfmemalloc; 565 void *data; 566 567 len += NET_SKB_PAD; 568 569 /* If requested length is either too small or too big, 570 * we use kmalloc() for skb->head allocation. 571 */ 572 if (len <= SKB_WITH_OVERHEAD(1024) || 573 len > SKB_WITH_OVERHEAD(PAGE_SIZE) || 574 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 575 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); 576 if (!skb) 577 goto skb_fail; 578 goto skb_success; 579 } 580 581 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 582 len = SKB_DATA_ALIGN(len); 583 584 if (sk_memalloc_socks()) 585 gfp_mask |= __GFP_MEMALLOC; 586 587 if (in_hardirq() || irqs_disabled()) { 588 nc = this_cpu_ptr(&netdev_alloc_cache); 589 data = page_frag_alloc(nc, len, gfp_mask); 590 pfmemalloc = nc->pfmemalloc; 591 } else { 592 local_bh_disable(); 593 nc = this_cpu_ptr(&napi_alloc_cache.page); 594 data = page_frag_alloc(nc, len, gfp_mask); 595 pfmemalloc = nc->pfmemalloc; 596 local_bh_enable(); 597 } 598 599 if (unlikely(!data)) 600 return NULL; 601 602 skb = __build_skb(data, len); 603 if (unlikely(!skb)) { 604 skb_free_frag(data); 605 return NULL; 606 } 607 608 if (pfmemalloc) 609 skb->pfmemalloc = 1; 610 skb->head_frag = 1; 611 612 skb_success: 613 skb_reserve(skb, NET_SKB_PAD); 614 skb->dev = dev; 615 616 skb_fail: 617 return skb; 618 } 619 EXPORT_SYMBOL(__netdev_alloc_skb); 620 621 /** 622 * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance 623 * @napi: napi instance this buffer was allocated for 624 * @len: length to allocate 625 * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages 626 * 627 * Allocate a new sk_buff for use in NAPI receive. This buffer will 628 * attempt to allocate the head from a special reserved region used 629 * only for NAPI Rx allocation. By doing this we can save several 630 * CPU cycles by avoiding having to disable and re-enable IRQs. 631 * 632 * %NULL is returned if there is no free memory. 633 */ 634 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, 635 gfp_t gfp_mask) 636 { 637 struct napi_alloc_cache *nc; 638 struct sk_buff *skb; 639 bool pfmemalloc; 640 void *data; 641 642 DEBUG_NET_WARN_ON_ONCE(!in_softirq()); 643 len += NET_SKB_PAD + NET_IP_ALIGN; 644 645 /* If requested length is either too small or too big, 646 * we use kmalloc() for skb->head allocation. 647 * When the small frag allocator is available, prefer it over kmalloc 648 * for small fragments 649 */ 650 if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || 651 len > SKB_WITH_OVERHEAD(PAGE_SIZE) || 652 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 653 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, 654 NUMA_NO_NODE); 655 if (!skb) 656 goto skb_fail; 657 goto skb_success; 658 } 659 660 nc = this_cpu_ptr(&napi_alloc_cache); 661 662 if (sk_memalloc_socks()) 663 gfp_mask |= __GFP_MEMALLOC; 664 665 if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { 666 /* we are artificially inflating the allocation size, but 667 * that is not as bad as it may look like, as: 668 * - 'len' less than GRO_MAX_HEAD makes little sense 669 * - On most systems, larger 'len' values lead to fragment 670 * size above 512 bytes 671 * - kmalloc would use the kmalloc-1k slab for such values 672 * - Builds with smaller GRO_MAX_HEAD will very likely do 673 * little networking, as that implies no WiFi and no 674 * tunnels support, and 32 bits arches. 675 */ 676 len = SZ_1K; 677 678 data = page_frag_alloc_1k(&nc->page_small, gfp_mask); 679 pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); 680 } else { 681 len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 682 len = SKB_DATA_ALIGN(len); 683 684 data = page_frag_alloc(&nc->page, len, gfp_mask); 685 pfmemalloc = nc->page.pfmemalloc; 686 } 687 688 if (unlikely(!data)) 689 return NULL; 690 691 skb = __napi_build_skb(data, len); 692 if (unlikely(!skb)) { 693 skb_free_frag(data); 694 return NULL; 695 } 696 697 if (pfmemalloc) 698 skb->pfmemalloc = 1; 699 skb->head_frag = 1; 700 701 skb_success: 702 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); 703 skb->dev = napi->dev; 704 705 skb_fail: 706 return skb; 707 } 708 EXPORT_SYMBOL(__napi_alloc_skb); 709 710 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, 711 int size, unsigned int truesize) 712 { 713 skb_fill_page_desc(skb, i, page, off, size); 714 skb->len += size; 715 skb->data_len += size; 716 skb->truesize += truesize; 717 } 718 EXPORT_SYMBOL(skb_add_rx_frag); 719 720 void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, 721 unsigned int truesize) 722 { 723 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 724 725 skb_frag_size_add(frag, size); 726 skb->len += size; 727 skb->data_len += size; 728 skb->truesize += truesize; 729 } 730 EXPORT_SYMBOL(skb_coalesce_rx_frag); 731 732 static void skb_drop_list(struct sk_buff **listp) 733 { 734 kfree_skb_list(*listp); 735 *listp = NULL; 736 } 737 738 static inline void skb_drop_fraglist(struct sk_buff *skb) 739 { 740 skb_drop_list(&skb_shinfo(skb)->frag_list); 741 } 742 743 static void skb_clone_fraglist(struct sk_buff *skb) 744 { 745 struct sk_buff *list; 746 747 skb_walk_frags(skb, list) 748 skb_get(list); 749 } 750 751 static bool skb_pp_recycle(struct sk_buff *skb, void *data) 752 { 753 if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) 754 return false; 755 return page_pool_return_skb_page(virt_to_page(data)); 756 } 757 758 static void skb_free_head(struct sk_buff *skb) 759 { 760 unsigned char *head = skb->head; 761 762 if (skb->head_frag) { 763 if (skb_pp_recycle(skb, head)) 764 return; 765 skb_free_frag(head); 766 } else { 767 kfree(head); 768 } 769 } 770 771 static void skb_release_data(struct sk_buff *skb) 772 { 773 struct skb_shared_info *shinfo = skb_shinfo(skb); 774 int i; 775 776 if (skb->cloned && 777 atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, 778 &shinfo->dataref)) 779 goto exit; 780 781 if (skb_zcopy(skb)) { 782 bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS; 783 784 skb_zcopy_clear(skb, true); 785 if (skip_unref) 786 goto free_head; 787 } 788 789 for (i = 0; i < shinfo->nr_frags; i++) 790 __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); 791 792 free_head: 793 if (shinfo->frag_list) 794 kfree_skb_list(shinfo->frag_list); 795 796 skb_free_head(skb); 797 exit: 798 /* When we clone an SKB we copy the reycling bit. The pp_recycle 799 * bit is only set on the head though, so in order to avoid races 800 * while trying to recycle fragments on __skb_frag_unref() we need 801 * to make one SKB responsible for triggering the recycle path. 802 * So disable the recycling bit if an SKB is cloned and we have 803 * additional references to the fragmented part of the SKB. 804 * Eventually the last SKB will have the recycling bit set and it's 805 * dataref set to 0, which will trigger the recycling 806 */ 807 skb->pp_recycle = 0; 808 } 809 810 /* 811 * Free an skbuff by memory without cleaning the state. 812 */ 813 static void kfree_skbmem(struct sk_buff *skb) 814 { 815 struct sk_buff_fclones *fclones; 816 817 switch (skb->fclone) { 818 case SKB_FCLONE_UNAVAILABLE: 819 kmem_cache_free(skbuff_head_cache, skb); 820 return; 821 822 case SKB_FCLONE_ORIG: 823 fclones = container_of(skb, struct sk_buff_fclones, skb1); 824 825 /* We usually free the clone (TX completion) before original skb 826 * This test would have no chance to be true for the clone, 827 * while here, branch prediction will be good. 828 */ 829 if (refcount_read(&fclones->fclone_ref) == 1) 830 goto fastpath; 831 break; 832 833 default: /* SKB_FCLONE_CLONE */ 834 fclones = container_of(skb, struct sk_buff_fclones, skb2); 835 break; 836 } 837 if (!refcount_dec_and_test(&fclones->fclone_ref)) 838 return; 839 fastpath: 840 kmem_cache_free(skbuff_fclone_cache, fclones); 841 } 842 843 void skb_release_head_state(struct sk_buff *skb) 844 { 845 skb_dst_drop(skb); 846 if (skb->destructor) { 847 DEBUG_NET_WARN_ON_ONCE(in_hardirq()); 848 skb->destructor(skb); 849 } 850 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 851 nf_conntrack_put(skb_nfct(skb)); 852 #endif 853 skb_ext_put(skb); 854 } 855 856 /* Free everything but the sk_buff shell. */ 857 static void skb_release_all(struct sk_buff *skb) 858 { 859 skb_release_head_state(skb); 860 if (likely(skb->head)) 861 skb_release_data(skb); 862 } 863 864 /** 865 * __kfree_skb - private function 866 * @skb: buffer 867 * 868 * Free an sk_buff. Release anything attached to the buffer. 869 * Clean the state. This is an internal helper function. Users should 870 * always call kfree_skb 871 */ 872 873 void __kfree_skb(struct sk_buff *skb) 874 { 875 skb_release_all(skb); 876 kfree_skbmem(skb); 877 } 878 EXPORT_SYMBOL(__kfree_skb); 879 880 /** 881 * kfree_skb_reason - free an sk_buff with special reason 882 * @skb: buffer to free 883 * @reason: reason why this skb is dropped 884 * 885 * Drop a reference to the buffer and free it if the usage count has 886 * hit zero. Meanwhile, pass the drop reason to 'kfree_skb' 887 * tracepoint. 888 */ 889 void __fix_address 890 kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) 891 { 892 if (unlikely(!skb_unref(skb))) 893 return; 894 895 DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); 896 897 trace_kfree_skb(skb, __builtin_return_address(0), reason); 898 __kfree_skb(skb); 899 } 900 EXPORT_SYMBOL(kfree_skb_reason); 901 902 void kfree_skb_list_reason(struct sk_buff *segs, 903 enum skb_drop_reason reason) 904 { 905 while (segs) { 906 struct sk_buff *next = segs->next; 907 908 kfree_skb_reason(segs, reason); 909 segs = next; 910 } 911 } 912 EXPORT_SYMBOL(kfree_skb_list_reason); 913 914 /* Dump skb information and contents. 915 * 916 * Must only be called from net_ratelimit()-ed paths. 917 * 918 * Dumps whole packets if full_pkt, only headers otherwise. 919 */ 920 void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) 921 { 922 struct skb_shared_info *sh = skb_shinfo(skb); 923 struct net_device *dev = skb->dev; 924 struct sock *sk = skb->sk; 925 struct sk_buff *list_skb; 926 bool has_mac, has_trans; 927 int headroom, tailroom; 928 int i, len, seg_len; 929 930 if (full_pkt) 931 len = skb->len; 932 else 933 len = min_t(int, skb->len, MAX_HEADER + 128); 934 935 headroom = skb_headroom(skb); 936 tailroom = skb_tailroom(skb); 937 938 has_mac = skb_mac_header_was_set(skb); 939 has_trans = skb_transport_header_was_set(skb); 940 941 printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" 942 "mac=(%d,%d) net=(%d,%d) trans=%d\n" 943 "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" 944 "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" 945 "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n", 946 level, skb->len, headroom, skb_headlen(skb), tailroom, 947 has_mac ? skb->mac_header : -1, 948 has_mac ? skb_mac_header_len(skb) : -1, 949 skb->network_header, 950 has_trans ? skb_network_header_len(skb) : -1, 951 has_trans ? skb->transport_header : -1, 952 sh->tx_flags, sh->nr_frags, 953 sh->gso_size, sh->gso_type, sh->gso_segs, 954 skb->csum, skb->ip_summed, skb->csum_complete_sw, 955 skb->csum_valid, skb->csum_level, 956 skb->hash, skb->sw_hash, skb->l4_hash, 957 ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); 958 959 if (dev) 960 printk("%sdev name=%s feat=%pNF\n", 961 level, dev->name, &dev->features); 962 if (sk) 963 printk("%ssk family=%hu type=%u proto=%u\n", 964 level, sk->sk_family, sk->sk_type, sk->sk_protocol); 965 966 if (full_pkt && headroom) 967 print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, 968 16, 1, skb->head, headroom, false); 969 970 seg_len = min_t(int, skb_headlen(skb), len); 971 if (seg_len) 972 print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, 973 16, 1, skb->data, seg_len, false); 974 len -= seg_len; 975 976 if (full_pkt && tailroom) 977 print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, 978 16, 1, skb_tail_pointer(skb), tailroom, false); 979 980 for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { 981 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 982 u32 p_off, p_len, copied; 983 struct page *p; 984 u8 *vaddr; 985 986 skb_frag_foreach_page(frag, skb_frag_off(frag), 987 skb_frag_size(frag), p, p_off, p_len, 988 copied) { 989 seg_len = min_t(int, p_len, len); 990 vaddr = kmap_atomic(p); 991 print_hex_dump(level, "skb frag: ", 992 DUMP_PREFIX_OFFSET, 993 16, 1, vaddr + p_off, seg_len, false); 994 kunmap_atomic(vaddr); 995 len -= seg_len; 996 if (!len) 997 break; 998 } 999 } 1000 1001 if (full_pkt && skb_has_frag_list(skb)) { 1002 printk("skb fraglist:\n"); 1003 skb_walk_frags(skb, list_skb) 1004 skb_dump(level, list_skb, true); 1005 } 1006 } 1007 EXPORT_SYMBOL(skb_dump); 1008 1009 /** 1010 * skb_tx_error - report an sk_buff xmit error 1011 * @skb: buffer that triggered an error 1012 * 1013 * Report xmit error if a device callback is tracking this skb. 1014 * skb must be freed afterwards. 1015 */ 1016 void skb_tx_error(struct sk_buff *skb) 1017 { 1018 if (skb) { 1019 skb_zcopy_downgrade_managed(skb); 1020 skb_zcopy_clear(skb, true); 1021 } 1022 } 1023 EXPORT_SYMBOL(skb_tx_error); 1024 1025 #ifdef CONFIG_TRACEPOINTS 1026 /** 1027 * consume_skb - free an skbuff 1028 * @skb: buffer to free 1029 * 1030 * Drop a ref to the buffer and free it if the usage count has hit zero 1031 * Functions identically to kfree_skb, but kfree_skb assumes that the frame 1032 * is being dropped after a failure and notes that 1033 */ 1034 void consume_skb(struct sk_buff *skb) 1035 { 1036 if (!skb_unref(skb)) 1037 return; 1038 1039 trace_consume_skb(skb); 1040 __kfree_skb(skb); 1041 } 1042 EXPORT_SYMBOL(consume_skb); 1043 #endif 1044 1045 /** 1046 * __consume_stateless_skb - free an skbuff, assuming it is stateless 1047 * @skb: buffer to free 1048 * 1049 * Alike consume_skb(), but this variant assumes that this is the last 1050 * skb reference and all the head states have been already dropped 1051 */ 1052 void __consume_stateless_skb(struct sk_buff *skb) 1053 { 1054 trace_consume_skb(skb); 1055 skb_release_data(skb); 1056 kfree_skbmem(skb); 1057 } 1058 1059 static void napi_skb_cache_put(struct sk_buff *skb) 1060 { 1061 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 1062 u32 i; 1063 1064 kasan_poison_object_data(skbuff_head_cache, skb); 1065 nc->skb_cache[nc->skb_count++] = skb; 1066 1067 if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { 1068 for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) 1069 kasan_unpoison_object_data(skbuff_head_cache, 1070 nc->skb_cache[i]); 1071 1072 kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF, 1073 nc->skb_cache + NAPI_SKB_CACHE_HALF); 1074 nc->skb_count = NAPI_SKB_CACHE_HALF; 1075 } 1076 } 1077 1078 void __kfree_skb_defer(struct sk_buff *skb) 1079 { 1080 skb_release_all(skb); 1081 napi_skb_cache_put(skb); 1082 } 1083 1084 void napi_skb_free_stolen_head(struct sk_buff *skb) 1085 { 1086 if (unlikely(skb->slow_gro)) { 1087 nf_reset_ct(skb); 1088 skb_dst_drop(skb); 1089 skb_ext_put(skb); 1090 skb_orphan(skb); 1091 skb->slow_gro = 0; 1092 } 1093 napi_skb_cache_put(skb); 1094 } 1095 1096 void napi_consume_skb(struct sk_buff *skb, int budget) 1097 { 1098 /* Zero budget indicate non-NAPI context called us, like netpoll */ 1099 if (unlikely(!budget)) { 1100 dev_consume_skb_any(skb); 1101 return; 1102 } 1103 1104 DEBUG_NET_WARN_ON_ONCE(!in_softirq()); 1105 1106 if (!skb_unref(skb)) 1107 return; 1108 1109 /* if reaching here SKB is ready to free */ 1110 trace_consume_skb(skb); 1111 1112 /* if SKB is a clone, don't handle this case */ 1113 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { 1114 __kfree_skb(skb); 1115 return; 1116 } 1117 1118 skb_release_all(skb); 1119 napi_skb_cache_put(skb); 1120 } 1121 EXPORT_SYMBOL(napi_consume_skb); 1122 1123 /* Make sure a field is contained by headers group */ 1124 #define CHECK_SKB_FIELD(field) \ 1125 BUILD_BUG_ON(offsetof(struct sk_buff, field) != \ 1126 offsetof(struct sk_buff, headers.field)); \ 1127 1128 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 1129 { 1130 new->tstamp = old->tstamp; 1131 /* We do not copy old->sk */ 1132 new->dev = old->dev; 1133 memcpy(new->cb, old->cb, sizeof(old->cb)); 1134 skb_dst_copy(new, old); 1135 __skb_ext_copy(new, old); 1136 __nf_copy(new, old, false); 1137 1138 /* Note : this field could be in the headers group. 1139 * It is not yet because we do not want to have a 16 bit hole 1140 */ 1141 new->queue_mapping = old->queue_mapping; 1142 1143 memcpy(&new->headers, &old->headers, sizeof(new->headers)); 1144 CHECK_SKB_FIELD(protocol); 1145 CHECK_SKB_FIELD(csum); 1146 CHECK_SKB_FIELD(hash); 1147 CHECK_SKB_FIELD(priority); 1148 CHECK_SKB_FIELD(skb_iif); 1149 CHECK_SKB_FIELD(vlan_proto); 1150 CHECK_SKB_FIELD(vlan_tci); 1151 CHECK_SKB_FIELD(transport_header); 1152 CHECK_SKB_FIELD(network_header); 1153 CHECK_SKB_FIELD(mac_header); 1154 CHECK_SKB_FIELD(inner_protocol); 1155 CHECK_SKB_FIELD(inner_transport_header); 1156 CHECK_SKB_FIELD(inner_network_header); 1157 CHECK_SKB_FIELD(inner_mac_header); 1158 CHECK_SKB_FIELD(mark); 1159 #ifdef CONFIG_NETWORK_SECMARK 1160 CHECK_SKB_FIELD(secmark); 1161 #endif 1162 #ifdef CONFIG_NET_RX_BUSY_POLL 1163 CHECK_SKB_FIELD(napi_id); 1164 #endif 1165 CHECK_SKB_FIELD(alloc_cpu); 1166 #ifdef CONFIG_XPS 1167 CHECK_SKB_FIELD(sender_cpu); 1168 #endif 1169 #ifdef CONFIG_NET_SCHED 1170 CHECK_SKB_FIELD(tc_index); 1171 #endif 1172 1173 } 1174 1175 /* 1176 * You should not add any new code to this function. Add it to 1177 * __copy_skb_header above instead. 1178 */ 1179 static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) 1180 { 1181 #define C(x) n->x = skb->x 1182 1183 n->next = n->prev = NULL; 1184 n->sk = NULL; 1185 __copy_skb_header(n, skb); 1186 1187 C(len); 1188 C(data_len); 1189 C(mac_len); 1190 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; 1191 n->cloned = 1; 1192 n->nohdr = 0; 1193 n->peeked = 0; 1194 C(pfmemalloc); 1195 C(pp_recycle); 1196 n->destructor = NULL; 1197 C(tail); 1198 C(end); 1199 C(head); 1200 C(head_frag); 1201 C(data); 1202 C(truesize); 1203 refcount_set(&n->users, 1); 1204 1205 atomic_inc(&(skb_shinfo(skb)->dataref)); 1206 skb->cloned = 1; 1207 1208 return n; 1209 #undef C 1210 } 1211 1212 /** 1213 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg 1214 * @first: first sk_buff of the msg 1215 */ 1216 struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) 1217 { 1218 struct sk_buff *n; 1219 1220 n = alloc_skb(0, GFP_ATOMIC); 1221 if (!n) 1222 return NULL; 1223 1224 n->len = first->len; 1225 n->data_len = first->len; 1226 n->truesize = first->truesize; 1227 1228 skb_shinfo(n)->frag_list = first; 1229 1230 __copy_skb_header(n, first); 1231 n->destructor = NULL; 1232 1233 return n; 1234 } 1235 EXPORT_SYMBOL_GPL(alloc_skb_for_msg); 1236 1237 /** 1238 * skb_morph - morph one skb into another 1239 * @dst: the skb to receive the contents 1240 * @src: the skb to supply the contents 1241 * 1242 * This is identical to skb_clone except that the target skb is 1243 * supplied by the user. 1244 * 1245 * The target skb is returned upon exit. 1246 */ 1247 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) 1248 { 1249 skb_release_all(dst); 1250 return __skb_clone(dst, src); 1251 } 1252 EXPORT_SYMBOL_GPL(skb_morph); 1253 1254 int mm_account_pinned_pages(struct mmpin *mmp, size_t size) 1255 { 1256 unsigned long max_pg, num_pg, new_pg, old_pg; 1257 struct user_struct *user; 1258 1259 if (capable(CAP_IPC_LOCK) || !size) 1260 return 0; 1261 1262 num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ 1263 max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 1264 user = mmp->user ? : current_user(); 1265 1266 do { 1267 old_pg = atomic_long_read(&user->locked_vm); 1268 new_pg = old_pg + num_pg; 1269 if (new_pg > max_pg) 1270 return -ENOBUFS; 1271 } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) != 1272 old_pg); 1273 1274 if (!mmp->user) { 1275 mmp->user = get_uid(user); 1276 mmp->num_pg = num_pg; 1277 } else { 1278 mmp->num_pg += num_pg; 1279 } 1280 1281 return 0; 1282 } 1283 EXPORT_SYMBOL_GPL(mm_account_pinned_pages); 1284 1285 void mm_unaccount_pinned_pages(struct mmpin *mmp) 1286 { 1287 if (mmp->user) { 1288 atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); 1289 free_uid(mmp->user); 1290 } 1291 } 1292 EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); 1293 1294 static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) 1295 { 1296 struct ubuf_info_msgzc *uarg; 1297 struct sk_buff *skb; 1298 1299 WARN_ON_ONCE(!in_task()); 1300 1301 skb = sock_omalloc(sk, 0, GFP_KERNEL); 1302 if (!skb) 1303 return NULL; 1304 1305 BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); 1306 uarg = (void *)skb->cb; 1307 uarg->mmp.user = NULL; 1308 1309 if (mm_account_pinned_pages(&uarg->mmp, size)) { 1310 kfree_skb(skb); 1311 return NULL; 1312 } 1313 1314 uarg->ubuf.callback = msg_zerocopy_callback; 1315 uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; 1316 uarg->len = 1; 1317 uarg->bytelen = size; 1318 uarg->zerocopy = 1; 1319 uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; 1320 refcount_set(&uarg->ubuf.refcnt, 1); 1321 sock_hold(sk); 1322 1323 return &uarg->ubuf; 1324 } 1325 1326 static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) 1327 { 1328 return container_of((void *)uarg, struct sk_buff, cb); 1329 } 1330 1331 struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, 1332 struct ubuf_info *uarg) 1333 { 1334 if (uarg) { 1335 struct ubuf_info_msgzc *uarg_zc; 1336 const u32 byte_limit = 1 << 19; /* limit to a few TSO */ 1337 u32 bytelen, next; 1338 1339 /* there might be non MSG_ZEROCOPY users */ 1340 if (uarg->callback != msg_zerocopy_callback) 1341 return NULL; 1342 1343 /* realloc only when socket is locked (TCP, UDP cork), 1344 * so uarg->len and sk_zckey access is serialized 1345 */ 1346 if (!sock_owned_by_user(sk)) { 1347 WARN_ON_ONCE(1); 1348 return NULL; 1349 } 1350 1351 uarg_zc = uarg_to_msgzc(uarg); 1352 bytelen = uarg_zc->bytelen + size; 1353 if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) { 1354 /* TCP can create new skb to attach new uarg */ 1355 if (sk->sk_type == SOCK_STREAM) 1356 goto new_alloc; 1357 return NULL; 1358 } 1359 1360 next = (u32)atomic_read(&sk->sk_zckey); 1361 if ((u32)(uarg_zc->id + uarg_zc->len) == next) { 1362 if (mm_account_pinned_pages(&uarg_zc->mmp, size)) 1363 return NULL; 1364 uarg_zc->len++; 1365 uarg_zc->bytelen = bytelen; 1366 atomic_set(&sk->sk_zckey, ++next); 1367 1368 /* no extra ref when appending to datagram (MSG_MORE) */ 1369 if (sk->sk_type == SOCK_STREAM) 1370 net_zcopy_get(uarg); 1371 1372 return uarg; 1373 } 1374 } 1375 1376 new_alloc: 1377 return msg_zerocopy_alloc(sk, size); 1378 } 1379 EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); 1380 1381 static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) 1382 { 1383 struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); 1384 u32 old_lo, old_hi; 1385 u64 sum_len; 1386 1387 old_lo = serr->ee.ee_info; 1388 old_hi = serr->ee.ee_data; 1389 sum_len = old_hi - old_lo + 1ULL + len; 1390 1391 if (sum_len >= (1ULL << 32)) 1392 return false; 1393 1394 if (lo != old_hi + 1) 1395 return false; 1396 1397 serr->ee.ee_data += len; 1398 return true; 1399 } 1400 1401 static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg) 1402 { 1403 struct sk_buff *tail, *skb = skb_from_uarg(uarg); 1404 struct sock_exterr_skb *serr; 1405 struct sock *sk = skb->sk; 1406 struct sk_buff_head *q; 1407 unsigned long flags; 1408 bool is_zerocopy; 1409 u32 lo, hi; 1410 u16 len; 1411 1412 mm_unaccount_pinned_pages(&uarg->mmp); 1413 1414 /* if !len, there was only 1 call, and it was aborted 1415 * so do not queue a completion notification 1416 */ 1417 if (!uarg->len || sock_flag(sk, SOCK_DEAD)) 1418 goto release; 1419 1420 len = uarg->len; 1421 lo = uarg->id; 1422 hi = uarg->id + len - 1; 1423 is_zerocopy = uarg->zerocopy; 1424 1425 serr = SKB_EXT_ERR(skb); 1426 memset(serr, 0, sizeof(*serr)); 1427 serr->ee.ee_errno = 0; 1428 serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; 1429 serr->ee.ee_data = hi; 1430 serr->ee.ee_info = lo; 1431 if (!is_zerocopy) 1432 serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; 1433 1434 q = &sk->sk_error_queue; 1435 spin_lock_irqsave(&q->lock, flags); 1436 tail = skb_peek_tail(q); 1437 if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || 1438 !skb_zerocopy_notify_extend(tail, lo, len)) { 1439 __skb_queue_tail(q, skb); 1440 skb = NULL; 1441 } 1442 spin_unlock_irqrestore(&q->lock, flags); 1443 1444 sk_error_report(sk); 1445 1446 release: 1447 consume_skb(skb); 1448 sock_put(sk); 1449 } 1450 1451 void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, 1452 bool success) 1453 { 1454 struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); 1455 1456 uarg_zc->zerocopy = uarg_zc->zerocopy & success; 1457 1458 if (refcount_dec_and_test(&uarg->refcnt)) 1459 __msg_zerocopy_callback(uarg_zc); 1460 } 1461 EXPORT_SYMBOL_GPL(msg_zerocopy_callback); 1462 1463 void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) 1464 { 1465 struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk; 1466 1467 atomic_dec(&sk->sk_zckey); 1468 uarg_to_msgzc(uarg)->len--; 1469 1470 if (have_uref) 1471 msg_zerocopy_callback(NULL, uarg, true); 1472 } 1473 EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); 1474 1475 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, 1476 struct msghdr *msg, int len, 1477 struct ubuf_info *uarg) 1478 { 1479 struct ubuf_info *orig_uarg = skb_zcopy(skb); 1480 int err, orig_len = skb->len; 1481 1482 /* An skb can only point to one uarg. This edge case happens when 1483 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. 1484 */ 1485 if (orig_uarg && uarg != orig_uarg) 1486 return -EEXIST; 1487 1488 err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); 1489 if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { 1490 struct sock *save_sk = skb->sk; 1491 1492 /* Streams do not free skb on error. Reset to prev state. */ 1493 iov_iter_revert(&msg->msg_iter, skb->len - orig_len); 1494 skb->sk = sk; 1495 ___pskb_trim(skb, orig_len); 1496 skb->sk = save_sk; 1497 return err; 1498 } 1499 1500 skb_zcopy_set(skb, uarg, NULL); 1501 return skb->len - orig_len; 1502 } 1503 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); 1504 1505 void __skb_zcopy_downgrade_managed(struct sk_buff *skb) 1506 { 1507 int i; 1508 1509 skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; 1510 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1511 skb_frag_ref(skb, i); 1512 } 1513 EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed); 1514 1515 static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, 1516 gfp_t gfp_mask) 1517 { 1518 if (skb_zcopy(orig)) { 1519 if (skb_zcopy(nskb)) { 1520 /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ 1521 if (!gfp_mask) { 1522 WARN_ON_ONCE(1); 1523 return -ENOMEM; 1524 } 1525 if (skb_uarg(nskb) == skb_uarg(orig)) 1526 return 0; 1527 if (skb_copy_ubufs(nskb, GFP_ATOMIC)) 1528 return -EIO; 1529 } 1530 skb_zcopy_set(nskb, skb_uarg(orig), NULL); 1531 } 1532 return 0; 1533 } 1534 1535 /** 1536 * skb_copy_ubufs - copy userspace skb frags buffers to kernel 1537 * @skb: the skb to modify 1538 * @gfp_mask: allocation priority 1539 * 1540 * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. 1541 * It will copy all frags into kernel and drop the reference 1542 * to userspace pages. 1543 * 1544 * If this function is called from an interrupt gfp_mask() must be 1545 * %GFP_ATOMIC. 1546 * 1547 * Returns 0 on success or a negative error code on failure 1548 * to allocate kernel memory to copy to. 1549 */ 1550 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) 1551 { 1552 int num_frags = skb_shinfo(skb)->nr_frags; 1553 struct page *page, *head = NULL; 1554 int i, new_frags; 1555 u32 d_off; 1556 1557 if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) 1558 return -EINVAL; 1559 1560 if (!num_frags) 1561 goto release; 1562 1563 new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1564 for (i = 0; i < new_frags; i++) { 1565 page = alloc_page(gfp_mask); 1566 if (!page) { 1567 while (head) { 1568 struct page *next = (struct page *)page_private(head); 1569 put_page(head); 1570 head = next; 1571 } 1572 return -ENOMEM; 1573 } 1574 set_page_private(page, (unsigned long)head); 1575 head = page; 1576 } 1577 1578 page = head; 1579 d_off = 0; 1580 for (i = 0; i < num_frags; i++) { 1581 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 1582 u32 p_off, p_len, copied; 1583 struct page *p; 1584 u8 *vaddr; 1585 1586 skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), 1587 p, p_off, p_len, copied) { 1588 u32 copy, done = 0; 1589 vaddr = kmap_atomic(p); 1590 1591 while (done < p_len) { 1592 if (d_off == PAGE_SIZE) { 1593 d_off = 0; 1594 page = (struct page *)page_private(page); 1595 } 1596 copy = min_t(u32, PAGE_SIZE - d_off, p_len - done); 1597 memcpy(page_address(page) + d_off, 1598 vaddr + p_off + done, copy); 1599 done += copy; 1600 d_off += copy; 1601 } 1602 kunmap_atomic(vaddr); 1603 } 1604 } 1605 1606 /* skb frags release userspace buffers */ 1607 for (i = 0; i < num_frags; i++) 1608 skb_frag_unref(skb, i); 1609 1610 /* skb frags point to kernel buffers */ 1611 for (i = 0; i < new_frags - 1; i++) { 1612 __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE); 1613 head = (struct page *)page_private(head); 1614 } 1615 __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); 1616 skb_shinfo(skb)->nr_frags = new_frags; 1617 1618 release: 1619 skb_zcopy_clear(skb, false); 1620 return 0; 1621 } 1622 EXPORT_SYMBOL_GPL(skb_copy_ubufs); 1623 1624 /** 1625 * skb_clone - duplicate an sk_buff 1626 * @skb: buffer to clone 1627 * @gfp_mask: allocation priority 1628 * 1629 * Duplicate an &sk_buff. The new one is not owned by a socket. Both 1630 * copies share the same packet data but not structure. The new 1631 * buffer has a reference count of 1. If the allocation fails the 1632 * function returns %NULL otherwise the new buffer is returned. 1633 * 1634 * If this function is called from an interrupt gfp_mask() must be 1635 * %GFP_ATOMIC. 1636 */ 1637 1638 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) 1639 { 1640 struct sk_buff_fclones *fclones = container_of(skb, 1641 struct sk_buff_fclones, 1642 skb1); 1643 struct sk_buff *n; 1644 1645 if (skb_orphan_frags(skb, gfp_mask)) 1646 return NULL; 1647 1648 if (skb->fclone == SKB_FCLONE_ORIG && 1649 refcount_read(&fclones->fclone_ref) == 1) { 1650 n = &fclones->skb2; 1651 refcount_set(&fclones->fclone_ref, 2); 1652 n->fclone = SKB_FCLONE_CLONE; 1653 } else { 1654 if (skb_pfmemalloc(skb)) 1655 gfp_mask |= __GFP_MEMALLOC; 1656 1657 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 1658 if (!n) 1659 return NULL; 1660 1661 n->fclone = SKB_FCLONE_UNAVAILABLE; 1662 } 1663 1664 return __skb_clone(n, skb); 1665 } 1666 EXPORT_SYMBOL(skb_clone); 1667 1668 void skb_headers_offset_update(struct sk_buff *skb, int off) 1669 { 1670 /* Only adjust this if it actually is csum_start rather than csum */ 1671 if (skb->ip_summed == CHECKSUM_PARTIAL) 1672 skb->csum_start += off; 1673 /* {transport,network,mac}_header and tail are relative to skb->head */ 1674 skb->transport_header += off; 1675 skb->network_header += off; 1676 if (skb_mac_header_was_set(skb)) 1677 skb->mac_header += off; 1678 skb->inner_transport_header += off; 1679 skb->inner_network_header += off; 1680 skb->inner_mac_header += off; 1681 } 1682 EXPORT_SYMBOL(skb_headers_offset_update); 1683 1684 void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) 1685 { 1686 __copy_skb_header(new, old); 1687 1688 skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; 1689 skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; 1690 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 1691 } 1692 EXPORT_SYMBOL(skb_copy_header); 1693 1694 static inline int skb_alloc_rx_flag(const struct sk_buff *skb) 1695 { 1696 if (skb_pfmemalloc(skb)) 1697 return SKB_ALLOC_RX; 1698 return 0; 1699 } 1700 1701 /** 1702 * skb_copy - create private copy of an sk_buff 1703 * @skb: buffer to copy 1704 * @gfp_mask: allocation priority 1705 * 1706 * Make a copy of both an &sk_buff and its data. This is used when the 1707 * caller wishes to modify the data and needs a private copy of the 1708 * data to alter. Returns %NULL on failure or the pointer to the buffer 1709 * on success. The returned buffer has a reference count of 1. 1710 * 1711 * As by-product this function converts non-linear &sk_buff to linear 1712 * one, so that &sk_buff becomes completely private and caller is allowed 1713 * to modify all the data of returned buffer. This means that this 1714 * function is not recommended for use in circumstances when only 1715 * header is going to be modified. Use pskb_copy() instead. 1716 */ 1717 1718 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) 1719 { 1720 int headerlen = skb_headroom(skb); 1721 unsigned int size = skb_end_offset(skb) + skb->data_len; 1722 struct sk_buff *n = __alloc_skb(size, gfp_mask, 1723 skb_alloc_rx_flag(skb), NUMA_NO_NODE); 1724 1725 if (!n) 1726 return NULL; 1727 1728 /* Set the data pointer */ 1729 skb_reserve(n, headerlen); 1730 /* Set the tail pointer and length */ 1731 skb_put(n, skb->len); 1732 1733 BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); 1734 1735 skb_copy_header(n, skb); 1736 return n; 1737 } 1738 EXPORT_SYMBOL(skb_copy); 1739 1740 /** 1741 * __pskb_copy_fclone - create copy of an sk_buff with private head. 1742 * @skb: buffer to copy 1743 * @headroom: headroom of new skb 1744 * @gfp_mask: allocation priority 1745 * @fclone: if true allocate the copy of the skb from the fclone 1746 * cache instead of the head cache; it is recommended to set this 1747 * to true for the cases where the copy will likely be cloned 1748 * 1749 * Make a copy of both an &sk_buff and part of its data, located 1750 * in header. Fragmented data remain shared. This is used when 1751 * the caller wishes to modify only header of &sk_buff and needs 1752 * private copy of the header to alter. Returns %NULL on failure 1753 * or the pointer to the buffer on success. 1754 * The returned buffer has a reference count of 1. 1755 */ 1756 1757 struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, 1758 gfp_t gfp_mask, bool fclone) 1759 { 1760 unsigned int size = skb_headlen(skb) + headroom; 1761 int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); 1762 struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); 1763 1764 if (!n) 1765 goto out; 1766 1767 /* Set the data pointer */ 1768 skb_reserve(n, headroom); 1769 /* Set the tail pointer and length */ 1770 skb_put(n, skb_headlen(skb)); 1771 /* Copy the bytes */ 1772 skb_copy_from_linear_data(skb, n->data, n->len); 1773 1774 n->truesize += skb->data_len; 1775 n->data_len = skb->data_len; 1776 n->len = skb->len; 1777 1778 if (skb_shinfo(skb)->nr_frags) { 1779 int i; 1780 1781 if (skb_orphan_frags(skb, gfp_mask) || 1782 skb_zerocopy_clone(n, skb, gfp_mask)) { 1783 kfree_skb(n); 1784 n = NULL; 1785 goto out; 1786 } 1787 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1788 skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; 1789 skb_frag_ref(skb, i); 1790 } 1791 skb_shinfo(n)->nr_frags = i; 1792 } 1793 1794 if (skb_has_frag_list(skb)) { 1795 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; 1796 skb_clone_fraglist(n); 1797 } 1798 1799 skb_copy_header(n, skb); 1800 out: 1801 return n; 1802 } 1803 EXPORT_SYMBOL(__pskb_copy_fclone); 1804 1805 /** 1806 * pskb_expand_head - reallocate header of &sk_buff 1807 * @skb: buffer to reallocate 1808 * @nhead: room to add at head 1809 * @ntail: room to add at tail 1810 * @gfp_mask: allocation priority 1811 * 1812 * Expands (or creates identical copy, if @nhead and @ntail are zero) 1813 * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have 1814 * reference count of 1. Returns zero in the case of success or error, 1815 * if expansion failed. In the last case, &sk_buff is not changed. 1816 * 1817 * All the pointers pointing into skb header may change and must be 1818 * reloaded after call to this function. 1819 */ 1820 1821 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, 1822 gfp_t gfp_mask) 1823 { 1824 int i, osize = skb_end_offset(skb); 1825 int size = osize + nhead + ntail; 1826 long off; 1827 u8 *data; 1828 1829 BUG_ON(nhead < 0); 1830 1831 BUG_ON(skb_shared(skb)); 1832 1833 skb_zcopy_downgrade_managed(skb); 1834 1835 size = SKB_DATA_ALIGN(size); 1836 1837 if (skb_pfmemalloc(skb)) 1838 gfp_mask |= __GFP_MEMALLOC; 1839 data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 1840 gfp_mask, NUMA_NO_NODE, NULL); 1841 if (!data) 1842 goto nodata; 1843 size = SKB_WITH_OVERHEAD(ksize(data)); 1844 1845 /* Copy only real data... and, alas, header. This should be 1846 * optimized for the cases when header is void. 1847 */ 1848 memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); 1849 1850 memcpy((struct skb_shared_info *)(data + size), 1851 skb_shinfo(skb), 1852 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); 1853 1854 /* 1855 * if shinfo is shared we must drop the old head gracefully, but if it 1856 * is not we can just drop the old head and let the existing refcount 1857 * be since all we did is relocate the values 1858 */ 1859 if (skb_cloned(skb)) { 1860 if (skb_orphan_frags(skb, gfp_mask)) 1861 goto nofrags; 1862 if (skb_zcopy(skb)) 1863 refcount_inc(&skb_uarg(skb)->refcnt); 1864 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1865 skb_frag_ref(skb, i); 1866 1867 if (skb_has_frag_list(skb)) 1868 skb_clone_fraglist(skb); 1869 1870 skb_release_data(skb); 1871 } else { 1872 skb_free_head(skb); 1873 } 1874 off = (data + nhead) - skb->head; 1875 1876 skb->head = data; 1877 skb->head_frag = 0; 1878 skb->data += off; 1879 1880 skb_set_end_offset(skb, size); 1881 #ifdef NET_SKBUFF_DATA_USES_OFFSET 1882 off = nhead; 1883 #endif 1884 skb->tail += off; 1885 skb_headers_offset_update(skb, nhead); 1886 skb->cloned = 0; 1887 skb->hdr_len = 0; 1888 skb->nohdr = 0; 1889 atomic_set(&skb_shinfo(skb)->dataref, 1); 1890 1891 skb_metadata_clear(skb); 1892 1893 /* It is not generally safe to change skb->truesize. 1894 * For the moment, we really care of rx path, or 1895 * when skb is orphaned (not attached to a socket). 1896 */ 1897 if (!skb->sk || skb->destructor == sock_edemux) 1898 skb->truesize += size - osize; 1899 1900 return 0; 1901 1902 nofrags: 1903 kfree(data); 1904 nodata: 1905 return -ENOMEM; 1906 } 1907 EXPORT_SYMBOL(pskb_expand_head); 1908 1909 /* Make private copy of skb with writable head and some headroom */ 1910 1911 struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) 1912 { 1913 struct sk_buff *skb2; 1914 int delta = headroom - skb_headroom(skb); 1915 1916 if (delta <= 0) 1917 skb2 = pskb_copy(skb, GFP_ATOMIC); 1918 else { 1919 skb2 = skb_clone(skb, GFP_ATOMIC); 1920 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, 1921 GFP_ATOMIC)) { 1922 kfree_skb(skb2); 1923 skb2 = NULL; 1924 } 1925 } 1926 return skb2; 1927 } 1928 EXPORT_SYMBOL(skb_realloc_headroom); 1929 1930 int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) 1931 { 1932 unsigned int saved_end_offset, saved_truesize; 1933 struct skb_shared_info *shinfo; 1934 int res; 1935 1936 saved_end_offset = skb_end_offset(skb); 1937 saved_truesize = skb->truesize; 1938 1939 res = pskb_expand_head(skb, 0, 0, pri); 1940 if (res) 1941 return res; 1942 1943 skb->truesize = saved_truesize; 1944 1945 if (likely(skb_end_offset(skb) == saved_end_offset)) 1946 return 0; 1947 1948 shinfo = skb_shinfo(skb); 1949 1950 /* We are about to change back skb->end, 1951 * we need to move skb_shinfo() to its new location. 1952 */ 1953 memmove(skb->head + saved_end_offset, 1954 shinfo, 1955 offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); 1956 1957 skb_set_end_offset(skb, saved_end_offset); 1958 1959 return 0; 1960 } 1961 1962 /** 1963 * skb_expand_head - reallocate header of &sk_buff 1964 * @skb: buffer to reallocate 1965 * @headroom: needed headroom 1966 * 1967 * Unlike skb_realloc_headroom, this one does not allocate a new skb 1968 * if possible; copies skb->sk to new skb as needed 1969 * and frees original skb in case of failures. 1970 * 1971 * It expect increased headroom and generates warning otherwise. 1972 */ 1973 1974 struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) 1975 { 1976 int delta = headroom - skb_headroom(skb); 1977 int osize = skb_end_offset(skb); 1978 struct sock *sk = skb->sk; 1979 1980 if (WARN_ONCE(delta <= 0, 1981 "%s is expecting an increase in the headroom", __func__)) 1982 return skb; 1983 1984 delta = SKB_DATA_ALIGN(delta); 1985 /* pskb_expand_head() might crash, if skb is shared. */ 1986 if (skb_shared(skb) || !is_skb_wmem(skb)) { 1987 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); 1988 1989 if (unlikely(!nskb)) 1990 goto fail; 1991 1992 if (sk) 1993 skb_set_owner_w(nskb, sk); 1994 consume_skb(skb); 1995 skb = nskb; 1996 } 1997 if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) 1998 goto fail; 1999 2000 if (sk && is_skb_wmem(skb)) { 2001 delta = skb_end_offset(skb) - osize; 2002 refcount_add(delta, &sk->sk_wmem_alloc); 2003 skb->truesize += delta; 2004 } 2005 return skb; 2006 2007 fail: 2008 kfree_skb(skb); 2009 return NULL; 2010 } 2011 EXPORT_SYMBOL(skb_expand_head); 2012 2013 /** 2014 * skb_copy_expand - copy and expand sk_buff 2015 * @skb: buffer to copy 2016 * @newheadroom: new free bytes at head 2017 * @newtailroom: new free bytes at tail 2018 * @gfp_mask: allocation priority 2019 * 2020 * Make a copy of both an &sk_buff and its data and while doing so 2021 * allocate additional space. 2022 * 2023 * This is used when the caller wishes to modify the data and needs a 2024 * private copy of the data to alter as well as more space for new fields. 2025 * Returns %NULL on failure or the pointer to the buffer 2026 * on success. The returned buffer has a reference count of 1. 2027 * 2028 * You must pass %GFP_ATOMIC as the allocation priority if this function 2029 * is called from an interrupt. 2030 */ 2031 struct sk_buff *skb_copy_expand(const struct sk_buff *skb, 2032 int newheadroom, int newtailroom, 2033 gfp_t gfp_mask) 2034 { 2035 /* 2036 * Allocate the copy buffer 2037 */ 2038 struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, 2039 gfp_mask, skb_alloc_rx_flag(skb), 2040 NUMA_NO_NODE); 2041 int oldheadroom = skb_headroom(skb); 2042 int head_copy_len, head_copy_off; 2043 2044 if (!n) 2045 return NULL; 2046 2047 skb_reserve(n, newheadroom); 2048 2049 /* Set the tail pointer and length */ 2050 skb_put(n, skb->len); 2051 2052 head_copy_len = oldheadroom; 2053 head_copy_off = 0; 2054 if (newheadroom <= head_copy_len) 2055 head_copy_len = newheadroom; 2056 else 2057 head_copy_off = newheadroom - head_copy_len; 2058 2059 /* Copy the linear header and data. */ 2060 BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, 2061 skb->len + head_copy_len)); 2062 2063 skb_copy_header(n, skb); 2064 2065 skb_headers_offset_update(n, newheadroom - oldheadroom); 2066 2067 return n; 2068 } 2069 EXPORT_SYMBOL(skb_copy_expand); 2070 2071 /** 2072 * __skb_pad - zero pad the tail of an skb 2073 * @skb: buffer to pad 2074 * @pad: space to pad 2075 * @free_on_error: free buffer on error 2076 * 2077 * Ensure that a buffer is followed by a padding area that is zero 2078 * filled. Used by network drivers which may DMA or transfer data 2079 * beyond the buffer end onto the wire. 2080 * 2081 * May return error in out of memory cases. The skb is freed on error 2082 * if @free_on_error is true. 2083 */ 2084 2085 int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) 2086 { 2087 int err; 2088 int ntail; 2089 2090 /* If the skbuff is non linear tailroom is always zero.. */ 2091 if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { 2092 memset(skb->data+skb->len, 0, pad); 2093 return 0; 2094 } 2095 2096 ntail = skb->data_len + pad - (skb->end - skb->tail); 2097 if (likely(skb_cloned(skb) || ntail > 0)) { 2098 err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); 2099 if (unlikely(err)) 2100 goto free_skb; 2101 } 2102 2103 /* FIXME: The use of this function with non-linear skb's really needs 2104 * to be audited. 2105 */ 2106 err = skb_linearize(skb); 2107 if (unlikely(err)) 2108 goto free_skb; 2109 2110 memset(skb->data + skb->len, 0, pad); 2111 return 0; 2112 2113 free_skb: 2114 if (free_on_error) 2115 kfree_skb(skb); 2116 return err; 2117 } 2118 EXPORT_SYMBOL(__skb_pad); 2119 2120 /** 2121 * pskb_put - add data to the tail of a potentially fragmented buffer 2122 * @skb: start of the buffer to use 2123 * @tail: tail fragment of the buffer to use 2124 * @len: amount of data to add 2125 * 2126 * This function extends the used data area of the potentially 2127 * fragmented buffer. @tail must be the last fragment of @skb -- or 2128 * @skb itself. If this would exceed the total buffer size the kernel 2129 * will panic. A pointer to the first byte of the extra data is 2130 * returned. 2131 */ 2132 2133 void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) 2134 { 2135 if (tail != skb) { 2136 skb->data_len += len; 2137 skb->len += len; 2138 } 2139 return skb_put(tail, len); 2140 } 2141 EXPORT_SYMBOL_GPL(pskb_put); 2142 2143 /** 2144 * skb_put - add data to a buffer 2145 * @skb: buffer to use 2146 * @len: amount of data to add 2147 * 2148 * This function extends the used data area of the buffer. If this would 2149 * exceed the total buffer size the kernel will panic. A pointer to the 2150 * first byte of the extra data is returned. 2151 */ 2152 void *skb_put(struct sk_buff *skb, unsigned int len) 2153 { 2154 void *tmp = skb_tail_pointer(skb); 2155 SKB_LINEAR_ASSERT(skb); 2156 skb->tail += len; 2157 skb->len += len; 2158 if (unlikely(skb->tail > skb->end)) 2159 skb_over_panic(skb, len, __builtin_return_address(0)); 2160 return tmp; 2161 } 2162 EXPORT_SYMBOL(skb_put); 2163 2164 /** 2165 * skb_push - add data to the start of a buffer 2166 * @skb: buffer to use 2167 * @len: amount of data to add 2168 * 2169 * This function extends the used data area of the buffer at the buffer 2170 * start. If this would exceed the total buffer headroom the kernel will 2171 * panic. A pointer to the first byte of the extra data is returned. 2172 */ 2173 void *skb_push(struct sk_buff *skb, unsigned int len) 2174 { 2175 skb->data -= len; 2176 skb->len += len; 2177 if (unlikely(skb->data < skb->head)) 2178 skb_under_panic(skb, len, __builtin_return_address(0)); 2179 return skb->data; 2180 } 2181 EXPORT_SYMBOL(skb_push); 2182 2183 /** 2184 * skb_pull - remove data from the start of a buffer 2185 * @skb: buffer to use 2186 * @len: amount of data to remove 2187 * 2188 * This function removes data from the start of a buffer, returning 2189 * the memory to the headroom. A pointer to the next data in the buffer 2190 * is returned. Once the data has been pulled future pushes will overwrite 2191 * the old data. 2192 */ 2193 void *skb_pull(struct sk_buff *skb, unsigned int len) 2194 { 2195 return skb_pull_inline(skb, len); 2196 } 2197 EXPORT_SYMBOL(skb_pull); 2198 2199 /** 2200 * skb_pull_data - remove data from the start of a buffer returning its 2201 * original position. 2202 * @skb: buffer to use 2203 * @len: amount of data to remove 2204 * 2205 * This function removes data from the start of a buffer, returning 2206 * the memory to the headroom. A pointer to the original data in the buffer 2207 * is returned after checking if there is enough data to pull. Once the 2208 * data has been pulled future pushes will overwrite the old data. 2209 */ 2210 void *skb_pull_data(struct sk_buff *skb, size_t len) 2211 { 2212 void *data = skb->data; 2213 2214 if (skb->len < len) 2215 return NULL; 2216 2217 skb_pull(skb, len); 2218 2219 return data; 2220 } 2221 EXPORT_SYMBOL(skb_pull_data); 2222 2223 /** 2224 * skb_trim - remove end from a buffer 2225 * @skb: buffer to alter 2226 * @len: new length 2227 * 2228 * Cut the length of a buffer down by removing data from the tail. If 2229 * the buffer is already under the length specified it is not modified. 2230 * The skb must be linear. 2231 */ 2232 void skb_trim(struct sk_buff *skb, unsigned int len) 2233 { 2234 if (skb->len > len) 2235 __skb_trim(skb, len); 2236 } 2237 EXPORT_SYMBOL(skb_trim); 2238 2239 /* Trims skb to length len. It can change skb pointers. 2240 */ 2241 2242 int ___pskb_trim(struct sk_buff *skb, unsigned int len) 2243 { 2244 struct sk_buff **fragp; 2245 struct sk_buff *frag; 2246 int offset = skb_headlen(skb); 2247 int nfrags = skb_shinfo(skb)->nr_frags; 2248 int i; 2249 int err; 2250 2251 if (skb_cloned(skb) && 2252 unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) 2253 return err; 2254 2255 i = 0; 2256 if (offset >= len) 2257 goto drop_pages; 2258 2259 for (; i < nfrags; i++) { 2260 int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); 2261 2262 if (end < len) { 2263 offset = end; 2264 continue; 2265 } 2266 2267 skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); 2268 2269 drop_pages: 2270 skb_shinfo(skb)->nr_frags = i; 2271 2272 for (; i < nfrags; i++) 2273 skb_frag_unref(skb, i); 2274 2275 if (skb_has_frag_list(skb)) 2276 skb_drop_fraglist(skb); 2277 goto done; 2278 } 2279 2280 for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); 2281 fragp = &frag->next) { 2282 int end = offset + frag->len; 2283 2284 if (skb_shared(frag)) { 2285 struct sk_buff *nfrag; 2286 2287 nfrag = skb_clone(frag, GFP_ATOMIC); 2288 if (unlikely(!nfrag)) 2289 return -ENOMEM; 2290 2291 nfrag->next = frag->next; 2292 consume_skb(frag); 2293 frag = nfrag; 2294 *fragp = frag; 2295 } 2296 2297 if (end < len) { 2298 offset = end; 2299 continue; 2300 } 2301 2302 if (end > len && 2303 unlikely((err = pskb_trim(frag, len - offset)))) 2304 return err; 2305 2306 if (frag->next) 2307 skb_drop_list(&frag->next); 2308 break; 2309 } 2310 2311 done: 2312 if (len > skb_headlen(skb)) { 2313 skb->data_len -= skb->len - len; 2314 skb->len = len; 2315 } else { 2316 skb->len = len; 2317 skb->data_len = 0; 2318 skb_set_tail_pointer(skb, len); 2319 } 2320 2321 if (!skb->sk || skb->destructor == sock_edemux) 2322 skb_condense(skb); 2323 return 0; 2324 } 2325 EXPORT_SYMBOL(___pskb_trim); 2326 2327 /* Note : use pskb_trim_rcsum() instead of calling this directly 2328 */ 2329 int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) 2330 { 2331 if (skb->ip_summed == CHECKSUM_COMPLETE) { 2332 int delta = skb->len - len; 2333 2334 skb->csum = csum_block_sub(skb->csum, 2335 skb_checksum(skb, len, delta, 0), 2336 len); 2337 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { 2338 int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; 2339 int offset = skb_checksum_start_offset(skb) + skb->csum_offset; 2340 2341 if (offset + sizeof(__sum16) > hdlen) 2342 return -EINVAL; 2343 } 2344 return __pskb_trim(skb, len); 2345 } 2346 EXPORT_SYMBOL(pskb_trim_rcsum_slow); 2347 2348 /** 2349 * __pskb_pull_tail - advance tail of skb header 2350 * @skb: buffer to reallocate 2351 * @delta: number of bytes to advance tail 2352 * 2353 * The function makes a sense only on a fragmented &sk_buff, 2354 * it expands header moving its tail forward and copying necessary 2355 * data from fragmented part. 2356 * 2357 * &sk_buff MUST have reference count of 1. 2358 * 2359 * Returns %NULL (and &sk_buff does not change) if pull failed 2360 * or value of new tail of skb in the case of success. 2361 * 2362 * All the pointers pointing into skb header may change and must be 2363 * reloaded after call to this function. 2364 */ 2365 2366 /* Moves tail of skb head forward, copying data from fragmented part, 2367 * when it is necessary. 2368 * 1. It may fail due to malloc failure. 2369 * 2. It may change skb pointers. 2370 * 2371 * It is pretty complicated. Luckily, it is called only in exceptional cases. 2372 */ 2373 void *__pskb_pull_tail(struct sk_buff *skb, int delta) 2374 { 2375 /* If skb has not enough free space at tail, get new one 2376 * plus 128 bytes for future expansions. If we have enough 2377 * room at tail, reallocate without expansion only if skb is cloned. 2378 */ 2379 int i, k, eat = (skb->tail + delta) - skb->end; 2380 2381 if (eat > 0 || skb_cloned(skb)) { 2382 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, 2383 GFP_ATOMIC)) 2384 return NULL; 2385 } 2386 2387 BUG_ON(skb_copy_bits(skb, skb_headlen(skb), 2388 skb_tail_pointer(skb), delta)); 2389 2390 /* Optimization: no fragments, no reasons to preestimate 2391 * size of pulled pages. Superb. 2392 */ 2393 if (!skb_has_frag_list(skb)) 2394 goto pull_pages; 2395 2396 /* Estimate size of pulled pages. */ 2397 eat = delta; 2398 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2399 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2400 2401 if (size >= eat) 2402 goto pull_pages; 2403 eat -= size; 2404 } 2405 2406 /* If we need update frag list, we are in troubles. 2407 * Certainly, it is possible to add an offset to skb data, 2408 * but taking into account that pulling is expected to 2409 * be very rare operation, it is worth to fight against 2410 * further bloating skb head and crucify ourselves here instead. 2411 * Pure masohism, indeed. 8)8) 2412 */ 2413 if (eat) { 2414 struct sk_buff *list = skb_shinfo(skb)->frag_list; 2415 struct sk_buff *clone = NULL; 2416 struct sk_buff *insp = NULL; 2417 2418 do { 2419 if (list->len <= eat) { 2420 /* Eaten as whole. */ 2421 eat -= list->len; 2422 list = list->next; 2423 insp = list; 2424 } else { 2425 /* Eaten partially. */ 2426 2427 if (skb_shared(list)) { 2428 /* Sucks! We need to fork list. :-( */ 2429 clone = skb_clone(list, GFP_ATOMIC); 2430 if (!clone) 2431 return NULL; 2432 insp = list->next; 2433 list = clone; 2434 } else { 2435 /* This may be pulled without 2436 * problems. */ 2437 insp = list; 2438 } 2439 if (!pskb_pull(list, eat)) { 2440 kfree_skb(clone); 2441 return NULL; 2442 } 2443 break; 2444 } 2445 } while (eat); 2446 2447 /* Free pulled out fragments. */ 2448 while ((list = skb_shinfo(skb)->frag_list) != insp) { 2449 skb_shinfo(skb)->frag_list = list->next; 2450 consume_skb(list); 2451 } 2452 /* And insert new clone at head. */ 2453 if (clone) { 2454 clone->next = list; 2455 skb_shinfo(skb)->frag_list = clone; 2456 } 2457 } 2458 /* Success! Now we may commit changes to skb data. */ 2459 2460 pull_pages: 2461 eat = delta; 2462 k = 0; 2463 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2464 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2465 2466 if (size <= eat) { 2467 skb_frag_unref(skb, i); 2468 eat -= size; 2469 } else { 2470 skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; 2471 2472 *frag = skb_shinfo(skb)->frags[i]; 2473 if (eat) { 2474 skb_frag_off_add(frag, eat); 2475 skb_frag_size_sub(frag, eat); 2476 if (!i) 2477 goto end; 2478 eat = 0; 2479 } 2480 k++; 2481 } 2482 } 2483 skb_shinfo(skb)->nr_frags = k; 2484 2485 end: 2486 skb->tail += delta; 2487 skb->data_len -= delta; 2488 2489 if (!skb->data_len) 2490 skb_zcopy_clear(skb, false); 2491 2492 return skb_tail_pointer(skb); 2493 } 2494 EXPORT_SYMBOL(__pskb_pull_tail); 2495 2496 /** 2497 * skb_copy_bits - copy bits from skb to kernel buffer 2498 * @skb: source skb 2499 * @offset: offset in source 2500 * @to: destination buffer 2501 * @len: number of bytes to copy 2502 * 2503 * Copy the specified number of bytes from the source skb to the 2504 * destination buffer. 2505 * 2506 * CAUTION ! : 2507 * If its prototype is ever changed, 2508 * check arch/{*}/net/{*}.S files, 2509 * since it is called from BPF assembly code. 2510 */ 2511 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) 2512 { 2513 int start = skb_headlen(skb); 2514 struct sk_buff *frag_iter; 2515 int i, copy; 2516 2517 if (offset > (int)skb->len - len) 2518 goto fault; 2519 2520 /* Copy header. */ 2521 if ((copy = start - offset) > 0) { 2522 if (copy > len) 2523 copy = len; 2524 skb_copy_from_linear_data_offset(skb, offset, to, copy); 2525 if ((len -= copy) == 0) 2526 return 0; 2527 offset += copy; 2528 to += copy; 2529 } 2530 2531 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2532 int end; 2533 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 2534 2535 WARN_ON(start > offset + len); 2536 2537 end = start + skb_frag_size(f); 2538 if ((copy = end - offset) > 0) { 2539 u32 p_off, p_len, copied; 2540 struct page *p; 2541 u8 *vaddr; 2542 2543 if (copy > len) 2544 copy = len; 2545 2546 skb_frag_foreach_page(f, 2547 skb_frag_off(f) + offset - start, 2548 copy, p, p_off, p_len, copied) { 2549 vaddr = kmap_atomic(p); 2550 memcpy(to + copied, vaddr + p_off, p_len); 2551 kunmap_atomic(vaddr); 2552 } 2553 2554 if ((len -= copy) == 0) 2555 return 0; 2556 offset += copy; 2557 to += copy; 2558 } 2559 start = end; 2560 } 2561 2562 skb_walk_frags(skb, frag_iter) { 2563 int end; 2564 2565 WARN_ON(start > offset + len); 2566 2567 end = start + frag_iter->len; 2568 if ((copy = end - offset) > 0) { 2569 if (copy > len) 2570 copy = len; 2571 if (skb_copy_bits(frag_iter, offset - start, to, copy)) 2572 goto fault; 2573 if ((len -= copy) == 0) 2574 return 0; 2575 offset += copy; 2576 to += copy; 2577 } 2578 start = end; 2579 } 2580 2581 if (!len) 2582 return 0; 2583 2584 fault: 2585 return -EFAULT; 2586 } 2587 EXPORT_SYMBOL(skb_copy_bits); 2588 2589 /* 2590 * Callback from splice_to_pipe(), if we need to release some pages 2591 * at the end of the spd in case we error'ed out in filling the pipe. 2592 */ 2593 static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) 2594 { 2595 put_page(spd->pages[i]); 2596 } 2597 2598 static struct page *linear_to_page(struct page *page, unsigned int *len, 2599 unsigned int *offset, 2600 struct sock *sk) 2601 { 2602 struct page_frag *pfrag = sk_page_frag(sk); 2603 2604 if (!sk_page_frag_refill(sk, pfrag)) 2605 return NULL; 2606 2607 *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); 2608 2609 memcpy(page_address(pfrag->page) + pfrag->offset, 2610 page_address(page) + *offset, *len); 2611 *offset = pfrag->offset; 2612 pfrag->offset += *len; 2613 2614 return pfrag->page; 2615 } 2616 2617 static bool spd_can_coalesce(const struct splice_pipe_desc *spd, 2618 struct page *page, 2619 unsigned int offset) 2620 { 2621 return spd->nr_pages && 2622 spd->pages[spd->nr_pages - 1] == page && 2623 (spd->partial[spd->nr_pages - 1].offset + 2624 spd->partial[spd->nr_pages - 1].len == offset); 2625 } 2626 2627 /* 2628 * Fill page/offset/length into spd, if it can hold more pages. 2629 */ 2630 static bool spd_fill_page(struct splice_pipe_desc *spd, 2631 struct pipe_inode_info *pipe, struct page *page, 2632 unsigned int *len, unsigned int offset, 2633 bool linear, 2634 struct sock *sk) 2635 { 2636 if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) 2637 return true; 2638 2639 if (linear) { 2640 page = linear_to_page(page, len, &offset, sk); 2641 if (!page) 2642 return true; 2643 } 2644 if (spd_can_coalesce(spd, page, offset)) { 2645 spd->partial[spd->nr_pages - 1].len += *len; 2646 return false; 2647 } 2648 get_page(page); 2649 spd->pages[spd->nr_pages] = page; 2650 spd->partial[spd->nr_pages].len = *len; 2651 spd->partial[spd->nr_pages].offset = offset; 2652 spd->nr_pages++; 2653 2654 return false; 2655 } 2656 2657 static bool __splice_segment(struct page *page, unsigned int poff, 2658 unsigned int plen, unsigned int *off, 2659 unsigned int *len, 2660 struct splice_pipe_desc *spd, bool linear, 2661 struct sock *sk, 2662 struct pipe_inode_info *pipe) 2663 { 2664 if (!*len) 2665 return true; 2666 2667 /* skip this segment if already processed */ 2668 if (*off >= plen) { 2669 *off -= plen; 2670 return false; 2671 } 2672 2673 /* ignore any bits we already processed */ 2674 poff += *off; 2675 plen -= *off; 2676 *off = 0; 2677 2678 do { 2679 unsigned int flen = min(*len, plen); 2680 2681 if (spd_fill_page(spd, pipe, page, &flen, poff, 2682 linear, sk)) 2683 return true; 2684 poff += flen; 2685 plen -= flen; 2686 *len -= flen; 2687 } while (*len && plen); 2688 2689 return false; 2690 } 2691 2692 /* 2693 * Map linear and fragment data from the skb to spd. It reports true if the 2694 * pipe is full or if we already spliced the requested length. 2695 */ 2696 static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, 2697 unsigned int *offset, unsigned int *len, 2698 struct splice_pipe_desc *spd, struct sock *sk) 2699 { 2700 int seg; 2701 struct sk_buff *iter; 2702 2703 /* map the linear part : 2704 * If skb->head_frag is set, this 'linear' part is backed by a 2705 * fragment, and if the head is not shared with any clones then 2706 * we can avoid a copy since we own the head portion of this page. 2707 */ 2708 if (__splice_segment(virt_to_page(skb->data), 2709 (unsigned long) skb->data & (PAGE_SIZE - 1), 2710 skb_headlen(skb), 2711 offset, len, spd, 2712 skb_head_is_locked(skb), 2713 sk, pipe)) 2714 return true; 2715 2716 /* 2717 * then map the fragments 2718 */ 2719 for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { 2720 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 2721 2722 if (__splice_segment(skb_frag_page(f), 2723 skb_frag_off(f), skb_frag_size(f), 2724 offset, len, spd, false, sk, pipe)) 2725 return true; 2726 } 2727 2728 skb_walk_frags(skb, iter) { 2729 if (*offset >= iter->len) { 2730 *offset -= iter->len; 2731 continue; 2732 } 2733 /* __skb_splice_bits() only fails if the output has no room 2734 * left, so no point in going over the frag_list for the error 2735 * case. 2736 */ 2737 if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) 2738 return true; 2739 } 2740 2741 return false; 2742 } 2743 2744 /* 2745 * Map data from the skb to a pipe. Should handle both the linear part, 2746 * the fragments, and the frag list. 2747 */ 2748 int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, 2749 struct pipe_inode_info *pipe, unsigned int tlen, 2750 unsigned int flags) 2751 { 2752 struct partial_page partial[MAX_SKB_FRAGS]; 2753 struct page *pages[MAX_SKB_FRAGS]; 2754 struct splice_pipe_desc spd = { 2755 .pages = pages, 2756 .partial = partial, 2757 .nr_pages_max = MAX_SKB_FRAGS, 2758 .ops = &nosteal_pipe_buf_ops, 2759 .spd_release = sock_spd_release, 2760 }; 2761 int ret = 0; 2762 2763 __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); 2764 2765 if (spd.nr_pages) 2766 ret = splice_to_pipe(pipe, &spd); 2767 2768 return ret; 2769 } 2770 EXPORT_SYMBOL_GPL(skb_splice_bits); 2771 2772 static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, 2773 struct kvec *vec, size_t num, size_t size) 2774 { 2775 struct socket *sock = sk->sk_socket; 2776 2777 if (!sock) 2778 return -EINVAL; 2779 return kernel_sendmsg(sock, msg, vec, num, size); 2780 } 2781 2782 static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, 2783 size_t size, int flags) 2784 { 2785 struct socket *sock = sk->sk_socket; 2786 2787 if (!sock) 2788 return -EINVAL; 2789 return kernel_sendpage(sock, page, offset, size, flags); 2790 } 2791 2792 typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, 2793 struct kvec *vec, size_t num, size_t size); 2794 typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, 2795 size_t size, int flags); 2796 static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, 2797 int len, sendmsg_func sendmsg, sendpage_func sendpage) 2798 { 2799 unsigned int orig_len = len; 2800 struct sk_buff *head = skb; 2801 unsigned short fragidx; 2802 int slen, ret; 2803 2804 do_frag_list: 2805 2806 /* Deal with head data */ 2807 while (offset < skb_headlen(skb) && len) { 2808 struct kvec kv; 2809 struct msghdr msg; 2810 2811 slen = min_t(int, len, skb_headlen(skb) - offset); 2812 kv.iov_base = skb->data + offset; 2813 kv.iov_len = slen; 2814 memset(&msg, 0, sizeof(msg)); 2815 msg.msg_flags = MSG_DONTWAIT; 2816 2817 ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, 2818 sendmsg_unlocked, sk, &msg, &kv, 1, slen); 2819 if (ret <= 0) 2820 goto error; 2821 2822 offset += ret; 2823 len -= ret; 2824 } 2825 2826 /* All the data was skb head? */ 2827 if (!len) 2828 goto out; 2829 2830 /* Make offset relative to start of frags */ 2831 offset -= skb_headlen(skb); 2832 2833 /* Find where we are in frag list */ 2834 for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { 2835 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; 2836 2837 if (offset < skb_frag_size(frag)) 2838 break; 2839 2840 offset -= skb_frag_size(frag); 2841 } 2842 2843 for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { 2844 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; 2845 2846 slen = min_t(size_t, len, skb_frag_size(frag) - offset); 2847 2848 while (slen) { 2849 ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, 2850 sendpage_unlocked, sk, 2851 skb_frag_page(frag), 2852 skb_frag_off(frag) + offset, 2853 slen, MSG_DONTWAIT); 2854 if (ret <= 0) 2855 goto error; 2856 2857 len -= ret; 2858 offset += ret; 2859 slen -= ret; 2860 } 2861 2862 offset = 0; 2863 } 2864 2865 if (len) { 2866 /* Process any frag lists */ 2867 2868 if (skb == head) { 2869 if (skb_has_frag_list(skb)) { 2870 skb = skb_shinfo(skb)->frag_list; 2871 goto do_frag_list; 2872 } 2873 } else if (skb->next) { 2874 skb = skb->next; 2875 goto do_frag_list; 2876 } 2877 } 2878 2879 out: 2880 return orig_len - len; 2881 2882 error: 2883 return orig_len == len ? ret : orig_len - len; 2884 } 2885 2886 /* Send skb data on a socket. Socket must be locked. */ 2887 int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, 2888 int len) 2889 { 2890 return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, 2891 kernel_sendpage_locked); 2892 } 2893 EXPORT_SYMBOL_GPL(skb_send_sock_locked); 2894 2895 /* Send skb data on a socket. Socket must be unlocked. */ 2896 int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) 2897 { 2898 return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 2899 sendpage_unlocked); 2900 } 2901 2902 /** 2903 * skb_store_bits - store bits from kernel buffer to skb 2904 * @skb: destination buffer 2905 * @offset: offset in destination 2906 * @from: source buffer 2907 * @len: number of bytes to copy 2908 * 2909 * Copy the specified number of bytes from the source buffer to the 2910 * destination skb. This function handles all the messy bits of 2911 * traversing fragment lists and such. 2912 */ 2913 2914 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) 2915 { 2916 int start = skb_headlen(skb); 2917 struct sk_buff *frag_iter; 2918 int i, copy; 2919 2920 if (offset > (int)skb->len - len) 2921 goto fault; 2922 2923 if ((copy = start - offset) > 0) { 2924 if (copy > len) 2925 copy = len; 2926 skb_copy_to_linear_data_offset(skb, offset, from, copy); 2927 if ((len -= copy) == 0) 2928 return 0; 2929 offset += copy; 2930 from += copy; 2931 } 2932 2933 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2934 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2935 int end; 2936 2937 WARN_ON(start > offset + len); 2938 2939 end = start + skb_frag_size(frag); 2940 if ((copy = end - offset) > 0) { 2941 u32 p_off, p_len, copied; 2942 struct page *p; 2943 u8 *vaddr; 2944 2945 if (copy > len) 2946 copy = len; 2947 2948 skb_frag_foreach_page(frag, 2949 skb_frag_off(frag) + offset - start, 2950 copy, p, p_off, p_len, copied) { 2951 vaddr = kmap_atomic(p); 2952 memcpy(vaddr + p_off, from + copied, p_len); 2953 kunmap_atomic(vaddr); 2954 } 2955 2956 if ((len -= copy) == 0) 2957 return 0; 2958 offset += copy; 2959 from += copy; 2960 } 2961 start = end; 2962 } 2963 2964 skb_walk_frags(skb, frag_iter) { 2965 int end; 2966 2967 WARN_ON(start > offset + len); 2968 2969 end = start + frag_iter->len; 2970 if ((copy = end - offset) > 0) { 2971 if (copy > len) 2972 copy = len; 2973 if (skb_store_bits(frag_iter, offset - start, 2974 from, copy)) 2975 goto fault; 2976 if ((len -= copy) == 0) 2977 return 0; 2978 offset += copy; 2979 from += copy; 2980 } 2981 start = end; 2982 } 2983 if (!len) 2984 return 0; 2985 2986 fault: 2987 return -EFAULT; 2988 } 2989 EXPORT_SYMBOL(skb_store_bits); 2990 2991 /* Checksum skb data. */ 2992 __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, 2993 __wsum csum, const struct skb_checksum_ops *ops) 2994 { 2995 int start = skb_headlen(skb); 2996 int i, copy = start - offset; 2997 struct sk_buff *frag_iter; 2998 int pos = 0; 2999 3000 /* Checksum header. */ 3001 if (copy > 0) { 3002 if (copy > len) 3003 copy = len; 3004 csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, 3005 skb->data + offset, copy, csum); 3006 if ((len -= copy) == 0) 3007 return csum; 3008 offset += copy; 3009 pos = copy; 3010 } 3011 3012 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3013 int end; 3014 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3015 3016 WARN_ON(start > offset + len); 3017 3018 end = start + skb_frag_size(frag); 3019 if ((copy = end - offset) > 0) { 3020 u32 p_off, p_len, copied; 3021 struct page *p; 3022 __wsum csum2; 3023 u8 *vaddr; 3024 3025 if (copy > len) 3026 copy = len; 3027 3028 skb_frag_foreach_page(frag, 3029 skb_frag_off(frag) + offset - start, 3030 copy, p, p_off, p_len, copied) { 3031 vaddr = kmap_atomic(p); 3032 csum2 = INDIRECT_CALL_1(ops->update, 3033 csum_partial_ext, 3034 vaddr + p_off, p_len, 0); 3035 kunmap_atomic(vaddr); 3036 csum = INDIRECT_CALL_1(ops->combine, 3037 csum_block_add_ext, csum, 3038 csum2, pos, p_len); 3039 pos += p_len; 3040 } 3041 3042 if (!(len -= copy)) 3043 return csum; 3044 offset += copy; 3045 } 3046 start = end; 3047 } 3048 3049 skb_walk_frags(skb, frag_iter) { 3050 int end; 3051 3052 WARN_ON(start > offset + len); 3053 3054 end = start + frag_iter->len; 3055 if ((copy = end - offset) > 0) { 3056 __wsum csum2; 3057 if (copy > len) 3058 copy = len; 3059 csum2 = __skb_checksum(frag_iter, offset - start, 3060 copy, 0, ops); 3061 csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, 3062 csum, csum2, pos, copy); 3063 if ((len -= copy) == 0) 3064 return csum; 3065 offset += copy; 3066 pos += copy; 3067 } 3068 start = end; 3069 } 3070 BUG_ON(len); 3071 3072 return csum; 3073 } 3074 EXPORT_SYMBOL(__skb_checksum); 3075 3076 __wsum skb_checksum(const struct sk_buff *skb, int offset, 3077 int len, __wsum csum) 3078 { 3079 const struct skb_checksum_ops ops = { 3080 .update = csum_partial_ext, 3081 .combine = csum_block_add_ext, 3082 }; 3083 3084 return __skb_checksum(skb, offset, len, csum, &ops); 3085 } 3086 EXPORT_SYMBOL(skb_checksum); 3087 3088 /* Both of above in one bottle. */ 3089 3090 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, 3091 u8 *to, int len) 3092 { 3093 int start = skb_headlen(skb); 3094 int i, copy = start - offset; 3095 struct sk_buff *frag_iter; 3096 int pos = 0; 3097 __wsum csum = 0; 3098 3099 /* Copy header. */ 3100 if (copy > 0) { 3101 if (copy > len) 3102 copy = len; 3103 csum = csum_partial_copy_nocheck(skb->data + offset, to, 3104 copy); 3105 if ((len -= copy) == 0) 3106 return csum; 3107 offset += copy; 3108 to += copy; 3109 pos = copy; 3110 } 3111 3112 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3113 int end; 3114 3115 WARN_ON(start > offset + len); 3116 3117 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 3118 if ((copy = end - offset) > 0) { 3119 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3120 u32 p_off, p_len, copied; 3121 struct page *p; 3122 __wsum csum2; 3123 u8 *vaddr; 3124 3125 if (copy > len) 3126 copy = len; 3127 3128 skb_frag_foreach_page(frag, 3129 skb_frag_off(frag) + offset - start, 3130 copy, p, p_off, p_len, copied) { 3131 vaddr = kmap_atomic(p); 3132 csum2 = csum_partial_copy_nocheck(vaddr + p_off, 3133 to + copied, 3134 p_len); 3135 kunmap_atomic(vaddr); 3136 csum = csum_block_add(csum, csum2, pos); 3137 pos += p_len; 3138 } 3139 3140 if (!(len -= copy)) 3141 return csum; 3142 offset += copy; 3143 to += copy; 3144 } 3145 start = end; 3146 } 3147 3148 skb_walk_frags(skb, frag_iter) { 3149 __wsum csum2; 3150 int end; 3151 3152 WARN_ON(start > offset + len); 3153 3154 end = start + frag_iter->len; 3155 if ((copy = end - offset) > 0) { 3156 if (copy > len) 3157 copy = len; 3158 csum2 = skb_copy_and_csum_bits(frag_iter, 3159 offset - start, 3160 to, copy); 3161 csum = csum_block_add(csum, csum2, pos); 3162 if ((len -= copy) == 0) 3163 return csum; 3164 offset += copy; 3165 to += copy; 3166 pos += copy; 3167 } 3168 start = end; 3169 } 3170 BUG_ON(len); 3171 return csum; 3172 } 3173 EXPORT_SYMBOL(skb_copy_and_csum_bits); 3174 3175 __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) 3176 { 3177 __sum16 sum; 3178 3179 sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); 3180 /* See comments in __skb_checksum_complete(). */ 3181 if (likely(!sum)) { 3182 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 3183 !skb->csum_complete_sw) 3184 netdev_rx_csum_fault(skb->dev, skb); 3185 } 3186 if (!skb_shared(skb)) 3187 skb->csum_valid = !sum; 3188 return sum; 3189 } 3190 EXPORT_SYMBOL(__skb_checksum_complete_head); 3191 3192 /* This function assumes skb->csum already holds pseudo header's checksum, 3193 * which has been changed from the hardware checksum, for example, by 3194 * __skb_checksum_validate_complete(). And, the original skb->csum must 3195 * have been validated unsuccessfully for CHECKSUM_COMPLETE case. 3196 * 3197 * It returns non-zero if the recomputed checksum is still invalid, otherwise 3198 * zero. The new checksum is stored back into skb->csum unless the skb is 3199 * shared. 3200 */ 3201 __sum16 __skb_checksum_complete(struct sk_buff *skb) 3202 { 3203 __wsum csum; 3204 __sum16 sum; 3205 3206 csum = skb_checksum(skb, 0, skb->len, 0); 3207 3208 sum = csum_fold(csum_add(skb->csum, csum)); 3209 /* This check is inverted, because we already knew the hardware 3210 * checksum is invalid before calling this function. So, if the 3211 * re-computed checksum is valid instead, then we have a mismatch 3212 * between the original skb->csum and skb_checksum(). This means either 3213 * the original hardware checksum is incorrect or we screw up skb->csum 3214 * when moving skb->data around. 3215 */ 3216 if (likely(!sum)) { 3217 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 3218 !skb->csum_complete_sw) 3219 netdev_rx_csum_fault(skb->dev, skb); 3220 } 3221 3222 if (!skb_shared(skb)) { 3223 /* Save full packet checksum */ 3224 skb->csum = csum; 3225 skb->ip_summed = CHECKSUM_COMPLETE; 3226 skb->csum_complete_sw = 1; 3227 skb->csum_valid = !sum; 3228 } 3229 3230 return sum; 3231 } 3232 EXPORT_SYMBOL(__skb_checksum_complete); 3233 3234 static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) 3235 { 3236 net_warn_ratelimited( 3237 "%s: attempt to compute crc32c without libcrc32c.ko\n", 3238 __func__); 3239 return 0; 3240 } 3241 3242 static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, 3243 int offset, int len) 3244 { 3245 net_warn_ratelimited( 3246 "%s: attempt to compute crc32c without libcrc32c.ko\n", 3247 __func__); 3248 return 0; 3249 } 3250 3251 static const struct skb_checksum_ops default_crc32c_ops = { 3252 .update = warn_crc32c_csum_update, 3253 .combine = warn_crc32c_csum_combine, 3254 }; 3255 3256 const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = 3257 &default_crc32c_ops; 3258 EXPORT_SYMBOL(crc32c_csum_stub); 3259 3260 /** 3261 * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() 3262 * @from: source buffer 3263 * 3264 * Calculates the amount of linear headroom needed in the 'to' skb passed 3265 * into skb_zerocopy(). 3266 */ 3267 unsigned int 3268 skb_zerocopy_headlen(const struct sk_buff *from) 3269 { 3270 unsigned int hlen = 0; 3271 3272 if (!from->head_frag || 3273 skb_headlen(from) < L1_CACHE_BYTES || 3274 skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { 3275 hlen = skb_headlen(from); 3276 if (!hlen) 3277 hlen = from->len; 3278 } 3279 3280 if (skb_has_frag_list(from)) 3281 hlen = from->len; 3282 3283 return hlen; 3284 } 3285 EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); 3286 3287 /** 3288 * skb_zerocopy - Zero copy skb to skb 3289 * @to: destination buffer 3290 * @from: source buffer 3291 * @len: number of bytes to copy from source buffer 3292 * @hlen: size of linear headroom in destination buffer 3293 * 3294 * Copies up to `len` bytes from `from` to `to` by creating references 3295 * to the frags in the source buffer. 3296 * 3297 * The `hlen` as calculated by skb_zerocopy_headlen() specifies the 3298 * headroom in the `to` buffer. 3299 * 3300 * Return value: 3301 * 0: everything is OK 3302 * -ENOMEM: couldn't orphan frags of @from due to lack of memory 3303 * -EFAULT: skb_copy_bits() found some problem with skb geometry 3304 */ 3305 int 3306 skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) 3307 { 3308 int i, j = 0; 3309 int plen = 0; /* length of skb->head fragment */ 3310 int ret; 3311 struct page *page; 3312 unsigned int offset; 3313 3314 BUG_ON(!from->head_frag && !hlen); 3315 3316 /* dont bother with small payloads */ 3317 if (len <= skb_tailroom(to)) 3318 return skb_copy_bits(from, 0, skb_put(to, len), len); 3319 3320 if (hlen) { 3321 ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); 3322 if (unlikely(ret)) 3323 return ret; 3324 len -= hlen; 3325 } else { 3326 plen = min_t(int, skb_headlen(from), len); 3327 if (plen) { 3328 page = virt_to_head_page(from->head); 3329 offset = from->data - (unsigned char *)page_address(page); 3330 __skb_fill_page_desc(to, 0, page, offset, plen); 3331 get_page(page); 3332 j = 1; 3333 len -= plen; 3334 } 3335 } 3336 3337 skb_len_add(to, len + plen); 3338 3339 if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { 3340 skb_tx_error(from); 3341 return -ENOMEM; 3342 } 3343 skb_zerocopy_clone(to, from, GFP_ATOMIC); 3344 3345 for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { 3346 int size; 3347 3348 if (!len) 3349 break; 3350 skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; 3351 size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), 3352 len); 3353 skb_frag_size_set(&skb_shinfo(to)->frags[j], size); 3354 len -= size; 3355 skb_frag_ref(to, j); 3356 j++; 3357 } 3358 skb_shinfo(to)->nr_frags = j; 3359 3360 return 0; 3361 } 3362 EXPORT_SYMBOL_GPL(skb_zerocopy); 3363 3364 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) 3365 { 3366 __wsum csum; 3367 long csstart; 3368 3369 if (skb->ip_summed == CHECKSUM_PARTIAL) 3370 csstart = skb_checksum_start_offset(skb); 3371 else 3372 csstart = skb_headlen(skb); 3373 3374 BUG_ON(csstart > skb_headlen(skb)); 3375 3376 skb_copy_from_linear_data(skb, to, csstart); 3377 3378 csum = 0; 3379 if (csstart != skb->len) 3380 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, 3381 skb->len - csstart); 3382 3383 if (skb->ip_summed == CHECKSUM_PARTIAL) { 3384 long csstuff = csstart + skb->csum_offset; 3385 3386 *((__sum16 *)(to + csstuff)) = csum_fold(csum); 3387 } 3388 } 3389 EXPORT_SYMBOL(skb_copy_and_csum_dev); 3390 3391 /** 3392 * skb_dequeue - remove from the head of the queue 3393 * @list: list to dequeue from 3394 * 3395 * Remove the head of the list. The list lock is taken so the function 3396 * may be used safely with other locking list functions. The head item is 3397 * returned or %NULL if the list is empty. 3398 */ 3399 3400 struct sk_buff *skb_dequeue(struct sk_buff_head *list) 3401 { 3402 unsigned long flags; 3403 struct sk_buff *result; 3404 3405 spin_lock_irqsave(&list->lock, flags); 3406 result = __skb_dequeue(list); 3407 spin_unlock_irqrestore(&list->lock, flags); 3408 return result; 3409 } 3410 EXPORT_SYMBOL(skb_dequeue); 3411 3412 /** 3413 * skb_dequeue_tail - remove from the tail of the queue 3414 * @list: list to dequeue from 3415 * 3416 * Remove the tail of the list. The list lock is taken so the function 3417 * may be used safely with other locking list functions. The tail item is 3418 * returned or %NULL if the list is empty. 3419 */ 3420 struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) 3421 { 3422 unsigned long flags; 3423 struct sk_buff *result; 3424 3425 spin_lock_irqsave(&list->lock, flags); 3426 result = __skb_dequeue_tail(list); 3427 spin_unlock_irqrestore(&list->lock, flags); 3428 return result; 3429 } 3430 EXPORT_SYMBOL(skb_dequeue_tail); 3431 3432 /** 3433 * skb_queue_purge - empty a list 3434 * @list: list to empty 3435 * 3436 * Delete all buffers on an &sk_buff list. Each buffer is removed from 3437 * the list and one reference dropped. This function takes the list 3438 * lock and is atomic with respect to other list locking functions. 3439 */ 3440 void skb_queue_purge(struct sk_buff_head *list) 3441 { 3442 struct sk_buff *skb; 3443 while ((skb = skb_dequeue(list)) != NULL) 3444 kfree_skb(skb); 3445 } 3446 EXPORT_SYMBOL(skb_queue_purge); 3447 3448 /** 3449 * skb_rbtree_purge - empty a skb rbtree 3450 * @root: root of the rbtree to empty 3451 * Return value: the sum of truesizes of all purged skbs. 3452 * 3453 * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from 3454 * the list and one reference dropped. This function does not take 3455 * any lock. Synchronization should be handled by the caller (e.g., TCP 3456 * out-of-order queue is protected by the socket lock). 3457 */ 3458 unsigned int skb_rbtree_purge(struct rb_root *root) 3459 { 3460 struct rb_node *p = rb_first(root); 3461 unsigned int sum = 0; 3462 3463 while (p) { 3464 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 3465 3466 p = rb_next(p); 3467 rb_erase(&skb->rbnode, root); 3468 sum += skb->truesize; 3469 kfree_skb(skb); 3470 } 3471 return sum; 3472 } 3473 3474 /** 3475 * skb_queue_head - queue a buffer at the list head 3476 * @list: list to use 3477 * @newsk: buffer to queue 3478 * 3479 * Queue a buffer at the start of the list. This function takes the 3480 * list lock and can be used safely with other locking &sk_buff functions 3481 * safely. 3482 * 3483 * A buffer cannot be placed on two lists at the same time. 3484 */ 3485 void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) 3486 { 3487 unsigned long flags; 3488 3489 spin_lock_irqsave(&list->lock, flags); 3490 __skb_queue_head(list, newsk); 3491 spin_unlock_irqrestore(&list->lock, flags); 3492 } 3493 EXPORT_SYMBOL(skb_queue_head); 3494 3495 /** 3496 * skb_queue_tail - queue a buffer at the list tail 3497 * @list: list to use 3498 * @newsk: buffer to queue 3499 * 3500 * Queue a buffer at the tail of the list. This function takes the 3501 * list lock and can be used safely with other locking &sk_buff functions 3502 * safely. 3503 * 3504 * A buffer cannot be placed on two lists at the same time. 3505 */ 3506 void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) 3507 { 3508 unsigned long flags; 3509 3510 spin_lock_irqsave(&list->lock, flags); 3511 __skb_queue_tail(list, newsk); 3512 spin_unlock_irqrestore(&list->lock, flags); 3513 } 3514 EXPORT_SYMBOL(skb_queue_tail); 3515 3516 /** 3517 * skb_unlink - remove a buffer from a list 3518 * @skb: buffer to remove 3519 * @list: list to use 3520 * 3521 * Remove a packet from a list. The list locks are taken and this 3522 * function is atomic with respect to other list locked calls 3523 * 3524 * You must know what list the SKB is on. 3525 */ 3526 void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) 3527 { 3528 unsigned long flags; 3529 3530 spin_lock_irqsave(&list->lock, flags); 3531 __skb_unlink(skb, list); 3532 spin_unlock_irqrestore(&list->lock, flags); 3533 } 3534 EXPORT_SYMBOL(skb_unlink); 3535 3536 /** 3537 * skb_append - append a buffer 3538 * @old: buffer to insert after 3539 * @newsk: buffer to insert 3540 * @list: list to use 3541 * 3542 * Place a packet after a given packet in a list. The list locks are taken 3543 * and this function is atomic with respect to other list locked calls. 3544 * A buffer cannot be placed on two lists at the same time. 3545 */ 3546 void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 3547 { 3548 unsigned long flags; 3549 3550 spin_lock_irqsave(&list->lock, flags); 3551 __skb_queue_after(list, old, newsk); 3552 spin_unlock_irqrestore(&list->lock, flags); 3553 } 3554 EXPORT_SYMBOL(skb_append); 3555 3556 static inline void skb_split_inside_header(struct sk_buff *skb, 3557 struct sk_buff* skb1, 3558 const u32 len, const int pos) 3559 { 3560 int i; 3561 3562 skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), 3563 pos - len); 3564 /* And move data appendix as is. */ 3565 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 3566 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; 3567 3568 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; 3569 skb_shinfo(skb)->nr_frags = 0; 3570 skb1->data_len = skb->data_len; 3571 skb1->len += skb1->data_len; 3572 skb->data_len = 0; 3573 skb->len = len; 3574 skb_set_tail_pointer(skb, len); 3575 } 3576 3577 static inline void skb_split_no_header(struct sk_buff *skb, 3578 struct sk_buff* skb1, 3579 const u32 len, int pos) 3580 { 3581 int i, k = 0; 3582 const int nfrags = skb_shinfo(skb)->nr_frags; 3583 3584 skb_shinfo(skb)->nr_frags = 0; 3585 skb1->len = skb1->data_len = skb->len - len; 3586 skb->len = len; 3587 skb->data_len = len - pos; 3588 3589 for (i = 0; i < nfrags; i++) { 3590 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 3591 3592 if (pos + size > len) { 3593 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; 3594 3595 if (pos < len) { 3596 /* Split frag. 3597 * We have two variants in this case: 3598 * 1. Move all the frag to the second 3599 * part, if it is possible. F.e. 3600 * this approach is mandatory for TUX, 3601 * where splitting is expensive. 3602 * 2. Split is accurately. We make this. 3603 */ 3604 skb_frag_ref(skb, i); 3605 skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); 3606 skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); 3607 skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); 3608 skb_shinfo(skb)->nr_frags++; 3609 } 3610 k++; 3611 } else 3612 skb_shinfo(skb)->nr_frags++; 3613 pos += size; 3614 } 3615 skb_shinfo(skb1)->nr_frags = k; 3616 } 3617 3618 /** 3619 * skb_split - Split fragmented skb to two parts at length len. 3620 * @skb: the buffer to split 3621 * @skb1: the buffer to receive the second part 3622 * @len: new length for skb 3623 */ 3624 void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) 3625 { 3626 int pos = skb_headlen(skb); 3627 const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; 3628 3629 skb_zcopy_downgrade_managed(skb); 3630 3631 skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; 3632 skb_zerocopy_clone(skb1, skb, 0); 3633 if (len < pos) /* Split line is inside header. */ 3634 skb_split_inside_header(skb, skb1, len, pos); 3635 else /* Second chunk has no header, nothing to copy. */ 3636 skb_split_no_header(skb, skb1, len, pos); 3637 } 3638 EXPORT_SYMBOL(skb_split); 3639 3640 /* Shifting from/to a cloned skb is a no-go. 3641 * 3642 * Caller cannot keep skb_shinfo related pointers past calling here! 3643 */ 3644 static int skb_prepare_for_shift(struct sk_buff *skb) 3645 { 3646 return skb_unclone_keeptruesize(skb, GFP_ATOMIC); 3647 } 3648 3649 /** 3650 * skb_shift - Shifts paged data partially from skb to another 3651 * @tgt: buffer into which tail data gets added 3652 * @skb: buffer from which the paged data comes from 3653 * @shiftlen: shift up to this many bytes 3654 * 3655 * Attempts to shift up to shiftlen worth of bytes, which may be less than 3656 * the length of the skb, from skb to tgt. Returns number bytes shifted. 3657 * It's up to caller to free skb if everything was shifted. 3658 * 3659 * If @tgt runs out of frags, the whole operation is aborted. 3660 * 3661 * Skb cannot include anything else but paged data while tgt is allowed 3662 * to have non-paged data as well. 3663 * 3664 * TODO: full sized shift could be optimized but that would need 3665 * specialized skb free'er to handle frags without up-to-date nr_frags. 3666 */ 3667 int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) 3668 { 3669 int from, to, merge, todo; 3670 skb_frag_t *fragfrom, *fragto; 3671 3672 BUG_ON(shiftlen > skb->len); 3673 3674 if (skb_headlen(skb)) 3675 return 0; 3676 if (skb_zcopy(tgt) || skb_zcopy(skb)) 3677 return 0; 3678 3679 todo = shiftlen; 3680 from = 0; 3681 to = skb_shinfo(tgt)->nr_frags; 3682 fragfrom = &skb_shinfo(skb)->frags[from]; 3683 3684 /* Actual merge is delayed until the point when we know we can 3685 * commit all, so that we don't have to undo partial changes 3686 */ 3687 if (!to || 3688 !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), 3689 skb_frag_off(fragfrom))) { 3690 merge = -1; 3691 } else { 3692 merge = to - 1; 3693 3694 todo -= skb_frag_size(fragfrom); 3695 if (todo < 0) { 3696 if (skb_prepare_for_shift(skb) || 3697 skb_prepare_for_shift(tgt)) 3698 return 0; 3699 3700 /* All previous frag pointers might be stale! */ 3701 fragfrom = &skb_shinfo(skb)->frags[from]; 3702 fragto = &skb_shinfo(tgt)->frags[merge]; 3703 3704 skb_frag_size_add(fragto, shiftlen); 3705 skb_frag_size_sub(fragfrom, shiftlen); 3706 skb_frag_off_add(fragfrom, shiftlen); 3707 3708 goto onlymerged; 3709 } 3710 3711 from++; 3712 } 3713 3714 /* Skip full, not-fitting skb to avoid expensive operations */ 3715 if ((shiftlen == skb->len) && 3716 (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) 3717 return 0; 3718 3719 if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) 3720 return 0; 3721 3722 while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { 3723 if (to == MAX_SKB_FRAGS) 3724 return 0; 3725 3726 fragfrom = &skb_shinfo(skb)->frags[from]; 3727 fragto = &skb_shinfo(tgt)->frags[to]; 3728 3729 if (todo >= skb_frag_size(fragfrom)) { 3730 *fragto = *fragfrom; 3731 todo -= skb_frag_size(fragfrom); 3732 from++; 3733 to++; 3734 3735 } else { 3736 __skb_frag_ref(fragfrom); 3737 skb_frag_page_copy(fragto, fragfrom); 3738 skb_frag_off_copy(fragto, fragfrom); 3739 skb_frag_size_set(fragto, todo); 3740 3741 skb_frag_off_add(fragfrom, todo); 3742 skb_frag_size_sub(fragfrom, todo); 3743 todo = 0; 3744 3745 to++; 3746 break; 3747 } 3748 } 3749 3750 /* Ready to "commit" this state change to tgt */ 3751 skb_shinfo(tgt)->nr_frags = to; 3752 3753 if (merge >= 0) { 3754 fragfrom = &skb_shinfo(skb)->frags[0]; 3755 fragto = &skb_shinfo(tgt)->frags[merge]; 3756 3757 skb_frag_size_add(fragto, skb_frag_size(fragfrom)); 3758 __skb_frag_unref(fragfrom, skb->pp_recycle); 3759 } 3760 3761 /* Reposition in the original skb */ 3762 to = 0; 3763 while (from < skb_shinfo(skb)->nr_frags) 3764 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; 3765 skb_shinfo(skb)->nr_frags = to; 3766 3767 BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); 3768 3769 onlymerged: 3770 /* Most likely the tgt won't ever need its checksum anymore, skb on 3771 * the other hand might need it if it needs to be resent 3772 */ 3773 tgt->ip_summed = CHECKSUM_PARTIAL; 3774 skb->ip_summed = CHECKSUM_PARTIAL; 3775 3776 skb_len_add(skb, -shiftlen); 3777 skb_len_add(tgt, shiftlen); 3778 3779 return shiftlen; 3780 } 3781 3782 /** 3783 * skb_prepare_seq_read - Prepare a sequential read of skb data 3784 * @skb: the buffer to read 3785 * @from: lower offset of data to be read 3786 * @to: upper offset of data to be read 3787 * @st: state variable 3788 * 3789 * Initializes the specified state variable. Must be called before 3790 * invoking skb_seq_read() for the first time. 3791 */ 3792 void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, 3793 unsigned int to, struct skb_seq_state *st) 3794 { 3795 st->lower_offset = from; 3796 st->upper_offset = to; 3797 st->root_skb = st->cur_skb = skb; 3798 st->frag_idx = st->stepped_offset = 0; 3799 st->frag_data = NULL; 3800 st->frag_off = 0; 3801 } 3802 EXPORT_SYMBOL(skb_prepare_seq_read); 3803 3804 /** 3805 * skb_seq_read - Sequentially read skb data 3806 * @consumed: number of bytes consumed by the caller so far 3807 * @data: destination pointer for data to be returned 3808 * @st: state variable 3809 * 3810 * Reads a block of skb data at @consumed relative to the 3811 * lower offset specified to skb_prepare_seq_read(). Assigns 3812 * the head of the data block to @data and returns the length 3813 * of the block or 0 if the end of the skb data or the upper 3814 * offset has been reached. 3815 * 3816 * The caller is not required to consume all of the data 3817 * returned, i.e. @consumed is typically set to the number 3818 * of bytes already consumed and the next call to 3819 * skb_seq_read() will return the remaining part of the block. 3820 * 3821 * Note 1: The size of each block of data returned can be arbitrary, 3822 * this limitation is the cost for zerocopy sequential 3823 * reads of potentially non linear data. 3824 * 3825 * Note 2: Fragment lists within fragments are not implemented 3826 * at the moment, state->root_skb could be replaced with 3827 * a stack for this purpose. 3828 */ 3829 unsigned int skb_seq_read(unsigned int consumed, const u8 **data, 3830 struct skb_seq_state *st) 3831 { 3832 unsigned int block_limit, abs_offset = consumed + st->lower_offset; 3833 skb_frag_t *frag; 3834 3835 if (unlikely(abs_offset >= st->upper_offset)) { 3836 if (st->frag_data) { 3837 kunmap_atomic(st->frag_data); 3838 st->frag_data = NULL; 3839 } 3840 return 0; 3841 } 3842 3843 next_skb: 3844 block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; 3845 3846 if (abs_offset < block_limit && !st->frag_data) { 3847 *data = st->cur_skb->data + (abs_offset - st->stepped_offset); 3848 return block_limit - abs_offset; 3849 } 3850 3851 if (st->frag_idx == 0 && !st->frag_data) 3852 st->stepped_offset += skb_headlen(st->cur_skb); 3853 3854 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { 3855 unsigned int pg_idx, pg_off, pg_sz; 3856 3857 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; 3858 3859 pg_idx = 0; 3860 pg_off = skb_frag_off(frag); 3861 pg_sz = skb_frag_size(frag); 3862 3863 if (skb_frag_must_loop(skb_frag_page(frag))) { 3864 pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; 3865 pg_off = offset_in_page(pg_off + st->frag_off); 3866 pg_sz = min_t(unsigned int, pg_sz - st->frag_off, 3867 PAGE_SIZE - pg_off); 3868 } 3869 3870 block_limit = pg_sz + st->stepped_offset; 3871 if (abs_offset < block_limit) { 3872 if (!st->frag_data) 3873 st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx); 3874 3875 *data = (u8 *)st->frag_data + pg_off + 3876 (abs_offset - st->stepped_offset); 3877 3878 return block_limit - abs_offset; 3879 } 3880 3881 if (st->frag_data) { 3882 kunmap_atomic(st->frag_data); 3883 st->frag_data = NULL; 3884 } 3885 3886 st->stepped_offset += pg_sz; 3887 st->frag_off += pg_sz; 3888 if (st->frag_off == skb_frag_size(frag)) { 3889 st->frag_off = 0; 3890 st->frag_idx++; 3891 } 3892 } 3893 3894 if (st->frag_data) { 3895 kunmap_atomic(st->frag_data); 3896 st->frag_data = NULL; 3897 } 3898 3899 if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { 3900 st->cur_skb = skb_shinfo(st->root_skb)->frag_list; 3901 st->frag_idx = 0; 3902 goto next_skb; 3903 } else if (st->cur_skb->next) { 3904 st->cur_skb = st->cur_skb->next; 3905 st->frag_idx = 0; 3906 goto next_skb; 3907 } 3908 3909 return 0; 3910 } 3911 EXPORT_SYMBOL(skb_seq_read); 3912 3913 /** 3914 * skb_abort_seq_read - Abort a sequential read of skb data 3915 * @st: state variable 3916 * 3917 * Must be called if skb_seq_read() was not called until it 3918 * returned 0. 3919 */ 3920 void skb_abort_seq_read(struct skb_seq_state *st) 3921 { 3922 if (st->frag_data) 3923 kunmap_atomic(st->frag_data); 3924 } 3925 EXPORT_SYMBOL(skb_abort_seq_read); 3926 3927 #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) 3928 3929 static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, 3930 struct ts_config *conf, 3931 struct ts_state *state) 3932 { 3933 return skb_seq_read(offset, text, TS_SKB_CB(state)); 3934 } 3935 3936 static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) 3937 { 3938 skb_abort_seq_read(TS_SKB_CB(state)); 3939 } 3940 3941 /** 3942 * skb_find_text - Find a text pattern in skb data 3943 * @skb: the buffer to look in 3944 * @from: search offset 3945 * @to: search limit 3946 * @config: textsearch configuration 3947 * 3948 * Finds a pattern in the skb data according to the specified 3949 * textsearch configuration. Use textsearch_next() to retrieve 3950 * subsequent occurrences of the pattern. Returns the offset 3951 * to the first occurrence or UINT_MAX if no match was found. 3952 */ 3953 unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, 3954 unsigned int to, struct ts_config *config) 3955 { 3956 struct ts_state state; 3957 unsigned int ret; 3958 3959 BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb)); 3960 3961 config->get_next_block = skb_ts_get_next_block; 3962 config->finish = skb_ts_finish; 3963 3964 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); 3965 3966 ret = textsearch_find(config, &state); 3967 return (ret <= to - from ? ret : UINT_MAX); 3968 } 3969 EXPORT_SYMBOL(skb_find_text); 3970 3971 int skb_append_pagefrags(struct sk_buff *skb, struct page *page, 3972 int offset, size_t size) 3973 { 3974 int i = skb_shinfo(skb)->nr_frags; 3975 3976 if (skb_can_coalesce(skb, i, page, offset)) { 3977 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); 3978 } else if (i < MAX_SKB_FRAGS) { 3979 skb_zcopy_downgrade_managed(skb); 3980 get_page(page); 3981 skb_fill_page_desc(skb, i, page, offset, size); 3982 } else { 3983 return -EMSGSIZE; 3984 } 3985 3986 return 0; 3987 } 3988 EXPORT_SYMBOL_GPL(skb_append_pagefrags); 3989 3990 /** 3991 * skb_pull_rcsum - pull skb and update receive checksum 3992 * @skb: buffer to update 3993 * @len: length of data pulled 3994 * 3995 * This function performs an skb_pull on the packet and updates 3996 * the CHECKSUM_COMPLETE checksum. It should be used on 3997 * receive path processing instead of skb_pull unless you know 3998 * that the checksum difference is zero (e.g., a valid IP header) 3999 * or you are setting ip_summed to CHECKSUM_NONE. 4000 */ 4001 void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) 4002 { 4003 unsigned char *data = skb->data; 4004 4005 BUG_ON(len > skb->len); 4006 __skb_pull(skb, len); 4007 skb_postpull_rcsum(skb, data, len); 4008 return skb->data; 4009 } 4010 EXPORT_SYMBOL_GPL(skb_pull_rcsum); 4011 4012 static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) 4013 { 4014 skb_frag_t head_frag; 4015 struct page *page; 4016 4017 page = virt_to_head_page(frag_skb->head); 4018 __skb_frag_set_page(&head_frag, page); 4019 skb_frag_off_set(&head_frag, frag_skb->data - 4020 (unsigned char *)page_address(page)); 4021 skb_frag_size_set(&head_frag, skb_headlen(frag_skb)); 4022 return head_frag; 4023 } 4024 4025 struct sk_buff *skb_segment_list(struct sk_buff *skb, 4026 netdev_features_t features, 4027 unsigned int offset) 4028 { 4029 struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; 4030 unsigned int tnl_hlen = skb_tnl_header_len(skb); 4031 unsigned int delta_truesize = 0; 4032 unsigned int delta_len = 0; 4033 struct sk_buff *tail = NULL; 4034 struct sk_buff *nskb, *tmp; 4035 int len_diff, err; 4036 4037 skb_push(skb, -skb_network_offset(skb) + offset); 4038 4039 skb_shinfo(skb)->frag_list = NULL; 4040 4041 do { 4042 nskb = list_skb; 4043 list_skb = list_skb->next; 4044 4045 err = 0; 4046 delta_truesize += nskb->truesize; 4047 if (skb_shared(nskb)) { 4048 tmp = skb_clone(nskb, GFP_ATOMIC); 4049 if (tmp) { 4050 consume_skb(nskb); 4051 nskb = tmp; 4052 err = skb_unclone(nskb, GFP_ATOMIC); 4053 } else { 4054 err = -ENOMEM; 4055 } 4056 } 4057 4058 if (!tail) 4059 skb->next = nskb; 4060 else 4061 tail->next = nskb; 4062 4063 if (unlikely(err)) { 4064 nskb->next = list_skb; 4065 goto err_linearize; 4066 } 4067 4068 tail = nskb; 4069 4070 delta_len += nskb->len; 4071 4072 skb_push(nskb, -skb_network_offset(nskb) + offset); 4073 4074 skb_release_head_state(nskb); 4075 len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb); 4076 __copy_skb_header(nskb, skb); 4077 4078 skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); 4079 nskb->transport_header += len_diff; 4080 skb_copy_from_linear_data_offset(skb, -tnl_hlen, 4081 nskb->data - tnl_hlen, 4082 offset + tnl_hlen); 4083 4084 if (skb_needs_linearize(nskb, features) && 4085 __skb_linearize(nskb)) 4086 goto err_linearize; 4087 4088 } while (list_skb); 4089 4090 skb->truesize = skb->truesize - delta_truesize; 4091 skb->data_len = skb->data_len - delta_len; 4092 skb->len = skb->len - delta_len; 4093 4094 skb_gso_reset(skb); 4095 4096 skb->prev = tail; 4097 4098 if (skb_needs_linearize(skb, features) && 4099 __skb_linearize(skb)) 4100 goto err_linearize; 4101 4102 skb_get(skb); 4103 4104 return skb; 4105 4106 err_linearize: 4107 kfree_skb_list(skb->next); 4108 skb->next = NULL; 4109 return ERR_PTR(-ENOMEM); 4110 } 4111 EXPORT_SYMBOL_GPL(skb_segment_list); 4112 4113 /** 4114 * skb_segment - Perform protocol segmentation on skb. 4115 * @head_skb: buffer to segment 4116 * @features: features for the output path (see dev->features) 4117 * 4118 * This function performs segmentation on the given skb. It returns 4119 * a pointer to the first in a list of new skbs for the segments. 4120 * In case of error it returns ERR_PTR(err). 4121 */ 4122 struct sk_buff *skb_segment(struct sk_buff *head_skb, 4123 netdev_features_t features) 4124 { 4125 struct sk_buff *segs = NULL; 4126 struct sk_buff *tail = NULL; 4127 struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; 4128 skb_frag_t *frag = skb_shinfo(head_skb)->frags; 4129 unsigned int mss = skb_shinfo(head_skb)->gso_size; 4130 unsigned int doffset = head_skb->data - skb_mac_header(head_skb); 4131 struct sk_buff *frag_skb = head_skb; 4132 unsigned int offset = doffset; 4133 unsigned int tnl_hlen = skb_tnl_header_len(head_skb); 4134 unsigned int partial_segs = 0; 4135 unsigned int headroom; 4136 unsigned int len = head_skb->len; 4137 __be16 proto; 4138 bool csum, sg; 4139 int nfrags = skb_shinfo(head_skb)->nr_frags; 4140 int err = -ENOMEM; 4141 int i = 0; 4142 int pos; 4143 4144 if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) && 4145 (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) { 4146 /* gso_size is untrusted, and we have a frag_list with a linear 4147 * non head_frag head. 4148 * 4149 * (we assume checking the first list_skb member suffices; 4150 * i.e if either of the list_skb members have non head_frag 4151 * head, then the first one has too). 4152 * 4153 * If head_skb's headlen does not fit requested gso_size, it 4154 * means that the frag_list members do NOT terminate on exact 4155 * gso_size boundaries. Hence we cannot perform skb_frag_t page 4156 * sharing. Therefore we must fallback to copying the frag_list 4157 * skbs; we do so by disabling SG. 4158 */ 4159 if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) 4160 features &= ~NETIF_F_SG; 4161 } 4162 4163 __skb_push(head_skb, doffset); 4164 proto = skb_network_protocol(head_skb, NULL); 4165 if (unlikely(!proto)) 4166 return ERR_PTR(-EINVAL); 4167 4168 sg = !!(features & NETIF_F_SG); 4169 csum = !!can_checksum_protocol(features, proto); 4170 4171 if (sg && csum && (mss != GSO_BY_FRAGS)) { 4172 if (!(features & NETIF_F_GSO_PARTIAL)) { 4173 struct sk_buff *iter; 4174 unsigned int frag_len; 4175 4176 if (!list_skb || 4177 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) 4178 goto normal; 4179 4180 /* If we get here then all the required 4181 * GSO features except frag_list are supported. 4182 * Try to split the SKB to multiple GSO SKBs 4183 * with no frag_list. 4184 * Currently we can do that only when the buffers don't 4185 * have a linear part and all the buffers except 4186 * the last are of the same length. 4187 */ 4188 frag_len = list_skb->len; 4189 skb_walk_frags(head_skb, iter) { 4190 if (frag_len != iter->len && iter->next) 4191 goto normal; 4192 if (skb_headlen(iter) && !iter->head_frag) 4193 goto normal; 4194 4195 len -= iter->len; 4196 } 4197 4198 if (len != frag_len) 4199 goto normal; 4200 } 4201 4202 /* GSO partial only requires that we trim off any excess that 4203 * doesn't fit into an MSS sized block, so take care of that 4204 * now. 4205 */ 4206 partial_segs = len / mss; 4207 if (partial_segs > 1) 4208 mss *= partial_segs; 4209 else 4210 partial_segs = 0; 4211 } 4212 4213 normal: 4214 headroom = skb_headroom(head_skb); 4215 pos = skb_headlen(head_skb); 4216 4217 do { 4218 struct sk_buff *nskb; 4219 skb_frag_t *nskb_frag; 4220 int hsize; 4221 int size; 4222 4223 if (unlikely(mss == GSO_BY_FRAGS)) { 4224 len = list_skb->len; 4225 } else { 4226 len = head_skb->len - offset; 4227 if (len > mss) 4228 len = mss; 4229 } 4230 4231 hsize = skb_headlen(head_skb) - offset; 4232 4233 if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) && 4234 (skb_headlen(list_skb) == len || sg)) { 4235 BUG_ON(skb_headlen(list_skb) > len); 4236 4237 i = 0; 4238 nfrags = skb_shinfo(list_skb)->nr_frags; 4239 frag = skb_shinfo(list_skb)->frags; 4240 frag_skb = list_skb; 4241 pos += skb_headlen(list_skb); 4242 4243 while (pos < offset + len) { 4244 BUG_ON(i >= nfrags); 4245 4246 size = skb_frag_size(frag); 4247 if (pos + size > offset + len) 4248 break; 4249 4250 i++; 4251 pos += size; 4252 frag++; 4253 } 4254 4255 nskb = skb_clone(list_skb, GFP_ATOMIC); 4256 list_skb = list_skb->next; 4257 4258 if (unlikely(!nskb)) 4259 goto err; 4260 4261 if (unlikely(pskb_trim(nskb, len))) { 4262 kfree_skb(nskb); 4263 goto err; 4264 } 4265 4266 hsize = skb_end_offset(nskb); 4267 if (skb_cow_head(nskb, doffset + headroom)) { 4268 kfree_skb(nskb); 4269 goto err; 4270 } 4271 4272 nskb->truesize += skb_end_offset(nskb) - hsize; 4273 skb_release_head_state(nskb); 4274 __skb_push(nskb, doffset); 4275 } else { 4276 if (hsize < 0) 4277 hsize = 0; 4278 if (hsize > len || !sg) 4279 hsize = len; 4280 4281 nskb = __alloc_skb(hsize + doffset + headroom, 4282 GFP_ATOMIC, skb_alloc_rx_flag(head_skb), 4283 NUMA_NO_NODE); 4284 4285 if (unlikely(!nskb)) 4286 goto err; 4287 4288 skb_reserve(nskb, headroom); 4289 __skb_put(nskb, doffset); 4290 } 4291 4292 if (segs) 4293 tail->next = nskb; 4294 else 4295 segs = nskb; 4296 tail = nskb; 4297 4298 __copy_skb_header(nskb, head_skb); 4299 4300 skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); 4301 skb_reset_mac_len(nskb); 4302 4303 skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, 4304 nskb->data - tnl_hlen, 4305 doffset + tnl_hlen); 4306 4307 if (nskb->len == len + doffset) 4308 goto perform_csum_check; 4309 4310 if (!sg) { 4311 if (!csum) { 4312 if (!nskb->remcsum_offload) 4313 nskb->ip_summed = CHECKSUM_NONE; 4314 SKB_GSO_CB(nskb)->csum = 4315 skb_copy_and_csum_bits(head_skb, offset, 4316 skb_put(nskb, 4317 len), 4318 len); 4319 SKB_GSO_CB(nskb)->csum_start = 4320 skb_headroom(nskb) + doffset; 4321 } else { 4322 if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) 4323 goto err; 4324 } 4325 continue; 4326 } 4327 4328 nskb_frag = skb_shinfo(nskb)->frags; 4329 4330 skb_copy_from_linear_data_offset(head_skb, offset, 4331 skb_put(nskb, hsize), hsize); 4332 4333 skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & 4334 SKBFL_SHARED_FRAG; 4335 4336 if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || 4337 skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) 4338 goto err; 4339 4340 while (pos < offset + len) { 4341 if (i >= nfrags) { 4342 i = 0; 4343 nfrags = skb_shinfo(list_skb)->nr_frags; 4344 frag = skb_shinfo(list_skb)->frags; 4345 frag_skb = list_skb; 4346 if (!skb_headlen(list_skb)) { 4347 BUG_ON(!nfrags); 4348 } else { 4349 BUG_ON(!list_skb->head_frag); 4350 4351 /* to make room for head_frag. */ 4352 i--; 4353 frag--; 4354 } 4355 if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || 4356 skb_zerocopy_clone(nskb, frag_skb, 4357 GFP_ATOMIC)) 4358 goto err; 4359 4360 list_skb = list_skb->next; 4361 } 4362 4363 if (unlikely(skb_shinfo(nskb)->nr_frags >= 4364 MAX_SKB_FRAGS)) { 4365 net_warn_ratelimited( 4366 "skb_segment: too many frags: %u %u\n", 4367 pos, mss); 4368 err = -EINVAL; 4369 goto err; 4370 } 4371 4372 *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; 4373 __skb_frag_ref(nskb_frag); 4374 size = skb_frag_size(nskb_frag); 4375 4376 if (pos < offset) { 4377 skb_frag_off_add(nskb_frag, offset - pos); 4378 skb_frag_size_sub(nskb_frag, offset - pos); 4379 } 4380 4381 skb_shinfo(nskb)->nr_frags++; 4382 4383 if (pos + size <= offset + len) { 4384 i++; 4385 frag++; 4386 pos += size; 4387 } else { 4388 skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); 4389 goto skip_fraglist; 4390 } 4391 4392 nskb_frag++; 4393 } 4394 4395 skip_fraglist: 4396 nskb->data_len = len - hsize; 4397 nskb->len += nskb->data_len; 4398 nskb->truesize += nskb->data_len; 4399 4400 perform_csum_check: 4401 if (!csum) { 4402 if (skb_has_shared_frag(nskb) && 4403 __skb_linearize(nskb)) 4404 goto err; 4405 4406 if (!nskb->remcsum_offload) 4407 nskb->ip_summed = CHECKSUM_NONE; 4408 SKB_GSO_CB(nskb)->csum = 4409 skb_checksum(nskb, doffset, 4410 nskb->len - doffset, 0); 4411 SKB_GSO_CB(nskb)->csum_start = 4412 skb_headroom(nskb) + doffset; 4413 } 4414 } while ((offset += len) < head_skb->len); 4415 4416 /* Some callers want to get the end of the list. 4417 * Put it in segs->prev to avoid walking the list. 4418 * (see validate_xmit_skb_list() for example) 4419 */ 4420 segs->prev = tail; 4421 4422 if (partial_segs) { 4423 struct sk_buff *iter; 4424 int type = skb_shinfo(head_skb)->gso_type; 4425 unsigned short gso_size = skb_shinfo(head_skb)->gso_size; 4426 4427 /* Update type to add partial and then remove dodgy if set */ 4428 type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; 4429 type &= ~SKB_GSO_DODGY; 4430 4431 /* Update GSO info and prepare to start updating headers on 4432 * our way back down the stack of protocols. 4433 */ 4434 for (iter = segs; iter; iter = iter->next) { 4435 skb_shinfo(iter)->gso_size = gso_size; 4436 skb_shinfo(iter)->gso_segs = partial_segs; 4437 skb_shinfo(iter)->gso_type = type; 4438 SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; 4439 } 4440 4441 if (tail->len - doffset <= gso_size) 4442 skb_shinfo(tail)->gso_size = 0; 4443 else if (tail != segs) 4444 skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); 4445 } 4446 4447 /* Following permits correct backpressure, for protocols 4448 * using skb_set_owner_w(). 4449 * Idea is to tranfert ownership from head_skb to last segment. 4450 */ 4451 if (head_skb->destructor == sock_wfree) { 4452 swap(tail->truesize, head_skb->truesize); 4453 swap(tail->destructor, head_skb->destructor); 4454 swap(tail->sk, head_skb->sk); 4455 } 4456 return segs; 4457 4458 err: 4459 kfree_skb_list(segs); 4460 return ERR_PTR(err); 4461 } 4462 EXPORT_SYMBOL_GPL(skb_segment); 4463 4464 #ifdef CONFIG_SKB_EXTENSIONS 4465 #define SKB_EXT_ALIGN_VALUE 8 4466 #define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) 4467 4468 static const u8 skb_ext_type_len[] = { 4469 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 4470 [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), 4471 #endif 4472 #ifdef CONFIG_XFRM 4473 [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), 4474 #endif 4475 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) 4476 [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), 4477 #endif 4478 #if IS_ENABLED(CONFIG_MPTCP) 4479 [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), 4480 #endif 4481 #if IS_ENABLED(CONFIG_MCTP_FLOWS) 4482 [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), 4483 #endif 4484 }; 4485 4486 static __always_inline unsigned int skb_ext_total_length(void) 4487 { 4488 return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + 4489 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 4490 skb_ext_type_len[SKB_EXT_BRIDGE_NF] + 4491 #endif 4492 #ifdef CONFIG_XFRM 4493 skb_ext_type_len[SKB_EXT_SEC_PATH] + 4494 #endif 4495 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) 4496 skb_ext_type_len[TC_SKB_EXT] + 4497 #endif 4498 #if IS_ENABLED(CONFIG_MPTCP) 4499 skb_ext_type_len[SKB_EXT_MPTCP] + 4500 #endif 4501 #if IS_ENABLED(CONFIG_MCTP_FLOWS) 4502 skb_ext_type_len[SKB_EXT_MCTP] + 4503 #endif 4504 0; 4505 } 4506 4507 static void skb_extensions_init(void) 4508 { 4509 BUILD_BUG_ON(SKB_EXT_NUM >= 8); 4510 BUILD_BUG_ON(skb_ext_total_length() > 255); 4511 4512 skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", 4513 SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), 4514 0, 4515 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4516 NULL); 4517 } 4518 #else 4519 static void skb_extensions_init(void) {} 4520 #endif 4521 4522 void __init skb_init(void) 4523 { 4524 skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", 4525 sizeof(struct sk_buff), 4526 0, 4527 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4528 offsetof(struct sk_buff, cb), 4529 sizeof_field(struct sk_buff, cb), 4530 NULL); 4531 skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", 4532 sizeof(struct sk_buff_fclones), 4533 0, 4534 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 4535 NULL); 4536 skb_extensions_init(); 4537 } 4538 4539 static int 4540 __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, 4541 unsigned int recursion_level) 4542 { 4543 int start = skb_headlen(skb); 4544 int i, copy = start - offset; 4545 struct sk_buff *frag_iter; 4546 int elt = 0; 4547 4548 if (unlikely(recursion_level >= 24)) 4549 return -EMSGSIZE; 4550 4551 if (copy > 0) { 4552 if (copy > len) 4553 copy = len; 4554 sg_set_buf(sg, skb->data + offset, copy); 4555 elt++; 4556 if ((len -= copy) == 0) 4557 return elt; 4558 offset += copy; 4559 } 4560 4561 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 4562 int end; 4563 4564 WARN_ON(start > offset + len); 4565 4566 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 4567 if ((copy = end - offset) > 0) { 4568 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 4569 if (unlikely(elt && sg_is_last(&sg[elt - 1]))) 4570 return -EMSGSIZE; 4571 4572 if (copy > len) 4573 copy = len; 4574 sg_set_page(&sg[elt], skb_frag_page(frag), copy, 4575 skb_frag_off(frag) + offset - start); 4576 elt++; 4577 if (!(len -= copy)) 4578 return elt; 4579 offset += copy; 4580 } 4581 start = end; 4582 } 4583 4584 skb_walk_frags(skb, frag_iter) { 4585 int end, ret; 4586 4587 WARN_ON(start > offset + len); 4588 4589 end = start + frag_iter->len; 4590 if ((copy = end - offset) > 0) { 4591 if (unlikely(elt && sg_is_last(&sg[elt - 1]))) 4592 return -EMSGSIZE; 4593 4594 if (copy > len) 4595 copy = len; 4596 ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, 4597 copy, recursion_level + 1); 4598 if (unlikely(ret < 0)) 4599 return ret; 4600 elt += ret; 4601 if ((len -= copy) == 0) 4602 return elt; 4603 offset += copy; 4604 } 4605 start = end; 4606 } 4607 BUG_ON(len); 4608 return elt; 4609 } 4610 4611 /** 4612 * skb_to_sgvec - Fill a scatter-gather list from a socket buffer 4613 * @skb: Socket buffer containing the buffers to be mapped 4614 * @sg: The scatter-gather list to map into 4615 * @offset: The offset into the buffer's contents to start mapping 4616 * @len: Length of buffer space to be mapped 4617 * 4618 * Fill the specified scatter-gather list with mappings/pointers into a 4619 * region of the buffer space attached to a socket buffer. Returns either 4620 * the number of scatterlist items used, or -EMSGSIZE if the contents 4621 * could not fit. 4622 */ 4623 int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 4624 { 4625 int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); 4626 4627 if (nsg <= 0) 4628 return nsg; 4629 4630 sg_mark_end(&sg[nsg - 1]); 4631 4632 return nsg; 4633 } 4634 EXPORT_SYMBOL_GPL(skb_to_sgvec); 4635 4636 /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given 4637 * sglist without mark the sg which contain last skb data as the end. 4638 * So the caller can mannipulate sg list as will when padding new data after 4639 * the first call without calling sg_unmark_end to expend sg list. 4640 * 4641 * Scenario to use skb_to_sgvec_nomark: 4642 * 1. sg_init_table 4643 * 2. skb_to_sgvec_nomark(payload1) 4644 * 3. skb_to_sgvec_nomark(payload2) 4645 * 4646 * This is equivalent to: 4647 * 1. sg_init_table 4648 * 2. skb_to_sgvec(payload1) 4649 * 3. sg_unmark_end 4650 * 4. skb_to_sgvec(payload2) 4651 * 4652 * When mapping mutilple payload conditionally, skb_to_sgvec_nomark 4653 * is more preferable. 4654 */ 4655 int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, 4656 int offset, int len) 4657 { 4658 return __skb_to_sgvec(skb, sg, offset, len, 0); 4659 } 4660 EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); 4661 4662 4663 4664 /** 4665 * skb_cow_data - Check that a socket buffer's data buffers are writable 4666 * @skb: The socket buffer to check. 4667 * @tailbits: Amount of trailing space to be added 4668 * @trailer: Returned pointer to the skb where the @tailbits space begins 4669 * 4670 * Make sure that the data buffers attached to a socket buffer are 4671 * writable. If they are not, private copies are made of the data buffers 4672 * and the socket buffer is set to use these instead. 4673 * 4674 * If @tailbits is given, make sure that there is space to write @tailbits 4675 * bytes of data beyond current end of socket buffer. @trailer will be 4676 * set to point to the skb in which this space begins. 4677 * 4678 * The number of scatterlist elements required to completely map the 4679 * COW'd and extended socket buffer will be returned. 4680 */ 4681 int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) 4682 { 4683 int copyflag; 4684 int elt; 4685 struct sk_buff *skb1, **skb_p; 4686 4687 /* If skb is cloned or its head is paged, reallocate 4688 * head pulling out all the pages (pages are considered not writable 4689 * at the moment even if they are anonymous). 4690 */ 4691 if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && 4692 !__pskb_pull_tail(skb, __skb_pagelen(skb))) 4693 return -ENOMEM; 4694 4695 /* Easy case. Most of packets will go this way. */ 4696 if (!skb_has_frag_list(skb)) { 4697 /* A little of trouble, not enough of space for trailer. 4698 * This should not happen, when stack is tuned to generate 4699 * good frames. OK, on miss we reallocate and reserve even more 4700 * space, 128 bytes is fair. */ 4701 4702 if (skb_tailroom(skb) < tailbits && 4703 pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) 4704 return -ENOMEM; 4705 4706 /* Voila! */ 4707 *trailer = skb; 4708 return 1; 4709 } 4710 4711 /* Misery. We are in troubles, going to mincer fragments... */ 4712 4713 elt = 1; 4714 skb_p = &skb_shinfo(skb)->frag_list; 4715 copyflag = 0; 4716 4717 while ((skb1 = *skb_p) != NULL) { 4718 int ntail = 0; 4719 4720 /* The fragment is partially pulled by someone, 4721 * this can happen on input. Copy it and everything 4722 * after it. */ 4723 4724 if (skb_shared(skb1)) 4725 copyflag = 1; 4726 4727 /* If the skb is the last, worry about trailer. */ 4728 4729 if (skb1->next == NULL && tailbits) { 4730 if (skb_shinfo(skb1)->nr_frags || 4731 skb_has_frag_list(skb1) || 4732 skb_tailroom(skb1) < tailbits) 4733 ntail = tailbits + 128; 4734 } 4735 4736 if (copyflag || 4737 skb_cloned(skb1) || 4738 ntail || 4739 skb_shinfo(skb1)->nr_frags || 4740 skb_has_frag_list(skb1)) { 4741 struct sk_buff *skb2; 4742 4743 /* Fuck, we are miserable poor guys... */ 4744 if (ntail == 0) 4745 skb2 = skb_copy(skb1, GFP_ATOMIC); 4746 else 4747 skb2 = skb_copy_expand(skb1, 4748 skb_headroom(skb1), 4749 ntail, 4750 GFP_ATOMIC); 4751 if (unlikely(skb2 == NULL)) 4752 return -ENOMEM; 4753 4754 if (skb1->sk) 4755 skb_set_owner_w(skb2, skb1->sk); 4756 4757 /* Looking around. Are we still alive? 4758 * OK, link new skb, drop old one */ 4759 4760 skb2->next = skb1->next; 4761 *skb_p = skb2; 4762 kfree_skb(skb1); 4763 skb1 = skb2; 4764 } 4765 elt++; 4766 *trailer = skb1; 4767 skb_p = &skb1->next; 4768 } 4769 4770 return elt; 4771 } 4772 EXPORT_SYMBOL_GPL(skb_cow_data); 4773 4774 static void sock_rmem_free(struct sk_buff *skb) 4775 { 4776 struct sock *sk = skb->sk; 4777 4778 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 4779 } 4780 4781 static void skb_set_err_queue(struct sk_buff *skb) 4782 { 4783 /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. 4784 * So, it is safe to (mis)use it to mark skbs on the error queue. 4785 */ 4786 skb->pkt_type = PACKET_OUTGOING; 4787 BUILD_BUG_ON(PACKET_OUTGOING == 0); 4788 } 4789 4790 /* 4791 * Note: We dont mem charge error packets (no sk_forward_alloc changes) 4792 */ 4793 int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) 4794 { 4795 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 4796 (unsigned int)READ_ONCE(sk->sk_rcvbuf)) 4797 return -ENOMEM; 4798 4799 skb_orphan(skb); 4800 skb->sk = sk; 4801 skb->destructor = sock_rmem_free; 4802 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 4803 skb_set_err_queue(skb); 4804 4805 /* before exiting rcu section, make sure dst is refcounted */ 4806 skb_dst_force(skb); 4807 4808 skb_queue_tail(&sk->sk_error_queue, skb); 4809 if (!sock_flag(sk, SOCK_DEAD)) 4810 sk_error_report(sk); 4811 return 0; 4812 } 4813 EXPORT_SYMBOL(sock_queue_err_skb); 4814 4815 static bool is_icmp_err_skb(const struct sk_buff *skb) 4816 { 4817 return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || 4818 SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); 4819 } 4820 4821 struct sk_buff *sock_dequeue_err_skb(struct sock *sk) 4822 { 4823 struct sk_buff_head *q = &sk->sk_error_queue; 4824 struct sk_buff *skb, *skb_next = NULL; 4825 bool icmp_next = false; 4826 unsigned long flags; 4827 4828 spin_lock_irqsave(&q->lock, flags); 4829 skb = __skb_dequeue(q); 4830 if (skb && (skb_next = skb_peek(q))) { 4831 icmp_next = is_icmp_err_skb(skb_next); 4832 if (icmp_next) 4833 sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; 4834 } 4835 spin_unlock_irqrestore(&q->lock, flags); 4836 4837 if (is_icmp_err_skb(skb) && !icmp_next) 4838 sk->sk_err = 0; 4839 4840 if (skb_next) 4841 sk_error_report(sk); 4842 4843 return skb; 4844 } 4845 EXPORT_SYMBOL(sock_dequeue_err_skb); 4846 4847 /** 4848 * skb_clone_sk - create clone of skb, and take reference to socket 4849 * @skb: the skb to clone 4850 * 4851 * This function creates a clone of a buffer that holds a reference on 4852 * sk_refcnt. Buffers created via this function are meant to be 4853 * returned using sock_queue_err_skb, or free via kfree_skb. 4854 * 4855 * When passing buffers allocated with this function to sock_queue_err_skb 4856 * it is necessary to wrap the call with sock_hold/sock_put in order to 4857 * prevent the socket from being released prior to being enqueued on 4858 * the sk_error_queue. 4859 */ 4860 struct sk_buff *skb_clone_sk(struct sk_buff *skb) 4861 { 4862 struct sock *sk = skb->sk; 4863 struct sk_buff *clone; 4864 4865 if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) 4866 return NULL; 4867 4868 clone = skb_clone(skb, GFP_ATOMIC); 4869 if (!clone) { 4870 sock_put(sk); 4871 return NULL; 4872 } 4873 4874 clone->sk = sk; 4875 clone->destructor = sock_efree; 4876 4877 return clone; 4878 } 4879 EXPORT_SYMBOL(skb_clone_sk); 4880 4881 static void __skb_complete_tx_timestamp(struct sk_buff *skb, 4882 struct sock *sk, 4883 int tstype, 4884 bool opt_stats) 4885 { 4886 struct sock_exterr_skb *serr; 4887 int err; 4888 4889 BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); 4890 4891 serr = SKB_EXT_ERR(skb); 4892 memset(serr, 0, sizeof(*serr)); 4893 serr->ee.ee_errno = ENOMSG; 4894 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 4895 serr->ee.ee_info = tstype; 4896 serr->opt_stats = opt_stats; 4897 serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; 4898 if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { 4899 serr->ee.ee_data = skb_shinfo(skb)->tskey; 4900 if (sk_is_tcp(sk)) 4901 serr->ee.ee_data -= atomic_read(&sk->sk_tskey); 4902 } 4903 4904 err = sock_queue_err_skb(sk, skb); 4905 4906 if (err) 4907 kfree_skb(skb); 4908 } 4909 4910 static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) 4911 { 4912 bool ret; 4913 4914 if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) 4915 return true; 4916 4917 read_lock_bh(&sk->sk_callback_lock); 4918 ret = sk->sk_socket && sk->sk_socket->file && 4919 file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); 4920 read_unlock_bh(&sk->sk_callback_lock); 4921 return ret; 4922 } 4923 4924 void skb_complete_tx_timestamp(struct sk_buff *skb, 4925 struct skb_shared_hwtstamps *hwtstamps) 4926 { 4927 struct sock *sk = skb->sk; 4928 4929 if (!skb_may_tx_timestamp(sk, false)) 4930 goto err; 4931 4932 /* Take a reference to prevent skb_orphan() from freeing the socket, 4933 * but only if the socket refcount is not zero. 4934 */ 4935 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { 4936 *skb_hwtstamps(skb) = *hwtstamps; 4937 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); 4938 sock_put(sk); 4939 return; 4940 } 4941 4942 err: 4943 kfree_skb(skb); 4944 } 4945 EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); 4946 4947 void __skb_tstamp_tx(struct sk_buff *orig_skb, 4948 const struct sk_buff *ack_skb, 4949 struct skb_shared_hwtstamps *hwtstamps, 4950 struct sock *sk, int tstype) 4951 { 4952 struct sk_buff *skb; 4953 bool tsonly, opt_stats = false; 4954 4955 if (!sk) 4956 return; 4957 4958 if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && 4959 skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) 4960 return; 4961 4962 tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; 4963 if (!skb_may_tx_timestamp(sk, tsonly)) 4964 return; 4965 4966 if (tsonly) { 4967 #ifdef CONFIG_INET 4968 if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && 4969 sk_is_tcp(sk)) { 4970 skb = tcp_get_timestamping_opt_stats(sk, orig_skb, 4971 ack_skb); 4972 opt_stats = true; 4973 } else 4974 #endif 4975 skb = alloc_skb(0, GFP_ATOMIC); 4976 } else { 4977 skb = skb_clone(orig_skb, GFP_ATOMIC); 4978 } 4979 if (!skb) 4980 return; 4981 4982 if (tsonly) { 4983 skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & 4984 SKBTX_ANY_TSTAMP; 4985 skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; 4986 } 4987 4988 if (hwtstamps) 4989 *skb_hwtstamps(skb) = *hwtstamps; 4990 else 4991 __net_timestamp(skb); 4992 4993 __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); 4994 } 4995 EXPORT_SYMBOL_GPL(__skb_tstamp_tx); 4996 4997 void skb_tstamp_tx(struct sk_buff *orig_skb, 4998 struct skb_shared_hwtstamps *hwtstamps) 4999 { 5000 return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, 5001 SCM_TSTAMP_SND); 5002 } 5003 EXPORT_SYMBOL_GPL(skb_tstamp_tx); 5004 5005 void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) 5006 { 5007 struct sock *sk = skb->sk; 5008 struct sock_exterr_skb *serr; 5009 int err = 1; 5010 5011 skb->wifi_acked_valid = 1; 5012 skb->wifi_acked = acked; 5013 5014 serr = SKB_EXT_ERR(skb); 5015 memset(serr, 0, sizeof(*serr)); 5016 serr->ee.ee_errno = ENOMSG; 5017 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; 5018 5019 /* Take a reference to prevent skb_orphan() from freeing the socket, 5020 * but only if the socket refcount is not zero. 5021 */ 5022 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { 5023 err = sock_queue_err_skb(sk, skb); 5024 sock_put(sk); 5025 } 5026 if (err) 5027 kfree_skb(skb); 5028 } 5029 EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); 5030 5031 /** 5032 * skb_partial_csum_set - set up and verify partial csum values for packet 5033 * @skb: the skb to set 5034 * @start: the number of bytes after skb->data to start checksumming. 5035 * @off: the offset from start to place the checksum. 5036 * 5037 * For untrusted partially-checksummed packets, we need to make sure the values 5038 * for skb->csum_start and skb->csum_offset are valid so we don't oops. 5039 * 5040 * This function checks and sets those values and skb->ip_summed: if this 5041 * returns false you should drop the packet. 5042 */ 5043 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) 5044 { 5045 u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); 5046 u32 csum_start = skb_headroom(skb) + (u32)start; 5047 5048 if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) { 5049 net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", 5050 start, off, skb_headroom(skb), skb_headlen(skb)); 5051 return false; 5052 } 5053 skb->ip_summed = CHECKSUM_PARTIAL; 5054 skb->csum_start = csum_start; 5055 skb->csum_offset = off; 5056 skb_set_transport_header(skb, start); 5057 return true; 5058 } 5059 EXPORT_SYMBOL_GPL(skb_partial_csum_set); 5060 5061 static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, 5062 unsigned int max) 5063 { 5064 if (skb_headlen(skb) >= len) 5065 return 0; 5066 5067 /* If we need to pullup then pullup to the max, so we 5068 * won't need to do it again. 5069 */ 5070 if (max > skb->len) 5071 max = skb->len; 5072 5073 if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) 5074 return -ENOMEM; 5075 5076 if (skb_headlen(skb) < len) 5077 return -EPROTO; 5078 5079 return 0; 5080 } 5081 5082 #define MAX_TCP_HDR_LEN (15 * 4) 5083 5084 static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, 5085 typeof(IPPROTO_IP) proto, 5086 unsigned int off) 5087 { 5088 int err; 5089 5090 switch (proto) { 5091 case IPPROTO_TCP: 5092 err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), 5093 off + MAX_TCP_HDR_LEN); 5094 if (!err && !skb_partial_csum_set(skb, off, 5095 offsetof(struct tcphdr, 5096 check))) 5097 err = -EPROTO; 5098 return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; 5099 5100 case IPPROTO_UDP: 5101 err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), 5102 off + sizeof(struct udphdr)); 5103 if (!err && !skb_partial_csum_set(skb, off, 5104 offsetof(struct udphdr, 5105 check))) 5106 err = -EPROTO; 5107 return err ? ERR_PTR(err) : &udp_hdr(skb)->check; 5108 } 5109 5110 return ERR_PTR(-EPROTO); 5111 } 5112 5113 /* This value should be large enough to cover a tagged ethernet header plus 5114 * maximally sized IP and TCP or UDP headers. 5115 */ 5116 #define MAX_IP_HDR_LEN 128 5117 5118 static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) 5119 { 5120 unsigned int off; 5121 bool fragment; 5122 __sum16 *csum; 5123 int err; 5124 5125 fragment = false; 5126 5127 err = skb_maybe_pull_tail(skb, 5128 sizeof(struct iphdr), 5129 MAX_IP_HDR_LEN); 5130 if (err < 0) 5131 goto out; 5132 5133 if (ip_is_fragment(ip_hdr(skb))) 5134 fragment = true; 5135 5136 off = ip_hdrlen(skb); 5137 5138 err = -EPROTO; 5139 5140 if (fragment) 5141 goto out; 5142 5143 csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); 5144 if (IS_ERR(csum)) 5145 return PTR_ERR(csum); 5146 5147 if (recalculate) 5148 *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, 5149 ip_hdr(skb)->daddr, 5150 skb->len - off, 5151 ip_hdr(skb)->protocol, 0); 5152 err = 0; 5153 5154 out: 5155 return err; 5156 } 5157 5158 /* This value should be large enough to cover a tagged ethernet header plus 5159 * an IPv6 header, all options, and a maximal TCP or UDP header. 5160 */ 5161 #define MAX_IPV6_HDR_LEN 256 5162 5163 #define OPT_HDR(type, skb, off) \ 5164 (type *)(skb_network_header(skb) + (off)) 5165 5166 static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) 5167 { 5168 int err; 5169 u8 nexthdr; 5170 unsigned int off; 5171 unsigned int len; 5172 bool fragment; 5173 bool done; 5174 __sum16 *csum; 5175 5176 fragment = false; 5177 done = false; 5178 5179 off = sizeof(struct ipv6hdr); 5180 5181 err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); 5182 if (err < 0) 5183 goto out; 5184 5185 nexthdr = ipv6_hdr(skb)->nexthdr; 5186 5187 len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); 5188 while (off <= len && !done) { 5189 switch (nexthdr) { 5190 case IPPROTO_DSTOPTS: 5191 case IPPROTO_HOPOPTS: 5192 case IPPROTO_ROUTING: { 5193 struct ipv6_opt_hdr *hp; 5194 5195 err = skb_maybe_pull_tail(skb, 5196 off + 5197 sizeof(struct ipv6_opt_hdr), 5198 MAX_IPV6_HDR_LEN); 5199 if (err < 0) 5200 goto out; 5201 5202 hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); 5203 nexthdr = hp->nexthdr; 5204 off += ipv6_optlen(hp); 5205 break; 5206 } 5207 case IPPROTO_AH: { 5208 struct ip_auth_hdr *hp; 5209 5210 err = skb_maybe_pull_tail(skb, 5211 off + 5212 sizeof(struct ip_auth_hdr), 5213 MAX_IPV6_HDR_LEN); 5214 if (err < 0) 5215 goto out; 5216 5217 hp = OPT_HDR(struct ip_auth_hdr, skb, off); 5218 nexthdr = hp->nexthdr; 5219 off += ipv6_authlen(hp); 5220 break; 5221 } 5222 case IPPROTO_FRAGMENT: { 5223 struct frag_hdr *hp; 5224 5225 err = skb_maybe_pull_tail(skb, 5226 off + 5227 sizeof(struct frag_hdr), 5228 MAX_IPV6_HDR_LEN); 5229 if (err < 0) 5230 goto out; 5231 5232 hp = OPT_HDR(struct frag_hdr, skb, off); 5233 5234 if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) 5235 fragment = true; 5236 5237 nexthdr = hp->nexthdr; 5238 off += sizeof(struct frag_hdr); 5239 break; 5240 } 5241 default: 5242 done = true; 5243 break; 5244 } 5245 } 5246 5247 err = -EPROTO; 5248 5249 if (!done || fragment) 5250 goto out; 5251 5252 csum = skb_checksum_setup_ip(skb, nexthdr, off); 5253 if (IS_ERR(csum)) 5254 return PTR_ERR(csum); 5255 5256 if (recalculate) 5257 *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, 5258 &ipv6_hdr(skb)->daddr, 5259 skb->len - off, nexthdr, 0); 5260 err = 0; 5261 5262 out: 5263 return err; 5264 } 5265 5266 /** 5267 * skb_checksum_setup - set up partial checksum offset 5268 * @skb: the skb to set up 5269 * @recalculate: if true the pseudo-header checksum will be recalculated 5270 */ 5271 int skb_checksum_setup(struct sk_buff *skb, bool recalculate) 5272 { 5273 int err; 5274 5275 switch (skb->protocol) { 5276 case htons(ETH_P_IP): 5277 err = skb_checksum_setup_ipv4(skb, recalculate); 5278 break; 5279 5280 case htons(ETH_P_IPV6): 5281 err = skb_checksum_setup_ipv6(skb, recalculate); 5282 break; 5283 5284 default: 5285 err = -EPROTO; 5286 break; 5287 } 5288 5289 return err; 5290 } 5291 EXPORT_SYMBOL(skb_checksum_setup); 5292 5293 /** 5294 * skb_checksum_maybe_trim - maybe trims the given skb 5295 * @skb: the skb to check 5296 * @transport_len: the data length beyond the network header 5297 * 5298 * Checks whether the given skb has data beyond the given transport length. 5299 * If so, returns a cloned skb trimmed to this transport length. 5300 * Otherwise returns the provided skb. Returns NULL in error cases 5301 * (e.g. transport_len exceeds skb length or out-of-memory). 5302 * 5303 * Caller needs to set the skb transport header and free any returned skb if it 5304 * differs from the provided skb. 5305 */ 5306 static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, 5307 unsigned int transport_len) 5308 { 5309 struct sk_buff *skb_chk; 5310 unsigned int len = skb_transport_offset(skb) + transport_len; 5311 int ret; 5312 5313 if (skb->len < len) 5314 return NULL; 5315 else if (skb->len == len) 5316 return skb; 5317 5318 skb_chk = skb_clone(skb, GFP_ATOMIC); 5319 if (!skb_chk) 5320 return NULL; 5321 5322 ret = pskb_trim_rcsum(skb_chk, len); 5323 if (ret) { 5324 kfree_skb(skb_chk); 5325 return NULL; 5326 } 5327 5328 return skb_chk; 5329 } 5330 5331 /** 5332 * skb_checksum_trimmed - validate checksum of an skb 5333 * @skb: the skb to check 5334 * @transport_len: the data length beyond the network header 5335 * @skb_chkf: checksum function to use 5336 * 5337 * Applies the given checksum function skb_chkf to the provided skb. 5338 * Returns a checked and maybe trimmed skb. Returns NULL on error. 5339 * 5340 * If the skb has data beyond the given transport length, then a 5341 * trimmed & cloned skb is checked and returned. 5342 * 5343 * Caller needs to set the skb transport header and free any returned skb if it 5344 * differs from the provided skb. 5345 */ 5346 struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, 5347 unsigned int transport_len, 5348 __sum16(*skb_chkf)(struct sk_buff *skb)) 5349 { 5350 struct sk_buff *skb_chk; 5351 unsigned int offset = skb_transport_offset(skb); 5352 __sum16 ret; 5353 5354 skb_chk = skb_checksum_maybe_trim(skb, transport_len); 5355 if (!skb_chk) 5356 goto err; 5357 5358 if (!pskb_may_pull(skb_chk, offset)) 5359 goto err; 5360 5361 skb_pull_rcsum(skb_chk, offset); 5362 ret = skb_chkf(skb_chk); 5363 skb_push_rcsum(skb_chk, offset); 5364 5365 if (ret) 5366 goto err; 5367 5368 return skb_chk; 5369 5370 err: 5371 if (skb_chk && skb_chk != skb) 5372 kfree_skb(skb_chk); 5373 5374 return NULL; 5375 5376 } 5377 EXPORT_SYMBOL(skb_checksum_trimmed); 5378 5379 void __skb_warn_lro_forwarding(const struct sk_buff *skb) 5380 { 5381 net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", 5382 skb->dev->name); 5383 } 5384 EXPORT_SYMBOL(__skb_warn_lro_forwarding); 5385 5386 void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) 5387 { 5388 if (head_stolen) { 5389 skb_release_head_state(skb); 5390 kmem_cache_free(skbuff_head_cache, skb); 5391 } else { 5392 __kfree_skb(skb); 5393 } 5394 } 5395 EXPORT_SYMBOL(kfree_skb_partial); 5396 5397 /** 5398 * skb_try_coalesce - try to merge skb to prior one 5399 * @to: prior buffer 5400 * @from: buffer to add 5401 * @fragstolen: pointer to boolean 5402 * @delta_truesize: how much more was allocated than was requested 5403 */ 5404 bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, 5405 bool *fragstolen, int *delta_truesize) 5406 { 5407 struct skb_shared_info *to_shinfo, *from_shinfo; 5408 int i, delta, len = from->len; 5409 5410 *fragstolen = false; 5411 5412 if (skb_cloned(to)) 5413 return false; 5414 5415 /* In general, avoid mixing slab allocated and page_pool allocated 5416 * pages within the same SKB. However when @to is not pp_recycle and 5417 * @from is cloned, we can transition frag pages from page_pool to 5418 * reference counted. 5419 * 5420 * On the other hand, don't allow coalescing two pp_recycle SKBs if 5421 * @from is cloned, in case the SKB is using page_pool fragment 5422 * references (PP_FLAG_PAGE_FRAG). Since we only take full page 5423 * references for cloned SKBs at the moment that would result in 5424 * inconsistent reference counts. 5425 */ 5426 if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from))) 5427 return false; 5428 5429 if (len <= skb_tailroom(to)) { 5430 if (len) 5431 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); 5432 *delta_truesize = 0; 5433 return true; 5434 } 5435 5436 to_shinfo = skb_shinfo(to); 5437 from_shinfo = skb_shinfo(from); 5438 if (to_shinfo->frag_list || from_shinfo->frag_list) 5439 return false; 5440 if (skb_zcopy(to) || skb_zcopy(from)) 5441 return false; 5442 5443 if (skb_headlen(from) != 0) { 5444 struct page *page; 5445 unsigned int offset; 5446 5447 if (to_shinfo->nr_frags + 5448 from_shinfo->nr_frags >= MAX_SKB_FRAGS) 5449 return false; 5450 5451 if (skb_head_is_locked(from)) 5452 return false; 5453 5454 delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 5455 5456 page = virt_to_head_page(from->head); 5457 offset = from->data - (unsigned char *)page_address(page); 5458 5459 skb_fill_page_desc(to, to_shinfo->nr_frags, 5460 page, offset, skb_headlen(from)); 5461 *fragstolen = true; 5462 } else { 5463 if (to_shinfo->nr_frags + 5464 from_shinfo->nr_frags > MAX_SKB_FRAGS) 5465 return false; 5466 5467 delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); 5468 } 5469 5470 WARN_ON_ONCE(delta < len); 5471 5472 memcpy(to_shinfo->frags + to_shinfo->nr_frags, 5473 from_shinfo->frags, 5474 from_shinfo->nr_frags * sizeof(skb_frag_t)); 5475 to_shinfo->nr_frags += from_shinfo->nr_frags; 5476 5477 if (!skb_cloned(from)) 5478 from_shinfo->nr_frags = 0; 5479 5480 /* if the skb is not cloned this does nothing 5481 * since we set nr_frags to 0. 5482 */ 5483 for (i = 0; i < from_shinfo->nr_frags; i++) 5484 __skb_frag_ref(&from_shinfo->frags[i]); 5485 5486 to->truesize += delta; 5487 to->len += len; 5488 to->data_len += len; 5489 5490 *delta_truesize = delta; 5491 return true; 5492 } 5493 EXPORT_SYMBOL(skb_try_coalesce); 5494 5495 /** 5496 * skb_scrub_packet - scrub an skb 5497 * 5498 * @skb: buffer to clean 5499 * @xnet: packet is crossing netns 5500 * 5501 * skb_scrub_packet can be used after encapsulating or decapsulting a packet 5502 * into/from a tunnel. Some information have to be cleared during these 5503 * operations. 5504 * skb_scrub_packet can also be used to clean a skb before injecting it in 5505 * another namespace (@xnet == true). We have to clear all information in the 5506 * skb that could impact namespace isolation. 5507 */ 5508 void skb_scrub_packet(struct sk_buff *skb, bool xnet) 5509 { 5510 skb->pkt_type = PACKET_HOST; 5511 skb->skb_iif = 0; 5512 skb->ignore_df = 0; 5513 skb_dst_drop(skb); 5514 skb_ext_reset(skb); 5515 nf_reset_ct(skb); 5516 nf_reset_trace(skb); 5517 5518 #ifdef CONFIG_NET_SWITCHDEV 5519 skb->offload_fwd_mark = 0; 5520 skb->offload_l3_fwd_mark = 0; 5521 #endif 5522 5523 if (!xnet) 5524 return; 5525 5526 ipvs_reset(skb); 5527 skb->mark = 0; 5528 skb_clear_tstamp(skb); 5529 } 5530 EXPORT_SYMBOL_GPL(skb_scrub_packet); 5531 5532 /** 5533 * skb_gso_transport_seglen - Return length of individual segments of a gso packet 5534 * 5535 * @skb: GSO skb 5536 * 5537 * skb_gso_transport_seglen is used to determine the real size of the 5538 * individual segments, including Layer4 headers (TCP/UDP). 5539 * 5540 * The MAC/L2 or network (IP, IPv6) headers are not accounted for. 5541 */ 5542 static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) 5543 { 5544 const struct skb_shared_info *shinfo = skb_shinfo(skb); 5545 unsigned int thlen = 0; 5546 5547 if (skb->encapsulation) { 5548 thlen = skb_inner_transport_header(skb) - 5549 skb_transport_header(skb); 5550 5551 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 5552 thlen += inner_tcp_hdrlen(skb); 5553 } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { 5554 thlen = tcp_hdrlen(skb); 5555 } else if (unlikely(skb_is_gso_sctp(skb))) { 5556 thlen = sizeof(struct sctphdr); 5557 } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { 5558 thlen = sizeof(struct udphdr); 5559 } 5560 /* UFO sets gso_size to the size of the fragmentation 5561 * payload, i.e. the size of the L4 (UDP) header is already 5562 * accounted for. 5563 */ 5564 return thlen + shinfo->gso_size; 5565 } 5566 5567 /** 5568 * skb_gso_network_seglen - Return length of individual segments of a gso packet 5569 * 5570 * @skb: GSO skb 5571 * 5572 * skb_gso_network_seglen is used to determine the real size of the 5573 * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). 5574 * 5575 * The MAC/L2 header is not accounted for. 5576 */ 5577 static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) 5578 { 5579 unsigned int hdr_len = skb_transport_header(skb) - 5580 skb_network_header(skb); 5581 5582 return hdr_len + skb_gso_transport_seglen(skb); 5583 } 5584 5585 /** 5586 * skb_gso_mac_seglen - Return length of individual segments of a gso packet 5587 * 5588 * @skb: GSO skb 5589 * 5590 * skb_gso_mac_seglen is used to determine the real size of the 5591 * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 5592 * headers (TCP/UDP). 5593 */ 5594 static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) 5595 { 5596 unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 5597 5598 return hdr_len + skb_gso_transport_seglen(skb); 5599 } 5600 5601 /** 5602 * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS 5603 * 5604 * There are a couple of instances where we have a GSO skb, and we 5605 * want to determine what size it would be after it is segmented. 5606 * 5607 * We might want to check: 5608 * - L3+L4+payload size (e.g. IP forwarding) 5609 * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) 5610 * 5611 * This is a helper to do that correctly considering GSO_BY_FRAGS. 5612 * 5613 * @skb: GSO skb 5614 * 5615 * @seg_len: The segmented length (from skb_gso_*_seglen). In the 5616 * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. 5617 * 5618 * @max_len: The maximum permissible length. 5619 * 5620 * Returns true if the segmented length <= max length. 5621 */ 5622 static inline bool skb_gso_size_check(const struct sk_buff *skb, 5623 unsigned int seg_len, 5624 unsigned int max_len) { 5625 const struct skb_shared_info *shinfo = skb_shinfo(skb); 5626 const struct sk_buff *iter; 5627 5628 if (shinfo->gso_size != GSO_BY_FRAGS) 5629 return seg_len <= max_len; 5630 5631 /* Undo this so we can re-use header sizes */ 5632 seg_len -= GSO_BY_FRAGS; 5633 5634 skb_walk_frags(skb, iter) { 5635 if (seg_len + skb_headlen(iter) > max_len) 5636 return false; 5637 } 5638 5639 return true; 5640 } 5641 5642 /** 5643 * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? 5644 * 5645 * @skb: GSO skb 5646 * @mtu: MTU to validate against 5647 * 5648 * skb_gso_validate_network_len validates if a given skb will fit a 5649 * wanted MTU once split. It considers L3 headers, L4 headers, and the 5650 * payload. 5651 */ 5652 bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) 5653 { 5654 return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); 5655 } 5656 EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); 5657 5658 /** 5659 * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? 5660 * 5661 * @skb: GSO skb 5662 * @len: length to validate against 5663 * 5664 * skb_gso_validate_mac_len validates if a given skb will fit a wanted 5665 * length once split, including L2, L3 and L4 headers and the payload. 5666 */ 5667 bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) 5668 { 5669 return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); 5670 } 5671 EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); 5672 5673 static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) 5674 { 5675 int mac_len, meta_len; 5676 void *meta; 5677 5678 if (skb_cow(skb, skb_headroom(skb)) < 0) { 5679 kfree_skb(skb); 5680 return NULL; 5681 } 5682 5683 mac_len = skb->data - skb_mac_header(skb); 5684 if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { 5685 memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), 5686 mac_len - VLAN_HLEN - ETH_TLEN); 5687 } 5688 5689 meta_len = skb_metadata_len(skb); 5690 if (meta_len) { 5691 meta = skb_metadata_end(skb) - meta_len; 5692 memmove(meta + VLAN_HLEN, meta, meta_len); 5693 } 5694 5695 skb->mac_header += VLAN_HLEN; 5696 return skb; 5697 } 5698 5699 struct sk_buff *skb_vlan_untag(struct sk_buff *skb) 5700 { 5701 struct vlan_hdr *vhdr; 5702 u16 vlan_tci; 5703 5704 if (unlikely(skb_vlan_tag_present(skb))) { 5705 /* vlan_tci is already set-up so leave this for another time */ 5706 return skb; 5707 } 5708 5709 skb = skb_share_check(skb, GFP_ATOMIC); 5710 if (unlikely(!skb)) 5711 goto err_free; 5712 /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ 5713 if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) 5714 goto err_free; 5715 5716 vhdr = (struct vlan_hdr *)skb->data; 5717 vlan_tci = ntohs(vhdr->h_vlan_TCI); 5718 __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); 5719 5720 skb_pull_rcsum(skb, VLAN_HLEN); 5721 vlan_set_encap_proto(skb, vhdr); 5722 5723 skb = skb_reorder_vlan_header(skb); 5724 if (unlikely(!skb)) 5725 goto err_free; 5726 5727 skb_reset_network_header(skb); 5728 if (!skb_transport_header_was_set(skb)) 5729 skb_reset_transport_header(skb); 5730 skb_reset_mac_len(skb); 5731 5732 return skb; 5733 5734 err_free: 5735 kfree_skb(skb); 5736 return NULL; 5737 } 5738 EXPORT_SYMBOL(skb_vlan_untag); 5739 5740 int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) 5741 { 5742 if (!pskb_may_pull(skb, write_len)) 5743 return -ENOMEM; 5744 5745 if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) 5746 return 0; 5747 5748 return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 5749 } 5750 EXPORT_SYMBOL(skb_ensure_writable); 5751 5752 /* remove VLAN header from packet and update csum accordingly. 5753 * expects a non skb_vlan_tag_present skb with a vlan tag payload 5754 */ 5755 int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) 5756 { 5757 struct vlan_hdr *vhdr; 5758 int offset = skb->data - skb_mac_header(skb); 5759 int err; 5760 5761 if (WARN_ONCE(offset, 5762 "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", 5763 offset)) { 5764 return -EINVAL; 5765 } 5766 5767 err = skb_ensure_writable(skb, VLAN_ETH_HLEN); 5768 if (unlikely(err)) 5769 return err; 5770 5771 skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 5772 5773 vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); 5774 *vlan_tci = ntohs(vhdr->h_vlan_TCI); 5775 5776 memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); 5777 __skb_pull(skb, VLAN_HLEN); 5778 5779 vlan_set_encap_proto(skb, vhdr); 5780 skb->mac_header += VLAN_HLEN; 5781 5782 if (skb_network_offset(skb) < ETH_HLEN) 5783 skb_set_network_header(skb, ETH_HLEN); 5784 5785 skb_reset_mac_len(skb); 5786 5787 return err; 5788 } 5789 EXPORT_SYMBOL(__skb_vlan_pop); 5790 5791 /* Pop a vlan tag either from hwaccel or from payload. 5792 * Expects skb->data at mac header. 5793 */ 5794 int skb_vlan_pop(struct sk_buff *skb) 5795 { 5796 u16 vlan_tci; 5797 __be16 vlan_proto; 5798 int err; 5799 5800 if (likely(skb_vlan_tag_present(skb))) { 5801 __vlan_hwaccel_clear_tag(skb); 5802 } else { 5803 if (unlikely(!eth_type_vlan(skb->protocol))) 5804 return 0; 5805 5806 err = __skb_vlan_pop(skb, &vlan_tci); 5807 if (err) 5808 return err; 5809 } 5810 /* move next vlan tag to hw accel tag */ 5811 if (likely(!eth_type_vlan(skb->protocol))) 5812 return 0; 5813 5814 vlan_proto = skb->protocol; 5815 err = __skb_vlan_pop(skb, &vlan_tci); 5816 if (unlikely(err)) 5817 return err; 5818 5819 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 5820 return 0; 5821 } 5822 EXPORT_SYMBOL(skb_vlan_pop); 5823 5824 /* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). 5825 * Expects skb->data at mac header. 5826 */ 5827 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) 5828 { 5829 if (skb_vlan_tag_present(skb)) { 5830 int offset = skb->data - skb_mac_header(skb); 5831 int err; 5832 5833 if (WARN_ONCE(offset, 5834 "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", 5835 offset)) { 5836 return -EINVAL; 5837 } 5838 5839 err = __vlan_insert_tag(skb, skb->vlan_proto, 5840 skb_vlan_tag_get(skb)); 5841 if (err) 5842 return err; 5843 5844 skb->protocol = skb->vlan_proto; 5845 skb->mac_len += VLAN_HLEN; 5846 5847 skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 5848 } 5849 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 5850 return 0; 5851 } 5852 EXPORT_SYMBOL(skb_vlan_push); 5853 5854 /** 5855 * skb_eth_pop() - Drop the Ethernet header at the head of a packet 5856 * 5857 * @skb: Socket buffer to modify 5858 * 5859 * Drop the Ethernet header of @skb. 5860 * 5861 * Expects that skb->data points to the mac header and that no VLAN tags are 5862 * present. 5863 * 5864 * Returns 0 on success, -errno otherwise. 5865 */ 5866 int skb_eth_pop(struct sk_buff *skb) 5867 { 5868 if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || 5869 skb_network_offset(skb) < ETH_HLEN) 5870 return -EPROTO; 5871 5872 skb_pull_rcsum(skb, ETH_HLEN); 5873 skb_reset_mac_header(skb); 5874 skb_reset_mac_len(skb); 5875 5876 return 0; 5877 } 5878 EXPORT_SYMBOL(skb_eth_pop); 5879 5880 /** 5881 * skb_eth_push() - Add a new Ethernet header at the head of a packet 5882 * 5883 * @skb: Socket buffer to modify 5884 * @dst: Destination MAC address of the new header 5885 * @src: Source MAC address of the new header 5886 * 5887 * Prepend @skb with a new Ethernet header. 5888 * 5889 * Expects that skb->data points to the mac header, which must be empty. 5890 * 5891 * Returns 0 on success, -errno otherwise. 5892 */ 5893 int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, 5894 const unsigned char *src) 5895 { 5896 struct ethhdr *eth; 5897 int err; 5898 5899 if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) 5900 return -EPROTO; 5901 5902 err = skb_cow_head(skb, sizeof(*eth)); 5903 if (err < 0) 5904 return err; 5905 5906 skb_push(skb, sizeof(*eth)); 5907 skb_reset_mac_header(skb); 5908 skb_reset_mac_len(skb); 5909 5910 eth = eth_hdr(skb); 5911 ether_addr_copy(eth->h_dest, dst); 5912 ether_addr_copy(eth->h_source, src); 5913 eth->h_proto = skb->protocol; 5914 5915 skb_postpush_rcsum(skb, eth, sizeof(*eth)); 5916 5917 return 0; 5918 } 5919 EXPORT_SYMBOL(skb_eth_push); 5920 5921 /* Update the ethertype of hdr and the skb csum value if required. */ 5922 static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, 5923 __be16 ethertype) 5924 { 5925 if (skb->ip_summed == CHECKSUM_COMPLETE) { 5926 __be16 diff[] = { ~hdr->h_proto, ethertype }; 5927 5928 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); 5929 } 5930 5931 hdr->h_proto = ethertype; 5932 } 5933 5934 /** 5935 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of 5936 * the packet 5937 * 5938 * @skb: buffer 5939 * @mpls_lse: MPLS label stack entry to push 5940 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) 5941 * @mac_len: length of the MAC header 5942 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is 5943 * ethernet 5944 * 5945 * Expects skb->data at mac header. 5946 * 5947 * Returns 0 on success, -errno otherwise. 5948 */ 5949 int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, 5950 int mac_len, bool ethernet) 5951 { 5952 struct mpls_shim_hdr *lse; 5953 int err; 5954 5955 if (unlikely(!eth_p_mpls(mpls_proto))) 5956 return -EINVAL; 5957 5958 /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ 5959 if (skb->encapsulation) 5960 return -EINVAL; 5961 5962 err = skb_cow_head(skb, MPLS_HLEN); 5963 if (unlikely(err)) 5964 return err; 5965 5966 if (!skb->inner_protocol) { 5967 skb_set_inner_network_header(skb, skb_network_offset(skb)); 5968 skb_set_inner_protocol(skb, skb->protocol); 5969 } 5970 5971 skb_push(skb, MPLS_HLEN); 5972 memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), 5973 mac_len); 5974 skb_reset_mac_header(skb); 5975 skb_set_network_header(skb, mac_len); 5976 skb_reset_mac_len(skb); 5977 5978 lse = mpls_hdr(skb); 5979 lse->label_stack_entry = mpls_lse; 5980 skb_postpush_rcsum(skb, lse, MPLS_HLEN); 5981 5982 if (ethernet && mac_len >= ETH_HLEN) 5983 skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); 5984 skb->protocol = mpls_proto; 5985 5986 return 0; 5987 } 5988 EXPORT_SYMBOL_GPL(skb_mpls_push); 5989 5990 /** 5991 * skb_mpls_pop() - pop the outermost MPLS header 5992 * 5993 * @skb: buffer 5994 * @next_proto: ethertype of header after popped MPLS header 5995 * @mac_len: length of the MAC header 5996 * @ethernet: flag to indicate if the packet is ethernet 5997 * 5998 * Expects skb->data at mac header. 5999 * 6000 * Returns 0 on success, -errno otherwise. 6001 */ 6002 int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, 6003 bool ethernet) 6004 { 6005 int err; 6006 6007 if (unlikely(!eth_p_mpls(skb->protocol))) 6008 return 0; 6009 6010 err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); 6011 if (unlikely(err)) 6012 return err; 6013 6014 skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); 6015 memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), 6016 mac_len); 6017 6018 __skb_pull(skb, MPLS_HLEN); 6019 skb_reset_mac_header(skb); 6020 skb_set_network_header(skb, mac_len); 6021 6022 if (ethernet && mac_len >= ETH_HLEN) { 6023 struct ethhdr *hdr; 6024 6025 /* use mpls_hdr() to get ethertype to account for VLANs. */ 6026 hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); 6027 skb_mod_eth_type(skb, hdr, next_proto); 6028 } 6029 skb->protocol = next_proto; 6030 6031 return 0; 6032 } 6033 EXPORT_SYMBOL_GPL(skb_mpls_pop); 6034 6035 /** 6036 * skb_mpls_update_lse() - modify outermost MPLS header and update csum 6037 * 6038 * @skb: buffer 6039 * @mpls_lse: new MPLS label stack entry to update to 6040 * 6041 * Expects skb->data at mac header. 6042 * 6043 * Returns 0 on success, -errno otherwise. 6044 */ 6045 int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) 6046 { 6047 int err; 6048 6049 if (unlikely(!eth_p_mpls(skb->protocol))) 6050 return -EINVAL; 6051 6052 err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); 6053 if (unlikely(err)) 6054 return err; 6055 6056 if (skb->ip_summed == CHECKSUM_COMPLETE) { 6057 __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; 6058 6059 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); 6060 } 6061 6062 mpls_hdr(skb)->label_stack_entry = mpls_lse; 6063 6064 return 0; 6065 } 6066 EXPORT_SYMBOL_GPL(skb_mpls_update_lse); 6067 6068 /** 6069 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header 6070 * 6071 * @skb: buffer 6072 * 6073 * Expects skb->data at mac header. 6074 * 6075 * Returns 0 on success, -errno otherwise. 6076 */ 6077 int skb_mpls_dec_ttl(struct sk_buff *skb) 6078 { 6079 u32 lse; 6080 u8 ttl; 6081 6082 if (unlikely(!eth_p_mpls(skb->protocol))) 6083 return -EINVAL; 6084 6085 if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) 6086 return -ENOMEM; 6087 6088 lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); 6089 ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; 6090 if (!--ttl) 6091 return -EINVAL; 6092 6093 lse &= ~MPLS_LS_TTL_MASK; 6094 lse |= ttl << MPLS_LS_TTL_SHIFT; 6095 6096 return skb_mpls_update_lse(skb, cpu_to_be32(lse)); 6097 } 6098 EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); 6099 6100 /** 6101 * alloc_skb_with_frags - allocate skb with page frags 6102 * 6103 * @header_len: size of linear part 6104 * @data_len: needed length in frags 6105 * @max_page_order: max page order desired. 6106 * @errcode: pointer to error code if any 6107 * @gfp_mask: allocation mask 6108 * 6109 * This can be used to allocate a paged skb, given a maximal order for frags. 6110 */ 6111 struct sk_buff *alloc_skb_with_frags(unsigned long header_len, 6112 unsigned long data_len, 6113 int max_page_order, 6114 int *errcode, 6115 gfp_t gfp_mask) 6116 { 6117 int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; 6118 unsigned long chunk; 6119 struct sk_buff *skb; 6120 struct page *page; 6121 int i; 6122 6123 *errcode = -EMSGSIZE; 6124 /* Note this test could be relaxed, if we succeed to allocate 6125 * high order pages... 6126 */ 6127 if (npages > MAX_SKB_FRAGS) 6128 return NULL; 6129 6130 *errcode = -ENOBUFS; 6131 skb = alloc_skb(header_len, gfp_mask); 6132 if (!skb) 6133 return NULL; 6134 6135 skb->truesize += npages << PAGE_SHIFT; 6136 6137 for (i = 0; npages > 0; i++) { 6138 int order = max_page_order; 6139 6140 while (order) { 6141 if (npages >= 1 << order) { 6142 page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | 6143 __GFP_COMP | 6144 __GFP_NOWARN, 6145 order); 6146 if (page) 6147 goto fill_page; 6148 /* Do not retry other high order allocations */ 6149 order = 1; 6150 max_page_order = 0; 6151 } 6152 order--; 6153 } 6154 page = alloc_page(gfp_mask); 6155 if (!page) 6156 goto failure; 6157 fill_page: 6158 chunk = min_t(unsigned long, data_len, 6159 PAGE_SIZE << order); 6160 skb_fill_page_desc(skb, i, page, 0, chunk); 6161 data_len -= chunk; 6162 npages -= 1 << order; 6163 } 6164 return skb; 6165 6166 failure: 6167 kfree_skb(skb); 6168 return NULL; 6169 } 6170 EXPORT_SYMBOL(alloc_skb_with_frags); 6171 6172 /* carve out the first off bytes from skb when off < headlen */ 6173 static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, 6174 const int headlen, gfp_t gfp_mask) 6175 { 6176 int i; 6177 int size = skb_end_offset(skb); 6178 int new_hlen = headlen - off; 6179 u8 *data; 6180 6181 size = SKB_DATA_ALIGN(size); 6182 6183 if (skb_pfmemalloc(skb)) 6184 gfp_mask |= __GFP_MEMALLOC; 6185 data = kmalloc_reserve(size + 6186 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 6187 gfp_mask, NUMA_NO_NODE, NULL); 6188 if (!data) 6189 return -ENOMEM; 6190 6191 size = SKB_WITH_OVERHEAD(ksize(data)); 6192 6193 /* Copy real data, and all frags */ 6194 skb_copy_from_linear_data_offset(skb, off, data, new_hlen); 6195 skb->len -= off; 6196 6197 memcpy((struct skb_shared_info *)(data + size), 6198 skb_shinfo(skb), 6199 offsetof(struct skb_shared_info, 6200 frags[skb_shinfo(skb)->nr_frags])); 6201 if (skb_cloned(skb)) { 6202 /* drop the old head gracefully */ 6203 if (skb_orphan_frags(skb, gfp_mask)) { 6204 kfree(data); 6205 return -ENOMEM; 6206 } 6207 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 6208 skb_frag_ref(skb, i); 6209 if (skb_has_frag_list(skb)) 6210 skb_clone_fraglist(skb); 6211 skb_release_data(skb); 6212 } else { 6213 /* we can reuse existing recount- all we did was 6214 * relocate values 6215 */ 6216 skb_free_head(skb); 6217 } 6218 6219 skb->head = data; 6220 skb->data = data; 6221 skb->head_frag = 0; 6222 skb_set_end_offset(skb, size); 6223 skb_set_tail_pointer(skb, skb_headlen(skb)); 6224 skb_headers_offset_update(skb, 0); 6225 skb->cloned = 0; 6226 skb->hdr_len = 0; 6227 skb->nohdr = 0; 6228 atomic_set(&skb_shinfo(skb)->dataref, 1); 6229 6230 return 0; 6231 } 6232 6233 static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); 6234 6235 /* carve out the first eat bytes from skb's frag_list. May recurse into 6236 * pskb_carve() 6237 */ 6238 static int pskb_carve_frag_list(struct sk_buff *skb, 6239 struct skb_shared_info *shinfo, int eat, 6240 gfp_t gfp_mask) 6241 { 6242 struct sk_buff *list = shinfo->frag_list; 6243 struct sk_buff *clone = NULL; 6244 struct sk_buff *insp = NULL; 6245 6246 do { 6247 if (!list) { 6248 pr_err("Not enough bytes to eat. Want %d\n", eat); 6249 return -EFAULT; 6250 } 6251 if (list->len <= eat) { 6252 /* Eaten as whole. */ 6253 eat -= list->len; 6254 list = list->next; 6255 insp = list; 6256 } else { 6257 /* Eaten partially. */ 6258 if (skb_shared(list)) { 6259 clone = skb_clone(list, gfp_mask); 6260 if (!clone) 6261 return -ENOMEM; 6262 insp = list->next; 6263 list = clone; 6264 } else { 6265 /* This may be pulled without problems. */ 6266 insp = list; 6267 } 6268 if (pskb_carve(list, eat, gfp_mask) < 0) { 6269 kfree_skb(clone); 6270 return -ENOMEM; 6271 } 6272 break; 6273 } 6274 } while (eat); 6275 6276 /* Free pulled out fragments. */ 6277 while ((list = shinfo->frag_list) != insp) { 6278 shinfo->frag_list = list->next; 6279 consume_skb(list); 6280 } 6281 /* And insert new clone at head. */ 6282 if (clone) { 6283 clone->next = list; 6284 shinfo->frag_list = clone; 6285 } 6286 return 0; 6287 } 6288 6289 /* carve off first len bytes from skb. Split line (off) is in the 6290 * non-linear part of skb 6291 */ 6292 static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, 6293 int pos, gfp_t gfp_mask) 6294 { 6295 int i, k = 0; 6296 int size = skb_end_offset(skb); 6297 u8 *data; 6298 const int nfrags = skb_shinfo(skb)->nr_frags; 6299 struct skb_shared_info *shinfo; 6300 6301 size = SKB_DATA_ALIGN(size); 6302 6303 if (skb_pfmemalloc(skb)) 6304 gfp_mask |= __GFP_MEMALLOC; 6305 data = kmalloc_reserve(size + 6306 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 6307 gfp_mask, NUMA_NO_NODE, NULL); 6308 if (!data) 6309 return -ENOMEM; 6310 6311 size = SKB_WITH_OVERHEAD(ksize(data)); 6312 6313 memcpy((struct skb_shared_info *)(data + size), 6314 skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); 6315 if (skb_orphan_frags(skb, gfp_mask)) { 6316 kfree(data); 6317 return -ENOMEM; 6318 } 6319 shinfo = (struct skb_shared_info *)(data + size); 6320 for (i = 0; i < nfrags; i++) { 6321 int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); 6322 6323 if (pos + fsize > off) { 6324 shinfo->frags[k] = skb_shinfo(skb)->frags[i]; 6325 6326 if (pos < off) { 6327 /* Split frag. 6328 * We have two variants in this case: 6329 * 1. Move all the frag to the second 6330 * part, if it is possible. F.e. 6331 * this approach is mandatory for TUX, 6332 * where splitting is expensive. 6333 * 2. Split is accurately. We make this. 6334 */ 6335 skb_frag_off_add(&shinfo->frags[0], off - pos); 6336 skb_frag_size_sub(&shinfo->frags[0], off - pos); 6337 } 6338 skb_frag_ref(skb, i); 6339 k++; 6340 } 6341 pos += fsize; 6342 } 6343 shinfo->nr_frags = k; 6344 if (skb_has_frag_list(skb)) 6345 skb_clone_fraglist(skb); 6346 6347 /* split line is in frag list */ 6348 if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) { 6349 /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ 6350 if (skb_has_frag_list(skb)) 6351 kfree_skb_list(skb_shinfo(skb)->frag_list); 6352 kfree(data); 6353 return -ENOMEM; 6354 } 6355 skb_release_data(skb); 6356 6357 skb->head = data; 6358 skb->head_frag = 0; 6359 skb->data = data; 6360 skb_set_end_offset(skb, size); 6361 skb_reset_tail_pointer(skb); 6362 skb_headers_offset_update(skb, 0); 6363 skb->cloned = 0; 6364 skb->hdr_len = 0; 6365 skb->nohdr = 0; 6366 skb->len -= off; 6367 skb->data_len = skb->len; 6368 atomic_set(&skb_shinfo(skb)->dataref, 1); 6369 return 0; 6370 } 6371 6372 /* remove len bytes from the beginning of the skb */ 6373 static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) 6374 { 6375 int headlen = skb_headlen(skb); 6376 6377 if (len < headlen) 6378 return pskb_carve_inside_header(skb, len, headlen, gfp); 6379 else 6380 return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); 6381 } 6382 6383 /* Extract to_copy bytes starting at off from skb, and return this in 6384 * a new skb 6385 */ 6386 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, 6387 int to_copy, gfp_t gfp) 6388 { 6389 struct sk_buff *clone = skb_clone(skb, gfp); 6390 6391 if (!clone) 6392 return NULL; 6393 6394 if (pskb_carve(clone, off, gfp) < 0 || 6395 pskb_trim(clone, to_copy)) { 6396 kfree_skb(clone); 6397 return NULL; 6398 } 6399 return clone; 6400 } 6401 EXPORT_SYMBOL(pskb_extract); 6402 6403 /** 6404 * skb_condense - try to get rid of fragments/frag_list if possible 6405 * @skb: buffer 6406 * 6407 * Can be used to save memory before skb is added to a busy queue. 6408 * If packet has bytes in frags and enough tail room in skb->head, 6409 * pull all of them, so that we can free the frags right now and adjust 6410 * truesize. 6411 * Notes: 6412 * We do not reallocate skb->head thus can not fail. 6413 * Caller must re-evaluate skb->truesize if needed. 6414 */ 6415 void skb_condense(struct sk_buff *skb) 6416 { 6417 if (skb->data_len) { 6418 if (skb->data_len > skb->end - skb->tail || 6419 skb_cloned(skb)) 6420 return; 6421 6422 /* Nice, we can free page frag(s) right now */ 6423 __pskb_pull_tail(skb, skb->data_len); 6424 } 6425 /* At this point, skb->truesize might be over estimated, 6426 * because skb had a fragment, and fragments do not tell 6427 * their truesize. 6428 * When we pulled its content into skb->head, fragment 6429 * was freed, but __pskb_pull_tail() could not possibly 6430 * adjust skb->truesize, not knowing the frag truesize. 6431 */ 6432 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 6433 } 6434 6435 #ifdef CONFIG_SKB_EXTENSIONS 6436 static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) 6437 { 6438 return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); 6439 } 6440 6441 /** 6442 * __skb_ext_alloc - allocate a new skb extensions storage 6443 * 6444 * @flags: See kmalloc(). 6445 * 6446 * Returns the newly allocated pointer. The pointer can later attached to a 6447 * skb via __skb_ext_set(). 6448 * Note: caller must handle the skb_ext as an opaque data. 6449 */ 6450 struct skb_ext *__skb_ext_alloc(gfp_t flags) 6451 { 6452 struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); 6453 6454 if (new) { 6455 memset(new->offset, 0, sizeof(new->offset)); 6456 refcount_set(&new->refcnt, 1); 6457 } 6458 6459 return new; 6460 } 6461 6462 static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, 6463 unsigned int old_active) 6464 { 6465 struct skb_ext *new; 6466 6467 if (refcount_read(&old->refcnt) == 1) 6468 return old; 6469 6470 new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); 6471 if (!new) 6472 return NULL; 6473 6474 memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); 6475 refcount_set(&new->refcnt, 1); 6476 6477 #ifdef CONFIG_XFRM 6478 if (old_active & (1 << SKB_EXT_SEC_PATH)) { 6479 struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); 6480 unsigned int i; 6481 6482 for (i = 0; i < sp->len; i++) 6483 xfrm_state_hold(sp->xvec[i]); 6484 } 6485 #endif 6486 __skb_ext_put(old); 6487 return new; 6488 } 6489 6490 /** 6491 * __skb_ext_set - attach the specified extension storage to this skb 6492 * @skb: buffer 6493 * @id: extension id 6494 * @ext: extension storage previously allocated via __skb_ext_alloc() 6495 * 6496 * Existing extensions, if any, are cleared. 6497 * 6498 * Returns the pointer to the extension. 6499 */ 6500 void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, 6501 struct skb_ext *ext) 6502 { 6503 unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); 6504 6505 skb_ext_put(skb); 6506 newlen = newoff + skb_ext_type_len[id]; 6507 ext->chunks = newlen; 6508 ext->offset[id] = newoff; 6509 skb->extensions = ext; 6510 skb->active_extensions = 1 << id; 6511 return skb_ext_get_ptr(ext, id); 6512 } 6513 6514 /** 6515 * skb_ext_add - allocate space for given extension, COW if needed 6516 * @skb: buffer 6517 * @id: extension to allocate space for 6518 * 6519 * Allocates enough space for the given extension. 6520 * If the extension is already present, a pointer to that extension 6521 * is returned. 6522 * 6523 * If the skb was cloned, COW applies and the returned memory can be 6524 * modified without changing the extension space of clones buffers. 6525 * 6526 * Returns pointer to the extension or NULL on allocation failure. 6527 */ 6528 void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) 6529 { 6530 struct skb_ext *new, *old = NULL; 6531 unsigned int newlen, newoff; 6532 6533 if (skb->active_extensions) { 6534 old = skb->extensions; 6535 6536 new = skb_ext_maybe_cow(old, skb->active_extensions); 6537 if (!new) 6538 return NULL; 6539 6540 if (__skb_ext_exist(new, id)) 6541 goto set_active; 6542 6543 newoff = new->chunks; 6544 } else { 6545 newoff = SKB_EXT_CHUNKSIZEOF(*new); 6546 6547 new = __skb_ext_alloc(GFP_ATOMIC); 6548 if (!new) 6549 return NULL; 6550 } 6551 6552 newlen = newoff + skb_ext_type_len[id]; 6553 new->chunks = newlen; 6554 new->offset[id] = newoff; 6555 set_active: 6556 skb->slow_gro = 1; 6557 skb->extensions = new; 6558 skb->active_extensions |= 1 << id; 6559 return skb_ext_get_ptr(new, id); 6560 } 6561 EXPORT_SYMBOL(skb_ext_add); 6562 6563 #ifdef CONFIG_XFRM 6564 static void skb_ext_put_sp(struct sec_path *sp) 6565 { 6566 unsigned int i; 6567 6568 for (i = 0; i < sp->len; i++) 6569 xfrm_state_put(sp->xvec[i]); 6570 } 6571 #endif 6572 6573 #ifdef CONFIG_MCTP_FLOWS 6574 static void skb_ext_put_mctp(struct mctp_flow *flow) 6575 { 6576 if (flow->key) 6577 mctp_key_unref(flow->key); 6578 } 6579 #endif 6580 6581 void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) 6582 { 6583 struct skb_ext *ext = skb->extensions; 6584 6585 skb->active_extensions &= ~(1 << id); 6586 if (skb->active_extensions == 0) { 6587 skb->extensions = NULL; 6588 __skb_ext_put(ext); 6589 #ifdef CONFIG_XFRM 6590 } else if (id == SKB_EXT_SEC_PATH && 6591 refcount_read(&ext->refcnt) == 1) { 6592 struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); 6593 6594 skb_ext_put_sp(sp); 6595 sp->len = 0; 6596 #endif 6597 } 6598 } 6599 EXPORT_SYMBOL(__skb_ext_del); 6600 6601 void __skb_ext_put(struct skb_ext *ext) 6602 { 6603 /* If this is last clone, nothing can increment 6604 * it after check passes. Avoids one atomic op. 6605 */ 6606 if (refcount_read(&ext->refcnt) == 1) 6607 goto free_now; 6608 6609 if (!refcount_dec_and_test(&ext->refcnt)) 6610 return; 6611 free_now: 6612 #ifdef CONFIG_XFRM 6613 if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) 6614 skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); 6615 #endif 6616 #ifdef CONFIG_MCTP_FLOWS 6617 if (__skb_ext_exist(ext, SKB_EXT_MCTP)) 6618 skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP)); 6619 #endif 6620 6621 kmem_cache_free(skbuff_ext_cache, ext); 6622 } 6623 EXPORT_SYMBOL(__skb_ext_put); 6624 #endif /* CONFIG_SKB_EXTENSIONS */ 6625 6626 /** 6627 * skb_attempt_defer_free - queue skb for remote freeing 6628 * @skb: buffer 6629 * 6630 * Put @skb in a per-cpu list, using the cpu which 6631 * allocated the skb/pages to reduce false sharing 6632 * and memory zone spinlock contention. 6633 */ 6634 void skb_attempt_defer_free(struct sk_buff *skb) 6635 { 6636 int cpu = skb->alloc_cpu; 6637 struct softnet_data *sd; 6638 unsigned long flags; 6639 unsigned int defer_max; 6640 bool kick; 6641 6642 if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || 6643 !cpu_online(cpu) || 6644 cpu == raw_smp_processor_id()) { 6645 nodefer: __kfree_skb(skb); 6646 return; 6647 } 6648 6649 sd = &per_cpu(softnet_data, cpu); 6650 defer_max = READ_ONCE(sysctl_skb_defer_max); 6651 if (READ_ONCE(sd->defer_count) >= defer_max) 6652 goto nodefer; 6653 6654 spin_lock_irqsave(&sd->defer_lock, flags); 6655 /* Send an IPI every time queue reaches half capacity. */ 6656 kick = sd->defer_count == (defer_max >> 1); 6657 /* Paired with the READ_ONCE() few lines above */ 6658 WRITE_ONCE(sd->defer_count, sd->defer_count + 1); 6659 6660 skb->next = sd->defer_list; 6661 /* Paired with READ_ONCE() in skb_defer_free_flush() */ 6662 WRITE_ONCE(sd->defer_list, skb); 6663 spin_unlock_irqrestore(&sd->defer_lock, flags); 6664 6665 /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU 6666 * if we are unlucky enough (this seems very unlikely). 6667 */ 6668 if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) 6669 smp_call_function_single_async(cpu, &sd->defer_csd); 6670 } 6671