1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Routines having to do with the 'struct sk_buff' memory handlers. 4 * 5 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> 6 * Florian La Roche <rzsfl@rz.uni-sb.de> 7 * 8 * Fixes: 9 * Alan Cox : Fixed the worst of the load 10 * balancer bugs. 11 * Dave Platt : Interrupt stacking fix. 12 * Richard Kooijman : Timestamp fixes. 13 * Alan Cox : Changed buffer format. 14 * Alan Cox : destructor hook for AF_UNIX etc. 15 * Linus Torvalds : Better skb_clone. 16 * Alan Cox : Added skb_copy. 17 * Alan Cox : Added all the changed routines Linus 18 * only put in the headers 19 * Ray VanTassle : Fixed --skb->lock in free 20 * Alan Cox : skb_copy copy arp field 21 * Andi Kleen : slabified it. 22 * Robert Olsson : Removed skb_head_pool 23 * 24 * NOTE: 25 * The __skb_ routines should be called with interrupts 26 * disabled, or you better be *real* sure that the operation is atomic 27 * with respect to whatever list is being frobbed (e.g. via lock_sock() 28 * or via disabling bottom half handlers, etc). 29 */ 30 31 /* 32 * The functions in this file will not compile correctly with gcc 2.4.x 33 */ 34 35 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 36 37 #include <linux/module.h> 38 #include <linux/types.h> 39 #include <linux/kernel.h> 40 #include <linux/mm.h> 41 #include <linux/interrupt.h> 42 #include <linux/in.h> 43 #include <linux/inet.h> 44 #include <linux/slab.h> 45 #include <linux/tcp.h> 46 #include <linux/udp.h> 47 #include <linux/sctp.h> 48 #include <linux/netdevice.h> 49 #ifdef CONFIG_NET_CLS_ACT 50 #include <net/pkt_sched.h> 51 #endif 52 #include <linux/string.h> 53 #include <linux/skbuff.h> 54 #include <linux/skbuff_ref.h> 55 #include <linux/splice.h> 56 #include <linux/cache.h> 57 #include <linux/rtnetlink.h> 58 #include <linux/init.h> 59 #include <linux/scatterlist.h> 60 #include <linux/errqueue.h> 61 #include <linux/prefetch.h> 62 #include <linux/bitfield.h> 63 #include <linux/if_vlan.h> 64 #include <linux/mpls.h> 65 #include <linux/kcov.h> 66 #include <linux/iov_iter.h> 67 #include <linux/crc32.h> 68 69 #include <net/protocol.h> 70 #include <net/dst.h> 71 #include <net/sock.h> 72 #include <net/checksum.h> 73 #include <net/gro.h> 74 #include <net/gso.h> 75 #include <net/hotdata.h> 76 #include <net/ip6_checksum.h> 77 #include <net/xfrm.h> 78 #include <net/mpls.h> 79 #include <net/mptcp.h> 80 #include <net/mctp.h> 81 #include <net/page_pool/helpers.h> 82 #include <net/psp/types.h> 83 #include <net/dropreason.h> 84 #include <net/xdp_sock.h> 85 86 #include <linux/uaccess.h> 87 #include <trace/events/skb.h> 88 #include <linux/highmem.h> 89 #include <linux/capability.h> 90 #include <linux/user_namespace.h> 91 #include <linux/indirect_call_wrapper.h> 92 #include <linux/textsearch.h> 93 94 #include "dev.h" 95 #include "devmem.h" 96 #include "netmem_priv.h" 97 #include "sock_destructor.h" 98 99 #ifdef CONFIG_SKB_EXTENSIONS 100 static struct kmem_cache *skbuff_ext_cache __ro_after_init; 101 #endif 102 103 #define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN) 104 #define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \ 105 GRO_MAX_HEAD_PAD)) 106 107 /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. 108 * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique 109 * size, and we can differentiate heads from skb_small_head_cache 110 * vs system slabs by looking at their size (skb_end_offset()). 111 */ 112 #define SKB_SMALL_HEAD_CACHE_SIZE \ 113 (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \ 114 (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \ 115 SKB_SMALL_HEAD_SIZE) 116 117 #define SKB_SMALL_HEAD_HEADROOM \ 118 SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) 119 120 /* kcm_write_msgs() relies on casting paged frags to bio_vec to use 121 * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the 122 * netmem is a page. 123 */ 124 static_assert(offsetof(struct bio_vec, bv_page) == 125 offsetof(skb_frag_t, netmem)); 126 static_assert(sizeof_field(struct bio_vec, bv_page) == 127 sizeof_field(skb_frag_t, netmem)); 128 129 static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len)); 130 static_assert(sizeof_field(struct bio_vec, bv_len) == 131 sizeof_field(skb_frag_t, len)); 132 133 static_assert(offsetof(struct bio_vec, bv_offset) == 134 offsetof(skb_frag_t, offset)); 135 static_assert(sizeof_field(struct bio_vec, bv_offset) == 136 sizeof_field(skb_frag_t, offset)); 137 138 #undef FN 139 #define FN(reason) [SKB_DROP_REASON_##reason] = #reason, 140 static const char * const drop_reasons[] = { 141 [SKB_CONSUMED] = "CONSUMED", 142 DEFINE_DROP_REASON(FN, FN) 143 }; 144 145 static const struct drop_reason_list drop_reasons_core = { 146 .reasons = drop_reasons, 147 .n_reasons = ARRAY_SIZE(drop_reasons), 148 }; 149 150 const struct drop_reason_list __rcu * 151 drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = { 152 [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core), 153 }; 154 EXPORT_SYMBOL(drop_reasons_by_subsys); 155 156 /** 157 * drop_reasons_register_subsys - register another drop reason subsystem 158 * @subsys: the subsystem to register, must not be the core 159 * @list: the list of drop reasons within the subsystem, must point to 160 * a statically initialized list 161 */ 162 void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys, 163 const struct drop_reason_list *list) 164 { 165 if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || 166 subsys >= ARRAY_SIZE(drop_reasons_by_subsys), 167 "invalid subsystem %d\n", subsys)) 168 return; 169 170 /* must point to statically allocated memory, so INIT is OK */ 171 RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list); 172 } 173 EXPORT_SYMBOL_GPL(drop_reasons_register_subsys); 174 175 /** 176 * drop_reasons_unregister_subsys - unregister a drop reason subsystem 177 * @subsys: the subsystem to remove, must not be the core 178 * 179 * Note: This will synchronize_rcu() to ensure no users when it returns. 180 */ 181 void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys) 182 { 183 if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || 184 subsys >= ARRAY_SIZE(drop_reasons_by_subsys), 185 "invalid subsystem %d\n", subsys)) 186 return; 187 188 RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL); 189 190 synchronize_rcu(); 191 } 192 EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys); 193 194 /** 195 * skb_panic - private function for out-of-line support 196 * @skb: buffer 197 * @sz: size 198 * @addr: address 199 * @msg: skb_over_panic or skb_under_panic 200 * 201 * Out-of-line support for skb_put() and skb_push(). 202 * Called via the wrapper skb_over_panic() or skb_under_panic(). 203 * Keep out of line to prevent kernel bloat. 204 * __builtin_return_address is not used because it is not always reliable. 205 */ 206 static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, 207 const char msg[]) 208 { 209 pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", 210 msg, addr, skb->len, sz, skb->head, skb->data, 211 (unsigned long)skb->tail, (unsigned long)skb->end, 212 skb->dev ? skb->dev->name : "<NULL>"); 213 BUG(); 214 } 215 216 static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) 217 { 218 skb_panic(skb, sz, addr, __func__); 219 } 220 221 static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) 222 { 223 skb_panic(skb, sz, addr, __func__); 224 } 225 226 #define NAPI_SKB_CACHE_SIZE 128 227 #define NAPI_SKB_CACHE_BULK 32 228 #define NAPI_SKB_CACHE_FREE 32 229 230 struct napi_alloc_cache { 231 local_lock_t bh_lock; 232 struct page_frag_cache page; 233 unsigned int skb_count; 234 void *skb_cache[NAPI_SKB_CACHE_SIZE]; 235 }; 236 237 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); 238 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { 239 .bh_lock = INIT_LOCAL_LOCK(bh_lock), 240 }; 241 242 void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) 243 { 244 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 245 void *data; 246 247 fragsz = SKB_DATA_ALIGN(fragsz); 248 249 local_lock_nested_bh(&napi_alloc_cache.bh_lock); 250 data = __page_frag_alloc_align(&nc->page, fragsz, 251 GFP_ATOMIC | __GFP_NOWARN, align_mask); 252 local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 253 return data; 254 255 } 256 EXPORT_SYMBOL(__napi_alloc_frag_align); 257 258 void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) 259 { 260 void *data; 261 262 if (in_hardirq() || irqs_disabled()) { 263 struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); 264 265 fragsz = SKB_DATA_ALIGN(fragsz); 266 data = __page_frag_alloc_align(nc, fragsz, 267 GFP_ATOMIC | __GFP_NOWARN, 268 align_mask); 269 } else { 270 local_bh_disable(); 271 data = __napi_alloc_frag_align(fragsz, align_mask); 272 local_bh_enable(); 273 } 274 return data; 275 } 276 EXPORT_SYMBOL(__netdev_alloc_frag_align); 277 278 /* Cache kmem_cache_size(net_hotdata.skbuff_cache) to help the compiler 279 * remove dead code (and skbuff_cache_size) when CONFIG_KASAN is unset. 280 */ 281 static u32 skbuff_cache_size __read_mostly; 282 283 static struct sk_buff *napi_skb_cache_get(bool alloc) 284 { 285 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 286 struct sk_buff *skb; 287 288 local_lock_nested_bh(&napi_alloc_cache.bh_lock); 289 if (unlikely(!nc->skb_count)) { 290 if (alloc) 291 nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, 292 GFP_ATOMIC | __GFP_NOWARN, 293 NAPI_SKB_CACHE_BULK, 294 nc->skb_cache); 295 if (unlikely(!nc->skb_count)) { 296 local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 297 return NULL; 298 } 299 } 300 301 skb = nc->skb_cache[--nc->skb_count]; 302 if (nc->skb_count) 303 prefetch(nc->skb_cache[nc->skb_count - 1]); 304 local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 305 kasan_mempool_unpoison_object(skb, skbuff_cache_size); 306 307 return skb; 308 } 309 310 /** 311 * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache 312 * @skbs: pointer to an at least @n-sized array to fill with skb pointers 313 * @n: number of entries to provide 314 * 315 * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes 316 * the pointers into the provided array @skbs. If there are less entries 317 * available, tries to replenish the cache and bulk-allocates the diff from 318 * the MM layer if needed. 319 * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are 320 * ready for {,__}build_skb_around() and don't have any data buffers attached. 321 * Must be called *only* from the BH context. 322 * 323 * Return: number of successfully allocated skbs (@n if no actual allocation 324 * needed or kmem_cache_alloc_bulk() didn't fail). 325 */ 326 u32 napi_skb_cache_get_bulk(void **skbs, u32 n) 327 { 328 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 329 u32 bulk, total = n; 330 331 local_lock_nested_bh(&napi_alloc_cache.bh_lock); 332 333 if (nc->skb_count >= n) 334 goto get; 335 336 /* No enough cached skbs. Try refilling the cache first */ 337 bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK); 338 nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, 339 GFP_ATOMIC | __GFP_NOWARN, bulk, 340 &nc->skb_cache[nc->skb_count]); 341 if (likely(nc->skb_count >= n)) 342 goto get; 343 344 /* Still not enough. Bulk-allocate the missing part directly, zeroed */ 345 n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, 346 GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN, 347 n - nc->skb_count, &skbs[nc->skb_count]); 348 if (likely(nc->skb_count >= n)) 349 goto get; 350 351 /* kmem_cache didn't allocate the number we need, limit the output */ 352 total -= n - nc->skb_count; 353 n = nc->skb_count; 354 355 get: 356 for (u32 base = nc->skb_count - n, i = 0; i < n; i++) { 357 skbs[i] = nc->skb_cache[base + i]; 358 359 kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size); 360 memset(skbs[i], 0, offsetof(struct sk_buff, tail)); 361 } 362 363 nc->skb_count -= n; 364 local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 365 366 return total; 367 } 368 EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk); 369 370 static inline void __finalize_skb_around(struct sk_buff *skb, void *data, 371 unsigned int size) 372 { 373 struct skb_shared_info *shinfo; 374 375 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 376 377 /* Assumes caller memset cleared SKB */ 378 skb->truesize = SKB_TRUESIZE(size); 379 refcount_set(&skb->users, 1); 380 skb->head = data; 381 skb->data = data; 382 skb_reset_tail_pointer(skb); 383 skb_set_end_offset(skb, size); 384 skb->mac_header = (typeof(skb->mac_header))~0U; 385 skb->transport_header = (typeof(skb->transport_header))~0U; 386 skb->alloc_cpu = raw_smp_processor_id(); 387 /* make sure we initialize shinfo sequentially */ 388 shinfo = skb_shinfo(skb); 389 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 390 atomic_set(&shinfo->dataref, 1); 391 392 skb_set_kcov_handle(skb, kcov_common_handle()); 393 } 394 395 static inline void *__slab_build_skb(void *data, unsigned int *size) 396 { 397 void *resized; 398 399 /* Must find the allocation size (and grow it to match). */ 400 *size = ksize(data); 401 /* krealloc() will immediately return "data" when 402 * "ksize(data)" is requested: it is the existing upper 403 * bounds. As a result, GFP_ATOMIC will be ignored. Note 404 * that this "new" pointer needs to be passed back to the 405 * caller for use so the __alloc_size hinting will be 406 * tracked correctly. 407 */ 408 resized = krealloc(data, *size, GFP_ATOMIC); 409 WARN_ON_ONCE(resized != data); 410 return resized; 411 } 412 413 /* build_skb() variant which can operate on slab buffers. 414 * Note that this should be used sparingly as slab buffers 415 * cannot be combined efficiently by GRO! 416 */ 417 struct sk_buff *slab_build_skb(void *data) 418 { 419 struct sk_buff *skb; 420 unsigned int size; 421 422 skb = kmem_cache_alloc(net_hotdata.skbuff_cache, 423 GFP_ATOMIC | __GFP_NOWARN); 424 if (unlikely(!skb)) 425 return NULL; 426 427 memset(skb, 0, offsetof(struct sk_buff, tail)); 428 data = __slab_build_skb(data, &size); 429 __finalize_skb_around(skb, data, size); 430 431 return skb; 432 } 433 EXPORT_SYMBOL(slab_build_skb); 434 435 /* Caller must provide SKB that is memset cleared */ 436 static void __build_skb_around(struct sk_buff *skb, void *data, 437 unsigned int frag_size) 438 { 439 unsigned int size = frag_size; 440 441 /* frag_size == 0 is considered deprecated now. Callers 442 * using slab buffer should use slab_build_skb() instead. 443 */ 444 if (WARN_ONCE(size == 0, "Use slab_build_skb() instead")) 445 data = __slab_build_skb(data, &size); 446 447 __finalize_skb_around(skb, data, size); 448 } 449 450 /** 451 * __build_skb - build a network buffer 452 * @data: data buffer provided by caller 453 * @frag_size: size of data (must not be 0) 454 * 455 * Allocate a new &sk_buff. Caller provides space holding head and 456 * skb_shared_info. @data must have been allocated from the page 457 * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc() 458 * allocation is deprecated, and callers should use slab_build_skb() 459 * instead.) 460 * The return is the new skb buffer. 461 * On a failure the return is %NULL, and @data is not freed. 462 * Notes : 463 * Before IO, driver allocates only data buffer where NIC put incoming frame 464 * Driver should add room at head (NET_SKB_PAD) and 465 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) 466 * After IO, driver calls build_skb(), to allocate sk_buff and populate it 467 * before giving packet to stack. 468 * RX rings only contains data buffers, not full skbs. 469 */ 470 struct sk_buff *__build_skb(void *data, unsigned int frag_size) 471 { 472 struct sk_buff *skb; 473 474 skb = kmem_cache_alloc(net_hotdata.skbuff_cache, 475 GFP_ATOMIC | __GFP_NOWARN); 476 if (unlikely(!skb)) 477 return NULL; 478 479 memset(skb, 0, offsetof(struct sk_buff, tail)); 480 __build_skb_around(skb, data, frag_size); 481 482 return skb; 483 } 484 485 /* build_skb() is wrapper over __build_skb(), that specifically 486 * takes care of skb->head and skb->pfmemalloc 487 */ 488 struct sk_buff *build_skb(void *data, unsigned int frag_size) 489 { 490 struct sk_buff *skb = __build_skb(data, frag_size); 491 492 if (likely(skb && frag_size)) { 493 skb->head_frag = 1; 494 skb_propagate_pfmemalloc(virt_to_head_page(data), skb); 495 } 496 return skb; 497 } 498 EXPORT_SYMBOL(build_skb); 499 500 /** 501 * build_skb_around - build a network buffer around provided skb 502 * @skb: sk_buff provide by caller, must be memset cleared 503 * @data: data buffer provided by caller 504 * @frag_size: size of data 505 */ 506 struct sk_buff *build_skb_around(struct sk_buff *skb, 507 void *data, unsigned int frag_size) 508 { 509 if (unlikely(!skb)) 510 return NULL; 511 512 __build_skb_around(skb, data, frag_size); 513 514 if (frag_size) { 515 skb->head_frag = 1; 516 skb_propagate_pfmemalloc(virt_to_head_page(data), skb); 517 } 518 return skb; 519 } 520 EXPORT_SYMBOL(build_skb_around); 521 522 /** 523 * __napi_build_skb - build a network buffer 524 * @data: data buffer provided by caller 525 * @frag_size: size of data 526 * 527 * Version of __build_skb() that uses NAPI percpu caches to obtain 528 * skbuff_head instead of inplace allocation. 529 * 530 * Returns a new &sk_buff on success, %NULL on allocation failure. 531 */ 532 static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) 533 { 534 struct sk_buff *skb; 535 536 skb = napi_skb_cache_get(true); 537 if (unlikely(!skb)) 538 return NULL; 539 540 memset(skb, 0, offsetof(struct sk_buff, tail)); 541 __build_skb_around(skb, data, frag_size); 542 543 return skb; 544 } 545 546 /** 547 * napi_build_skb - build a network buffer 548 * @data: data buffer provided by caller 549 * @frag_size: size of data 550 * 551 * Version of __napi_build_skb() that takes care of skb->head_frag 552 * and skb->pfmemalloc when the data is a page or page fragment. 553 * 554 * Returns a new &sk_buff on success, %NULL on allocation failure. 555 */ 556 struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) 557 { 558 struct sk_buff *skb = __napi_build_skb(data, frag_size); 559 560 if (likely(skb) && frag_size) { 561 skb->head_frag = 1; 562 skb_propagate_pfmemalloc(virt_to_head_page(data), skb); 563 } 564 565 return skb; 566 } 567 EXPORT_SYMBOL(napi_build_skb); 568 569 /* 570 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells 571 * the caller if emergency pfmemalloc reserves are being used. If it is and 572 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves 573 * may be used. Otherwise, the packet data may be discarded until enough 574 * memory is free 575 */ 576 static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, 577 bool *pfmemalloc) 578 { 579 bool ret_pfmemalloc = false; 580 size_t obj_size; 581 void *obj; 582 583 obj_size = SKB_HEAD_ALIGN(*size); 584 if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && 585 !(flags & KMALLOC_NOT_NORMAL_BITS)) { 586 obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, 587 flags | __GFP_NOMEMALLOC | __GFP_NOWARN, 588 node); 589 *size = SKB_SMALL_HEAD_CACHE_SIZE; 590 if (obj || !(gfp_pfmemalloc_allowed(flags))) 591 goto out; 592 /* Try again but now we are using pfmemalloc reserves */ 593 ret_pfmemalloc = true; 594 obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node); 595 goto out; 596 } 597 598 obj_size = kmalloc_size_roundup(obj_size); 599 /* The following cast might truncate high-order bits of obj_size, this 600 * is harmless because kmalloc(obj_size >= 2^32) will fail anyway. 601 */ 602 *size = (unsigned int)obj_size; 603 604 /* 605 * Try a regular allocation, when that fails and we're not entitled 606 * to the reserves, fail. 607 */ 608 obj = kmalloc_node_track_caller(obj_size, 609 flags | __GFP_NOMEMALLOC | __GFP_NOWARN, 610 node); 611 if (obj || !(gfp_pfmemalloc_allowed(flags))) 612 goto out; 613 614 /* Try again but now we are using pfmemalloc reserves */ 615 ret_pfmemalloc = true; 616 obj = kmalloc_node_track_caller(obj_size, flags, node); 617 618 out: 619 if (pfmemalloc) 620 *pfmemalloc = ret_pfmemalloc; 621 622 return obj; 623 } 624 625 /* Allocate a new skbuff. We do this ourselves so we can fill in a few 626 * 'private' fields and also do memory statistics to find all the 627 * [BEEP] leaks. 628 * 629 */ 630 631 /** 632 * __alloc_skb - allocate a network buffer 633 * @size: size to allocate 634 * @gfp_mask: allocation mask 635 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache 636 * instead of head cache and allocate a cloned (child) skb. 637 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for 638 * allocations in case the data is required for writeback 639 * @node: numa node to allocate memory on 640 * 641 * Allocate a new &sk_buff. The returned buffer has no headroom and a 642 * tail room of at least size bytes. The object has a reference count 643 * of one. The return is the buffer. On a failure the return is %NULL. 644 * 645 * Buffers may only be allocated from interrupts using a @gfp_mask of 646 * %GFP_ATOMIC. 647 */ 648 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 649 int flags, int node) 650 { 651 struct sk_buff *skb = NULL; 652 struct kmem_cache *cache; 653 bool pfmemalloc; 654 u8 *data; 655 656 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) 657 gfp_mask |= __GFP_MEMALLOC; 658 659 if (flags & SKB_ALLOC_FCLONE) { 660 cache = net_hotdata.skbuff_fclone_cache; 661 goto fallback; 662 } 663 cache = net_hotdata.skbuff_cache; 664 if (unlikely(node != NUMA_NO_NODE && node != numa_mem_id())) 665 goto fallback; 666 667 if (flags & SKB_ALLOC_NAPI) { 668 skb = napi_skb_cache_get(true); 669 if (unlikely(!skb)) 670 return NULL; 671 } else if (!in_hardirq() && !irqs_disabled()) { 672 local_bh_disable(); 673 skb = napi_skb_cache_get(false); 674 local_bh_enable(); 675 } 676 677 if (!skb) { 678 fallback: 679 skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); 680 if (unlikely(!skb)) 681 return NULL; 682 } 683 prefetchw(skb); 684 685 /* We do our best to align skb_shared_info on a separate cache 686 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives 687 * aligned memory blocks, unless SLUB/SLAB debug is enabled. 688 * Both skb->head and skb_shared_info are cache line aligned. 689 */ 690 data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc); 691 if (unlikely(!data)) 692 goto nodata; 693 /* kmalloc_size_roundup() might give us more room than requested. 694 * Put skb_shared_info exactly at the end of allocated zone, 695 * to allow max possible filling before reallocation. 696 */ 697 prefetchw(data + SKB_WITH_OVERHEAD(size)); 698 699 /* 700 * Only clear those fields we need to clear, not those that we will 701 * actually initialise below. Hence, don't put any more fields after 702 * the tail pointer in struct sk_buff! 703 */ 704 memset(skb, 0, offsetof(struct sk_buff, tail)); 705 __build_skb_around(skb, data, size); 706 skb->pfmemalloc = pfmemalloc; 707 708 if (flags & SKB_ALLOC_FCLONE) { 709 struct sk_buff_fclones *fclones; 710 711 fclones = container_of(skb, struct sk_buff_fclones, skb1); 712 713 skb->fclone = SKB_FCLONE_ORIG; 714 refcount_set(&fclones->fclone_ref, 1); 715 } 716 717 return skb; 718 719 nodata: 720 kmem_cache_free(cache, skb); 721 return NULL; 722 } 723 EXPORT_SYMBOL(__alloc_skb); 724 725 /** 726 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device 727 * @dev: network device to receive on 728 * @len: length to allocate 729 * @gfp_mask: get_free_pages mask, passed to alloc_skb 730 * 731 * Allocate a new &sk_buff and assign it a usage count of one. The 732 * buffer has NET_SKB_PAD headroom built in. Users should allocate 733 * the headroom they think they need without accounting for the 734 * built in space. The built in space is used for optimisations. 735 * 736 * %NULL is returned if there is no free memory. 737 */ 738 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, 739 gfp_t gfp_mask) 740 { 741 struct page_frag_cache *nc; 742 struct sk_buff *skb; 743 bool pfmemalloc; 744 void *data; 745 746 len += NET_SKB_PAD; 747 748 /* If requested length is either too small or too big, 749 * we use kmalloc() for skb->head allocation. 750 */ 751 if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || 752 len > SKB_WITH_OVERHEAD(PAGE_SIZE) || 753 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 754 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); 755 if (!skb) 756 goto skb_fail; 757 goto skb_success; 758 } 759 760 len = SKB_HEAD_ALIGN(len); 761 762 if (sk_memalloc_socks()) 763 gfp_mask |= __GFP_MEMALLOC; 764 765 if (in_hardirq() || irqs_disabled()) { 766 nc = this_cpu_ptr(&netdev_alloc_cache); 767 data = page_frag_alloc(nc, len, gfp_mask); 768 pfmemalloc = page_frag_cache_is_pfmemalloc(nc); 769 } else { 770 local_bh_disable(); 771 local_lock_nested_bh(&napi_alloc_cache.bh_lock); 772 773 nc = this_cpu_ptr(&napi_alloc_cache.page); 774 data = page_frag_alloc(nc, len, gfp_mask); 775 pfmemalloc = page_frag_cache_is_pfmemalloc(nc); 776 777 local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 778 local_bh_enable(); 779 } 780 781 if (unlikely(!data)) 782 return NULL; 783 784 skb = __build_skb(data, len); 785 if (unlikely(!skb)) { 786 skb_free_frag(data); 787 return NULL; 788 } 789 790 if (pfmemalloc) 791 skb->pfmemalloc = 1; 792 skb->head_frag = 1; 793 794 skb_success: 795 skb_reserve(skb, NET_SKB_PAD); 796 skb->dev = dev; 797 798 skb_fail: 799 return skb; 800 } 801 EXPORT_SYMBOL(__netdev_alloc_skb); 802 803 /** 804 * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance 805 * @napi: napi instance this buffer was allocated for 806 * @len: length to allocate 807 * 808 * Allocate a new sk_buff for use in NAPI receive. This buffer will 809 * attempt to allocate the head from a special reserved region used 810 * only for NAPI Rx allocation. By doing this we can save several 811 * CPU cycles by avoiding having to disable and re-enable IRQs. 812 * 813 * %NULL is returned if there is no free memory. 814 */ 815 struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) 816 { 817 gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN; 818 struct napi_alloc_cache *nc; 819 struct sk_buff *skb; 820 bool pfmemalloc; 821 void *data; 822 823 DEBUG_NET_WARN_ON_ONCE(!in_softirq()); 824 len += NET_SKB_PAD + NET_IP_ALIGN; 825 826 /* If requested length is either too small or too big, 827 * we use kmalloc() for skb->head allocation. 828 */ 829 if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || 830 len > SKB_WITH_OVERHEAD(PAGE_SIZE) || 831 (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { 832 skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, 833 NUMA_NO_NODE); 834 if (!skb) 835 goto skb_fail; 836 goto skb_success; 837 } 838 839 len = SKB_HEAD_ALIGN(len); 840 841 if (sk_memalloc_socks()) 842 gfp_mask |= __GFP_MEMALLOC; 843 844 local_lock_nested_bh(&napi_alloc_cache.bh_lock); 845 nc = this_cpu_ptr(&napi_alloc_cache); 846 847 data = page_frag_alloc(&nc->page, len, gfp_mask); 848 pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); 849 local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 850 851 if (unlikely(!data)) 852 return NULL; 853 854 skb = __napi_build_skb(data, len); 855 if (unlikely(!skb)) { 856 skb_free_frag(data); 857 return NULL; 858 } 859 860 if (pfmemalloc) 861 skb->pfmemalloc = 1; 862 skb->head_frag = 1; 863 864 skb_success: 865 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); 866 skb->dev = napi->dev; 867 868 skb_fail: 869 return skb; 870 } 871 EXPORT_SYMBOL(napi_alloc_skb); 872 873 void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, 874 int off, int size, unsigned int truesize) 875 { 876 DEBUG_NET_WARN_ON_ONCE(size > truesize); 877 878 skb_fill_netmem_desc(skb, i, netmem, off, size); 879 skb->len += size; 880 skb->data_len += size; 881 skb->truesize += truesize; 882 } 883 EXPORT_SYMBOL(skb_add_rx_frag_netmem); 884 885 void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, 886 unsigned int truesize) 887 { 888 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 889 890 DEBUG_NET_WARN_ON_ONCE(size > truesize); 891 892 skb_frag_size_add(frag, size); 893 skb->len += size; 894 skb->data_len += size; 895 skb->truesize += truesize; 896 } 897 EXPORT_SYMBOL(skb_coalesce_rx_frag); 898 899 static void skb_drop_list(struct sk_buff **listp) 900 { 901 kfree_skb_list(*listp); 902 *listp = NULL; 903 } 904 905 static inline void skb_drop_fraglist(struct sk_buff *skb) 906 { 907 skb_drop_list(&skb_shinfo(skb)->frag_list); 908 } 909 910 static void skb_clone_fraglist(struct sk_buff *skb) 911 { 912 struct sk_buff *list; 913 914 skb_walk_frags(skb, list) 915 skb_get(list); 916 } 917 918 int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, 919 unsigned int headroom) 920 { 921 #if IS_ENABLED(CONFIG_PAGE_POOL) 922 u32 size, truesize, len, max_head_size, off; 923 struct sk_buff *skb = *pskb, *nskb; 924 int err, i, head_off; 925 void *data; 926 927 /* XDP does not support fraglist so we need to linearize 928 * the skb. 929 */ 930 if (skb_has_frag_list(skb)) 931 return -EOPNOTSUPP; 932 933 max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom); 934 if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE) 935 return -ENOMEM; 936 937 size = min_t(u32, skb->len, max_head_size); 938 truesize = SKB_HEAD_ALIGN(size) + headroom; 939 data = page_pool_dev_alloc_va(pool, &truesize); 940 if (!data) 941 return -ENOMEM; 942 943 nskb = napi_build_skb(data, truesize); 944 if (!nskb) { 945 page_pool_free_va(pool, data, true); 946 return -ENOMEM; 947 } 948 949 skb_reserve(nskb, headroom); 950 skb_copy_header(nskb, skb); 951 skb_mark_for_recycle(nskb); 952 953 err = skb_copy_bits(skb, 0, nskb->data, size); 954 if (err) { 955 consume_skb(nskb); 956 return err; 957 } 958 skb_put(nskb, size); 959 960 head_off = skb_headroom(nskb) - skb_headroom(skb); 961 skb_headers_offset_update(nskb, head_off); 962 963 off = size; 964 len = skb->len - off; 965 for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { 966 struct page *page; 967 u32 page_off; 968 969 size = min_t(u32, len, PAGE_SIZE); 970 truesize = size; 971 972 page = page_pool_dev_alloc(pool, &page_off, &truesize); 973 if (!page) { 974 consume_skb(nskb); 975 return -ENOMEM; 976 } 977 978 skb_add_rx_frag(nskb, i, page, page_off, size, truesize); 979 err = skb_copy_bits(skb, off, page_address(page) + page_off, 980 size); 981 if (err) { 982 consume_skb(nskb); 983 return err; 984 } 985 986 len -= size; 987 off += size; 988 } 989 990 consume_skb(skb); 991 *pskb = nskb; 992 993 return 0; 994 #else 995 return -EOPNOTSUPP; 996 #endif 997 } 998 EXPORT_SYMBOL(skb_pp_cow_data); 999 1000 int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, 1001 const struct bpf_prog *prog) 1002 { 1003 if (!prog->aux->xdp_has_frags) 1004 return -EINVAL; 1005 1006 return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM); 1007 } 1008 EXPORT_SYMBOL(skb_cow_data_for_xdp); 1009 1010 #if IS_ENABLED(CONFIG_PAGE_POOL) 1011 bool napi_pp_put_page(netmem_ref netmem) 1012 { 1013 netmem = netmem_compound_head(netmem); 1014 1015 if (unlikely(!netmem_is_pp(netmem))) 1016 return false; 1017 1018 page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); 1019 1020 return true; 1021 } 1022 EXPORT_SYMBOL(napi_pp_put_page); 1023 #endif 1024 1025 static bool skb_pp_recycle(struct sk_buff *skb, void *data) 1026 { 1027 if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) 1028 return false; 1029 return napi_pp_put_page(page_to_netmem(virt_to_page(data))); 1030 } 1031 1032 /** 1033 * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb 1034 * @skb: page pool aware skb 1035 * 1036 * Increase the fragment reference count (pp_ref_count) of a skb. This is 1037 * intended to gain fragment references only for page pool aware skbs, 1038 * i.e. when skb->pp_recycle is true, and not for fragments in a 1039 * non-pp-recycling skb. It has a fallback to increase references on normal 1040 * pages, as page pool aware skbs may also have normal page fragments. 1041 */ 1042 static int skb_pp_frag_ref(struct sk_buff *skb) 1043 { 1044 struct skb_shared_info *shinfo; 1045 netmem_ref head_netmem; 1046 int i; 1047 1048 if (!skb->pp_recycle) 1049 return -EINVAL; 1050 1051 shinfo = skb_shinfo(skb); 1052 1053 for (i = 0; i < shinfo->nr_frags; i++) { 1054 head_netmem = netmem_compound_head(shinfo->frags[i].netmem); 1055 if (likely(netmem_is_pp(head_netmem))) 1056 page_pool_ref_netmem(head_netmem); 1057 else 1058 page_ref_inc(netmem_to_page(head_netmem)); 1059 } 1060 return 0; 1061 } 1062 1063 static void skb_kfree_head(void *head, unsigned int end_offset) 1064 { 1065 if (end_offset == SKB_SMALL_HEAD_HEADROOM) 1066 kmem_cache_free(net_hotdata.skb_small_head_cache, head); 1067 else 1068 kfree(head); 1069 } 1070 1071 static void skb_free_head(struct sk_buff *skb) 1072 { 1073 unsigned char *head = skb->head; 1074 1075 if (skb->head_frag) { 1076 if (skb_pp_recycle(skb, head)) 1077 return; 1078 skb_free_frag(head); 1079 } else { 1080 skb_kfree_head(head, skb_end_offset(skb)); 1081 } 1082 } 1083 1084 static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) 1085 { 1086 struct skb_shared_info *shinfo = skb_shinfo(skb); 1087 int i; 1088 1089 if (!skb_data_unref(skb, shinfo)) 1090 goto exit; 1091 1092 if (skb_zcopy(skb)) { 1093 bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS; 1094 1095 skb_zcopy_clear(skb, true); 1096 if (skip_unref) 1097 goto free_head; 1098 } 1099 1100 for (i = 0; i < shinfo->nr_frags; i++) 1101 __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle); 1102 1103 free_head: 1104 if (shinfo->frag_list) 1105 kfree_skb_list_reason(shinfo->frag_list, reason); 1106 1107 skb_free_head(skb); 1108 exit: 1109 /* When we clone an SKB we copy the reycling bit. The pp_recycle 1110 * bit is only set on the head though, so in order to avoid races 1111 * while trying to recycle fragments on __skb_frag_unref() we need 1112 * to make one SKB responsible for triggering the recycle path. 1113 * So disable the recycling bit if an SKB is cloned and we have 1114 * additional references to the fragmented part of the SKB. 1115 * Eventually the last SKB will have the recycling bit set and it's 1116 * dataref set to 0, which will trigger the recycling 1117 */ 1118 skb->pp_recycle = 0; 1119 } 1120 1121 /* 1122 * Free an skbuff by memory without cleaning the state. 1123 */ 1124 static void kfree_skbmem(struct sk_buff *skb) 1125 { 1126 struct sk_buff_fclones *fclones; 1127 1128 switch (skb->fclone) { 1129 case SKB_FCLONE_UNAVAILABLE: 1130 kmem_cache_free(net_hotdata.skbuff_cache, skb); 1131 return; 1132 1133 case SKB_FCLONE_ORIG: 1134 fclones = container_of(skb, struct sk_buff_fclones, skb1); 1135 1136 /* We usually free the clone (TX completion) before original skb 1137 * This test would have no chance to be true for the clone, 1138 * while here, branch prediction will be good. 1139 */ 1140 if (refcount_read(&fclones->fclone_ref) == 1) 1141 goto fastpath; 1142 break; 1143 1144 default: /* SKB_FCLONE_CLONE */ 1145 fclones = container_of(skb, struct sk_buff_fclones, skb2); 1146 break; 1147 } 1148 if (!refcount_dec_and_test(&fclones->fclone_ref)) 1149 return; 1150 fastpath: 1151 kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones); 1152 } 1153 1154 void skb_release_head_state(struct sk_buff *skb) 1155 { 1156 skb_dst_drop(skb); 1157 if (skb->destructor) { 1158 DEBUG_NET_WARN_ON_ONCE(in_hardirq()); 1159 #ifdef CONFIG_INET 1160 INDIRECT_CALL_4(skb->destructor, 1161 tcp_wfree, __sock_wfree, sock_wfree, 1162 xsk_destruct_skb, 1163 skb); 1164 #else 1165 INDIRECT_CALL_2(skb->destructor, 1166 sock_wfree, xsk_destruct_skb, 1167 skb); 1168 1169 #endif 1170 skb->destructor = NULL; 1171 skb->sk = NULL; 1172 } 1173 nf_reset_ct(skb); 1174 skb_ext_reset(skb); 1175 } 1176 1177 /* Free everything but the sk_buff shell. */ 1178 static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) 1179 { 1180 skb_release_head_state(skb); 1181 if (likely(skb->head)) 1182 skb_release_data(skb, reason); 1183 } 1184 1185 /** 1186 * __kfree_skb - private function 1187 * @skb: buffer 1188 * 1189 * Free an sk_buff. Release anything attached to the buffer. 1190 * Clean the state. This is an internal helper function. Users should 1191 * always call kfree_skb 1192 */ 1193 1194 void __kfree_skb(struct sk_buff *skb) 1195 { 1196 skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); 1197 kfree_skbmem(skb); 1198 } 1199 EXPORT_SYMBOL(__kfree_skb); 1200 1201 static __always_inline 1202 bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, 1203 enum skb_drop_reason reason) 1204 { 1205 if (unlikely(!skb_unref(skb))) 1206 return false; 1207 1208 DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET || 1209 u32_get_bits(reason, 1210 SKB_DROP_REASON_SUBSYS_MASK) >= 1211 SKB_DROP_REASON_SUBSYS_NUM); 1212 1213 if (reason == SKB_CONSUMED) 1214 trace_consume_skb(skb, __builtin_return_address(0)); 1215 else 1216 trace_kfree_skb(skb, __builtin_return_address(0), reason, sk); 1217 return true; 1218 } 1219 1220 /** 1221 * sk_skb_reason_drop - free an sk_buff with special reason 1222 * @sk: the socket to receive @skb, or NULL if not applicable 1223 * @skb: buffer to free 1224 * @reason: reason why this skb is dropped 1225 * 1226 * Drop a reference to the buffer and free it if the usage count has hit 1227 * zero. Meanwhile, pass the receiving socket and drop reason to 1228 * 'kfree_skb' tracepoint. 1229 */ 1230 void __fix_address 1231 sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) 1232 { 1233 if (__sk_skb_reason_drop(sk, skb, reason)) 1234 __kfree_skb(skb); 1235 } 1236 EXPORT_SYMBOL(sk_skb_reason_drop); 1237 1238 #define KFREE_SKB_BULK_SIZE 16 1239 1240 struct skb_free_array { 1241 unsigned int skb_count; 1242 void *skb_array[KFREE_SKB_BULK_SIZE]; 1243 }; 1244 1245 static void kfree_skb_add_bulk(struct sk_buff *skb, 1246 struct skb_free_array *sa, 1247 enum skb_drop_reason reason) 1248 { 1249 /* if SKB is a clone, don't handle this case */ 1250 if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { 1251 __kfree_skb(skb); 1252 return; 1253 } 1254 1255 skb_release_all(skb, reason); 1256 sa->skb_array[sa->skb_count++] = skb; 1257 1258 if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { 1259 kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE, 1260 sa->skb_array); 1261 sa->skb_count = 0; 1262 } 1263 } 1264 1265 void __fix_address 1266 kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) 1267 { 1268 struct skb_free_array sa; 1269 1270 sa.skb_count = 0; 1271 1272 while (segs) { 1273 struct sk_buff *next = segs->next; 1274 1275 if (__sk_skb_reason_drop(NULL, segs, reason)) { 1276 skb_poison_list(segs); 1277 kfree_skb_add_bulk(segs, &sa, reason); 1278 } 1279 1280 segs = next; 1281 } 1282 1283 if (sa.skb_count) 1284 kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array); 1285 } 1286 EXPORT_SYMBOL(kfree_skb_list_reason); 1287 1288 /* Dump skb information and contents. 1289 * 1290 * Must only be called from net_ratelimit()-ed paths. 1291 * 1292 * Dumps whole packets if full_pkt, only headers otherwise. 1293 */ 1294 void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) 1295 { 1296 struct skb_shared_info *sh = skb_shinfo(skb); 1297 struct net_device *dev = skb->dev; 1298 struct sock *sk = skb->sk; 1299 struct sk_buff *list_skb; 1300 bool has_mac, has_trans; 1301 int headroom, tailroom; 1302 int i, len, seg_len; 1303 1304 if (full_pkt) 1305 len = skb->len; 1306 else 1307 len = min_t(int, skb->len, MAX_HEADER + 128); 1308 1309 headroom = skb_headroom(skb); 1310 tailroom = skb_tailroom(skb); 1311 1312 has_mac = skb_mac_header_was_set(skb); 1313 has_trans = skb_transport_header_was_set(skb); 1314 1315 printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" 1316 "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n" 1317 "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" 1318 "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n" 1319 "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" 1320 "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n" 1321 "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n", 1322 level, skb->len, headroom, skb_headlen(skb), tailroom, 1323 has_mac ? skb->mac_header : -1, 1324 has_mac ? skb_mac_header_len(skb) : -1, 1325 skb->mac_len, 1326 skb->network_header, 1327 has_trans ? skb_network_header_len(skb) : -1, 1328 has_trans ? skb->transport_header : -1, 1329 sh->tx_flags, sh->nr_frags, 1330 sh->gso_size, sh->gso_type, sh->gso_segs, 1331 skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed, 1332 skb->csum_complete_sw, skb->csum_valid, skb->csum_level, 1333 skb->hash, skb->sw_hash, skb->l4_hash, 1334 ntohs(skb->protocol), skb->pkt_type, skb->skb_iif, 1335 skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all, 1336 skb->encapsulation, skb->inner_protocol, skb->inner_mac_header, 1337 skb->inner_network_header, skb->inner_transport_header); 1338 1339 if (dev) 1340 printk("%sdev name=%s feat=%pNF\n", 1341 level, dev->name, &dev->features); 1342 if (sk) 1343 printk("%ssk family=%hu type=%u proto=%u\n", 1344 level, sk->sk_family, sk->sk_type, sk->sk_protocol); 1345 1346 if (full_pkt && headroom) 1347 print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET, 1348 16, 1, skb->head, headroom, false); 1349 1350 seg_len = min_t(int, skb_headlen(skb), len); 1351 if (seg_len) 1352 print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET, 1353 16, 1, skb->data, seg_len, false); 1354 len -= seg_len; 1355 1356 if (full_pkt && tailroom) 1357 print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET, 1358 16, 1, skb_tail_pointer(skb), tailroom, false); 1359 1360 for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { 1361 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1362 u32 p_off, p_len, copied; 1363 struct page *p; 1364 u8 *vaddr; 1365 1366 if (skb_frag_is_net_iov(frag)) { 1367 printk("%sskb frag %d: not readable\n", level, i); 1368 len -= skb_frag_size(frag); 1369 if (!len) 1370 break; 1371 continue; 1372 } 1373 1374 skb_frag_foreach_page(frag, skb_frag_off(frag), 1375 skb_frag_size(frag), p, p_off, p_len, 1376 copied) { 1377 seg_len = min_t(int, p_len, len); 1378 vaddr = kmap_atomic(p); 1379 print_hex_dump(level, "skb frag: ", 1380 DUMP_PREFIX_OFFSET, 1381 16, 1, vaddr + p_off, seg_len, false); 1382 kunmap_atomic(vaddr); 1383 len -= seg_len; 1384 if (!len) 1385 break; 1386 } 1387 } 1388 1389 if (full_pkt && skb_has_frag_list(skb)) { 1390 printk("skb fraglist:\n"); 1391 skb_walk_frags(skb, list_skb) 1392 skb_dump(level, list_skb, true); 1393 } 1394 } 1395 EXPORT_SYMBOL(skb_dump); 1396 1397 /** 1398 * skb_tx_error - report an sk_buff xmit error 1399 * @skb: buffer that triggered an error 1400 * 1401 * Report xmit error if a device callback is tracking this skb. 1402 * skb must be freed afterwards. 1403 */ 1404 void skb_tx_error(struct sk_buff *skb) 1405 { 1406 if (skb) { 1407 skb_zcopy_downgrade_managed(skb); 1408 skb_zcopy_clear(skb, true); 1409 } 1410 } 1411 EXPORT_SYMBOL(skb_tx_error); 1412 1413 #ifdef CONFIG_TRACEPOINTS 1414 /** 1415 * consume_skb - free an skbuff 1416 * @skb: buffer to free 1417 * 1418 * Drop a ref to the buffer and free it if the usage count has hit zero 1419 * Functions identically to kfree_skb, but kfree_skb assumes that the frame 1420 * is being dropped after a failure and notes that 1421 */ 1422 void consume_skb(struct sk_buff *skb) 1423 { 1424 if (!skb_unref(skb)) 1425 return; 1426 1427 trace_consume_skb(skb, __builtin_return_address(0)); 1428 __kfree_skb(skb); 1429 } 1430 EXPORT_SYMBOL(consume_skb); 1431 #endif 1432 1433 /** 1434 * __consume_stateless_skb - free an skbuff, assuming it is stateless 1435 * @skb: buffer to free 1436 * 1437 * Alike consume_skb(), but this variant assumes that this is the last 1438 * skb reference and all the head states have been already dropped 1439 */ 1440 void __consume_stateless_skb(struct sk_buff *skb) 1441 { 1442 trace_consume_skb(skb, __builtin_return_address(0)); 1443 skb_release_data(skb, SKB_CONSUMED); 1444 kfree_skbmem(skb); 1445 } 1446 1447 static void napi_skb_cache_put(struct sk_buff *skb) 1448 { 1449 struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); 1450 1451 if (!kasan_mempool_poison_object(skb)) 1452 return; 1453 1454 local_lock_nested_bh(&napi_alloc_cache.bh_lock); 1455 nc->skb_cache[nc->skb_count++] = skb; 1456 1457 if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { 1458 u32 i, remaining = NAPI_SKB_CACHE_SIZE - NAPI_SKB_CACHE_FREE; 1459 1460 for (i = remaining; i < NAPI_SKB_CACHE_SIZE; i++) 1461 kasan_mempool_unpoison_object(nc->skb_cache[i], 1462 skbuff_cache_size); 1463 1464 kmem_cache_free_bulk(net_hotdata.skbuff_cache, 1465 NAPI_SKB_CACHE_FREE, 1466 nc->skb_cache + remaining); 1467 nc->skb_count = remaining; 1468 } 1469 local_unlock_nested_bh(&napi_alloc_cache.bh_lock); 1470 } 1471 1472 void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) 1473 { 1474 skb_release_all(skb, reason); 1475 napi_skb_cache_put(skb); 1476 } 1477 1478 void napi_skb_free_stolen_head(struct sk_buff *skb) 1479 { 1480 if (unlikely(skb->slow_gro)) { 1481 nf_reset_ct(skb); 1482 skb_dst_drop(skb); 1483 skb_ext_put(skb); 1484 skb_orphan(skb); 1485 skb->slow_gro = 0; 1486 } 1487 napi_skb_cache_put(skb); 1488 } 1489 1490 void napi_consume_skb(struct sk_buff *skb, int budget) 1491 { 1492 /* Zero budget indicate non-NAPI context called us, like netpoll */ 1493 if (unlikely(!budget || !skb)) { 1494 dev_consume_skb_any(skb); 1495 return; 1496 } 1497 1498 DEBUG_NET_WARN_ON_ONCE(!in_softirq()); 1499 1500 if (skb->alloc_cpu != smp_processor_id() && !skb_shared(skb)) { 1501 skb_release_head_state(skb); 1502 return skb_attempt_defer_free(skb); 1503 } 1504 1505 if (!skb_unref(skb)) 1506 return; 1507 1508 /* if reaching here SKB is ready to free */ 1509 trace_consume_skb(skb, __builtin_return_address(0)); 1510 1511 /* if SKB is a clone, don't handle this case */ 1512 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { 1513 __kfree_skb(skb); 1514 return; 1515 } 1516 1517 skb_release_all(skb, SKB_CONSUMED); 1518 napi_skb_cache_put(skb); 1519 } 1520 EXPORT_SYMBOL(napi_consume_skb); 1521 1522 /* Make sure a field is contained by headers group */ 1523 #define CHECK_SKB_FIELD(field) \ 1524 BUILD_BUG_ON(offsetof(struct sk_buff, field) != \ 1525 offsetof(struct sk_buff, headers.field)); \ 1526 1527 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 1528 { 1529 new->tstamp = old->tstamp; 1530 /* We do not copy old->sk */ 1531 new->dev = old->dev; 1532 memcpy(new->cb, old->cb, sizeof(old->cb)); 1533 skb_dst_copy(new, old); 1534 __skb_ext_copy(new, old); 1535 __nf_copy(new, old, false); 1536 1537 /* Note : this field could be in the headers group. 1538 * It is not yet because we do not want to have a 16 bit hole 1539 */ 1540 new->queue_mapping = old->queue_mapping; 1541 1542 memcpy(&new->headers, &old->headers, sizeof(new->headers)); 1543 CHECK_SKB_FIELD(protocol); 1544 CHECK_SKB_FIELD(csum); 1545 CHECK_SKB_FIELD(hash); 1546 CHECK_SKB_FIELD(priority); 1547 CHECK_SKB_FIELD(skb_iif); 1548 CHECK_SKB_FIELD(vlan_proto); 1549 CHECK_SKB_FIELD(vlan_tci); 1550 CHECK_SKB_FIELD(transport_header); 1551 CHECK_SKB_FIELD(network_header); 1552 CHECK_SKB_FIELD(mac_header); 1553 CHECK_SKB_FIELD(inner_protocol); 1554 CHECK_SKB_FIELD(inner_transport_header); 1555 CHECK_SKB_FIELD(inner_network_header); 1556 CHECK_SKB_FIELD(inner_mac_header); 1557 CHECK_SKB_FIELD(mark); 1558 #ifdef CONFIG_NETWORK_SECMARK 1559 CHECK_SKB_FIELD(secmark); 1560 #endif 1561 #ifdef CONFIG_NET_RX_BUSY_POLL 1562 CHECK_SKB_FIELD(napi_id); 1563 #endif 1564 CHECK_SKB_FIELD(alloc_cpu); 1565 #ifdef CONFIG_XPS 1566 CHECK_SKB_FIELD(sender_cpu); 1567 #endif 1568 #ifdef CONFIG_NET_SCHED 1569 CHECK_SKB_FIELD(tc_index); 1570 #endif 1571 1572 } 1573 1574 /* 1575 * You should not add any new code to this function. Add it to 1576 * __copy_skb_header above instead. 1577 */ 1578 static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) 1579 { 1580 #define C(x) n->x = skb->x 1581 1582 n->next = n->prev = NULL; 1583 n->sk = NULL; 1584 __copy_skb_header(n, skb); 1585 1586 C(len); 1587 C(data_len); 1588 C(mac_len); 1589 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; 1590 n->cloned = 1; 1591 n->nohdr = 0; 1592 n->peeked = 0; 1593 C(pfmemalloc); 1594 C(pp_recycle); 1595 n->destructor = NULL; 1596 C(tail); 1597 C(end); 1598 C(head); 1599 C(head_frag); 1600 C(data); 1601 C(truesize); 1602 refcount_set(&n->users, 1); 1603 1604 atomic_inc(&(skb_shinfo(skb)->dataref)); 1605 skb->cloned = 1; 1606 1607 return n; 1608 #undef C 1609 } 1610 1611 /** 1612 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg 1613 * @first: first sk_buff of the msg 1614 */ 1615 struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) 1616 { 1617 struct sk_buff *n; 1618 1619 n = alloc_skb(0, GFP_ATOMIC); 1620 if (!n) 1621 return NULL; 1622 1623 n->len = first->len; 1624 n->data_len = first->len; 1625 n->truesize = first->truesize; 1626 1627 skb_shinfo(n)->frag_list = first; 1628 1629 __copy_skb_header(n, first); 1630 n->destructor = NULL; 1631 1632 return n; 1633 } 1634 EXPORT_SYMBOL_GPL(alloc_skb_for_msg); 1635 1636 /** 1637 * skb_morph - morph one skb into another 1638 * @dst: the skb to receive the contents 1639 * @src: the skb to supply the contents 1640 * 1641 * This is identical to skb_clone except that the target skb is 1642 * supplied by the user. 1643 * 1644 * The target skb is returned upon exit. 1645 */ 1646 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) 1647 { 1648 skb_release_all(dst, SKB_CONSUMED); 1649 return __skb_clone(dst, src); 1650 } 1651 EXPORT_SYMBOL_GPL(skb_morph); 1652 1653 int mm_account_pinned_pages(struct mmpin *mmp, size_t size) 1654 { 1655 unsigned long max_pg, num_pg, new_pg, old_pg, rlim; 1656 struct user_struct *user; 1657 1658 if (capable(CAP_IPC_LOCK) || !size) 1659 return 0; 1660 1661 rlim = rlimit(RLIMIT_MEMLOCK); 1662 if (rlim == RLIM_INFINITY) 1663 return 0; 1664 1665 num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ 1666 max_pg = rlim >> PAGE_SHIFT; 1667 user = mmp->user ? : current_user(); 1668 1669 old_pg = atomic_long_read(&user->locked_vm); 1670 do { 1671 new_pg = old_pg + num_pg; 1672 if (new_pg > max_pg) 1673 return -ENOBUFS; 1674 } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg)); 1675 1676 if (!mmp->user) { 1677 mmp->user = get_uid(user); 1678 mmp->num_pg = num_pg; 1679 } else { 1680 mmp->num_pg += num_pg; 1681 } 1682 1683 return 0; 1684 } 1685 EXPORT_SYMBOL_GPL(mm_account_pinned_pages); 1686 1687 void mm_unaccount_pinned_pages(struct mmpin *mmp) 1688 { 1689 if (mmp->user) { 1690 atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); 1691 free_uid(mmp->user); 1692 } 1693 } 1694 EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); 1695 1696 static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size, 1697 bool devmem) 1698 { 1699 struct ubuf_info_msgzc *uarg; 1700 struct sk_buff *skb; 1701 1702 WARN_ON_ONCE(!in_task()); 1703 1704 skb = sock_omalloc(sk, 0, GFP_KERNEL); 1705 if (!skb) 1706 return NULL; 1707 1708 BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); 1709 uarg = (void *)skb->cb; 1710 uarg->mmp.user = NULL; 1711 1712 if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) { 1713 kfree_skb(skb); 1714 return NULL; 1715 } 1716 1717 uarg->ubuf.ops = &msg_zerocopy_ubuf_ops; 1718 uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; 1719 uarg->len = 1; 1720 uarg->bytelen = size; 1721 uarg->zerocopy = 1; 1722 uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; 1723 refcount_set(&uarg->ubuf.refcnt, 1); 1724 sock_hold(sk); 1725 1726 return &uarg->ubuf; 1727 } 1728 1729 static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) 1730 { 1731 return container_of((void *)uarg, struct sk_buff, cb); 1732 } 1733 1734 struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, 1735 struct ubuf_info *uarg, bool devmem) 1736 { 1737 if (uarg) { 1738 struct ubuf_info_msgzc *uarg_zc; 1739 const u32 byte_limit = 1 << 19; /* limit to a few TSO */ 1740 u32 bytelen, next; 1741 1742 /* there might be non MSG_ZEROCOPY users */ 1743 if (uarg->ops != &msg_zerocopy_ubuf_ops) 1744 return NULL; 1745 1746 /* realloc only when socket is locked (TCP, UDP cork), 1747 * so uarg->len and sk_zckey access is serialized 1748 */ 1749 if (!sock_owned_by_user(sk)) { 1750 WARN_ON_ONCE(1); 1751 return NULL; 1752 } 1753 1754 uarg_zc = uarg_to_msgzc(uarg); 1755 bytelen = uarg_zc->bytelen + size; 1756 if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) { 1757 /* TCP can create new skb to attach new uarg */ 1758 if (sk->sk_type == SOCK_STREAM) 1759 goto new_alloc; 1760 return NULL; 1761 } 1762 1763 next = (u32)atomic_read(&sk->sk_zckey); 1764 if ((u32)(uarg_zc->id + uarg_zc->len) == next) { 1765 if (likely(!devmem) && 1766 mm_account_pinned_pages(&uarg_zc->mmp, size)) 1767 return NULL; 1768 uarg_zc->len++; 1769 uarg_zc->bytelen = bytelen; 1770 atomic_set(&sk->sk_zckey, ++next); 1771 1772 /* no extra ref when appending to datagram (MSG_MORE) */ 1773 if (sk->sk_type == SOCK_STREAM) 1774 net_zcopy_get(uarg); 1775 1776 return uarg; 1777 } 1778 } 1779 1780 new_alloc: 1781 return msg_zerocopy_alloc(sk, size, devmem); 1782 } 1783 EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); 1784 1785 static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) 1786 { 1787 struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); 1788 u32 old_lo, old_hi; 1789 u64 sum_len; 1790 1791 old_lo = serr->ee.ee_info; 1792 old_hi = serr->ee.ee_data; 1793 sum_len = old_hi - old_lo + 1ULL + len; 1794 1795 if (sum_len >= (1ULL << 32)) 1796 return false; 1797 1798 if (lo != old_hi + 1) 1799 return false; 1800 1801 serr->ee.ee_data += len; 1802 return true; 1803 } 1804 1805 static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg) 1806 { 1807 struct sk_buff *tail, *skb = skb_from_uarg(uarg); 1808 struct sock_exterr_skb *serr; 1809 struct sock *sk = skb->sk; 1810 struct sk_buff_head *q; 1811 unsigned long flags; 1812 bool is_zerocopy; 1813 u32 lo, hi; 1814 u16 len; 1815 1816 mm_unaccount_pinned_pages(&uarg->mmp); 1817 1818 /* if !len, there was only 1 call, and it was aborted 1819 * so do not queue a completion notification 1820 */ 1821 if (!uarg->len || sock_flag(sk, SOCK_DEAD)) 1822 goto release; 1823 1824 len = uarg->len; 1825 lo = uarg->id; 1826 hi = uarg->id + len - 1; 1827 is_zerocopy = uarg->zerocopy; 1828 1829 serr = SKB_EXT_ERR(skb); 1830 memset(serr, 0, sizeof(*serr)); 1831 serr->ee.ee_errno = 0; 1832 serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; 1833 serr->ee.ee_data = hi; 1834 serr->ee.ee_info = lo; 1835 if (!is_zerocopy) 1836 serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; 1837 1838 q = &sk->sk_error_queue; 1839 spin_lock_irqsave(&q->lock, flags); 1840 tail = skb_peek_tail(q); 1841 if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || 1842 !skb_zerocopy_notify_extend(tail, lo, len)) { 1843 __skb_queue_tail(q, skb); 1844 skb = NULL; 1845 } 1846 spin_unlock_irqrestore(&q->lock, flags); 1847 1848 sk_error_report(sk); 1849 1850 release: 1851 consume_skb(skb); 1852 sock_put(sk); 1853 } 1854 1855 static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, 1856 bool success) 1857 { 1858 struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); 1859 1860 uarg_zc->zerocopy = uarg_zc->zerocopy & success; 1861 1862 if (refcount_dec_and_test(&uarg->refcnt)) 1863 __msg_zerocopy_callback(uarg_zc); 1864 } 1865 1866 void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) 1867 { 1868 struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk; 1869 1870 atomic_dec(&sk->sk_zckey); 1871 uarg_to_msgzc(uarg)->len--; 1872 1873 if (have_uref) 1874 msg_zerocopy_complete(NULL, uarg, true); 1875 } 1876 EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); 1877 1878 const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { 1879 .complete = msg_zerocopy_complete, 1880 }; 1881 EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); 1882 1883 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, 1884 struct msghdr *msg, int len, 1885 struct ubuf_info *uarg, 1886 struct net_devmem_dmabuf_binding *binding) 1887 { 1888 int err, orig_len = skb->len; 1889 1890 if (uarg->ops->link_skb) { 1891 err = uarg->ops->link_skb(skb, uarg); 1892 if (err) 1893 return err; 1894 } else { 1895 struct ubuf_info *orig_uarg = skb_zcopy(skb); 1896 1897 /* An skb can only point to one uarg. This edge case happens 1898 * when TCP appends to an skb, but zerocopy_realloc triggered 1899 * a new alloc. 1900 */ 1901 if (orig_uarg && uarg != orig_uarg) 1902 return -EEXIST; 1903 } 1904 1905 err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len, 1906 binding); 1907 if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { 1908 struct sock *save_sk = skb->sk; 1909 1910 /* Streams do not free skb on error. Reset to prev state. */ 1911 iov_iter_revert(&msg->msg_iter, skb->len - orig_len); 1912 skb->sk = sk; 1913 ___pskb_trim(skb, orig_len); 1914 skb->sk = save_sk; 1915 return err; 1916 } 1917 1918 skb_zcopy_set(skb, uarg, NULL); 1919 return skb->len - orig_len; 1920 } 1921 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); 1922 1923 void __skb_zcopy_downgrade_managed(struct sk_buff *skb) 1924 { 1925 int i; 1926 1927 skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; 1928 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1929 skb_frag_ref(skb, i); 1930 } 1931 EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed); 1932 1933 static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, 1934 gfp_t gfp_mask) 1935 { 1936 if (skb_zcopy(orig)) { 1937 if (skb_zcopy(nskb)) { 1938 /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ 1939 if (!gfp_mask) { 1940 WARN_ON_ONCE(1); 1941 return -ENOMEM; 1942 } 1943 if (skb_uarg(nskb) == skb_uarg(orig)) 1944 return 0; 1945 if (skb_copy_ubufs(nskb, GFP_ATOMIC)) 1946 return -EIO; 1947 } 1948 skb_zcopy_set(nskb, skb_uarg(orig), NULL); 1949 } 1950 return 0; 1951 } 1952 1953 /** 1954 * skb_copy_ubufs - copy userspace skb frags buffers to kernel 1955 * @skb: the skb to modify 1956 * @gfp_mask: allocation priority 1957 * 1958 * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. 1959 * It will copy all frags into kernel and drop the reference 1960 * to userspace pages. 1961 * 1962 * If this function is called from an interrupt gfp_mask() must be 1963 * %GFP_ATOMIC. 1964 * 1965 * Returns 0 on success or a negative error code on failure 1966 * to allocate kernel memory to copy to. 1967 */ 1968 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) 1969 { 1970 int num_frags = skb_shinfo(skb)->nr_frags; 1971 struct page *page, *head = NULL; 1972 int i, order, psize, new_frags; 1973 u32 d_off; 1974 1975 if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) 1976 return -EINVAL; 1977 1978 if (!skb_frags_readable(skb)) 1979 return -EFAULT; 1980 1981 if (!num_frags) 1982 goto release; 1983 1984 /* We might have to allocate high order pages, so compute what minimum 1985 * page order is needed. 1986 */ 1987 order = 0; 1988 while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb)) 1989 order++; 1990 psize = (PAGE_SIZE << order); 1991 1992 new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order); 1993 for (i = 0; i < new_frags; i++) { 1994 page = alloc_pages(gfp_mask | __GFP_COMP, order); 1995 if (!page) { 1996 while (head) { 1997 struct page *next = (struct page *)page_private(head); 1998 put_page(head); 1999 head = next; 2000 } 2001 return -ENOMEM; 2002 } 2003 set_page_private(page, (unsigned long)head); 2004 head = page; 2005 } 2006 2007 page = head; 2008 d_off = 0; 2009 for (i = 0; i < num_frags; i++) { 2010 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 2011 u32 p_off, p_len, copied; 2012 struct page *p; 2013 u8 *vaddr; 2014 2015 skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), 2016 p, p_off, p_len, copied) { 2017 u32 copy, done = 0; 2018 vaddr = kmap_atomic(p); 2019 2020 while (done < p_len) { 2021 if (d_off == psize) { 2022 d_off = 0; 2023 page = (struct page *)page_private(page); 2024 } 2025 copy = min_t(u32, psize - d_off, p_len - done); 2026 memcpy(page_address(page) + d_off, 2027 vaddr + p_off + done, copy); 2028 done += copy; 2029 d_off += copy; 2030 } 2031 kunmap_atomic(vaddr); 2032 } 2033 } 2034 2035 /* skb frags release userspace buffers */ 2036 for (i = 0; i < num_frags; i++) 2037 skb_frag_unref(skb, i); 2038 2039 /* skb frags point to kernel buffers */ 2040 for (i = 0; i < new_frags - 1; i++) { 2041 __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize); 2042 head = (struct page *)page_private(head); 2043 } 2044 __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0, 2045 d_off); 2046 skb_shinfo(skb)->nr_frags = new_frags; 2047 2048 release: 2049 skb_zcopy_clear(skb, false); 2050 return 0; 2051 } 2052 EXPORT_SYMBOL_GPL(skb_copy_ubufs); 2053 2054 /** 2055 * skb_clone - duplicate an sk_buff 2056 * @skb: buffer to clone 2057 * @gfp_mask: allocation priority 2058 * 2059 * Duplicate an &sk_buff. The new one is not owned by a socket. Both 2060 * copies share the same packet data but not structure. The new 2061 * buffer has a reference count of 1. If the allocation fails the 2062 * function returns %NULL otherwise the new buffer is returned. 2063 * 2064 * If this function is called from an interrupt gfp_mask() must be 2065 * %GFP_ATOMIC. 2066 */ 2067 2068 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) 2069 { 2070 struct sk_buff_fclones *fclones = container_of(skb, 2071 struct sk_buff_fclones, 2072 skb1); 2073 struct sk_buff *n; 2074 2075 if (skb_orphan_frags(skb, gfp_mask)) 2076 return NULL; 2077 2078 if (skb->fclone == SKB_FCLONE_ORIG && 2079 refcount_read(&fclones->fclone_ref) == 1) { 2080 n = &fclones->skb2; 2081 refcount_set(&fclones->fclone_ref, 2); 2082 n->fclone = SKB_FCLONE_CLONE; 2083 } else { 2084 if (skb_pfmemalloc(skb)) 2085 gfp_mask |= __GFP_MEMALLOC; 2086 2087 n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask); 2088 if (!n) 2089 return NULL; 2090 2091 n->fclone = SKB_FCLONE_UNAVAILABLE; 2092 } 2093 2094 return __skb_clone(n, skb); 2095 } 2096 EXPORT_SYMBOL(skb_clone); 2097 2098 void skb_headers_offset_update(struct sk_buff *skb, int off) 2099 { 2100 /* Only adjust this if it actually is csum_start rather than csum */ 2101 if (skb->ip_summed == CHECKSUM_PARTIAL) 2102 skb->csum_start += off; 2103 /* {transport,network,mac}_header and tail are relative to skb->head */ 2104 skb->transport_header += off; 2105 skb->network_header += off; 2106 if (skb_mac_header_was_set(skb)) 2107 skb->mac_header += off; 2108 skb->inner_transport_header += off; 2109 skb->inner_network_header += off; 2110 skb->inner_mac_header += off; 2111 } 2112 EXPORT_SYMBOL(skb_headers_offset_update); 2113 2114 void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) 2115 { 2116 __copy_skb_header(new, old); 2117 2118 skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; 2119 skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; 2120 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 2121 } 2122 EXPORT_SYMBOL(skb_copy_header); 2123 2124 static inline int skb_alloc_rx_flag(const struct sk_buff *skb) 2125 { 2126 if (skb_pfmemalloc(skb)) 2127 return SKB_ALLOC_RX; 2128 return 0; 2129 } 2130 2131 /** 2132 * skb_copy - create private copy of an sk_buff 2133 * @skb: buffer to copy 2134 * @gfp_mask: allocation priority 2135 * 2136 * Make a copy of both an &sk_buff and its data. This is used when the 2137 * caller wishes to modify the data and needs a private copy of the 2138 * data to alter. Returns %NULL on failure or the pointer to the buffer 2139 * on success. The returned buffer has a reference count of 1. 2140 * 2141 * As by-product this function converts non-linear &sk_buff to linear 2142 * one, so that &sk_buff becomes completely private and caller is allowed 2143 * to modify all the data of returned buffer. This means that this 2144 * function is not recommended for use in circumstances when only 2145 * header is going to be modified. Use pskb_copy() instead. 2146 */ 2147 2148 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) 2149 { 2150 struct sk_buff *n; 2151 unsigned int size; 2152 int headerlen; 2153 2154 if (!skb_frags_readable(skb)) 2155 return NULL; 2156 2157 if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) 2158 return NULL; 2159 2160 headerlen = skb_headroom(skb); 2161 size = skb_end_offset(skb) + skb->data_len; 2162 n = __alloc_skb(size, gfp_mask, 2163 skb_alloc_rx_flag(skb), NUMA_NO_NODE); 2164 if (!n) 2165 return NULL; 2166 2167 /* Set the data pointer */ 2168 skb_reserve(n, headerlen); 2169 /* Set the tail pointer and length */ 2170 skb_put(n, skb->len); 2171 2172 BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); 2173 2174 skb_copy_header(n, skb); 2175 return n; 2176 } 2177 EXPORT_SYMBOL(skb_copy); 2178 2179 /** 2180 * __pskb_copy_fclone - create copy of an sk_buff with private head. 2181 * @skb: buffer to copy 2182 * @headroom: headroom of new skb 2183 * @gfp_mask: allocation priority 2184 * @fclone: if true allocate the copy of the skb from the fclone 2185 * cache instead of the head cache; it is recommended to set this 2186 * to true for the cases where the copy will likely be cloned 2187 * 2188 * Make a copy of both an &sk_buff and part of its data, located 2189 * in header. Fragmented data remain shared. This is used when 2190 * the caller wishes to modify only header of &sk_buff and needs 2191 * private copy of the header to alter. Returns %NULL on failure 2192 * or the pointer to the buffer on success. 2193 * The returned buffer has a reference count of 1. 2194 */ 2195 2196 struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, 2197 gfp_t gfp_mask, bool fclone) 2198 { 2199 unsigned int size = skb_headlen(skb) + headroom; 2200 int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); 2201 struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); 2202 2203 if (!n) 2204 goto out; 2205 2206 /* Set the data pointer */ 2207 skb_reserve(n, headroom); 2208 /* Set the tail pointer and length */ 2209 skb_put(n, skb_headlen(skb)); 2210 /* Copy the bytes */ 2211 skb_copy_from_linear_data(skb, n->data, n->len); 2212 2213 n->truesize += skb->data_len; 2214 n->data_len = skb->data_len; 2215 n->len = skb->len; 2216 2217 if (skb_shinfo(skb)->nr_frags) { 2218 int i; 2219 2220 if (skb_orphan_frags(skb, gfp_mask) || 2221 skb_zerocopy_clone(n, skb, gfp_mask)) { 2222 kfree_skb(n); 2223 n = NULL; 2224 goto out; 2225 } 2226 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2227 skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; 2228 skb_frag_ref(skb, i); 2229 } 2230 skb_shinfo(n)->nr_frags = i; 2231 } 2232 2233 if (skb_has_frag_list(skb)) { 2234 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; 2235 skb_clone_fraglist(n); 2236 } 2237 2238 skb_copy_header(n, skb); 2239 out: 2240 return n; 2241 } 2242 EXPORT_SYMBOL(__pskb_copy_fclone); 2243 2244 /** 2245 * pskb_expand_head - reallocate header of &sk_buff 2246 * @skb: buffer to reallocate 2247 * @nhead: room to add at head 2248 * @ntail: room to add at tail 2249 * @gfp_mask: allocation priority 2250 * 2251 * Expands (or creates identical copy, if @nhead and @ntail are zero) 2252 * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have 2253 * reference count of 1. Returns zero in the case of success or error, 2254 * if expansion failed. In the last case, &sk_buff is not changed. 2255 * 2256 * All the pointers pointing into skb header may change and must be 2257 * reloaded after call to this function. 2258 * 2259 * Note: If you skb_push() the start of the buffer after reallocating the 2260 * header, call skb_postpush_data_move() first to move the metadata out of 2261 * the way before writing to &sk_buff->data. 2262 */ 2263 2264 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, 2265 gfp_t gfp_mask) 2266 { 2267 unsigned int osize = skb_end_offset(skb); 2268 unsigned int size = osize + nhead + ntail; 2269 long off; 2270 u8 *data; 2271 int i; 2272 2273 BUG_ON(nhead < 0); 2274 2275 BUG_ON(skb_shared(skb)); 2276 2277 skb_zcopy_downgrade_managed(skb); 2278 2279 if (skb_pfmemalloc(skb)) 2280 gfp_mask |= __GFP_MEMALLOC; 2281 2282 data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); 2283 if (!data) 2284 goto nodata; 2285 size = SKB_WITH_OVERHEAD(size); 2286 2287 /* Copy only real data... and, alas, header. This should be 2288 * optimized for the cases when header is void. 2289 */ 2290 memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); 2291 2292 memcpy((struct skb_shared_info *)(data + size), 2293 skb_shinfo(skb), 2294 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); 2295 2296 /* 2297 * if shinfo is shared we must drop the old head gracefully, but if it 2298 * is not we can just drop the old head and let the existing refcount 2299 * be since all we did is relocate the values 2300 */ 2301 if (skb_cloned(skb)) { 2302 if (skb_orphan_frags(skb, gfp_mask)) 2303 goto nofrags; 2304 if (skb_zcopy(skb)) 2305 refcount_inc(&skb_uarg(skb)->refcnt); 2306 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 2307 skb_frag_ref(skb, i); 2308 2309 if (skb_has_frag_list(skb)) 2310 skb_clone_fraglist(skb); 2311 2312 skb_release_data(skb, SKB_CONSUMED); 2313 } else { 2314 skb_free_head(skb); 2315 } 2316 off = (data + nhead) - skb->head; 2317 2318 skb->head = data; 2319 skb->head_frag = 0; 2320 skb->data += off; 2321 2322 skb_set_end_offset(skb, size); 2323 #ifdef NET_SKBUFF_DATA_USES_OFFSET 2324 off = nhead; 2325 #endif 2326 skb->tail += off; 2327 skb_headers_offset_update(skb, nhead); 2328 skb->cloned = 0; 2329 skb->hdr_len = 0; 2330 skb->nohdr = 0; 2331 atomic_set(&skb_shinfo(skb)->dataref, 1); 2332 2333 /* It is not generally safe to change skb->truesize. 2334 * For the moment, we really care of rx path, or 2335 * when skb is orphaned (not attached to a socket). 2336 */ 2337 if (!skb->sk || skb->destructor == sock_edemux) 2338 skb->truesize += size - osize; 2339 2340 return 0; 2341 2342 nofrags: 2343 skb_kfree_head(data, size); 2344 nodata: 2345 return -ENOMEM; 2346 } 2347 EXPORT_SYMBOL(pskb_expand_head); 2348 2349 /* Make private copy of skb with writable head and some headroom */ 2350 2351 struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) 2352 { 2353 struct sk_buff *skb2; 2354 int delta = headroom - skb_headroom(skb); 2355 2356 if (delta <= 0) 2357 skb2 = pskb_copy(skb, GFP_ATOMIC); 2358 else { 2359 skb2 = skb_clone(skb, GFP_ATOMIC); 2360 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, 2361 GFP_ATOMIC)) { 2362 kfree_skb(skb2); 2363 skb2 = NULL; 2364 } 2365 } 2366 return skb2; 2367 } 2368 EXPORT_SYMBOL(skb_realloc_headroom); 2369 2370 /* Note: We plan to rework this in linux-6.4 */ 2371 int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) 2372 { 2373 unsigned int saved_end_offset, saved_truesize; 2374 struct skb_shared_info *shinfo; 2375 int res; 2376 2377 saved_end_offset = skb_end_offset(skb); 2378 saved_truesize = skb->truesize; 2379 2380 res = pskb_expand_head(skb, 0, 0, pri); 2381 if (res) 2382 return res; 2383 2384 skb->truesize = saved_truesize; 2385 2386 if (likely(skb_end_offset(skb) == saved_end_offset)) 2387 return 0; 2388 2389 /* We can not change skb->end if the original or new value 2390 * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head(). 2391 */ 2392 if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM || 2393 skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) { 2394 /* We think this path should not be taken. 2395 * Add a temporary trace to warn us just in case. 2396 */ 2397 pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n", 2398 saved_end_offset, skb_end_offset(skb)); 2399 WARN_ON_ONCE(1); 2400 return 0; 2401 } 2402 2403 shinfo = skb_shinfo(skb); 2404 2405 /* We are about to change back skb->end, 2406 * we need to move skb_shinfo() to its new location. 2407 */ 2408 memmove(skb->head + saved_end_offset, 2409 shinfo, 2410 offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); 2411 2412 skb_set_end_offset(skb, saved_end_offset); 2413 2414 return 0; 2415 } 2416 2417 /** 2418 * skb_expand_head - reallocate header of &sk_buff 2419 * @skb: buffer to reallocate 2420 * @headroom: needed headroom 2421 * 2422 * Unlike skb_realloc_headroom, this one does not allocate a new skb 2423 * if possible; copies skb->sk to new skb as needed 2424 * and frees original skb in case of failures. 2425 * 2426 * It expect increased headroom and generates warning otherwise. 2427 */ 2428 2429 struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) 2430 { 2431 int delta = headroom - skb_headroom(skb); 2432 int osize = skb_end_offset(skb); 2433 struct sock *sk = skb->sk; 2434 2435 if (WARN_ONCE(delta <= 0, 2436 "%s is expecting an increase in the headroom", __func__)) 2437 return skb; 2438 2439 delta = SKB_DATA_ALIGN(delta); 2440 /* pskb_expand_head() might crash, if skb is shared. */ 2441 if (skb_shared(skb) || !is_skb_wmem(skb)) { 2442 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); 2443 2444 if (unlikely(!nskb)) 2445 goto fail; 2446 2447 if (sk) 2448 skb_set_owner_w(nskb, sk); 2449 consume_skb(skb); 2450 skb = nskb; 2451 } 2452 if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) 2453 goto fail; 2454 2455 if (sk && is_skb_wmem(skb)) { 2456 delta = skb_end_offset(skb) - osize; 2457 refcount_add(delta, &sk->sk_wmem_alloc); 2458 skb->truesize += delta; 2459 } 2460 return skb; 2461 2462 fail: 2463 kfree_skb(skb); 2464 return NULL; 2465 } 2466 EXPORT_SYMBOL(skb_expand_head); 2467 2468 /** 2469 * skb_copy_expand - copy and expand sk_buff 2470 * @skb: buffer to copy 2471 * @newheadroom: new free bytes at head 2472 * @newtailroom: new free bytes at tail 2473 * @gfp_mask: allocation priority 2474 * 2475 * Make a copy of both an &sk_buff and its data and while doing so 2476 * allocate additional space. 2477 * 2478 * This is used when the caller wishes to modify the data and needs a 2479 * private copy of the data to alter as well as more space for new fields. 2480 * Returns %NULL on failure or the pointer to the buffer 2481 * on success. The returned buffer has a reference count of 1. 2482 * 2483 * You must pass %GFP_ATOMIC as the allocation priority if this function 2484 * is called from an interrupt. 2485 */ 2486 struct sk_buff *skb_copy_expand(const struct sk_buff *skb, 2487 int newheadroom, int newtailroom, 2488 gfp_t gfp_mask) 2489 { 2490 /* 2491 * Allocate the copy buffer 2492 */ 2493 int head_copy_len, head_copy_off; 2494 struct sk_buff *n; 2495 int oldheadroom; 2496 2497 if (!skb_frags_readable(skb)) 2498 return NULL; 2499 2500 if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) 2501 return NULL; 2502 2503 oldheadroom = skb_headroom(skb); 2504 n = __alloc_skb(newheadroom + skb->len + newtailroom, 2505 gfp_mask, skb_alloc_rx_flag(skb), 2506 NUMA_NO_NODE); 2507 if (!n) 2508 return NULL; 2509 2510 skb_reserve(n, newheadroom); 2511 2512 /* Set the tail pointer and length */ 2513 skb_put(n, skb->len); 2514 2515 head_copy_len = oldheadroom; 2516 head_copy_off = 0; 2517 if (newheadroom <= head_copy_len) 2518 head_copy_len = newheadroom; 2519 else 2520 head_copy_off = newheadroom - head_copy_len; 2521 2522 /* Copy the linear header and data. */ 2523 BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, 2524 skb->len + head_copy_len)); 2525 2526 skb_copy_header(n, skb); 2527 2528 skb_headers_offset_update(n, newheadroom - oldheadroom); 2529 2530 return n; 2531 } 2532 EXPORT_SYMBOL(skb_copy_expand); 2533 2534 /** 2535 * __skb_pad - zero pad the tail of an skb 2536 * @skb: buffer to pad 2537 * @pad: space to pad 2538 * @free_on_error: free buffer on error 2539 * 2540 * Ensure that a buffer is followed by a padding area that is zero 2541 * filled. Used by network drivers which may DMA or transfer data 2542 * beyond the buffer end onto the wire. 2543 * 2544 * May return error in out of memory cases. The skb is freed on error 2545 * if @free_on_error is true. 2546 */ 2547 2548 int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) 2549 { 2550 int err; 2551 int ntail; 2552 2553 /* If the skbuff is non linear tailroom is always zero.. */ 2554 if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { 2555 memset(skb->data+skb->len, 0, pad); 2556 return 0; 2557 } 2558 2559 ntail = skb->data_len + pad - (skb->end - skb->tail); 2560 if (likely(skb_cloned(skb) || ntail > 0)) { 2561 err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); 2562 if (unlikely(err)) 2563 goto free_skb; 2564 } 2565 2566 /* FIXME: The use of this function with non-linear skb's really needs 2567 * to be audited. 2568 */ 2569 err = skb_linearize(skb); 2570 if (unlikely(err)) 2571 goto free_skb; 2572 2573 memset(skb->data + skb->len, 0, pad); 2574 return 0; 2575 2576 free_skb: 2577 if (free_on_error) 2578 kfree_skb(skb); 2579 return err; 2580 } 2581 EXPORT_SYMBOL(__skb_pad); 2582 2583 /** 2584 * pskb_put - add data to the tail of a potentially fragmented buffer 2585 * @skb: start of the buffer to use 2586 * @tail: tail fragment of the buffer to use 2587 * @len: amount of data to add 2588 * 2589 * This function extends the used data area of the potentially 2590 * fragmented buffer. @tail must be the last fragment of @skb -- or 2591 * @skb itself. If this would exceed the total buffer size the kernel 2592 * will panic. A pointer to the first byte of the extra data is 2593 * returned. 2594 */ 2595 2596 void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) 2597 { 2598 if (tail != skb) { 2599 skb->data_len += len; 2600 skb->len += len; 2601 } 2602 return skb_put(tail, len); 2603 } 2604 EXPORT_SYMBOL_GPL(pskb_put); 2605 2606 /** 2607 * skb_put - add data to a buffer 2608 * @skb: buffer to use 2609 * @len: amount of data to add 2610 * 2611 * This function extends the used data area of the buffer. If this would 2612 * exceed the total buffer size the kernel will panic. A pointer to the 2613 * first byte of the extra data is returned. 2614 */ 2615 void *skb_put(struct sk_buff *skb, unsigned int len) 2616 { 2617 void *tmp = skb_tail_pointer(skb); 2618 SKB_LINEAR_ASSERT(skb); 2619 skb->tail += len; 2620 skb->len += len; 2621 if (unlikely(skb->tail > skb->end)) 2622 skb_over_panic(skb, len, __builtin_return_address(0)); 2623 return tmp; 2624 } 2625 EXPORT_SYMBOL(skb_put); 2626 2627 /** 2628 * skb_push - add data to the start of a buffer 2629 * @skb: buffer to use 2630 * @len: amount of data to add 2631 * 2632 * This function extends the used data area of the buffer at the buffer 2633 * start. If this would exceed the total buffer headroom the kernel will 2634 * panic. A pointer to the first byte of the extra data is returned. 2635 */ 2636 void *skb_push(struct sk_buff *skb, unsigned int len) 2637 { 2638 skb->data -= len; 2639 skb->len += len; 2640 if (unlikely(skb->data < skb->head)) 2641 skb_under_panic(skb, len, __builtin_return_address(0)); 2642 return skb->data; 2643 } 2644 EXPORT_SYMBOL(skb_push); 2645 2646 /** 2647 * skb_pull - remove data from the start of a buffer 2648 * @skb: buffer to use 2649 * @len: amount of data to remove 2650 * 2651 * This function removes data from the start of a buffer, returning 2652 * the memory to the headroom. A pointer to the next data in the buffer 2653 * is returned. Once the data has been pulled future pushes will overwrite 2654 * the old data. 2655 */ 2656 void *skb_pull(struct sk_buff *skb, unsigned int len) 2657 { 2658 return skb_pull_inline(skb, len); 2659 } 2660 EXPORT_SYMBOL(skb_pull); 2661 2662 /** 2663 * skb_pull_data - remove data from the start of a buffer returning its 2664 * original position. 2665 * @skb: buffer to use 2666 * @len: amount of data to remove 2667 * 2668 * This function removes data from the start of a buffer, returning 2669 * the memory to the headroom. A pointer to the original data in the buffer 2670 * is returned after checking if there is enough data to pull. Once the 2671 * data has been pulled future pushes will overwrite the old data. 2672 */ 2673 void *skb_pull_data(struct sk_buff *skb, size_t len) 2674 { 2675 void *data = skb->data; 2676 2677 if (skb->len < len) 2678 return NULL; 2679 2680 skb_pull(skb, len); 2681 2682 return data; 2683 } 2684 EXPORT_SYMBOL(skb_pull_data); 2685 2686 /** 2687 * skb_trim - remove end from a buffer 2688 * @skb: buffer to alter 2689 * @len: new length 2690 * 2691 * Cut the length of a buffer down by removing data from the tail. If 2692 * the buffer is already under the length specified it is not modified. 2693 * The skb must be linear. 2694 */ 2695 void skb_trim(struct sk_buff *skb, unsigned int len) 2696 { 2697 if (skb->len > len) 2698 __skb_trim(skb, len); 2699 } 2700 EXPORT_SYMBOL(skb_trim); 2701 2702 /* Trims skb to length len. It can change skb pointers. 2703 */ 2704 2705 int ___pskb_trim(struct sk_buff *skb, unsigned int len) 2706 { 2707 struct sk_buff **fragp; 2708 struct sk_buff *frag; 2709 int offset = skb_headlen(skb); 2710 int nfrags = skb_shinfo(skb)->nr_frags; 2711 int i; 2712 int err; 2713 2714 if (skb_cloned(skb) && 2715 unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) 2716 return err; 2717 2718 i = 0; 2719 if (offset >= len) 2720 goto drop_pages; 2721 2722 for (; i < nfrags; i++) { 2723 int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); 2724 2725 if (end < len) { 2726 offset = end; 2727 continue; 2728 } 2729 2730 skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); 2731 2732 drop_pages: 2733 skb_shinfo(skb)->nr_frags = i; 2734 2735 for (; i < nfrags; i++) 2736 skb_frag_unref(skb, i); 2737 2738 if (skb_has_frag_list(skb)) 2739 skb_drop_fraglist(skb); 2740 goto done; 2741 } 2742 2743 for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); 2744 fragp = &frag->next) { 2745 int end = offset + frag->len; 2746 2747 if (skb_shared(frag)) { 2748 struct sk_buff *nfrag; 2749 2750 nfrag = skb_clone(frag, GFP_ATOMIC); 2751 if (unlikely(!nfrag)) 2752 return -ENOMEM; 2753 2754 nfrag->next = frag->next; 2755 consume_skb(frag); 2756 frag = nfrag; 2757 *fragp = frag; 2758 } 2759 2760 if (end < len) { 2761 offset = end; 2762 continue; 2763 } 2764 2765 if (end > len && 2766 unlikely((err = pskb_trim(frag, len - offset)))) 2767 return err; 2768 2769 if (frag->next) 2770 skb_drop_list(&frag->next); 2771 break; 2772 } 2773 2774 done: 2775 if (len > skb_headlen(skb)) { 2776 skb->data_len -= skb->len - len; 2777 skb->len = len; 2778 } else { 2779 skb->len = len; 2780 skb->data_len = 0; 2781 skb_set_tail_pointer(skb, len); 2782 } 2783 2784 if (!skb->sk || skb->destructor == sock_edemux) 2785 skb_condense(skb); 2786 return 0; 2787 } 2788 EXPORT_SYMBOL(___pskb_trim); 2789 2790 /* Note : use pskb_trim_rcsum() instead of calling this directly 2791 */ 2792 int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) 2793 { 2794 if (skb->ip_summed == CHECKSUM_COMPLETE) { 2795 int delta = skb->len - len; 2796 2797 skb->csum = csum_block_sub(skb->csum, 2798 skb_checksum(skb, len, delta, 0), 2799 len); 2800 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { 2801 int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; 2802 int offset = skb_checksum_start_offset(skb) + skb->csum_offset; 2803 2804 if (offset + sizeof(__sum16) > hdlen) 2805 return -EINVAL; 2806 } 2807 return __pskb_trim(skb, len); 2808 } 2809 EXPORT_SYMBOL(pskb_trim_rcsum_slow); 2810 2811 /** 2812 * __pskb_pull_tail - advance tail of skb header 2813 * @skb: buffer to reallocate 2814 * @delta: number of bytes to advance tail 2815 * 2816 * The function makes a sense only on a fragmented &sk_buff, 2817 * it expands header moving its tail forward and copying necessary 2818 * data from fragmented part. 2819 * 2820 * &sk_buff MUST have reference count of 1. 2821 * 2822 * Returns %NULL (and &sk_buff does not change) if pull failed 2823 * or value of new tail of skb in the case of success. 2824 * 2825 * All the pointers pointing into skb header may change and must be 2826 * reloaded after call to this function. 2827 */ 2828 2829 /* Moves tail of skb head forward, copying data from fragmented part, 2830 * when it is necessary. 2831 * 1. It may fail due to malloc failure. 2832 * 2. It may change skb pointers. 2833 * 2834 * It is pretty complicated. Luckily, it is called only in exceptional cases. 2835 */ 2836 void *__pskb_pull_tail(struct sk_buff *skb, int delta) 2837 { 2838 /* If skb has not enough free space at tail, get new one 2839 * plus 128 bytes for future expansions. If we have enough 2840 * room at tail, reallocate without expansion only if skb is cloned. 2841 */ 2842 int i, k, eat = (skb->tail + delta) - skb->end; 2843 2844 if (!skb_frags_readable(skb)) 2845 return NULL; 2846 2847 if (eat > 0 || skb_cloned(skb)) { 2848 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, 2849 GFP_ATOMIC)) 2850 return NULL; 2851 } 2852 2853 BUG_ON(skb_copy_bits(skb, skb_headlen(skb), 2854 skb_tail_pointer(skb), delta)); 2855 2856 /* Optimization: no fragments, no reasons to preestimate 2857 * size of pulled pages. Superb. 2858 */ 2859 if (!skb_has_frag_list(skb)) 2860 goto pull_pages; 2861 2862 /* Estimate size of pulled pages. */ 2863 eat = delta; 2864 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2865 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2866 2867 if (size >= eat) 2868 goto pull_pages; 2869 eat -= size; 2870 } 2871 2872 /* If we need update frag list, we are in troubles. 2873 * Certainly, it is possible to add an offset to skb data, 2874 * but taking into account that pulling is expected to 2875 * be very rare operation, it is worth to fight against 2876 * further bloating skb head and crucify ourselves here instead. 2877 * Pure masohism, indeed. 8)8) 2878 */ 2879 if (eat) { 2880 struct sk_buff *list = skb_shinfo(skb)->frag_list; 2881 struct sk_buff *clone = NULL; 2882 struct sk_buff *insp = NULL; 2883 2884 do { 2885 if (list->len <= eat) { 2886 /* Eaten as whole. */ 2887 eat -= list->len; 2888 list = list->next; 2889 insp = list; 2890 } else { 2891 /* Eaten partially. */ 2892 if (skb_is_gso(skb) && !list->head_frag && 2893 skb_headlen(list)) 2894 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 2895 2896 if (skb_shared(list)) { 2897 /* Sucks! We need to fork list. :-( */ 2898 clone = skb_clone(list, GFP_ATOMIC); 2899 if (!clone) 2900 return NULL; 2901 insp = list->next; 2902 list = clone; 2903 } else { 2904 /* This may be pulled without 2905 * problems. */ 2906 insp = list; 2907 } 2908 if (!pskb_pull(list, eat)) { 2909 kfree_skb(clone); 2910 return NULL; 2911 } 2912 break; 2913 } 2914 } while (eat); 2915 2916 /* Free pulled out fragments. */ 2917 while ((list = skb_shinfo(skb)->frag_list) != insp) { 2918 skb_shinfo(skb)->frag_list = list->next; 2919 consume_skb(list); 2920 } 2921 /* And insert new clone at head. */ 2922 if (clone) { 2923 clone->next = list; 2924 skb_shinfo(skb)->frag_list = clone; 2925 } 2926 } 2927 /* Success! Now we may commit changes to skb data. */ 2928 2929 pull_pages: 2930 eat = delta; 2931 k = 0; 2932 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2933 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2934 2935 if (size <= eat) { 2936 skb_frag_unref(skb, i); 2937 eat -= size; 2938 } else { 2939 skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; 2940 2941 *frag = skb_shinfo(skb)->frags[i]; 2942 if (eat) { 2943 skb_frag_off_add(frag, eat); 2944 skb_frag_size_sub(frag, eat); 2945 if (!i) 2946 goto end; 2947 eat = 0; 2948 } 2949 k++; 2950 } 2951 } 2952 skb_shinfo(skb)->nr_frags = k; 2953 2954 end: 2955 skb->tail += delta; 2956 skb->data_len -= delta; 2957 2958 if (!skb->data_len) 2959 skb_zcopy_clear(skb, false); 2960 2961 return skb_tail_pointer(skb); 2962 } 2963 EXPORT_SYMBOL(__pskb_pull_tail); 2964 2965 /** 2966 * skb_copy_bits - copy bits from skb to kernel buffer 2967 * @skb: source skb 2968 * @offset: offset in source 2969 * @to: destination buffer 2970 * @len: number of bytes to copy 2971 * 2972 * Copy the specified number of bytes from the source skb to the 2973 * destination buffer. 2974 * 2975 * CAUTION ! : 2976 * If its prototype is ever changed, 2977 * check arch/{*}/net/{*}.S files, 2978 * since it is called from BPF assembly code. 2979 */ 2980 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) 2981 { 2982 int start = skb_headlen(skb); 2983 struct sk_buff *frag_iter; 2984 int i, copy; 2985 2986 if (offset > (int)skb->len - len) 2987 goto fault; 2988 2989 /* Copy header. */ 2990 if ((copy = start - offset) > 0) { 2991 if (copy > len) 2992 copy = len; 2993 skb_copy_from_linear_data_offset(skb, offset, to, copy); 2994 if ((len -= copy) == 0) 2995 return 0; 2996 offset += copy; 2997 to += copy; 2998 } 2999 3000 if (!skb_frags_readable(skb)) 3001 goto fault; 3002 3003 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3004 int end; 3005 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 3006 3007 WARN_ON(start > offset + len); 3008 3009 end = start + skb_frag_size(f); 3010 if ((copy = end - offset) > 0) { 3011 u32 p_off, p_len, copied; 3012 struct page *p; 3013 u8 *vaddr; 3014 3015 if (copy > len) 3016 copy = len; 3017 3018 skb_frag_foreach_page(f, 3019 skb_frag_off(f) + offset - start, 3020 copy, p, p_off, p_len, copied) { 3021 vaddr = kmap_atomic(p); 3022 memcpy(to + copied, vaddr + p_off, p_len); 3023 kunmap_atomic(vaddr); 3024 } 3025 3026 if ((len -= copy) == 0) 3027 return 0; 3028 offset += copy; 3029 to += copy; 3030 } 3031 start = end; 3032 } 3033 3034 skb_walk_frags(skb, frag_iter) { 3035 int end; 3036 3037 WARN_ON(start > offset + len); 3038 3039 end = start + frag_iter->len; 3040 if ((copy = end - offset) > 0) { 3041 if (copy > len) 3042 copy = len; 3043 if (skb_copy_bits(frag_iter, offset - start, to, copy)) 3044 goto fault; 3045 if ((len -= copy) == 0) 3046 return 0; 3047 offset += copy; 3048 to += copy; 3049 } 3050 start = end; 3051 } 3052 3053 if (!len) 3054 return 0; 3055 3056 fault: 3057 return -EFAULT; 3058 } 3059 EXPORT_SYMBOL(skb_copy_bits); 3060 3061 /* 3062 * Callback from splice_to_pipe(), if we need to release some pages 3063 * at the end of the spd in case we error'ed out in filling the pipe. 3064 */ 3065 static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) 3066 { 3067 put_page(spd->pages[i]); 3068 } 3069 3070 static struct page *linear_to_page(struct page *page, unsigned int *len, 3071 unsigned int *offset, 3072 struct sock *sk) 3073 { 3074 struct page_frag *pfrag = sk_page_frag(sk); 3075 3076 if (!sk_page_frag_refill(sk, pfrag)) 3077 return NULL; 3078 3079 *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); 3080 3081 memcpy(page_address(pfrag->page) + pfrag->offset, 3082 page_address(page) + *offset, *len); 3083 *offset = pfrag->offset; 3084 pfrag->offset += *len; 3085 3086 return pfrag->page; 3087 } 3088 3089 static bool spd_can_coalesce(const struct splice_pipe_desc *spd, 3090 struct page *page, 3091 unsigned int offset) 3092 { 3093 return spd->nr_pages && 3094 spd->pages[spd->nr_pages - 1] == page && 3095 (spd->partial[spd->nr_pages - 1].offset + 3096 spd->partial[spd->nr_pages - 1].len == offset); 3097 } 3098 3099 /* 3100 * Fill page/offset/length into spd, if it can hold more pages. 3101 */ 3102 static bool spd_fill_page(struct splice_pipe_desc *spd, struct page *page, 3103 unsigned int *len, unsigned int offset, bool linear, 3104 struct sock *sk) 3105 { 3106 if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) 3107 return true; 3108 3109 if (linear) { 3110 page = linear_to_page(page, len, &offset, sk); 3111 if (!page) 3112 return true; 3113 } 3114 if (spd_can_coalesce(spd, page, offset)) { 3115 spd->partial[spd->nr_pages - 1].len += *len; 3116 return false; 3117 } 3118 get_page(page); 3119 spd->pages[spd->nr_pages] = page; 3120 spd->partial[spd->nr_pages].len = *len; 3121 spd->partial[spd->nr_pages].offset = offset; 3122 spd->nr_pages++; 3123 3124 return false; 3125 } 3126 3127 static bool __splice_segment(struct page *page, unsigned int poff, 3128 unsigned int plen, unsigned int *off, 3129 unsigned int *len, 3130 struct splice_pipe_desc *spd, bool linear, 3131 struct sock *sk) 3132 { 3133 if (!*len) 3134 return true; 3135 3136 /* skip this segment if already processed */ 3137 if (*off >= plen) { 3138 *off -= plen; 3139 return false; 3140 } 3141 3142 /* ignore any bits we already processed */ 3143 poff += *off; 3144 plen -= *off; 3145 *off = 0; 3146 3147 do { 3148 unsigned int flen = min(*len, plen); 3149 3150 if (spd_fill_page(spd, page, &flen, poff, linear, sk)) 3151 return true; 3152 poff += flen; 3153 plen -= flen; 3154 *len -= flen; 3155 if (!*len) 3156 return true; 3157 } while (plen); 3158 3159 return false; 3160 } 3161 3162 /* 3163 * Map linear and fragment data from the skb to spd. It reports true if the 3164 * pipe is full or if we already spliced the requested length. 3165 */ 3166 static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, 3167 unsigned int *offset, unsigned int *len, 3168 struct splice_pipe_desc *spd, struct sock *sk) 3169 { 3170 struct sk_buff *iter; 3171 int seg; 3172 3173 /* map the linear part : 3174 * If skb->head_frag is set, this 'linear' part is backed by a 3175 * fragment, and if the head is not shared with any clones then 3176 * we can avoid a copy since we own the head portion of this page. 3177 */ 3178 if (__splice_segment(virt_to_page(skb->data), 3179 (unsigned long) skb->data & (PAGE_SIZE - 1), 3180 skb_headlen(skb), 3181 offset, len, spd, 3182 skb_head_is_locked(skb), 3183 sk)) 3184 return true; 3185 3186 /* 3187 * then map the fragments 3188 */ 3189 if (!skb_frags_readable(skb)) 3190 return false; 3191 3192 for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { 3193 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 3194 3195 if (WARN_ON_ONCE(!skb_frag_page(f))) 3196 return false; 3197 3198 if (__splice_segment(skb_frag_page(f), 3199 skb_frag_off(f), skb_frag_size(f), 3200 offset, len, spd, false, sk)) 3201 return true; 3202 } 3203 3204 skb_walk_frags(skb, iter) { 3205 if (*offset >= iter->len) { 3206 *offset -= iter->len; 3207 continue; 3208 } 3209 /* __skb_splice_bits() only fails if the output has no room 3210 * left, so no point in going over the frag_list for the error 3211 * case. 3212 */ 3213 if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) 3214 return true; 3215 } 3216 3217 return false; 3218 } 3219 3220 /* 3221 * Map data from the skb to a pipe. Should handle both the linear part, 3222 * the fragments, and the frag list. 3223 */ 3224 int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, 3225 struct pipe_inode_info *pipe, unsigned int tlen, 3226 unsigned int flags) 3227 { 3228 struct partial_page partial[MAX_SKB_FRAGS]; 3229 struct page *pages[MAX_SKB_FRAGS]; 3230 struct splice_pipe_desc spd = { 3231 .pages = pages, 3232 .partial = partial, 3233 .nr_pages_max = MAX_SKB_FRAGS, 3234 .ops = &nosteal_pipe_buf_ops, 3235 .spd_release = sock_spd_release, 3236 }; 3237 int ret = 0; 3238 3239 __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); 3240 3241 if (spd.nr_pages) 3242 ret = splice_to_pipe(pipe, &spd); 3243 3244 return ret; 3245 } 3246 EXPORT_SYMBOL_GPL(skb_splice_bits); 3247 3248 static int sendmsg_locked(struct sock *sk, struct msghdr *msg) 3249 { 3250 struct socket *sock = sk->sk_socket; 3251 size_t size = msg_data_left(msg); 3252 3253 if (!sock) 3254 return -EINVAL; 3255 3256 if (!sock->ops->sendmsg_locked) 3257 return sock_no_sendmsg_locked(sk, msg, size); 3258 3259 return sock->ops->sendmsg_locked(sk, msg, size); 3260 } 3261 3262 static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) 3263 { 3264 struct socket *sock = sk->sk_socket; 3265 3266 if (!sock) 3267 return -EINVAL; 3268 return sock_sendmsg(sock, msg); 3269 } 3270 3271 typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); 3272 static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, 3273 int len, sendmsg_func sendmsg, int flags) 3274 { 3275 int more_hint = sk_is_tcp(sk) ? MSG_MORE : 0; 3276 unsigned int orig_len = len; 3277 struct sk_buff *head = skb; 3278 unsigned short fragidx; 3279 int slen, ret; 3280 3281 do_frag_list: 3282 3283 /* Deal with head data */ 3284 while (offset < skb_headlen(skb) && len) { 3285 struct kvec kv; 3286 struct msghdr msg; 3287 3288 slen = min_t(int, len, skb_headlen(skb) - offset); 3289 kv.iov_base = skb->data + offset; 3290 kv.iov_len = slen; 3291 memset(&msg, 0, sizeof(msg)); 3292 msg.msg_flags = MSG_DONTWAIT | flags; 3293 if (slen < len) 3294 msg.msg_flags |= more_hint; 3295 3296 iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); 3297 ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, 3298 sendmsg_unlocked, sk, &msg); 3299 if (ret <= 0) 3300 goto error; 3301 3302 offset += ret; 3303 len -= ret; 3304 } 3305 3306 /* All the data was skb head? */ 3307 if (!len) 3308 goto out; 3309 3310 /* Make offset relative to start of frags */ 3311 offset -= skb_headlen(skb); 3312 3313 /* Find where we are in frag list */ 3314 for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { 3315 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; 3316 3317 if (offset < skb_frag_size(frag)) 3318 break; 3319 3320 offset -= skb_frag_size(frag); 3321 } 3322 3323 for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { 3324 skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; 3325 3326 slen = min_t(size_t, len, skb_frag_size(frag) - offset); 3327 3328 while (slen) { 3329 struct bio_vec bvec; 3330 struct msghdr msg = { 3331 .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | 3332 flags, 3333 }; 3334 3335 if (slen < len) 3336 msg.msg_flags |= more_hint; 3337 bvec_set_page(&bvec, skb_frag_page(frag), slen, 3338 skb_frag_off(frag) + offset); 3339 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, 3340 slen); 3341 3342 ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, 3343 sendmsg_unlocked, sk, &msg); 3344 if (ret <= 0) 3345 goto error; 3346 3347 len -= ret; 3348 offset += ret; 3349 slen -= ret; 3350 } 3351 3352 offset = 0; 3353 } 3354 3355 if (len) { 3356 /* Process any frag lists */ 3357 3358 if (skb == head) { 3359 if (skb_has_frag_list(skb)) { 3360 skb = skb_shinfo(skb)->frag_list; 3361 goto do_frag_list; 3362 } 3363 } else if (skb->next) { 3364 skb = skb->next; 3365 goto do_frag_list; 3366 } 3367 } 3368 3369 out: 3370 return orig_len - len; 3371 3372 error: 3373 return orig_len == len ? ret : orig_len - len; 3374 } 3375 3376 /* Send skb data on a socket. Socket must be locked. */ 3377 int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, 3378 int len) 3379 { 3380 return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0); 3381 } 3382 EXPORT_SYMBOL_GPL(skb_send_sock_locked); 3383 3384 int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, 3385 int offset, int len, int flags) 3386 { 3387 return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags); 3388 } 3389 EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags); 3390 3391 /* Send skb data on a socket. Socket must be unlocked. */ 3392 int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) 3393 { 3394 return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0); 3395 } 3396 3397 /** 3398 * skb_store_bits - store bits from kernel buffer to skb 3399 * @skb: destination buffer 3400 * @offset: offset in destination 3401 * @from: source buffer 3402 * @len: number of bytes to copy 3403 * 3404 * Copy the specified number of bytes from the source buffer to the 3405 * destination skb. This function handles all the messy bits of 3406 * traversing fragment lists and such. 3407 */ 3408 3409 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) 3410 { 3411 int start = skb_headlen(skb); 3412 struct sk_buff *frag_iter; 3413 int i, copy; 3414 3415 if (offset > (int)skb->len - len) 3416 goto fault; 3417 3418 if ((copy = start - offset) > 0) { 3419 if (copy > len) 3420 copy = len; 3421 skb_copy_to_linear_data_offset(skb, offset, from, copy); 3422 if ((len -= copy) == 0) 3423 return 0; 3424 offset += copy; 3425 from += copy; 3426 } 3427 3428 if (!skb_frags_readable(skb)) 3429 goto fault; 3430 3431 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3432 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3433 int end; 3434 3435 WARN_ON(start > offset + len); 3436 3437 end = start + skb_frag_size(frag); 3438 if ((copy = end - offset) > 0) { 3439 u32 p_off, p_len, copied; 3440 struct page *p; 3441 u8 *vaddr; 3442 3443 if (copy > len) 3444 copy = len; 3445 3446 skb_frag_foreach_page(frag, 3447 skb_frag_off(frag) + offset - start, 3448 copy, p, p_off, p_len, copied) { 3449 vaddr = kmap_atomic(p); 3450 memcpy(vaddr + p_off, from + copied, p_len); 3451 kunmap_atomic(vaddr); 3452 } 3453 3454 if ((len -= copy) == 0) 3455 return 0; 3456 offset += copy; 3457 from += copy; 3458 } 3459 start = end; 3460 } 3461 3462 skb_walk_frags(skb, frag_iter) { 3463 int end; 3464 3465 WARN_ON(start > offset + len); 3466 3467 end = start + frag_iter->len; 3468 if ((copy = end - offset) > 0) { 3469 if (copy > len) 3470 copy = len; 3471 if (skb_store_bits(frag_iter, offset - start, 3472 from, copy)) 3473 goto fault; 3474 if ((len -= copy) == 0) 3475 return 0; 3476 offset += copy; 3477 from += copy; 3478 } 3479 start = end; 3480 } 3481 if (!len) 3482 return 0; 3483 3484 fault: 3485 return -EFAULT; 3486 } 3487 EXPORT_SYMBOL(skb_store_bits); 3488 3489 /* Checksum skb data. */ 3490 __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum) 3491 { 3492 int start = skb_headlen(skb); 3493 int i, copy = start - offset; 3494 struct sk_buff *frag_iter; 3495 int pos = 0; 3496 3497 /* Checksum header. */ 3498 if (copy > 0) { 3499 if (copy > len) 3500 copy = len; 3501 csum = csum_partial(skb->data + offset, copy, csum); 3502 if ((len -= copy) == 0) 3503 return csum; 3504 offset += copy; 3505 pos = copy; 3506 } 3507 3508 if (WARN_ON_ONCE(!skb_frags_readable(skb))) 3509 return 0; 3510 3511 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3512 int end; 3513 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3514 3515 WARN_ON(start > offset + len); 3516 3517 end = start + skb_frag_size(frag); 3518 if ((copy = end - offset) > 0) { 3519 u32 p_off, p_len, copied; 3520 struct page *p; 3521 __wsum csum2; 3522 u8 *vaddr; 3523 3524 if (copy > len) 3525 copy = len; 3526 3527 skb_frag_foreach_page(frag, 3528 skb_frag_off(frag) + offset - start, 3529 copy, p, p_off, p_len, copied) { 3530 vaddr = kmap_atomic(p); 3531 csum2 = csum_partial(vaddr + p_off, p_len, 0); 3532 kunmap_atomic(vaddr); 3533 csum = csum_block_add(csum, csum2, pos); 3534 pos += p_len; 3535 } 3536 3537 if (!(len -= copy)) 3538 return csum; 3539 offset += copy; 3540 } 3541 start = end; 3542 } 3543 3544 skb_walk_frags(skb, frag_iter) { 3545 int end; 3546 3547 WARN_ON(start > offset + len); 3548 3549 end = start + frag_iter->len; 3550 if ((copy = end - offset) > 0) { 3551 __wsum csum2; 3552 if (copy > len) 3553 copy = len; 3554 csum2 = skb_checksum(frag_iter, offset - start, copy, 3555 0); 3556 csum = csum_block_add(csum, csum2, pos); 3557 if ((len -= copy) == 0) 3558 return csum; 3559 offset += copy; 3560 pos += copy; 3561 } 3562 start = end; 3563 } 3564 BUG_ON(len); 3565 3566 return csum; 3567 } 3568 EXPORT_SYMBOL(skb_checksum); 3569 3570 /* Both of above in one bottle. */ 3571 3572 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, 3573 u8 *to, int len) 3574 { 3575 int start = skb_headlen(skb); 3576 int i, copy = start - offset; 3577 struct sk_buff *frag_iter; 3578 int pos = 0; 3579 __wsum csum = 0; 3580 3581 /* Copy header. */ 3582 if (copy > 0) { 3583 if (copy > len) 3584 copy = len; 3585 csum = csum_partial_copy_nocheck(skb->data + offset, to, 3586 copy); 3587 if ((len -= copy) == 0) 3588 return csum; 3589 offset += copy; 3590 to += copy; 3591 pos = copy; 3592 } 3593 3594 if (!skb_frags_readable(skb)) 3595 return 0; 3596 3597 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3598 int end; 3599 3600 WARN_ON(start > offset + len); 3601 3602 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 3603 if ((copy = end - offset) > 0) { 3604 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3605 u32 p_off, p_len, copied; 3606 struct page *p; 3607 __wsum csum2; 3608 u8 *vaddr; 3609 3610 if (copy > len) 3611 copy = len; 3612 3613 skb_frag_foreach_page(frag, 3614 skb_frag_off(frag) + offset - start, 3615 copy, p, p_off, p_len, copied) { 3616 vaddr = kmap_atomic(p); 3617 csum2 = csum_partial_copy_nocheck(vaddr + p_off, 3618 to + copied, 3619 p_len); 3620 kunmap_atomic(vaddr); 3621 csum = csum_block_add(csum, csum2, pos); 3622 pos += p_len; 3623 } 3624 3625 if (!(len -= copy)) 3626 return csum; 3627 offset += copy; 3628 to += copy; 3629 } 3630 start = end; 3631 } 3632 3633 skb_walk_frags(skb, frag_iter) { 3634 __wsum csum2; 3635 int end; 3636 3637 WARN_ON(start > offset + len); 3638 3639 end = start + frag_iter->len; 3640 if ((copy = end - offset) > 0) { 3641 if (copy > len) 3642 copy = len; 3643 csum2 = skb_copy_and_csum_bits(frag_iter, 3644 offset - start, 3645 to, copy); 3646 csum = csum_block_add(csum, csum2, pos); 3647 if ((len -= copy) == 0) 3648 return csum; 3649 offset += copy; 3650 to += copy; 3651 pos += copy; 3652 } 3653 start = end; 3654 } 3655 BUG_ON(len); 3656 return csum; 3657 } 3658 EXPORT_SYMBOL(skb_copy_and_csum_bits); 3659 3660 #ifdef CONFIG_NET_CRC32C 3661 u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc) 3662 { 3663 int start = skb_headlen(skb); 3664 int i, copy = start - offset; 3665 struct sk_buff *frag_iter; 3666 3667 if (copy > 0) { 3668 copy = min(copy, len); 3669 crc = crc32c(crc, skb->data + offset, copy); 3670 len -= copy; 3671 if (len == 0) 3672 return crc; 3673 offset += copy; 3674 } 3675 3676 if (WARN_ON_ONCE(!skb_frags_readable(skb))) 3677 return 0; 3678 3679 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3680 int end; 3681 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3682 3683 WARN_ON(start > offset + len); 3684 3685 end = start + skb_frag_size(frag); 3686 copy = end - offset; 3687 if (copy > 0) { 3688 u32 p_off, p_len, copied; 3689 struct page *p; 3690 u8 *vaddr; 3691 3692 copy = min(copy, len); 3693 skb_frag_foreach_page(frag, 3694 skb_frag_off(frag) + offset - start, 3695 copy, p, p_off, p_len, copied) { 3696 vaddr = kmap_atomic(p); 3697 crc = crc32c(crc, vaddr + p_off, p_len); 3698 kunmap_atomic(vaddr); 3699 } 3700 len -= copy; 3701 if (len == 0) 3702 return crc; 3703 offset += copy; 3704 } 3705 start = end; 3706 } 3707 3708 skb_walk_frags(skb, frag_iter) { 3709 int end; 3710 3711 WARN_ON(start > offset + len); 3712 3713 end = start + frag_iter->len; 3714 copy = end - offset; 3715 if (copy > 0) { 3716 copy = min(copy, len); 3717 crc = skb_crc32c(frag_iter, offset - start, copy, crc); 3718 len -= copy; 3719 if (len == 0) 3720 return crc; 3721 offset += copy; 3722 } 3723 start = end; 3724 } 3725 BUG_ON(len); 3726 3727 return crc; 3728 } 3729 EXPORT_SYMBOL(skb_crc32c); 3730 #endif /* CONFIG_NET_CRC32C */ 3731 3732 __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) 3733 { 3734 __sum16 sum; 3735 3736 sum = csum_fold(skb_checksum(skb, 0, len, skb->csum)); 3737 /* See comments in __skb_checksum_complete(). */ 3738 if (likely(!sum)) { 3739 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 3740 !skb->csum_complete_sw) 3741 netdev_rx_csum_fault(skb->dev, skb); 3742 } 3743 if (!skb_shared(skb)) 3744 skb->csum_valid = !sum; 3745 return sum; 3746 } 3747 EXPORT_SYMBOL(__skb_checksum_complete_head); 3748 3749 /* This function assumes skb->csum already holds pseudo header's checksum, 3750 * which has been changed from the hardware checksum, for example, by 3751 * __skb_checksum_validate_complete(). And, the original skb->csum must 3752 * have been validated unsuccessfully for CHECKSUM_COMPLETE case. 3753 * 3754 * It returns non-zero if the recomputed checksum is still invalid, otherwise 3755 * zero. The new checksum is stored back into skb->csum unless the skb is 3756 * shared. 3757 */ 3758 __sum16 __skb_checksum_complete(struct sk_buff *skb) 3759 { 3760 __wsum csum; 3761 __sum16 sum; 3762 3763 csum = skb_checksum(skb, 0, skb->len, 0); 3764 3765 sum = csum_fold(csum_add(skb->csum, csum)); 3766 /* This check is inverted, because we already knew the hardware 3767 * checksum is invalid before calling this function. So, if the 3768 * re-computed checksum is valid instead, then we have a mismatch 3769 * between the original skb->csum and skb_checksum(). This means either 3770 * the original hardware checksum is incorrect or we screw up skb->csum 3771 * when moving skb->data around. 3772 */ 3773 if (likely(!sum)) { 3774 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 3775 !skb->csum_complete_sw) 3776 netdev_rx_csum_fault(skb->dev, skb); 3777 } 3778 3779 if (!skb_shared(skb)) { 3780 /* Save full packet checksum */ 3781 skb->csum = csum; 3782 skb->ip_summed = CHECKSUM_COMPLETE; 3783 skb->csum_complete_sw = 1; 3784 skb->csum_valid = !sum; 3785 } 3786 3787 return sum; 3788 } 3789 EXPORT_SYMBOL(__skb_checksum_complete); 3790 3791 /** 3792 * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() 3793 * @from: source buffer 3794 * 3795 * Calculates the amount of linear headroom needed in the 'to' skb passed 3796 * into skb_zerocopy(). 3797 */ 3798 unsigned int 3799 skb_zerocopy_headlen(const struct sk_buff *from) 3800 { 3801 unsigned int hlen = 0; 3802 3803 if (!from->head_frag || 3804 skb_headlen(from) < L1_CACHE_BYTES || 3805 skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { 3806 hlen = skb_headlen(from); 3807 if (!hlen) 3808 hlen = from->len; 3809 } 3810 3811 if (skb_has_frag_list(from)) 3812 hlen = from->len; 3813 3814 return hlen; 3815 } 3816 EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); 3817 3818 /** 3819 * skb_zerocopy - Zero copy skb to skb 3820 * @to: destination buffer 3821 * @from: source buffer 3822 * @len: number of bytes to copy from source buffer 3823 * @hlen: size of linear headroom in destination buffer 3824 * 3825 * Copies up to `len` bytes from `from` to `to` by creating references 3826 * to the frags in the source buffer. 3827 * 3828 * The `hlen` as calculated by skb_zerocopy_headlen() specifies the 3829 * headroom in the `to` buffer. 3830 * 3831 * Return value: 3832 * 0: everything is OK 3833 * -ENOMEM: couldn't orphan frags of @from due to lack of memory 3834 * -EFAULT: skb_copy_bits() found some problem with skb geometry 3835 */ 3836 int 3837 skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) 3838 { 3839 int i, j = 0; 3840 int plen = 0; /* length of skb->head fragment */ 3841 int ret; 3842 struct page *page; 3843 unsigned int offset; 3844 3845 BUG_ON(!from->head_frag && !hlen); 3846 3847 /* dont bother with small payloads */ 3848 if (len <= skb_tailroom(to)) 3849 return skb_copy_bits(from, 0, skb_put(to, len), len); 3850 3851 if (hlen) { 3852 ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); 3853 if (unlikely(ret)) 3854 return ret; 3855 len -= hlen; 3856 } else { 3857 plen = min_t(int, skb_headlen(from), len); 3858 if (plen) { 3859 page = virt_to_head_page(from->head); 3860 offset = from->data - (unsigned char *)page_address(page); 3861 __skb_fill_netmem_desc(to, 0, page_to_netmem(page), 3862 offset, plen); 3863 get_page(page); 3864 j = 1; 3865 len -= plen; 3866 } 3867 } 3868 3869 skb_len_add(to, len + plen); 3870 3871 if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { 3872 skb_tx_error(from); 3873 return -ENOMEM; 3874 } 3875 skb_zerocopy_clone(to, from, GFP_ATOMIC); 3876 3877 for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { 3878 int size; 3879 3880 if (!len) 3881 break; 3882 skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; 3883 size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), 3884 len); 3885 skb_frag_size_set(&skb_shinfo(to)->frags[j], size); 3886 len -= size; 3887 skb_frag_ref(to, j); 3888 j++; 3889 } 3890 skb_shinfo(to)->nr_frags = j; 3891 3892 return 0; 3893 } 3894 EXPORT_SYMBOL_GPL(skb_zerocopy); 3895 3896 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) 3897 { 3898 __wsum csum; 3899 long csstart; 3900 3901 if (skb->ip_summed == CHECKSUM_PARTIAL) 3902 csstart = skb_checksum_start_offset(skb); 3903 else 3904 csstart = skb_headlen(skb); 3905 3906 BUG_ON(csstart > skb_headlen(skb)); 3907 3908 skb_copy_from_linear_data(skb, to, csstart); 3909 3910 csum = 0; 3911 if (csstart != skb->len) 3912 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, 3913 skb->len - csstart); 3914 3915 if (skb->ip_summed == CHECKSUM_PARTIAL) { 3916 long csstuff = csstart + skb->csum_offset; 3917 3918 *((__sum16 *)(to + csstuff)) = csum_fold(csum); 3919 } 3920 } 3921 EXPORT_SYMBOL(skb_copy_and_csum_dev); 3922 3923 /** 3924 * skb_dequeue - remove from the head of the queue 3925 * @list: list to dequeue from 3926 * 3927 * Remove the head of the list. The list lock is taken so the function 3928 * may be used safely with other locking list functions. The head item is 3929 * returned or %NULL if the list is empty. 3930 */ 3931 3932 struct sk_buff *skb_dequeue(struct sk_buff_head *list) 3933 { 3934 unsigned long flags; 3935 struct sk_buff *result; 3936 3937 spin_lock_irqsave(&list->lock, flags); 3938 result = __skb_dequeue(list); 3939 spin_unlock_irqrestore(&list->lock, flags); 3940 return result; 3941 } 3942 EXPORT_SYMBOL(skb_dequeue); 3943 3944 /** 3945 * skb_dequeue_tail - remove from the tail of the queue 3946 * @list: list to dequeue from 3947 * 3948 * Remove the tail of the list. The list lock is taken so the function 3949 * may be used safely with other locking list functions. The tail item is 3950 * returned or %NULL if the list is empty. 3951 */ 3952 struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) 3953 { 3954 unsigned long flags; 3955 struct sk_buff *result; 3956 3957 spin_lock_irqsave(&list->lock, flags); 3958 result = __skb_dequeue_tail(list); 3959 spin_unlock_irqrestore(&list->lock, flags); 3960 return result; 3961 } 3962 EXPORT_SYMBOL(skb_dequeue_tail); 3963 3964 /** 3965 * skb_queue_purge_reason - empty a list 3966 * @list: list to empty 3967 * @reason: drop reason 3968 * 3969 * Delete all buffers on an &sk_buff list. Each buffer is removed from 3970 * the list and one reference dropped. This function takes the list 3971 * lock and is atomic with respect to other list locking functions. 3972 */ 3973 void skb_queue_purge_reason(struct sk_buff_head *list, 3974 enum skb_drop_reason reason) 3975 { 3976 struct sk_buff_head tmp; 3977 unsigned long flags; 3978 3979 if (skb_queue_empty_lockless(list)) 3980 return; 3981 3982 __skb_queue_head_init(&tmp); 3983 3984 spin_lock_irqsave(&list->lock, flags); 3985 skb_queue_splice_init(list, &tmp); 3986 spin_unlock_irqrestore(&list->lock, flags); 3987 3988 __skb_queue_purge_reason(&tmp, reason); 3989 } 3990 EXPORT_SYMBOL(skb_queue_purge_reason); 3991 3992 /** 3993 * skb_rbtree_purge - empty a skb rbtree 3994 * @root: root of the rbtree to empty 3995 * Return value: the sum of truesizes of all purged skbs. 3996 * 3997 * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from 3998 * the list and one reference dropped. This function does not take 3999 * any lock. Synchronization should be handled by the caller (e.g., TCP 4000 * out-of-order queue is protected by the socket lock). 4001 */ 4002 unsigned int skb_rbtree_purge(struct rb_root *root) 4003 { 4004 struct rb_node *p = rb_first(root); 4005 unsigned int sum = 0; 4006 4007 while (p) { 4008 struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); 4009 4010 p = rb_next(p); 4011 rb_erase(&skb->rbnode, root); 4012 sum += skb->truesize; 4013 kfree_skb(skb); 4014 } 4015 return sum; 4016 } 4017 4018 void skb_errqueue_purge(struct sk_buff_head *list) 4019 { 4020 struct sk_buff *skb, *next; 4021 struct sk_buff_head kill; 4022 unsigned long flags; 4023 4024 __skb_queue_head_init(&kill); 4025 4026 spin_lock_irqsave(&list->lock, flags); 4027 skb_queue_walk_safe(list, skb, next) { 4028 if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY || 4029 SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) 4030 continue; 4031 __skb_unlink(skb, list); 4032 __skb_queue_tail(&kill, skb); 4033 } 4034 spin_unlock_irqrestore(&list->lock, flags); 4035 __skb_queue_purge(&kill); 4036 } 4037 EXPORT_SYMBOL(skb_errqueue_purge); 4038 4039 /** 4040 * skb_queue_head - queue a buffer at the list head 4041 * @list: list to use 4042 * @newsk: buffer to queue 4043 * 4044 * Queue a buffer at the start of the list. This function takes the 4045 * list lock and can be used safely with other locking &sk_buff functions 4046 * safely. 4047 * 4048 * A buffer cannot be placed on two lists at the same time. 4049 */ 4050 void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) 4051 { 4052 unsigned long flags; 4053 4054 spin_lock_irqsave(&list->lock, flags); 4055 __skb_queue_head(list, newsk); 4056 spin_unlock_irqrestore(&list->lock, flags); 4057 } 4058 EXPORT_SYMBOL(skb_queue_head); 4059 4060 /** 4061 * skb_queue_tail - queue a buffer at the list tail 4062 * @list: list to use 4063 * @newsk: buffer to queue 4064 * 4065 * Queue a buffer at the tail of the list. This function takes the 4066 * list lock and can be used safely with other locking &sk_buff functions 4067 * safely. 4068 * 4069 * A buffer cannot be placed on two lists at the same time. 4070 */ 4071 void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) 4072 { 4073 unsigned long flags; 4074 4075 spin_lock_irqsave(&list->lock, flags); 4076 __skb_queue_tail(list, newsk); 4077 spin_unlock_irqrestore(&list->lock, flags); 4078 } 4079 EXPORT_SYMBOL(skb_queue_tail); 4080 4081 /** 4082 * skb_unlink - remove a buffer from a list 4083 * @skb: buffer to remove 4084 * @list: list to use 4085 * 4086 * Remove a packet from a list. The list locks are taken and this 4087 * function is atomic with respect to other list locked calls 4088 * 4089 * You must know what list the SKB is on. 4090 */ 4091 void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) 4092 { 4093 unsigned long flags; 4094 4095 spin_lock_irqsave(&list->lock, flags); 4096 __skb_unlink(skb, list); 4097 spin_unlock_irqrestore(&list->lock, flags); 4098 } 4099 EXPORT_SYMBOL(skb_unlink); 4100 4101 /** 4102 * skb_append - append a buffer 4103 * @old: buffer to insert after 4104 * @newsk: buffer to insert 4105 * @list: list to use 4106 * 4107 * Place a packet after a given packet in a list. The list locks are taken 4108 * and this function is atomic with respect to other list locked calls. 4109 * A buffer cannot be placed on two lists at the same time. 4110 */ 4111 void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 4112 { 4113 unsigned long flags; 4114 4115 spin_lock_irqsave(&list->lock, flags); 4116 __skb_queue_after(list, old, newsk); 4117 spin_unlock_irqrestore(&list->lock, flags); 4118 } 4119 EXPORT_SYMBOL(skb_append); 4120 4121 static inline void skb_split_inside_header(struct sk_buff *skb, 4122 struct sk_buff* skb1, 4123 const u32 len, const int pos) 4124 { 4125 int i; 4126 4127 skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), 4128 pos - len); 4129 /* And move data appendix as is. */ 4130 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 4131 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; 4132 4133 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; 4134 skb1->unreadable = skb->unreadable; 4135 skb_shinfo(skb)->nr_frags = 0; 4136 skb1->data_len = skb->data_len; 4137 skb1->len += skb1->data_len; 4138 skb->data_len = 0; 4139 skb->len = len; 4140 skb_set_tail_pointer(skb, len); 4141 } 4142 4143 static inline void skb_split_no_header(struct sk_buff *skb, 4144 struct sk_buff* skb1, 4145 const u32 len, int pos) 4146 { 4147 int i, k = 0; 4148 const int nfrags = skb_shinfo(skb)->nr_frags; 4149 4150 skb_shinfo(skb)->nr_frags = 0; 4151 skb1->len = skb1->data_len = skb->len - len; 4152 skb->len = len; 4153 skb->data_len = len - pos; 4154 4155 for (i = 0; i < nfrags; i++) { 4156 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 4157 4158 if (pos + size > len) { 4159 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; 4160 4161 if (pos < len) { 4162 /* Split frag. 4163 * We have two variants in this case: 4164 * 1. Move all the frag to the second 4165 * part, if it is possible. F.e. 4166 * this approach is mandatory for TUX, 4167 * where splitting is expensive. 4168 * 2. Split is accurately. We make this. 4169 */ 4170 skb_frag_ref(skb, i); 4171 skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos); 4172 skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); 4173 skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); 4174 skb_shinfo(skb)->nr_frags++; 4175 } 4176 k++; 4177 } else 4178 skb_shinfo(skb)->nr_frags++; 4179 pos += size; 4180 } 4181 skb_shinfo(skb1)->nr_frags = k; 4182 4183 skb1->unreadable = skb->unreadable; 4184 } 4185 4186 /** 4187 * skb_split - Split fragmented skb to two parts at length len. 4188 * @skb: the buffer to split 4189 * @skb1: the buffer to receive the second part 4190 * @len: new length for skb 4191 */ 4192 void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) 4193 { 4194 int pos = skb_headlen(skb); 4195 const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; 4196 4197 skb_zcopy_downgrade_managed(skb); 4198 4199 skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; 4200 skb_zerocopy_clone(skb1, skb, 0); 4201 if (len < pos) /* Split line is inside header. */ 4202 skb_split_inside_header(skb, skb1, len, pos); 4203 else /* Second chunk has no header, nothing to copy. */ 4204 skb_split_no_header(skb, skb1, len, pos); 4205 } 4206 EXPORT_SYMBOL(skb_split); 4207 4208 /* Shifting from/to a cloned skb is a no-go. 4209 * 4210 * Caller cannot keep skb_shinfo related pointers past calling here! 4211 */ 4212 static int skb_prepare_for_shift(struct sk_buff *skb) 4213 { 4214 return skb_unclone_keeptruesize(skb, GFP_ATOMIC); 4215 } 4216 4217 /** 4218 * skb_shift - Shifts paged data partially from skb to another 4219 * @tgt: buffer into which tail data gets added 4220 * @skb: buffer from which the paged data comes from 4221 * @shiftlen: shift up to this many bytes 4222 * 4223 * Attempts to shift up to shiftlen worth of bytes, which may be less than 4224 * the length of the skb, from skb to tgt. Returns number bytes shifted. 4225 * It's up to caller to free skb if everything was shifted. 4226 * 4227 * If @tgt runs out of frags, the whole operation is aborted. 4228 * 4229 * Skb cannot include anything else but paged data while tgt is allowed 4230 * to have non-paged data as well. 4231 * 4232 * TODO: full sized shift could be optimized but that would need 4233 * specialized skb free'er to handle frags without up-to-date nr_frags. 4234 */ 4235 int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) 4236 { 4237 int from, to, merge, todo; 4238 skb_frag_t *fragfrom, *fragto; 4239 4240 BUG_ON(shiftlen > skb->len); 4241 4242 if (skb_headlen(skb)) 4243 return 0; 4244 if (skb_zcopy(tgt) || skb_zcopy(skb)) 4245 return 0; 4246 4247 DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle); 4248 DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb)); 4249 4250 todo = shiftlen; 4251 from = 0; 4252 to = skb_shinfo(tgt)->nr_frags; 4253 fragfrom = &skb_shinfo(skb)->frags[from]; 4254 4255 /* Actual merge is delayed until the point when we know we can 4256 * commit all, so that we don't have to undo partial changes 4257 */ 4258 if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), 4259 skb_frag_off(fragfrom))) { 4260 merge = -1; 4261 } else { 4262 merge = to - 1; 4263 4264 todo -= skb_frag_size(fragfrom); 4265 if (todo < 0) { 4266 if (skb_prepare_for_shift(skb) || 4267 skb_prepare_for_shift(tgt)) 4268 return 0; 4269 4270 /* All previous frag pointers might be stale! */ 4271 fragfrom = &skb_shinfo(skb)->frags[from]; 4272 fragto = &skb_shinfo(tgt)->frags[merge]; 4273 4274 skb_frag_size_add(fragto, shiftlen); 4275 skb_frag_size_sub(fragfrom, shiftlen); 4276 skb_frag_off_add(fragfrom, shiftlen); 4277 4278 goto onlymerged; 4279 } 4280 4281 from++; 4282 } 4283 4284 /* Skip full, not-fitting skb to avoid expensive operations */ 4285 if ((shiftlen == skb->len) && 4286 (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) 4287 return 0; 4288 4289 if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) 4290 return 0; 4291 4292 while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { 4293 if (to == MAX_SKB_FRAGS) 4294 return 0; 4295 4296 fragfrom = &skb_shinfo(skb)->frags[from]; 4297 fragto = &skb_shinfo(tgt)->frags[to]; 4298 4299 if (todo >= skb_frag_size(fragfrom)) { 4300 *fragto = *fragfrom; 4301 todo -= skb_frag_size(fragfrom); 4302 from++; 4303 to++; 4304 4305 } else { 4306 __skb_frag_ref(fragfrom); 4307 skb_frag_page_copy(fragto, fragfrom); 4308 skb_frag_off_copy(fragto, fragfrom); 4309 skb_frag_size_set(fragto, todo); 4310 4311 skb_frag_off_add(fragfrom, todo); 4312 skb_frag_size_sub(fragfrom, todo); 4313 todo = 0; 4314 4315 to++; 4316 break; 4317 } 4318 } 4319 4320 /* Ready to "commit" this state change to tgt */ 4321 skb_shinfo(tgt)->nr_frags = to; 4322 4323 if (merge >= 0) { 4324 fragfrom = &skb_shinfo(skb)->frags[0]; 4325 fragto = &skb_shinfo(tgt)->frags[merge]; 4326 4327 skb_frag_size_add(fragto, skb_frag_size(fragfrom)); 4328 __skb_frag_unref(fragfrom, skb->pp_recycle); 4329 } 4330 4331 /* Reposition in the original skb */ 4332 to = 0; 4333 while (from < skb_shinfo(skb)->nr_frags) 4334 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; 4335 skb_shinfo(skb)->nr_frags = to; 4336 4337 BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); 4338 4339 onlymerged: 4340 /* Most likely the tgt won't ever need its checksum anymore, skb on 4341 * the other hand might need it if it needs to be resent 4342 */ 4343 tgt->ip_summed = CHECKSUM_PARTIAL; 4344 skb->ip_summed = CHECKSUM_PARTIAL; 4345 4346 skb_len_add(skb, -shiftlen); 4347 skb_len_add(tgt, shiftlen); 4348 4349 return shiftlen; 4350 } 4351 4352 /** 4353 * skb_prepare_seq_read - Prepare a sequential read of skb data 4354 * @skb: the buffer to read 4355 * @from: lower offset of data to be read 4356 * @to: upper offset of data to be read 4357 * @st: state variable 4358 * 4359 * Initializes the specified state variable. Must be called before 4360 * invoking skb_seq_read() for the first time. 4361 */ 4362 void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, 4363 unsigned int to, struct skb_seq_state *st) 4364 { 4365 st->lower_offset = from; 4366 st->upper_offset = to; 4367 st->root_skb = st->cur_skb = skb; 4368 st->frag_idx = st->stepped_offset = 0; 4369 st->frag_data = NULL; 4370 st->frag_off = 0; 4371 } 4372 EXPORT_SYMBOL(skb_prepare_seq_read); 4373 4374 /** 4375 * skb_seq_read - Sequentially read skb data 4376 * @consumed: number of bytes consumed by the caller so far 4377 * @data: destination pointer for data to be returned 4378 * @st: state variable 4379 * 4380 * Reads a block of skb data at @consumed relative to the 4381 * lower offset specified to skb_prepare_seq_read(). Assigns 4382 * the head of the data block to @data and returns the length 4383 * of the block or 0 if the end of the skb data or the upper 4384 * offset has been reached. 4385 * 4386 * The caller is not required to consume all of the data 4387 * returned, i.e. @consumed is typically set to the number 4388 * of bytes already consumed and the next call to 4389 * skb_seq_read() will return the remaining part of the block. 4390 * 4391 * Note 1: The size of each block of data returned can be arbitrary, 4392 * this limitation is the cost for zerocopy sequential 4393 * reads of potentially non linear data. 4394 * 4395 * Note 2: Fragment lists within fragments are not implemented 4396 * at the moment, state->root_skb could be replaced with 4397 * a stack for this purpose. 4398 */ 4399 unsigned int skb_seq_read(unsigned int consumed, const u8 **data, 4400 struct skb_seq_state *st) 4401 { 4402 unsigned int block_limit, abs_offset = consumed + st->lower_offset; 4403 skb_frag_t *frag; 4404 4405 if (unlikely(abs_offset >= st->upper_offset)) { 4406 if (st->frag_data) { 4407 kunmap_atomic(st->frag_data); 4408 st->frag_data = NULL; 4409 } 4410 return 0; 4411 } 4412 4413 next_skb: 4414 block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; 4415 4416 if (abs_offset < block_limit && !st->frag_data) { 4417 *data = st->cur_skb->data + (abs_offset - st->stepped_offset); 4418 return block_limit - abs_offset; 4419 } 4420 4421 if (!skb_frags_readable(st->cur_skb)) 4422 return 0; 4423 4424 if (st->frag_idx == 0 && !st->frag_data) 4425 st->stepped_offset += skb_headlen(st->cur_skb); 4426 4427 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { 4428 unsigned int pg_idx, pg_off, pg_sz; 4429 4430 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; 4431 4432 pg_idx = 0; 4433 pg_off = skb_frag_off(frag); 4434 pg_sz = skb_frag_size(frag); 4435 4436 if (skb_frag_must_loop(skb_frag_page(frag))) { 4437 pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; 4438 pg_off = offset_in_page(pg_off + st->frag_off); 4439 pg_sz = min_t(unsigned int, pg_sz - st->frag_off, 4440 PAGE_SIZE - pg_off); 4441 } 4442 4443 block_limit = pg_sz + st->stepped_offset; 4444 if (abs_offset < block_limit) { 4445 if (!st->frag_data) 4446 st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx); 4447 4448 *data = (u8 *)st->frag_data + pg_off + 4449 (abs_offset - st->stepped_offset); 4450 4451 return block_limit - abs_offset; 4452 } 4453 4454 if (st->frag_data) { 4455 kunmap_atomic(st->frag_data); 4456 st->frag_data = NULL; 4457 } 4458 4459 st->stepped_offset += pg_sz; 4460 st->frag_off += pg_sz; 4461 if (st->frag_off == skb_frag_size(frag)) { 4462 st->frag_off = 0; 4463 st->frag_idx++; 4464 } 4465 } 4466 4467 if (st->frag_data) { 4468 kunmap_atomic(st->frag_data); 4469 st->frag_data = NULL; 4470 } 4471 4472 if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { 4473 st->cur_skb = skb_shinfo(st->root_skb)->frag_list; 4474 st->frag_idx = 0; 4475 goto next_skb; 4476 } else if (st->cur_skb->next) { 4477 st->cur_skb = st->cur_skb->next; 4478 st->frag_idx = 0; 4479 goto next_skb; 4480 } 4481 4482 return 0; 4483 } 4484 EXPORT_SYMBOL(skb_seq_read); 4485 4486 /** 4487 * skb_abort_seq_read - Abort a sequential read of skb data 4488 * @st: state variable 4489 * 4490 * Must be called if skb_seq_read() was not called until it 4491 * returned 0. 4492 */ 4493 void skb_abort_seq_read(struct skb_seq_state *st) 4494 { 4495 if (st->frag_data) 4496 kunmap_atomic(st->frag_data); 4497 } 4498 EXPORT_SYMBOL(skb_abort_seq_read); 4499 4500 /** 4501 * skb_copy_seq_read() - copy from a skb_seq_state to a buffer 4502 * @st: source skb_seq_state 4503 * @offset: offset in source 4504 * @to: destination buffer 4505 * @len: number of bytes to copy 4506 * 4507 * Copy @len bytes from @offset bytes into the source @st to the destination 4508 * buffer @to. `offset` should increase (or be unchanged) with each subsequent 4509 * call to this function. If offset needs to decrease from the previous use `st` 4510 * should be reset first. 4511 * 4512 * Return: 0 on success or -EINVAL if the copy ended early 4513 */ 4514 int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len) 4515 { 4516 const u8 *data; 4517 u32 sqlen; 4518 4519 for (;;) { 4520 sqlen = skb_seq_read(offset, &data, st); 4521 if (sqlen == 0) 4522 return -EINVAL; 4523 if (sqlen >= len) { 4524 memcpy(to, data, len); 4525 return 0; 4526 } 4527 memcpy(to, data, sqlen); 4528 to += sqlen; 4529 offset += sqlen; 4530 len -= sqlen; 4531 } 4532 } 4533 EXPORT_SYMBOL(skb_copy_seq_read); 4534 4535 #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) 4536 4537 static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, 4538 struct ts_config *conf, 4539 struct ts_state *state) 4540 { 4541 return skb_seq_read(offset, text, TS_SKB_CB(state)); 4542 } 4543 4544 static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) 4545 { 4546 skb_abort_seq_read(TS_SKB_CB(state)); 4547 } 4548 4549 /** 4550 * skb_find_text - Find a text pattern in skb data 4551 * @skb: the buffer to look in 4552 * @from: search offset 4553 * @to: search limit 4554 * @config: textsearch configuration 4555 * 4556 * Finds a pattern in the skb data according to the specified 4557 * textsearch configuration. Use textsearch_next() to retrieve 4558 * subsequent occurrences of the pattern. Returns the offset 4559 * to the first occurrence or UINT_MAX if no match was found. 4560 */ 4561 unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, 4562 unsigned int to, struct ts_config *config) 4563 { 4564 unsigned int patlen = config->ops->get_pattern_len(config); 4565 struct ts_state state; 4566 unsigned int ret; 4567 4568 BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb)); 4569 4570 config->get_next_block = skb_ts_get_next_block; 4571 config->finish = skb_ts_finish; 4572 4573 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); 4574 4575 ret = textsearch_find(config, &state); 4576 return (ret + patlen <= to - from ? ret : UINT_MAX); 4577 } 4578 EXPORT_SYMBOL(skb_find_text); 4579 4580 int skb_append_pagefrags(struct sk_buff *skb, struct page *page, 4581 int offset, size_t size, size_t max_frags) 4582 { 4583 int i = skb_shinfo(skb)->nr_frags; 4584 4585 if (skb_can_coalesce(skb, i, page, offset)) { 4586 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); 4587 } else if (i < max_frags) { 4588 skb_zcopy_downgrade_managed(skb); 4589 get_page(page); 4590 skb_fill_page_desc_noacc(skb, i, page, offset, size); 4591 } else { 4592 return -EMSGSIZE; 4593 } 4594 4595 return 0; 4596 } 4597 EXPORT_SYMBOL_GPL(skb_append_pagefrags); 4598 4599 /** 4600 * skb_pull_rcsum - pull skb and update receive checksum 4601 * @skb: buffer to update 4602 * @len: length of data pulled 4603 * 4604 * This function performs an skb_pull on the packet and updates 4605 * the CHECKSUM_COMPLETE checksum. It should be used on 4606 * receive path processing instead of skb_pull unless you know 4607 * that the checksum difference is zero (e.g., a valid IP header) 4608 * or you are setting ip_summed to CHECKSUM_NONE. 4609 */ 4610 void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) 4611 { 4612 unsigned char *data = skb->data; 4613 4614 BUG_ON(len > skb->len); 4615 __skb_pull(skb, len); 4616 skb_postpull_rcsum(skb, data, len); 4617 return skb->data; 4618 } 4619 EXPORT_SYMBOL_GPL(skb_pull_rcsum); 4620 4621 static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) 4622 { 4623 skb_frag_t head_frag; 4624 struct page *page; 4625 4626 page = virt_to_head_page(frag_skb->head); 4627 skb_frag_fill_page_desc(&head_frag, page, frag_skb->data - 4628 (unsigned char *)page_address(page), 4629 skb_headlen(frag_skb)); 4630 return head_frag; 4631 } 4632 4633 struct sk_buff *skb_segment_list(struct sk_buff *skb, 4634 netdev_features_t features, 4635 unsigned int offset) 4636 { 4637 struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; 4638 unsigned int tnl_hlen = skb_tnl_header_len(skb); 4639 unsigned int delta_truesize = 0; 4640 unsigned int delta_len = 0; 4641 struct sk_buff *tail = NULL; 4642 struct sk_buff *nskb, *tmp; 4643 int len_diff, err; 4644 4645 skb_push(skb, -skb_network_offset(skb) + offset); 4646 4647 /* Ensure the head is writeable before touching the shared info */ 4648 err = skb_unclone(skb, GFP_ATOMIC); 4649 if (err) 4650 goto err_linearize; 4651 4652 skb_shinfo(skb)->frag_list = NULL; 4653 4654 while (list_skb) { 4655 nskb = list_skb; 4656 list_skb = list_skb->next; 4657 4658 err = 0; 4659 delta_truesize += nskb->truesize; 4660 if (skb_shared(nskb)) { 4661 tmp = skb_clone(nskb, GFP_ATOMIC); 4662 if (tmp) { 4663 consume_skb(nskb); 4664 nskb = tmp; 4665 err = skb_unclone(nskb, GFP_ATOMIC); 4666 } else { 4667 err = -ENOMEM; 4668 } 4669 } 4670 4671 if (!tail) 4672 skb->next = nskb; 4673 else 4674 tail->next = nskb; 4675 4676 if (unlikely(err)) { 4677 nskb->next = list_skb; 4678 goto err_linearize; 4679 } 4680 4681 tail = nskb; 4682 4683 delta_len += nskb->len; 4684 4685 skb_push(nskb, -skb_network_offset(nskb) + offset); 4686 4687 skb_release_head_state(nskb); 4688 len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb); 4689 __copy_skb_header(nskb, skb); 4690 4691 skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); 4692 nskb->transport_header += len_diff; 4693 skb_copy_from_linear_data_offset(skb, -tnl_hlen, 4694 nskb->data - tnl_hlen, 4695 offset + tnl_hlen); 4696 4697 if (skb_needs_linearize(nskb, features) && 4698 __skb_linearize(nskb)) 4699 goto err_linearize; 4700 } 4701 4702 skb->truesize = skb->truesize - delta_truesize; 4703 skb->data_len = skb->data_len - delta_len; 4704 skb->len = skb->len - delta_len; 4705 4706 skb_gso_reset(skb); 4707 4708 skb->prev = tail; 4709 4710 if (skb_needs_linearize(skb, features) && 4711 __skb_linearize(skb)) 4712 goto err_linearize; 4713 4714 skb_get(skb); 4715 4716 return skb; 4717 4718 err_linearize: 4719 kfree_skb_list(skb->next); 4720 skb->next = NULL; 4721 return ERR_PTR(-ENOMEM); 4722 } 4723 EXPORT_SYMBOL_GPL(skb_segment_list); 4724 4725 /** 4726 * skb_segment - Perform protocol segmentation on skb. 4727 * @head_skb: buffer to segment 4728 * @features: features for the output path (see dev->features) 4729 * 4730 * This function performs segmentation on the given skb. It returns 4731 * a pointer to the first in a list of new skbs for the segments. 4732 * In case of error it returns ERR_PTR(err). 4733 */ 4734 struct sk_buff *skb_segment(struct sk_buff *head_skb, 4735 netdev_features_t features) 4736 { 4737 struct sk_buff *segs = NULL; 4738 struct sk_buff *tail = NULL; 4739 struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; 4740 unsigned int mss = skb_shinfo(head_skb)->gso_size; 4741 unsigned int doffset = head_skb->data - skb_mac_header(head_skb); 4742 unsigned int offset = doffset; 4743 unsigned int tnl_hlen = skb_tnl_header_len(head_skb); 4744 unsigned int partial_segs = 0; 4745 unsigned int headroom; 4746 unsigned int len = head_skb->len; 4747 struct sk_buff *frag_skb; 4748 skb_frag_t *frag; 4749 __be16 proto; 4750 bool csum, sg; 4751 int err = -ENOMEM; 4752 int i = 0; 4753 int nfrags, pos; 4754 4755 if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && 4756 mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { 4757 struct sk_buff *check_skb; 4758 4759 for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { 4760 if (skb_headlen(check_skb) && !check_skb->head_frag) { 4761 /* gso_size is untrusted, and we have a frag_list with 4762 * a linear non head_frag item. 4763 * 4764 * If head_skb's headlen does not fit requested gso_size, 4765 * it means that the frag_list members do NOT terminate 4766 * on exact gso_size boundaries. Hence we cannot perform 4767 * skb_frag_t page sharing. Therefore we must fallback to 4768 * copying the frag_list skbs; we do so by disabling SG. 4769 */ 4770 features &= ~NETIF_F_SG; 4771 break; 4772 } 4773 } 4774 } 4775 4776 __skb_push(head_skb, doffset); 4777 proto = skb_network_protocol(head_skb, NULL); 4778 if (unlikely(!proto)) 4779 return ERR_PTR(-EINVAL); 4780 4781 sg = !!(features & NETIF_F_SG); 4782 csum = !!can_checksum_protocol(features, proto); 4783 4784 if (sg && csum && (mss != GSO_BY_FRAGS)) { 4785 if (!(features & NETIF_F_GSO_PARTIAL)) { 4786 struct sk_buff *iter; 4787 unsigned int frag_len; 4788 4789 if (!list_skb || 4790 !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) 4791 goto normal; 4792 4793 /* If we get here then all the required 4794 * GSO features except frag_list are supported. 4795 * Try to split the SKB to multiple GSO SKBs 4796 * with no frag_list. 4797 * Currently we can do that only when the buffers don't 4798 * have a linear part and all the buffers except 4799 * the last are of the same length. 4800 */ 4801 frag_len = list_skb->len; 4802 skb_walk_frags(head_skb, iter) { 4803 if (frag_len != iter->len && iter->next) 4804 goto normal; 4805 if (skb_headlen(iter) && !iter->head_frag) 4806 goto normal; 4807 4808 len -= iter->len; 4809 } 4810 4811 if (len != frag_len) 4812 goto normal; 4813 } 4814 4815 /* GSO partial only requires that we trim off any excess that 4816 * doesn't fit into an MSS sized block, so take care of that 4817 * now. 4818 * Cap len to not accidentally hit GSO_BY_FRAGS. 4819 */ 4820 partial_segs = min(len, GSO_BY_FRAGS - 1) / mss; 4821 if (partial_segs > 1) 4822 mss *= partial_segs; 4823 else 4824 partial_segs = 0; 4825 } 4826 4827 normal: 4828 headroom = skb_headroom(head_skb); 4829 pos = skb_headlen(head_skb); 4830 4831 if (skb_orphan_frags(head_skb, GFP_ATOMIC)) 4832 return ERR_PTR(-ENOMEM); 4833 4834 nfrags = skb_shinfo(head_skb)->nr_frags; 4835 frag = skb_shinfo(head_skb)->frags; 4836 frag_skb = head_skb; 4837 4838 do { 4839 struct sk_buff *nskb; 4840 skb_frag_t *nskb_frag; 4841 int hsize; 4842 int size; 4843 4844 if (unlikely(mss == GSO_BY_FRAGS)) { 4845 len = list_skb->len; 4846 } else { 4847 len = head_skb->len - offset; 4848 if (len > mss) 4849 len = mss; 4850 } 4851 4852 hsize = skb_headlen(head_skb) - offset; 4853 4854 if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) && 4855 (skb_headlen(list_skb) == len || sg)) { 4856 BUG_ON(skb_headlen(list_skb) > len); 4857 4858 nskb = skb_clone(list_skb, GFP_ATOMIC); 4859 if (unlikely(!nskb)) 4860 goto err; 4861 4862 i = 0; 4863 nfrags = skb_shinfo(list_skb)->nr_frags; 4864 frag = skb_shinfo(list_skb)->frags; 4865 frag_skb = list_skb; 4866 pos += skb_headlen(list_skb); 4867 4868 while (pos < offset + len) { 4869 BUG_ON(i >= nfrags); 4870 4871 size = skb_frag_size(frag); 4872 if (pos + size > offset + len) 4873 break; 4874 4875 i++; 4876 pos += size; 4877 frag++; 4878 } 4879 4880 list_skb = list_skb->next; 4881 4882 if (unlikely(pskb_trim(nskb, len))) { 4883 kfree_skb(nskb); 4884 goto err; 4885 } 4886 4887 hsize = skb_end_offset(nskb); 4888 if (skb_cow_head(nskb, doffset + headroom)) { 4889 kfree_skb(nskb); 4890 goto err; 4891 } 4892 4893 nskb->truesize += skb_end_offset(nskb) - hsize; 4894 skb_release_head_state(nskb); 4895 __skb_push(nskb, doffset); 4896 } else { 4897 if (hsize < 0) 4898 hsize = 0; 4899 if (hsize > len || !sg) 4900 hsize = len; 4901 4902 nskb = __alloc_skb(hsize + doffset + headroom, 4903 GFP_ATOMIC, skb_alloc_rx_flag(head_skb), 4904 NUMA_NO_NODE); 4905 4906 if (unlikely(!nskb)) 4907 goto err; 4908 4909 skb_reserve(nskb, headroom); 4910 __skb_put(nskb, doffset); 4911 } 4912 4913 if (segs) 4914 tail->next = nskb; 4915 else 4916 segs = nskb; 4917 tail = nskb; 4918 4919 __copy_skb_header(nskb, head_skb); 4920 4921 skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); 4922 skb_reset_mac_len(nskb); 4923 4924 skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, 4925 nskb->data - tnl_hlen, 4926 doffset + tnl_hlen); 4927 4928 if (nskb->len == len + doffset) 4929 goto perform_csum_check; 4930 4931 if (!sg) { 4932 if (!csum) { 4933 if (!nskb->remcsum_offload) 4934 nskb->ip_summed = CHECKSUM_NONE; 4935 SKB_GSO_CB(nskb)->csum = 4936 skb_copy_and_csum_bits(head_skb, offset, 4937 skb_put(nskb, 4938 len), 4939 len); 4940 SKB_GSO_CB(nskb)->csum_start = 4941 skb_headroom(nskb) + doffset; 4942 } else { 4943 if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) 4944 goto err; 4945 } 4946 continue; 4947 } 4948 4949 nskb_frag = skb_shinfo(nskb)->frags; 4950 4951 skb_copy_from_linear_data_offset(head_skb, offset, 4952 skb_put(nskb, hsize), hsize); 4953 4954 skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & 4955 SKBFL_SHARED_FRAG; 4956 4957 if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) 4958 goto err; 4959 4960 while (pos < offset + len) { 4961 if (i >= nfrags) { 4962 if (skb_orphan_frags(list_skb, GFP_ATOMIC) || 4963 skb_zerocopy_clone(nskb, list_skb, 4964 GFP_ATOMIC)) 4965 goto err; 4966 4967 i = 0; 4968 nfrags = skb_shinfo(list_skb)->nr_frags; 4969 frag = skb_shinfo(list_skb)->frags; 4970 frag_skb = list_skb; 4971 if (!skb_headlen(list_skb)) { 4972 BUG_ON(!nfrags); 4973 } else { 4974 BUG_ON(!list_skb->head_frag); 4975 4976 /* to make room for head_frag. */ 4977 i--; 4978 frag--; 4979 } 4980 4981 list_skb = list_skb->next; 4982 } 4983 4984 if (unlikely(skb_shinfo(nskb)->nr_frags >= 4985 MAX_SKB_FRAGS)) { 4986 net_warn_ratelimited( 4987 "skb_segment: too many frags: %u %u\n", 4988 pos, mss); 4989 err = -EINVAL; 4990 goto err; 4991 } 4992 4993 *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; 4994 __skb_frag_ref(nskb_frag); 4995 size = skb_frag_size(nskb_frag); 4996 4997 if (pos < offset) { 4998 skb_frag_off_add(nskb_frag, offset - pos); 4999 skb_frag_size_sub(nskb_frag, offset - pos); 5000 } 5001 5002 skb_shinfo(nskb)->nr_frags++; 5003 5004 if (pos + size <= offset + len) { 5005 i++; 5006 frag++; 5007 pos += size; 5008 } else { 5009 skb_frag_size_sub(nskb_frag, pos + size - (offset + len)); 5010 goto skip_fraglist; 5011 } 5012 5013 nskb_frag++; 5014 } 5015 5016 skip_fraglist: 5017 nskb->data_len = len - hsize; 5018 nskb->len += nskb->data_len; 5019 nskb->truesize += nskb->data_len; 5020 5021 perform_csum_check: 5022 if (!csum) { 5023 if (skb_has_shared_frag(nskb) && 5024 __skb_linearize(nskb)) 5025 goto err; 5026 5027 if (!nskb->remcsum_offload) 5028 nskb->ip_summed = CHECKSUM_NONE; 5029 SKB_GSO_CB(nskb)->csum = 5030 skb_checksum(nskb, doffset, 5031 nskb->len - doffset, 0); 5032 SKB_GSO_CB(nskb)->csum_start = 5033 skb_headroom(nskb) + doffset; 5034 } 5035 } while ((offset += len) < head_skb->len); 5036 5037 /* Some callers want to get the end of the list. 5038 * Put it in segs->prev to avoid walking the list. 5039 * (see validate_xmit_skb_list() for example) 5040 */ 5041 segs->prev = tail; 5042 5043 if (partial_segs) { 5044 struct sk_buff *iter; 5045 int type = skb_shinfo(head_skb)->gso_type; 5046 unsigned short gso_size = skb_shinfo(head_skb)->gso_size; 5047 5048 /* Update type to add partial and then remove dodgy if set */ 5049 type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; 5050 type &= ~SKB_GSO_DODGY; 5051 5052 /* Update GSO info and prepare to start updating headers on 5053 * our way back down the stack of protocols. 5054 */ 5055 for (iter = segs; iter; iter = iter->next) { 5056 skb_shinfo(iter)->gso_size = gso_size; 5057 skb_shinfo(iter)->gso_segs = partial_segs; 5058 skb_shinfo(iter)->gso_type = type; 5059 SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; 5060 } 5061 5062 if (tail->len - doffset <= gso_size) 5063 skb_shinfo(tail)->gso_size = 0; 5064 else if (tail != segs) 5065 skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); 5066 } 5067 5068 /* Following permits correct backpressure, for protocols 5069 * using skb_set_owner_w(). 5070 * Idea is to tranfert ownership from head_skb to last segment. 5071 */ 5072 if (head_skb->destructor == sock_wfree) { 5073 swap(tail->truesize, head_skb->truesize); 5074 swap(tail->destructor, head_skb->destructor); 5075 swap(tail->sk, head_skb->sk); 5076 } 5077 return segs; 5078 5079 err: 5080 kfree_skb_list(segs); 5081 return ERR_PTR(err); 5082 } 5083 EXPORT_SYMBOL_GPL(skb_segment); 5084 5085 #ifdef CONFIG_SKB_EXTENSIONS 5086 #define SKB_EXT_ALIGN_VALUE 8 5087 #define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) 5088 5089 static const u8 skb_ext_type_len[] = { 5090 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) 5091 [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), 5092 #endif 5093 #ifdef CONFIG_XFRM 5094 [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), 5095 #endif 5096 #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) 5097 [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), 5098 #endif 5099 #if IS_ENABLED(CONFIG_MPTCP) 5100 [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), 5101 #endif 5102 #if IS_ENABLED(CONFIG_MCTP_FLOWS) 5103 [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), 5104 #endif 5105 #if IS_ENABLED(CONFIG_INET_PSP) 5106 [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext), 5107 #endif 5108 }; 5109 5110 static __always_inline unsigned int skb_ext_total_length(void) 5111 { 5112 unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext); 5113 int i; 5114 5115 for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++) 5116 l += skb_ext_type_len[i]; 5117 5118 return l; 5119 } 5120 5121 static void skb_extensions_init(void) 5122 { 5123 BUILD_BUG_ON(SKB_EXT_NUM >= 8); 5124 #if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL) 5125 BUILD_BUG_ON(skb_ext_total_length() > 255); 5126 #endif 5127 5128 skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache", 5129 SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), 5130 0, 5131 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 5132 NULL); 5133 } 5134 #else 5135 static void skb_extensions_init(void) {} 5136 #endif 5137 5138 /* The SKB kmem_cache slab is critical for network performance. Never 5139 * merge/alias the slab with similar sized objects. This avoids fragmentation 5140 * that hurts performance of kmem_cache_{alloc,free}_bulk APIs. 5141 */ 5142 #ifndef CONFIG_SLUB_TINY 5143 #define FLAG_SKB_NO_MERGE SLAB_NO_MERGE 5144 #else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */ 5145 #define FLAG_SKB_NO_MERGE 0 5146 #endif 5147 5148 void __init skb_init(void) 5149 { 5150 net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", 5151 sizeof(struct sk_buff), 5152 0, 5153 SLAB_HWCACHE_ALIGN|SLAB_PANIC| 5154 FLAG_SKB_NO_MERGE, 5155 offsetof(struct sk_buff, cb), 5156 sizeof_field(struct sk_buff, cb), 5157 NULL); 5158 skbuff_cache_size = kmem_cache_size(net_hotdata.skbuff_cache); 5159 5160 net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", 5161 sizeof(struct sk_buff_fclones), 5162 0, 5163 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 5164 NULL); 5165 /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes. 5166 * struct skb_shared_info is located at the end of skb->head, 5167 * and should not be copied to/from user. 5168 */ 5169 net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head", 5170 SKB_SMALL_HEAD_CACHE_SIZE, 5171 0, 5172 SLAB_HWCACHE_ALIGN | SLAB_PANIC, 5173 0, 5174 SKB_SMALL_HEAD_HEADROOM, 5175 NULL); 5176 skb_extensions_init(); 5177 } 5178 5179 static int 5180 __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, 5181 unsigned int recursion_level) 5182 { 5183 int start = skb_headlen(skb); 5184 int i, copy = start - offset; 5185 struct sk_buff *frag_iter; 5186 int elt = 0; 5187 5188 if (unlikely(recursion_level >= 24)) 5189 return -EMSGSIZE; 5190 5191 if (copy > 0) { 5192 if (copy > len) 5193 copy = len; 5194 sg_set_buf(sg, skb->data + offset, copy); 5195 elt++; 5196 if ((len -= copy) == 0) 5197 return elt; 5198 offset += copy; 5199 } 5200 5201 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 5202 int end; 5203 5204 WARN_ON(start > offset + len); 5205 5206 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 5207 if ((copy = end - offset) > 0) { 5208 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 5209 if (unlikely(elt && sg_is_last(&sg[elt - 1]))) 5210 return -EMSGSIZE; 5211 5212 if (copy > len) 5213 copy = len; 5214 sg_set_page(&sg[elt], skb_frag_page(frag), copy, 5215 skb_frag_off(frag) + offset - start); 5216 elt++; 5217 if (!(len -= copy)) 5218 return elt; 5219 offset += copy; 5220 } 5221 start = end; 5222 } 5223 5224 skb_walk_frags(skb, frag_iter) { 5225 int end, ret; 5226 5227 WARN_ON(start > offset + len); 5228 5229 end = start + frag_iter->len; 5230 if ((copy = end - offset) > 0) { 5231 if (unlikely(elt && sg_is_last(&sg[elt - 1]))) 5232 return -EMSGSIZE; 5233 5234 if (copy > len) 5235 copy = len; 5236 ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, 5237 copy, recursion_level + 1); 5238 if (unlikely(ret < 0)) 5239 return ret; 5240 elt += ret; 5241 if ((len -= copy) == 0) 5242 return elt; 5243 offset += copy; 5244 } 5245 start = end; 5246 } 5247 BUG_ON(len); 5248 return elt; 5249 } 5250 5251 /** 5252 * skb_to_sgvec - Fill a scatter-gather list from a socket buffer 5253 * @skb: Socket buffer containing the buffers to be mapped 5254 * @sg: The scatter-gather list to map into 5255 * @offset: The offset into the buffer's contents to start mapping 5256 * @len: Length of buffer space to be mapped 5257 * 5258 * Fill the specified scatter-gather list with mappings/pointers into a 5259 * region of the buffer space attached to a socket buffer. Returns either 5260 * the number of scatterlist items used, or -EMSGSIZE if the contents 5261 * could not fit. 5262 */ 5263 int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 5264 { 5265 int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); 5266 5267 if (nsg <= 0) 5268 return nsg; 5269 5270 sg_mark_end(&sg[nsg - 1]); 5271 5272 return nsg; 5273 } 5274 EXPORT_SYMBOL_GPL(skb_to_sgvec); 5275 5276 /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given 5277 * sglist without mark the sg which contain last skb data as the end. 5278 * So the caller can mannipulate sg list as will when padding new data after 5279 * the first call without calling sg_unmark_end to expend sg list. 5280 * 5281 * Scenario to use skb_to_sgvec_nomark: 5282 * 1. sg_init_table 5283 * 2. skb_to_sgvec_nomark(payload1) 5284 * 3. skb_to_sgvec_nomark(payload2) 5285 * 5286 * This is equivalent to: 5287 * 1. sg_init_table 5288 * 2. skb_to_sgvec(payload1) 5289 * 3. sg_unmark_end 5290 * 4. skb_to_sgvec(payload2) 5291 * 5292 * When mapping multiple payload conditionally, skb_to_sgvec_nomark 5293 * is more preferable. 5294 */ 5295 int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, 5296 int offset, int len) 5297 { 5298 return __skb_to_sgvec(skb, sg, offset, len, 0); 5299 } 5300 EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); 5301 5302 5303 5304 /** 5305 * skb_cow_data - Check that a socket buffer's data buffers are writable 5306 * @skb: The socket buffer to check. 5307 * @tailbits: Amount of trailing space to be added 5308 * @trailer: Returned pointer to the skb where the @tailbits space begins 5309 * 5310 * Make sure that the data buffers attached to a socket buffer are 5311 * writable. If they are not, private copies are made of the data buffers 5312 * and the socket buffer is set to use these instead. 5313 * 5314 * If @tailbits is given, make sure that there is space to write @tailbits 5315 * bytes of data beyond current end of socket buffer. @trailer will be 5316 * set to point to the skb in which this space begins. 5317 * 5318 * The number of scatterlist elements required to completely map the 5319 * COW'd and extended socket buffer will be returned. 5320 */ 5321 int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) 5322 { 5323 int copyflag; 5324 int elt; 5325 struct sk_buff *skb1, **skb_p; 5326 5327 /* If skb is cloned or its head is paged, reallocate 5328 * head pulling out all the pages (pages are considered not writable 5329 * at the moment even if they are anonymous). 5330 */ 5331 if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && 5332 !__pskb_pull_tail(skb, __skb_pagelen(skb))) 5333 return -ENOMEM; 5334 5335 /* Easy case. Most of packets will go this way. */ 5336 if (!skb_has_frag_list(skb)) { 5337 /* A little of trouble, not enough of space for trailer. 5338 * This should not happen, when stack is tuned to generate 5339 * good frames. OK, on miss we reallocate and reserve even more 5340 * space, 128 bytes is fair. */ 5341 5342 if (skb_tailroom(skb) < tailbits && 5343 pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) 5344 return -ENOMEM; 5345 5346 /* Voila! */ 5347 *trailer = skb; 5348 return 1; 5349 } 5350 5351 /* Misery. We are in troubles, going to mincer fragments... */ 5352 5353 elt = 1; 5354 skb_p = &skb_shinfo(skb)->frag_list; 5355 copyflag = 0; 5356 5357 while ((skb1 = *skb_p) != NULL) { 5358 int ntail = 0; 5359 5360 /* The fragment is partially pulled by someone, 5361 * this can happen on input. Copy it and everything 5362 * after it. */ 5363 5364 if (skb_shared(skb1)) 5365 copyflag = 1; 5366 5367 /* If the skb is the last, worry about trailer. */ 5368 5369 if (skb1->next == NULL && tailbits) { 5370 if (skb_shinfo(skb1)->nr_frags || 5371 skb_has_frag_list(skb1) || 5372 skb_tailroom(skb1) < tailbits) 5373 ntail = tailbits + 128; 5374 } 5375 5376 if (copyflag || 5377 skb_cloned(skb1) || 5378 ntail || 5379 skb_shinfo(skb1)->nr_frags || 5380 skb_has_frag_list(skb1)) { 5381 struct sk_buff *skb2; 5382 5383 /* Fuck, we are miserable poor guys... */ 5384 if (ntail == 0) 5385 skb2 = skb_copy(skb1, GFP_ATOMIC); 5386 else 5387 skb2 = skb_copy_expand(skb1, 5388 skb_headroom(skb1), 5389 ntail, 5390 GFP_ATOMIC); 5391 if (unlikely(skb2 == NULL)) 5392 return -ENOMEM; 5393 5394 if (skb1->sk) 5395 skb_set_owner_w(skb2, skb1->sk); 5396 5397 /* Looking around. Are we still alive? 5398 * OK, link new skb, drop old one */ 5399 5400 skb2->next = skb1->next; 5401 *skb_p = skb2; 5402 kfree_skb(skb1); 5403 skb1 = skb2; 5404 } 5405 elt++; 5406 *trailer = skb1; 5407 skb_p = &skb1->next; 5408 } 5409 5410 return elt; 5411 } 5412 EXPORT_SYMBOL_GPL(skb_cow_data); 5413 5414 static void sock_rmem_free(struct sk_buff *skb) 5415 { 5416 struct sock *sk = skb->sk; 5417 5418 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 5419 } 5420 5421 static void skb_set_err_queue(struct sk_buff *skb) 5422 { 5423 /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. 5424 * So, it is safe to (mis)use it to mark skbs on the error queue. 5425 */ 5426 skb->pkt_type = PACKET_OUTGOING; 5427 BUILD_BUG_ON(PACKET_OUTGOING == 0); 5428 } 5429 5430 /* 5431 * Note: We dont mem charge error packets (no sk_forward_alloc changes) 5432 */ 5433 int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) 5434 { 5435 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 5436 (unsigned int)READ_ONCE(sk->sk_rcvbuf)) 5437 return -ENOMEM; 5438 5439 skb_orphan(skb); 5440 skb->sk = sk; 5441 skb->destructor = sock_rmem_free; 5442 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 5443 skb_set_err_queue(skb); 5444 5445 /* before exiting rcu section, make sure dst is refcounted */ 5446 skb_dst_force(skb); 5447 5448 skb_queue_tail(&sk->sk_error_queue, skb); 5449 if (!sock_flag(sk, SOCK_DEAD)) 5450 sk_error_report(sk); 5451 return 0; 5452 } 5453 EXPORT_SYMBOL(sock_queue_err_skb); 5454 5455 static bool is_icmp_err_skb(const struct sk_buff *skb) 5456 { 5457 return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || 5458 SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); 5459 } 5460 5461 struct sk_buff *sock_dequeue_err_skb(struct sock *sk) 5462 { 5463 struct sk_buff_head *q = &sk->sk_error_queue; 5464 struct sk_buff *skb, *skb_next = NULL; 5465 bool icmp_next = false; 5466 unsigned long flags; 5467 5468 if (skb_queue_empty_lockless(q)) 5469 return NULL; 5470 5471 spin_lock_irqsave(&q->lock, flags); 5472 skb = __skb_dequeue(q); 5473 if (skb && (skb_next = skb_peek(q))) { 5474 icmp_next = is_icmp_err_skb(skb_next); 5475 if (icmp_next) 5476 sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; 5477 } 5478 spin_unlock_irqrestore(&q->lock, flags); 5479 5480 if (is_icmp_err_skb(skb) && !icmp_next) 5481 sk->sk_err = 0; 5482 5483 if (skb_next) 5484 sk_error_report(sk); 5485 5486 return skb; 5487 } 5488 EXPORT_SYMBOL(sock_dequeue_err_skb); 5489 5490 /** 5491 * skb_clone_sk - create clone of skb, and take reference to socket 5492 * @skb: the skb to clone 5493 * 5494 * This function creates a clone of a buffer that holds a reference on 5495 * sk_refcnt. Buffers created via this function are meant to be 5496 * returned using sock_queue_err_skb, or free via kfree_skb. 5497 * 5498 * When passing buffers allocated with this function to sock_queue_err_skb 5499 * it is necessary to wrap the call with sock_hold/sock_put in order to 5500 * prevent the socket from being released prior to being enqueued on 5501 * the sk_error_queue. 5502 */ 5503 struct sk_buff *skb_clone_sk(struct sk_buff *skb) 5504 { 5505 struct sock *sk = skb->sk; 5506 struct sk_buff *clone; 5507 5508 if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) 5509 return NULL; 5510 5511 clone = skb_clone(skb, GFP_ATOMIC); 5512 if (!clone) { 5513 sock_put(sk); 5514 return NULL; 5515 } 5516 5517 clone->sk = sk; 5518 clone->destructor = sock_efree; 5519 5520 return clone; 5521 } 5522 EXPORT_SYMBOL(skb_clone_sk); 5523 5524 static void __skb_complete_tx_timestamp(struct sk_buff *skb, 5525 struct sock *sk, 5526 int tstype, 5527 bool opt_stats) 5528 { 5529 struct sock_exterr_skb *serr; 5530 int err; 5531 5532 BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); 5533 5534 serr = SKB_EXT_ERR(skb); 5535 memset(serr, 0, sizeof(*serr)); 5536 serr->ee.ee_errno = ENOMSG; 5537 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 5538 serr->ee.ee_info = tstype; 5539 serr->opt_stats = opt_stats; 5540 serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; 5541 if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { 5542 serr->ee.ee_data = skb_shinfo(skb)->tskey; 5543 if (sk_is_tcp(sk)) 5544 serr->ee.ee_data -= atomic_read(&sk->sk_tskey); 5545 } 5546 5547 err = sock_queue_err_skb(sk, skb); 5548 5549 if (err) 5550 kfree_skb(skb); 5551 } 5552 5553 static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) 5554 { 5555 bool ret; 5556 5557 if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data))) 5558 return true; 5559 5560 read_lock_bh(&sk->sk_callback_lock); 5561 ret = sk->sk_socket && sk->sk_socket->file && 5562 file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW); 5563 read_unlock_bh(&sk->sk_callback_lock); 5564 return ret; 5565 } 5566 5567 void skb_complete_tx_timestamp(struct sk_buff *skb, 5568 struct skb_shared_hwtstamps *hwtstamps) 5569 { 5570 struct sock *sk = skb->sk; 5571 5572 if (!skb_may_tx_timestamp(sk, false)) 5573 goto err; 5574 5575 /* Take a reference to prevent skb_orphan() from freeing the socket, 5576 * but only if the socket refcount is not zero. 5577 */ 5578 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { 5579 *skb_hwtstamps(skb) = *hwtstamps; 5580 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); 5581 sock_put(sk); 5582 return; 5583 } 5584 5585 err: 5586 kfree_skb(skb); 5587 } 5588 EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); 5589 5590 static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb, 5591 struct skb_shared_hwtstamps *hwtstamps, 5592 int tstype) 5593 { 5594 switch (tstype) { 5595 case SCM_TSTAMP_SCHED: 5596 return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP; 5597 case SCM_TSTAMP_SND: 5598 return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF : 5599 SKBTX_SW_TSTAMP); 5600 case SCM_TSTAMP_ACK: 5601 return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK; 5602 case SCM_TSTAMP_COMPLETION: 5603 return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP; 5604 } 5605 5606 return false; 5607 } 5608 5609 static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb, 5610 struct skb_shared_hwtstamps *hwtstamps, 5611 struct sock *sk, 5612 int tstype) 5613 { 5614 int op; 5615 5616 switch (tstype) { 5617 case SCM_TSTAMP_SCHED: 5618 op = BPF_SOCK_OPS_TSTAMP_SCHED_CB; 5619 break; 5620 case SCM_TSTAMP_SND: 5621 if (hwtstamps) { 5622 op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB; 5623 *skb_hwtstamps(skb) = *hwtstamps; 5624 } else { 5625 op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB; 5626 } 5627 break; 5628 case SCM_TSTAMP_ACK: 5629 op = BPF_SOCK_OPS_TSTAMP_ACK_CB; 5630 break; 5631 default: 5632 return; 5633 } 5634 5635 bpf_skops_tx_timestamping(sk, skb, op); 5636 } 5637 5638 void __skb_tstamp_tx(struct sk_buff *orig_skb, 5639 const struct sk_buff *ack_skb, 5640 struct skb_shared_hwtstamps *hwtstamps, 5641 struct sock *sk, int tstype) 5642 { 5643 struct sk_buff *skb; 5644 bool tsonly, opt_stats = false; 5645 u32 tsflags; 5646 5647 if (!sk) 5648 return; 5649 5650 if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF) 5651 skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps, 5652 sk, tstype); 5653 5654 if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype)) 5655 return; 5656 5657 tsflags = READ_ONCE(sk->sk_tsflags); 5658 if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && 5659 skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) 5660 return; 5661 5662 tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; 5663 if (!skb_may_tx_timestamp(sk, tsonly)) 5664 return; 5665 5666 if (tsonly) { 5667 #ifdef CONFIG_INET 5668 if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && 5669 sk_is_tcp(sk)) { 5670 skb = tcp_get_timestamping_opt_stats(sk, orig_skb, 5671 ack_skb); 5672 opt_stats = true; 5673 } else 5674 #endif 5675 skb = alloc_skb(0, GFP_ATOMIC); 5676 } else { 5677 skb = skb_clone(orig_skb, GFP_ATOMIC); 5678 5679 if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) { 5680 kfree_skb(skb); 5681 return; 5682 } 5683 } 5684 if (!skb) 5685 return; 5686 5687 if (tsonly) { 5688 skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & 5689 SKBTX_ANY_TSTAMP; 5690 skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; 5691 } 5692 5693 if (hwtstamps) 5694 *skb_hwtstamps(skb) = *hwtstamps; 5695 else 5696 __net_timestamp(skb); 5697 5698 __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); 5699 } 5700 EXPORT_SYMBOL_GPL(__skb_tstamp_tx); 5701 5702 void skb_tstamp_tx(struct sk_buff *orig_skb, 5703 struct skb_shared_hwtstamps *hwtstamps) 5704 { 5705 return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, 5706 SCM_TSTAMP_SND); 5707 } 5708 EXPORT_SYMBOL_GPL(skb_tstamp_tx); 5709 5710 #ifdef CONFIG_WIRELESS 5711 void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) 5712 { 5713 struct sock *sk = skb->sk; 5714 struct sock_exterr_skb *serr; 5715 int err = 1; 5716 5717 skb->wifi_acked_valid = 1; 5718 skb->wifi_acked = acked; 5719 5720 serr = SKB_EXT_ERR(skb); 5721 memset(serr, 0, sizeof(*serr)); 5722 serr->ee.ee_errno = ENOMSG; 5723 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; 5724 5725 /* Take a reference to prevent skb_orphan() from freeing the socket, 5726 * but only if the socket refcount is not zero. 5727 */ 5728 if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { 5729 err = sock_queue_err_skb(sk, skb); 5730 sock_put(sk); 5731 } 5732 if (err) 5733 kfree_skb(skb); 5734 } 5735 EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); 5736 #endif /* CONFIG_WIRELESS */ 5737 5738 /** 5739 * skb_partial_csum_set - set up and verify partial csum values for packet 5740 * @skb: the skb to set 5741 * @start: the number of bytes after skb->data to start checksumming. 5742 * @off: the offset from start to place the checksum. 5743 * 5744 * For untrusted partially-checksummed packets, we need to make sure the values 5745 * for skb->csum_start and skb->csum_offset are valid so we don't oops. 5746 * 5747 * This function checks and sets those values and skb->ip_summed: if this 5748 * returns false you should drop the packet. 5749 */ 5750 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) 5751 { 5752 u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); 5753 u32 csum_start = skb_headroom(skb) + (u32)start; 5754 5755 if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) { 5756 net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", 5757 start, off, skb_headroom(skb), skb_headlen(skb)); 5758 return false; 5759 } 5760 skb->ip_summed = CHECKSUM_PARTIAL; 5761 skb->csum_start = csum_start; 5762 skb->csum_offset = off; 5763 skb->transport_header = csum_start; 5764 return true; 5765 } 5766 EXPORT_SYMBOL_GPL(skb_partial_csum_set); 5767 5768 static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, 5769 unsigned int max) 5770 { 5771 if (skb_headlen(skb) >= len) 5772 return 0; 5773 5774 /* If we need to pullup then pullup to the max, so we 5775 * won't need to do it again. 5776 */ 5777 if (max > skb->len) 5778 max = skb->len; 5779 5780 if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) 5781 return -ENOMEM; 5782 5783 if (skb_headlen(skb) < len) 5784 return -EPROTO; 5785 5786 return 0; 5787 } 5788 5789 #define MAX_TCP_HDR_LEN (15 * 4) 5790 5791 static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, 5792 typeof(IPPROTO_IP) proto, 5793 unsigned int off) 5794 { 5795 int err; 5796 5797 switch (proto) { 5798 case IPPROTO_TCP: 5799 err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), 5800 off + MAX_TCP_HDR_LEN); 5801 if (!err && !skb_partial_csum_set(skb, off, 5802 offsetof(struct tcphdr, 5803 check))) 5804 err = -EPROTO; 5805 return err ? ERR_PTR(err) : &tcp_hdr(skb)->check; 5806 5807 case IPPROTO_UDP: 5808 err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr), 5809 off + sizeof(struct udphdr)); 5810 if (!err && !skb_partial_csum_set(skb, off, 5811 offsetof(struct udphdr, 5812 check))) 5813 err = -EPROTO; 5814 return err ? ERR_PTR(err) : &udp_hdr(skb)->check; 5815 } 5816 5817 return ERR_PTR(-EPROTO); 5818 } 5819 5820 /* This value should be large enough to cover a tagged ethernet header plus 5821 * maximally sized IP and TCP or UDP headers. 5822 */ 5823 #define MAX_IP_HDR_LEN 128 5824 5825 static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) 5826 { 5827 unsigned int off; 5828 bool fragment; 5829 __sum16 *csum; 5830 int err; 5831 5832 fragment = false; 5833 5834 err = skb_maybe_pull_tail(skb, 5835 sizeof(struct iphdr), 5836 MAX_IP_HDR_LEN); 5837 if (err < 0) 5838 goto out; 5839 5840 if (ip_is_fragment(ip_hdr(skb))) 5841 fragment = true; 5842 5843 off = ip_hdrlen(skb); 5844 5845 err = -EPROTO; 5846 5847 if (fragment) 5848 goto out; 5849 5850 csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off); 5851 if (IS_ERR(csum)) 5852 return PTR_ERR(csum); 5853 5854 if (recalculate) 5855 *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr, 5856 ip_hdr(skb)->daddr, 5857 skb->len - off, 5858 ip_hdr(skb)->protocol, 0); 5859 err = 0; 5860 5861 out: 5862 return err; 5863 } 5864 5865 /* This value should be large enough to cover a tagged ethernet header plus 5866 * an IPv6 header, all options, and a maximal TCP or UDP header. 5867 */ 5868 #define MAX_IPV6_HDR_LEN 256 5869 5870 #define OPT_HDR(type, skb, off) \ 5871 (type *)(skb_network_header(skb) + (off)) 5872 5873 static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) 5874 { 5875 int err; 5876 u8 nexthdr; 5877 unsigned int off; 5878 unsigned int len; 5879 bool fragment; 5880 bool done; 5881 __sum16 *csum; 5882 5883 fragment = false; 5884 done = false; 5885 5886 off = sizeof(struct ipv6hdr); 5887 5888 err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN); 5889 if (err < 0) 5890 goto out; 5891 5892 nexthdr = ipv6_hdr(skb)->nexthdr; 5893 5894 len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); 5895 while (off <= len && !done) { 5896 switch (nexthdr) { 5897 case IPPROTO_DSTOPTS: 5898 case IPPROTO_HOPOPTS: 5899 case IPPROTO_ROUTING: { 5900 struct ipv6_opt_hdr *hp; 5901 5902 err = skb_maybe_pull_tail(skb, 5903 off + 5904 sizeof(struct ipv6_opt_hdr), 5905 MAX_IPV6_HDR_LEN); 5906 if (err < 0) 5907 goto out; 5908 5909 hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); 5910 nexthdr = hp->nexthdr; 5911 off += ipv6_optlen(hp); 5912 break; 5913 } 5914 case IPPROTO_AH: { 5915 struct ip_auth_hdr *hp; 5916 5917 err = skb_maybe_pull_tail(skb, 5918 off + 5919 sizeof(struct ip_auth_hdr), 5920 MAX_IPV6_HDR_LEN); 5921 if (err < 0) 5922 goto out; 5923 5924 hp = OPT_HDR(struct ip_auth_hdr, skb, off); 5925 nexthdr = hp->nexthdr; 5926 off += ipv6_authlen(hp); 5927 break; 5928 } 5929 case IPPROTO_FRAGMENT: { 5930 struct frag_hdr *hp; 5931 5932 err = skb_maybe_pull_tail(skb, 5933 off + 5934 sizeof(struct frag_hdr), 5935 MAX_IPV6_HDR_LEN); 5936 if (err < 0) 5937 goto out; 5938 5939 hp = OPT_HDR(struct frag_hdr, skb, off); 5940 5941 if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) 5942 fragment = true; 5943 5944 nexthdr = hp->nexthdr; 5945 off += sizeof(struct frag_hdr); 5946 break; 5947 } 5948 default: 5949 done = true; 5950 break; 5951 } 5952 } 5953 5954 err = -EPROTO; 5955 5956 if (!done || fragment) 5957 goto out; 5958 5959 csum = skb_checksum_setup_ip(skb, nexthdr, off); 5960 if (IS_ERR(csum)) 5961 return PTR_ERR(csum); 5962 5963 if (recalculate) 5964 *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr, 5965 &ipv6_hdr(skb)->daddr, 5966 skb->len - off, nexthdr, 0); 5967 err = 0; 5968 5969 out: 5970 return err; 5971 } 5972 5973 /** 5974 * skb_checksum_setup - set up partial checksum offset 5975 * @skb: the skb to set up 5976 * @recalculate: if true the pseudo-header checksum will be recalculated 5977 */ 5978 int skb_checksum_setup(struct sk_buff *skb, bool recalculate) 5979 { 5980 int err; 5981 5982 switch (skb->protocol) { 5983 case htons(ETH_P_IP): 5984 err = skb_checksum_setup_ipv4(skb, recalculate); 5985 break; 5986 5987 case htons(ETH_P_IPV6): 5988 err = skb_checksum_setup_ipv6(skb, recalculate); 5989 break; 5990 5991 default: 5992 err = -EPROTO; 5993 break; 5994 } 5995 5996 return err; 5997 } 5998 EXPORT_SYMBOL(skb_checksum_setup); 5999 6000 /** 6001 * skb_checksum_maybe_trim - maybe trims the given skb 6002 * @skb: the skb to check 6003 * @transport_len: the data length beyond the network header 6004 * 6005 * Checks whether the given skb has data beyond the given transport length. 6006 * If so, returns a cloned skb trimmed to this transport length. 6007 * Otherwise returns the provided skb. Returns NULL in error cases 6008 * (e.g. transport_len exceeds skb length or out-of-memory). 6009 * 6010 * Caller needs to set the skb transport header and free any returned skb if it 6011 * differs from the provided skb. 6012 */ 6013 static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, 6014 unsigned int transport_len) 6015 { 6016 struct sk_buff *skb_chk; 6017 unsigned int len = skb_transport_offset(skb) + transport_len; 6018 int ret; 6019 6020 if (skb->len < len) 6021 return NULL; 6022 else if (skb->len == len) 6023 return skb; 6024 6025 skb_chk = skb_clone(skb, GFP_ATOMIC); 6026 if (!skb_chk) 6027 return NULL; 6028 6029 ret = pskb_trim_rcsum(skb_chk, len); 6030 if (ret) { 6031 kfree_skb(skb_chk); 6032 return NULL; 6033 } 6034 6035 return skb_chk; 6036 } 6037 6038 /** 6039 * skb_checksum_trimmed - validate checksum of an skb 6040 * @skb: the skb to check 6041 * @transport_len: the data length beyond the network header 6042 * @skb_chkf: checksum function to use 6043 * 6044 * Applies the given checksum function skb_chkf to the provided skb. 6045 * Returns a checked and maybe trimmed skb. Returns NULL on error. 6046 * 6047 * If the skb has data beyond the given transport length, then a 6048 * trimmed & cloned skb is checked and returned. 6049 * 6050 * Caller needs to set the skb transport header and free any returned skb if it 6051 * differs from the provided skb. 6052 */ 6053 struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, 6054 unsigned int transport_len, 6055 __sum16(*skb_chkf)(struct sk_buff *skb)) 6056 { 6057 struct sk_buff *skb_chk; 6058 unsigned int offset = skb_transport_offset(skb); 6059 __sum16 ret; 6060 6061 skb_chk = skb_checksum_maybe_trim(skb, transport_len); 6062 if (!skb_chk) 6063 goto err; 6064 6065 if (!pskb_may_pull(skb_chk, offset)) 6066 goto err; 6067 6068 skb_pull_rcsum(skb_chk, offset); 6069 ret = skb_chkf(skb_chk); 6070 skb_push_rcsum(skb_chk, offset); 6071 6072 if (ret) 6073 goto err; 6074 6075 return skb_chk; 6076 6077 err: 6078 if (skb_chk && skb_chk != skb) 6079 kfree_skb(skb_chk); 6080 6081 return NULL; 6082 6083 } 6084 EXPORT_SYMBOL(skb_checksum_trimmed); 6085 6086 void __skb_warn_lro_forwarding(const struct sk_buff *skb) 6087 { 6088 net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", 6089 skb->dev->name); 6090 } 6091 EXPORT_SYMBOL(__skb_warn_lro_forwarding); 6092 6093 void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) 6094 { 6095 if (head_stolen) { 6096 skb_release_head_state(skb); 6097 kmem_cache_free(net_hotdata.skbuff_cache, skb); 6098 } else { 6099 __kfree_skb(skb); 6100 } 6101 } 6102 EXPORT_SYMBOL(kfree_skb_partial); 6103 6104 /** 6105 * skb_try_coalesce - try to merge skb to prior one 6106 * @to: prior buffer 6107 * @from: buffer to add 6108 * @fragstolen: pointer to boolean 6109 * @delta_truesize: how much more was allocated than was requested 6110 */ 6111 bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, 6112 bool *fragstolen, int *delta_truesize) 6113 { 6114 struct skb_shared_info *to_shinfo, *from_shinfo; 6115 int i, delta, len = from->len; 6116 6117 *fragstolen = false; 6118 6119 if (skb_cloned(to)) 6120 return false; 6121 6122 /* In general, avoid mixing page_pool and non-page_pool allocated 6123 * pages within the same SKB. In theory we could take full 6124 * references if @from is cloned and !@to->pp_recycle but its 6125 * tricky (due to potential race with the clone disappearing) and 6126 * rare, so not worth dealing with. 6127 */ 6128 if (to->pp_recycle != from->pp_recycle) 6129 return false; 6130 6131 if (skb_frags_readable(from) != skb_frags_readable(to)) 6132 return false; 6133 6134 if (len <= skb_tailroom(to) && skb_frags_readable(from)) { 6135 if (len) 6136 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); 6137 *delta_truesize = 0; 6138 return true; 6139 } 6140 6141 to_shinfo = skb_shinfo(to); 6142 from_shinfo = skb_shinfo(from); 6143 if (to_shinfo->frag_list || from_shinfo->frag_list) 6144 return false; 6145 if (skb_zcopy(to) || skb_zcopy(from)) 6146 return false; 6147 6148 if (skb_headlen(from) != 0) { 6149 struct page *page; 6150 unsigned int offset; 6151 6152 if (to_shinfo->nr_frags + 6153 from_shinfo->nr_frags >= MAX_SKB_FRAGS) 6154 return false; 6155 6156 if (skb_head_is_locked(from)) 6157 return false; 6158 6159 delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 6160 6161 page = virt_to_head_page(from->head); 6162 offset = from->data - (unsigned char *)page_address(page); 6163 6164 skb_fill_page_desc(to, to_shinfo->nr_frags, 6165 page, offset, skb_headlen(from)); 6166 *fragstolen = true; 6167 } else { 6168 if (to_shinfo->nr_frags + 6169 from_shinfo->nr_frags > MAX_SKB_FRAGS) 6170 return false; 6171 6172 delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); 6173 } 6174 6175 WARN_ON_ONCE(delta < len); 6176 6177 memcpy(to_shinfo->frags + to_shinfo->nr_frags, 6178 from_shinfo->frags, 6179 from_shinfo->nr_frags * sizeof(skb_frag_t)); 6180 to_shinfo->nr_frags += from_shinfo->nr_frags; 6181 6182 if (!skb_cloned(from)) 6183 from_shinfo->nr_frags = 0; 6184 6185 /* if the skb is not cloned this does nothing 6186 * since we set nr_frags to 0. 6187 */ 6188 if (skb_pp_frag_ref(from)) { 6189 for (i = 0; i < from_shinfo->nr_frags; i++) 6190 __skb_frag_ref(&from_shinfo->frags[i]); 6191 } 6192 6193 to->truesize += delta; 6194 to->len += len; 6195 to->data_len += len; 6196 6197 *delta_truesize = delta; 6198 return true; 6199 } 6200 EXPORT_SYMBOL(skb_try_coalesce); 6201 6202 /** 6203 * skb_scrub_packet - scrub an skb 6204 * 6205 * @skb: buffer to clean 6206 * @xnet: packet is crossing netns 6207 * 6208 * skb_scrub_packet can be used after encapsulating or decapsulating a packet 6209 * into/from a tunnel. Some information have to be cleared during these 6210 * operations. 6211 * skb_scrub_packet can also be used to clean a skb before injecting it in 6212 * another namespace (@xnet == true). We have to clear all information in the 6213 * skb that could impact namespace isolation. 6214 */ 6215 void skb_scrub_packet(struct sk_buff *skb, bool xnet) 6216 { 6217 skb->pkt_type = PACKET_HOST; 6218 skb->skb_iif = 0; 6219 skb->ignore_df = 0; 6220 skb_dst_drop(skb); 6221 skb_ext_reset(skb); 6222 nf_reset_ct(skb); 6223 nf_reset_trace(skb); 6224 6225 #ifdef CONFIG_NET_SWITCHDEV 6226 skb->offload_fwd_mark = 0; 6227 skb->offload_l3_fwd_mark = 0; 6228 #endif 6229 ipvs_reset(skb); 6230 6231 if (!xnet) 6232 return; 6233 6234 skb->mark = 0; 6235 skb_clear_tstamp(skb); 6236 } 6237 EXPORT_SYMBOL_GPL(skb_scrub_packet); 6238 6239 static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) 6240 { 6241 int mac_len, meta_len; 6242 void *meta; 6243 6244 if (skb_cow(skb, skb_headroom(skb)) < 0) { 6245 kfree_skb(skb); 6246 return NULL; 6247 } 6248 6249 mac_len = skb->data - skb_mac_header(skb); 6250 if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { 6251 memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), 6252 mac_len - VLAN_HLEN - ETH_TLEN); 6253 } 6254 6255 meta_len = skb_metadata_len(skb); 6256 if (meta_len) { 6257 meta = skb_metadata_end(skb) - meta_len; 6258 memmove(meta + VLAN_HLEN, meta, meta_len); 6259 } 6260 6261 skb->mac_header += VLAN_HLEN; 6262 return skb; 6263 } 6264 6265 struct sk_buff *skb_vlan_untag(struct sk_buff *skb) 6266 { 6267 struct vlan_hdr *vhdr; 6268 u16 vlan_tci; 6269 6270 if (unlikely(skb_vlan_tag_present(skb))) { 6271 /* vlan_tci is already set-up so leave this for another time */ 6272 return skb; 6273 } 6274 6275 skb = skb_share_check(skb, GFP_ATOMIC); 6276 if (unlikely(!skb)) 6277 goto err_free; 6278 /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ 6279 if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) 6280 goto err_free; 6281 6282 vhdr = (struct vlan_hdr *)skb->data; 6283 vlan_tci = ntohs(vhdr->h_vlan_TCI); 6284 __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci); 6285 6286 skb_pull_rcsum(skb, VLAN_HLEN); 6287 vlan_set_encap_proto(skb, vhdr); 6288 6289 skb = skb_reorder_vlan_header(skb); 6290 if (unlikely(!skb)) 6291 goto err_free; 6292 6293 skb_reset_network_header(skb); 6294 if (!skb_transport_header_was_set(skb)) 6295 skb_reset_transport_header(skb); 6296 skb_reset_mac_len(skb); 6297 6298 return skb; 6299 6300 err_free: 6301 kfree_skb(skb); 6302 return NULL; 6303 } 6304 EXPORT_SYMBOL(skb_vlan_untag); 6305 6306 int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) 6307 { 6308 if (!pskb_may_pull(skb, write_len)) 6309 return -ENOMEM; 6310 6311 if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) 6312 return 0; 6313 6314 return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 6315 } 6316 EXPORT_SYMBOL(skb_ensure_writable); 6317 6318 int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev) 6319 { 6320 int needed_headroom = dev->needed_headroom; 6321 int needed_tailroom = dev->needed_tailroom; 6322 6323 /* For tail taggers, we need to pad short frames ourselves, to ensure 6324 * that the tail tag does not fail at its role of being at the end of 6325 * the packet, once the conduit interface pads the frame. Account for 6326 * that pad length here, and pad later. 6327 */ 6328 if (unlikely(needed_tailroom && skb->len < ETH_ZLEN)) 6329 needed_tailroom += ETH_ZLEN - skb->len; 6330 /* skb_headroom() returns unsigned int... */ 6331 needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0); 6332 needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0); 6333 6334 if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb))) 6335 /* No reallocation needed, yay! */ 6336 return 0; 6337 6338 return pskb_expand_head(skb, needed_headroom, needed_tailroom, 6339 GFP_ATOMIC); 6340 } 6341 EXPORT_SYMBOL(skb_ensure_writable_head_tail); 6342 6343 /* remove VLAN header from packet and update csum accordingly. 6344 * expects a non skb_vlan_tag_present skb with a vlan tag payload 6345 */ 6346 int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) 6347 { 6348 int offset = skb->data - skb_mac_header(skb); 6349 int err; 6350 6351 if (WARN_ONCE(offset, 6352 "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", 6353 offset)) { 6354 return -EINVAL; 6355 } 6356 6357 err = skb_ensure_writable(skb, VLAN_ETH_HLEN); 6358 if (unlikely(err)) 6359 return err; 6360 6361 skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 6362 6363 vlan_remove_tag(skb, vlan_tci); 6364 6365 skb->mac_header += VLAN_HLEN; 6366 6367 if (skb_network_offset(skb) < ETH_HLEN) 6368 skb_set_network_header(skb, ETH_HLEN); 6369 6370 skb_reset_mac_len(skb); 6371 6372 return err; 6373 } 6374 EXPORT_SYMBOL(__skb_vlan_pop); 6375 6376 /* Pop a vlan tag either from hwaccel or from payload. 6377 * Expects skb->data at mac header. 6378 */ 6379 int skb_vlan_pop(struct sk_buff *skb) 6380 { 6381 u16 vlan_tci; 6382 __be16 vlan_proto; 6383 int err; 6384 6385 if (likely(skb_vlan_tag_present(skb))) { 6386 __vlan_hwaccel_clear_tag(skb); 6387 } else { 6388 if (unlikely(!eth_type_vlan(skb->protocol))) 6389 return 0; 6390 6391 err = __skb_vlan_pop(skb, &vlan_tci); 6392 if (err) 6393 return err; 6394 } 6395 /* move next vlan tag to hw accel tag */ 6396 if (likely(!eth_type_vlan(skb->protocol))) 6397 return 0; 6398 6399 vlan_proto = skb->protocol; 6400 err = __skb_vlan_pop(skb, &vlan_tci); 6401 if (unlikely(err)) 6402 return err; 6403 6404 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 6405 return 0; 6406 } 6407 EXPORT_SYMBOL(skb_vlan_pop); 6408 6409 /* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). 6410 * Expects skb->data at mac header. 6411 */ 6412 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) 6413 { 6414 if (skb_vlan_tag_present(skb)) { 6415 int offset = skb->data - skb_mac_header(skb); 6416 int err; 6417 6418 if (WARN_ONCE(offset, 6419 "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", 6420 offset)) { 6421 return -EINVAL; 6422 } 6423 6424 err = __vlan_insert_tag(skb, skb->vlan_proto, 6425 skb_vlan_tag_get(skb)); 6426 if (err) 6427 return err; 6428 6429 skb->protocol = skb->vlan_proto; 6430 skb->network_header -= VLAN_HLEN; 6431 6432 skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); 6433 } 6434 __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); 6435 return 0; 6436 } 6437 EXPORT_SYMBOL(skb_vlan_push); 6438 6439 /** 6440 * skb_eth_pop() - Drop the Ethernet header at the head of a packet 6441 * 6442 * @skb: Socket buffer to modify 6443 * 6444 * Drop the Ethernet header of @skb. 6445 * 6446 * Expects that skb->data points to the mac header and that no VLAN tags are 6447 * present. 6448 * 6449 * Returns 0 on success, -errno otherwise. 6450 */ 6451 int skb_eth_pop(struct sk_buff *skb) 6452 { 6453 if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || 6454 skb_network_offset(skb) < ETH_HLEN) 6455 return -EPROTO; 6456 6457 skb_pull_rcsum(skb, ETH_HLEN); 6458 skb_reset_mac_header(skb); 6459 skb_reset_mac_len(skb); 6460 6461 return 0; 6462 } 6463 EXPORT_SYMBOL(skb_eth_pop); 6464 6465 /** 6466 * skb_eth_push() - Add a new Ethernet header at the head of a packet 6467 * 6468 * @skb: Socket buffer to modify 6469 * @dst: Destination MAC address of the new header 6470 * @src: Source MAC address of the new header 6471 * 6472 * Prepend @skb with a new Ethernet header. 6473 * 6474 * Expects that skb->data points to the mac header, which must be empty. 6475 * 6476 * Returns 0 on success, -errno otherwise. 6477 */ 6478 int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, 6479 const unsigned char *src) 6480 { 6481 struct ethhdr *eth; 6482 int err; 6483 6484 if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) 6485 return -EPROTO; 6486 6487 err = skb_cow_head(skb, sizeof(*eth)); 6488 if (err < 0) 6489 return err; 6490 6491 skb_push(skb, sizeof(*eth)); 6492 skb_reset_mac_header(skb); 6493 skb_reset_mac_len(skb); 6494 6495 eth = eth_hdr(skb); 6496 ether_addr_copy(eth->h_dest, dst); 6497 ether_addr_copy(eth->h_source, src); 6498 eth->h_proto = skb->protocol; 6499 6500 skb_postpush_rcsum(skb, eth, sizeof(*eth)); 6501 6502 return 0; 6503 } 6504 EXPORT_SYMBOL(skb_eth_push); 6505 6506 /* Update the ethertype of hdr and the skb csum value if required. */ 6507 static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, 6508 __be16 ethertype) 6509 { 6510 if (skb->ip_summed == CHECKSUM_COMPLETE) { 6511 __be16 diff[] = { ~hdr->h_proto, ethertype }; 6512 6513 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); 6514 } 6515 6516 hdr->h_proto = ethertype; 6517 } 6518 6519 /** 6520 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of 6521 * the packet 6522 * 6523 * @skb: buffer 6524 * @mpls_lse: MPLS label stack entry to push 6525 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) 6526 * @mac_len: length of the MAC header 6527 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is 6528 * ethernet 6529 * 6530 * Expects skb->data at mac header. 6531 * 6532 * Returns 0 on success, -errno otherwise. 6533 */ 6534 int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, 6535 int mac_len, bool ethernet) 6536 { 6537 struct mpls_shim_hdr *lse; 6538 int err; 6539 6540 if (unlikely(!eth_p_mpls(mpls_proto))) 6541 return -EINVAL; 6542 6543 /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ 6544 if (skb->encapsulation) 6545 return -EINVAL; 6546 6547 err = skb_cow_head(skb, MPLS_HLEN); 6548 if (unlikely(err)) 6549 return err; 6550 6551 if (!skb->inner_protocol) { 6552 skb_set_inner_network_header(skb, skb_network_offset(skb)); 6553 skb_set_inner_protocol(skb, skb->protocol); 6554 } 6555 6556 skb_push(skb, MPLS_HLEN); 6557 memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), 6558 mac_len); 6559 skb_reset_mac_header(skb); 6560 skb_set_network_header(skb, mac_len); 6561 skb_reset_mac_len(skb); 6562 6563 lse = mpls_hdr(skb); 6564 lse->label_stack_entry = mpls_lse; 6565 skb_postpush_rcsum(skb, lse, MPLS_HLEN); 6566 6567 if (ethernet && mac_len >= ETH_HLEN) 6568 skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto); 6569 skb->protocol = mpls_proto; 6570 6571 return 0; 6572 } 6573 EXPORT_SYMBOL_GPL(skb_mpls_push); 6574 6575 /** 6576 * skb_mpls_pop() - pop the outermost MPLS header 6577 * 6578 * @skb: buffer 6579 * @next_proto: ethertype of header after popped MPLS header 6580 * @mac_len: length of the MAC header 6581 * @ethernet: flag to indicate if the packet is ethernet 6582 * 6583 * Expects skb->data at mac header. 6584 * 6585 * Returns 0 on success, -errno otherwise. 6586 */ 6587 int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, 6588 bool ethernet) 6589 { 6590 int err; 6591 6592 if (unlikely(!eth_p_mpls(skb->protocol))) 6593 return 0; 6594 6595 err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); 6596 if (unlikely(err)) 6597 return err; 6598 6599 skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); 6600 memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), 6601 mac_len); 6602 6603 __skb_pull(skb, MPLS_HLEN); 6604 skb_reset_mac_header(skb); 6605 skb_set_network_header(skb, mac_len); 6606 6607 if (ethernet && mac_len >= ETH_HLEN) { 6608 struct ethhdr *hdr; 6609 6610 /* use mpls_hdr() to get ethertype to account for VLANs. */ 6611 hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); 6612 skb_mod_eth_type(skb, hdr, next_proto); 6613 } 6614 skb->protocol = next_proto; 6615 6616 return 0; 6617 } 6618 EXPORT_SYMBOL_GPL(skb_mpls_pop); 6619 6620 /** 6621 * skb_mpls_update_lse() - modify outermost MPLS header and update csum 6622 * 6623 * @skb: buffer 6624 * @mpls_lse: new MPLS label stack entry to update to 6625 * 6626 * Expects skb->data at mac header. 6627 * 6628 * Returns 0 on success, -errno otherwise. 6629 */ 6630 int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) 6631 { 6632 int err; 6633 6634 if (unlikely(!eth_p_mpls(skb->protocol))) 6635 return -EINVAL; 6636 6637 err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); 6638 if (unlikely(err)) 6639 return err; 6640 6641 if (skb->ip_summed == CHECKSUM_COMPLETE) { 6642 __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; 6643 6644 skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); 6645 } 6646 6647 mpls_hdr(skb)->label_stack_entry = mpls_lse; 6648 6649 return 0; 6650 } 6651 EXPORT_SYMBOL_GPL(skb_mpls_update_lse); 6652 6653 /** 6654 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header 6655 * 6656 * @skb: buffer 6657 * 6658 * Expects skb->data at mac header. 6659 * 6660 * Returns 0 on success, -errno otherwise. 6661 */ 6662 int skb_mpls_dec_ttl(struct sk_buff *skb) 6663 { 6664 u32 lse; 6665 u8 ttl; 6666 6667 if (unlikely(!eth_p_mpls(skb->protocol))) 6668 return -EINVAL; 6669 6670 if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) 6671 return -ENOMEM; 6672 6673 lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); 6674 ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; 6675 if (!--ttl) 6676 return -EINVAL; 6677 6678 lse &= ~MPLS_LS_TTL_MASK; 6679 lse |= ttl << MPLS_LS_TTL_SHIFT; 6680 6681 return skb_mpls_update_lse(skb, cpu_to_be32(lse)); 6682 } 6683 EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); 6684 6685 /** 6686 * alloc_skb_with_frags - allocate skb with page frags 6687 * 6688 * @header_len: size of linear part 6689 * @data_len: needed length in frags 6690 * @order: max page order desired. 6691 * @errcode: pointer to error code if any 6692 * @gfp_mask: allocation mask 6693 * 6694 * This can be used to allocate a paged skb, given a maximal order for frags. 6695 */ 6696 struct sk_buff *alloc_skb_with_frags(unsigned long header_len, 6697 unsigned long data_len, 6698 int order, 6699 int *errcode, 6700 gfp_t gfp_mask) 6701 { 6702 unsigned long chunk; 6703 struct sk_buff *skb; 6704 struct page *page; 6705 int nr_frags = 0; 6706 6707 *errcode = -EMSGSIZE; 6708 if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order))) 6709 return NULL; 6710 6711 *errcode = -ENOBUFS; 6712 skb = alloc_skb(header_len, gfp_mask); 6713 if (!skb) 6714 return NULL; 6715 6716 while (data_len) { 6717 if (nr_frags == MAX_SKB_FRAGS) 6718 goto failure; 6719 while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) 6720 order--; 6721 6722 if (order) { 6723 page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | 6724 __GFP_COMP | 6725 __GFP_NOWARN, 6726 order); 6727 if (!page) { 6728 order--; 6729 continue; 6730 } 6731 } else { 6732 page = alloc_page(gfp_mask); 6733 if (!page) 6734 goto failure; 6735 } 6736 chunk = min_t(unsigned long, data_len, 6737 PAGE_SIZE << order); 6738 skb_fill_page_desc(skb, nr_frags, page, 0, chunk); 6739 nr_frags++; 6740 skb->truesize += (PAGE_SIZE << order); 6741 data_len -= chunk; 6742 } 6743 return skb; 6744 6745 failure: 6746 kfree_skb(skb); 6747 return NULL; 6748 } 6749 EXPORT_SYMBOL(alloc_skb_with_frags); 6750 6751 /* carve out the first off bytes from skb when off < headlen */ 6752 static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, 6753 const int headlen, gfp_t gfp_mask) 6754 { 6755 int i; 6756 unsigned int size = skb_end_offset(skb); 6757 int new_hlen = headlen - off; 6758 u8 *data; 6759 6760 if (skb_pfmemalloc(skb)) 6761 gfp_mask |= __GFP_MEMALLOC; 6762 6763 data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); 6764 if (!data) 6765 return -ENOMEM; 6766 size = SKB_WITH_OVERHEAD(size); 6767 6768 /* Copy real data, and all frags */ 6769 skb_copy_from_linear_data_offset(skb, off, data, new_hlen); 6770 skb->len -= off; 6771 6772 memcpy((struct skb_shared_info *)(data + size), 6773 skb_shinfo(skb), 6774 offsetof(struct skb_shared_info, 6775 frags[skb_shinfo(skb)->nr_frags])); 6776 if (skb_cloned(skb)) { 6777 /* drop the old head gracefully */ 6778 if (skb_orphan_frags(skb, gfp_mask)) { 6779 skb_kfree_head(data, size); 6780 return -ENOMEM; 6781 } 6782 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 6783 skb_frag_ref(skb, i); 6784 if (skb_has_frag_list(skb)) 6785 skb_clone_fraglist(skb); 6786 skb_release_data(skb, SKB_CONSUMED); 6787 } else { 6788 /* we can reuse existing recount- all we did was 6789 * relocate values 6790 */ 6791 skb_free_head(skb); 6792 } 6793 6794 skb->head = data; 6795 skb->data = data; 6796 skb->head_frag = 0; 6797 skb_set_end_offset(skb, size); 6798 skb_set_tail_pointer(skb, skb_headlen(skb)); 6799 skb_headers_offset_update(skb, 0); 6800 skb->cloned = 0; 6801 skb->hdr_len = 0; 6802 skb->nohdr = 0; 6803 atomic_set(&skb_shinfo(skb)->dataref, 1); 6804 6805 return 0; 6806 } 6807 6808 static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); 6809 6810 /* carve out the first eat bytes from skb's frag_list. May recurse into 6811 * pskb_carve() 6812 */ 6813 static int pskb_carve_frag_list(struct skb_shared_info *shinfo, int eat, 6814 gfp_t gfp_mask) 6815 { 6816 struct sk_buff *list = shinfo->frag_list; 6817 struct sk_buff *clone = NULL; 6818 struct sk_buff *insp = NULL; 6819 6820 do { 6821 if (!list) { 6822 pr_err("Not enough bytes to eat. Want %d\n", eat); 6823 return -EFAULT; 6824 } 6825 if (list->len <= eat) { 6826 /* Eaten as whole. */ 6827 eat -= list->len; 6828 list = list->next; 6829 insp = list; 6830 } else { 6831 /* Eaten partially. */ 6832 if (skb_shared(list)) { 6833 clone = skb_clone(list, gfp_mask); 6834 if (!clone) 6835 return -ENOMEM; 6836 insp = list->next; 6837 list = clone; 6838 } else { 6839 /* This may be pulled without problems. */ 6840 insp = list; 6841 } 6842 if (pskb_carve(list, eat, gfp_mask) < 0) { 6843 kfree_skb(clone); 6844 return -ENOMEM; 6845 } 6846 break; 6847 } 6848 } while (eat); 6849 6850 /* Free pulled out fragments. */ 6851 while ((list = shinfo->frag_list) != insp) { 6852 shinfo->frag_list = list->next; 6853 consume_skb(list); 6854 } 6855 /* And insert new clone at head. */ 6856 if (clone) { 6857 clone->next = list; 6858 shinfo->frag_list = clone; 6859 } 6860 return 0; 6861 } 6862 6863 /* carve off first len bytes from skb. Split line (off) is in the 6864 * non-linear part of skb 6865 */ 6866 static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, 6867 int pos, gfp_t gfp_mask) 6868 { 6869 int i, k = 0; 6870 unsigned int size = skb_end_offset(skb); 6871 u8 *data; 6872 const int nfrags = skb_shinfo(skb)->nr_frags; 6873 struct skb_shared_info *shinfo; 6874 6875 if (skb_pfmemalloc(skb)) 6876 gfp_mask |= __GFP_MEMALLOC; 6877 6878 data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL); 6879 if (!data) 6880 return -ENOMEM; 6881 size = SKB_WITH_OVERHEAD(size); 6882 6883 memcpy((struct skb_shared_info *)(data + size), 6884 skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); 6885 if (skb_orphan_frags(skb, gfp_mask)) { 6886 skb_kfree_head(data, size); 6887 return -ENOMEM; 6888 } 6889 shinfo = (struct skb_shared_info *)(data + size); 6890 for (i = 0; i < nfrags; i++) { 6891 int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); 6892 6893 if (pos + fsize > off) { 6894 shinfo->frags[k] = skb_shinfo(skb)->frags[i]; 6895 6896 if (pos < off) { 6897 /* Split frag. 6898 * We have two variants in this case: 6899 * 1. Move all the frag to the second 6900 * part, if it is possible. F.e. 6901 * this approach is mandatory for TUX, 6902 * where splitting is expensive. 6903 * 2. Split is accurately. We make this. 6904 */ 6905 skb_frag_off_add(&shinfo->frags[0], off - pos); 6906 skb_frag_size_sub(&shinfo->frags[0], off - pos); 6907 } 6908 skb_frag_ref(skb, i); 6909 k++; 6910 } 6911 pos += fsize; 6912 } 6913 shinfo->nr_frags = k; 6914 if (skb_has_frag_list(skb)) 6915 skb_clone_fraglist(skb); 6916 6917 /* split line is in frag list */ 6918 if (k == 0 && pskb_carve_frag_list(shinfo, off - pos, gfp_mask)) { 6919 /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ 6920 if (skb_has_frag_list(skb)) 6921 kfree_skb_list(skb_shinfo(skb)->frag_list); 6922 skb_kfree_head(data, size); 6923 return -ENOMEM; 6924 } 6925 skb_release_data(skb, SKB_CONSUMED); 6926 6927 skb->head = data; 6928 skb->head_frag = 0; 6929 skb->data = data; 6930 skb_set_end_offset(skb, size); 6931 skb_reset_tail_pointer(skb); 6932 skb_headers_offset_update(skb, 0); 6933 skb->cloned = 0; 6934 skb->hdr_len = 0; 6935 skb->nohdr = 0; 6936 skb->len -= off; 6937 skb->data_len = skb->len; 6938 atomic_set(&skb_shinfo(skb)->dataref, 1); 6939 return 0; 6940 } 6941 6942 /* remove len bytes from the beginning of the skb */ 6943 static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) 6944 { 6945 int headlen = skb_headlen(skb); 6946 6947 if (len < headlen) 6948 return pskb_carve_inside_header(skb, len, headlen, gfp); 6949 else 6950 return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); 6951 } 6952 6953 /* Extract to_copy bytes starting at off from skb, and return this in 6954 * a new skb 6955 */ 6956 struct sk_buff *pskb_extract(struct sk_buff *skb, int off, 6957 int to_copy, gfp_t gfp) 6958 { 6959 struct sk_buff *clone = skb_clone(skb, gfp); 6960 6961 if (!clone) 6962 return NULL; 6963 6964 if (pskb_carve(clone, off, gfp) < 0 || 6965 pskb_trim(clone, to_copy)) { 6966 kfree_skb(clone); 6967 return NULL; 6968 } 6969 return clone; 6970 } 6971 EXPORT_SYMBOL(pskb_extract); 6972 6973 /** 6974 * skb_condense - try to get rid of fragments/frag_list if possible 6975 * @skb: buffer 6976 * 6977 * Can be used to save memory before skb is added to a busy queue. 6978 * If packet has bytes in frags and enough tail room in skb->head, 6979 * pull all of them, so that we can free the frags right now and adjust 6980 * truesize. 6981 * Notes: 6982 * We do not reallocate skb->head thus can not fail. 6983 * Caller must re-evaluate skb->truesize if needed. 6984 */ 6985 void skb_condense(struct sk_buff *skb) 6986 { 6987 if (skb->data_len) { 6988 if (skb->data_len > skb->end - skb->tail || 6989 skb_cloned(skb) || !skb_frags_readable(skb)) 6990 return; 6991 6992 /* Nice, we can free page frag(s) right now */ 6993 __pskb_pull_tail(skb, skb->data_len); 6994 } 6995 /* At this point, skb->truesize might be over estimated, 6996 * because skb had a fragment, and fragments do not tell 6997 * their truesize. 6998 * When we pulled its content into skb->head, fragment 6999 * was freed, but __pskb_pull_tail() could not possibly 7000 * adjust skb->truesize, not knowing the frag truesize. 7001 */ 7002 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 7003 } 7004 EXPORT_SYMBOL(skb_condense); 7005 7006 #ifdef CONFIG_SKB_EXTENSIONS 7007 static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) 7008 { 7009 return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); 7010 } 7011 7012 /** 7013 * __skb_ext_alloc - allocate a new skb extensions storage 7014 * 7015 * @flags: See kmalloc(). 7016 * 7017 * Returns the newly allocated pointer. The pointer can later attached to a 7018 * skb via __skb_ext_set(). 7019 * Note: caller must handle the skb_ext as an opaque data. 7020 */ 7021 struct skb_ext *__skb_ext_alloc(gfp_t flags) 7022 { 7023 struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); 7024 7025 if (new) { 7026 memset(new->offset, 0, sizeof(new->offset)); 7027 refcount_set(&new->refcnt, 1); 7028 } 7029 7030 return new; 7031 } 7032 7033 static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, 7034 unsigned int old_active) 7035 { 7036 struct skb_ext *new; 7037 7038 if (refcount_read(&old->refcnt) == 1) 7039 return old; 7040 7041 new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); 7042 if (!new) 7043 return NULL; 7044 7045 memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); 7046 refcount_set(&new->refcnt, 1); 7047 7048 #ifdef CONFIG_XFRM 7049 if (old_active & (1 << SKB_EXT_SEC_PATH)) { 7050 struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH); 7051 unsigned int i; 7052 7053 for (i = 0; i < sp->len; i++) 7054 xfrm_state_hold(sp->xvec[i]); 7055 } 7056 #endif 7057 #ifdef CONFIG_MCTP_FLOWS 7058 if (old_active & (1 << SKB_EXT_MCTP)) { 7059 struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP); 7060 7061 if (flow->key) 7062 refcount_inc(&flow->key->refs); 7063 } 7064 #endif 7065 __skb_ext_put(old); 7066 return new; 7067 } 7068 7069 /** 7070 * __skb_ext_set - attach the specified extension storage to this skb 7071 * @skb: buffer 7072 * @id: extension id 7073 * @ext: extension storage previously allocated via __skb_ext_alloc() 7074 * 7075 * Existing extensions, if any, are cleared. 7076 * 7077 * Returns the pointer to the extension. 7078 */ 7079 void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, 7080 struct skb_ext *ext) 7081 { 7082 unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); 7083 7084 skb_ext_put(skb); 7085 newlen = newoff + skb_ext_type_len[id]; 7086 ext->chunks = newlen; 7087 ext->offset[id] = newoff; 7088 skb->extensions = ext; 7089 skb->active_extensions = 1 << id; 7090 return skb_ext_get_ptr(ext, id); 7091 } 7092 EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL"); 7093 7094 /** 7095 * skb_ext_add - allocate space for given extension, COW if needed 7096 * @skb: buffer 7097 * @id: extension to allocate space for 7098 * 7099 * Allocates enough space for the given extension. 7100 * If the extension is already present, a pointer to that extension 7101 * is returned. 7102 * 7103 * If the skb was cloned, COW applies and the returned memory can be 7104 * modified without changing the extension space of clones buffers. 7105 * 7106 * Returns pointer to the extension or NULL on allocation failure. 7107 */ 7108 void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) 7109 { 7110 struct skb_ext *new, *old = NULL; 7111 unsigned int newlen, newoff; 7112 7113 if (skb->active_extensions) { 7114 old = skb->extensions; 7115 7116 new = skb_ext_maybe_cow(old, skb->active_extensions); 7117 if (!new) 7118 return NULL; 7119 7120 if (__skb_ext_exist(new, id)) 7121 goto set_active; 7122 7123 newoff = new->chunks; 7124 } else { 7125 newoff = SKB_EXT_CHUNKSIZEOF(*new); 7126 7127 new = __skb_ext_alloc(GFP_ATOMIC); 7128 if (!new) 7129 return NULL; 7130 } 7131 7132 newlen = newoff + skb_ext_type_len[id]; 7133 new->chunks = newlen; 7134 new->offset[id] = newoff; 7135 set_active: 7136 skb->slow_gro = 1; 7137 skb->extensions = new; 7138 skb->active_extensions |= 1 << id; 7139 return skb_ext_get_ptr(new, id); 7140 } 7141 EXPORT_SYMBOL(skb_ext_add); 7142 7143 #ifdef CONFIG_XFRM 7144 static void skb_ext_put_sp(struct sec_path *sp) 7145 { 7146 unsigned int i; 7147 7148 for (i = 0; i < sp->len; i++) 7149 xfrm_state_put(sp->xvec[i]); 7150 } 7151 #endif 7152 7153 #ifdef CONFIG_MCTP_FLOWS 7154 static void skb_ext_put_mctp(struct mctp_flow *flow) 7155 { 7156 if (flow->key) 7157 mctp_key_unref(flow->key); 7158 } 7159 #endif 7160 7161 void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) 7162 { 7163 struct skb_ext *ext = skb->extensions; 7164 7165 skb->active_extensions &= ~(1 << id); 7166 if (skb->active_extensions == 0) { 7167 skb->extensions = NULL; 7168 __skb_ext_put(ext); 7169 #ifdef CONFIG_XFRM 7170 } else if (id == SKB_EXT_SEC_PATH && 7171 refcount_read(&ext->refcnt) == 1) { 7172 struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH); 7173 7174 skb_ext_put_sp(sp); 7175 sp->len = 0; 7176 #endif 7177 } 7178 } 7179 EXPORT_SYMBOL(__skb_ext_del); 7180 7181 void __skb_ext_put(struct skb_ext *ext) 7182 { 7183 /* If this is last clone, nothing can increment 7184 * it after check passes. Avoids one atomic op. 7185 */ 7186 if (refcount_read(&ext->refcnt) == 1) 7187 goto free_now; 7188 7189 if (!refcount_dec_and_test(&ext->refcnt)) 7190 return; 7191 free_now: 7192 #ifdef CONFIG_XFRM 7193 if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) 7194 skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); 7195 #endif 7196 #ifdef CONFIG_MCTP_FLOWS 7197 if (__skb_ext_exist(ext, SKB_EXT_MCTP)) 7198 skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP)); 7199 #endif 7200 7201 kmem_cache_free(skbuff_ext_cache, ext); 7202 } 7203 EXPORT_SYMBOL(__skb_ext_put); 7204 #endif /* CONFIG_SKB_EXTENSIONS */ 7205 7206 static void kfree_skb_napi_cache(struct sk_buff *skb) 7207 { 7208 /* if SKB is a clone, don't handle this case */ 7209 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { 7210 __kfree_skb(skb); 7211 return; 7212 } 7213 7214 local_bh_disable(); 7215 __napi_kfree_skb(skb, SKB_CONSUMED); 7216 local_bh_enable(); 7217 } 7218 7219 /** 7220 * skb_attempt_defer_free - queue skb for remote freeing 7221 * @skb: buffer 7222 * 7223 * Put @skb in a per-cpu list, using the cpu which 7224 * allocated the skb/pages to reduce false sharing 7225 * and memory zone spinlock contention. 7226 */ 7227 void skb_attempt_defer_free(struct sk_buff *skb) 7228 { 7229 struct skb_defer_node *sdn; 7230 unsigned long defer_count; 7231 int cpu = skb->alloc_cpu; 7232 unsigned int defer_max; 7233 bool kick; 7234 7235 if (cpu == raw_smp_processor_id() || 7236 WARN_ON_ONCE(cpu >= nr_cpu_ids) || 7237 !cpu_online(cpu)) { 7238 nodefer: kfree_skb_napi_cache(skb); 7239 return; 7240 } 7241 7242 DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); 7243 DEBUG_NET_WARN_ON_ONCE(skb->destructor); 7244 DEBUG_NET_WARN_ON_ONCE(skb_nfct(skb)); 7245 7246 sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id(); 7247 7248 defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); 7249 defer_count = atomic_long_inc_return(&sdn->defer_count); 7250 7251 if (defer_count >= defer_max) 7252 goto nodefer; 7253 7254 llist_add(&skb->ll_node, &sdn->defer_list); 7255 7256 /* Send an IPI every time queue reaches half capacity. */ 7257 kick = (defer_count - 1) == (defer_max >> 1); 7258 7259 /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU 7260 * if we are unlucky enough (this seems very unlikely). 7261 */ 7262 if (unlikely(kick)) 7263 kick_defer_list_purge(cpu); 7264 } 7265 7266 static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, 7267 size_t offset, size_t len) 7268 { 7269 const char *kaddr; 7270 __wsum csum; 7271 7272 kaddr = kmap_local_page(page); 7273 csum = csum_partial(kaddr + offset, len, 0); 7274 kunmap_local(kaddr); 7275 skb->csum = csum_block_add(skb->csum, csum, skb->len); 7276 } 7277 7278 /** 7279 * skb_splice_from_iter - Splice (or copy) pages to skbuff 7280 * @skb: The buffer to add pages to 7281 * @iter: Iterator representing the pages to be added 7282 * @maxsize: Maximum amount of pages to be added 7283 * 7284 * This is a common helper function for supporting MSG_SPLICE_PAGES. It 7285 * extracts pages from an iterator and adds them to the socket buffer if 7286 * possible, copying them to fragments if not possible (such as if they're slab 7287 * pages). 7288 * 7289 * Returns the amount of data spliced/copied or -EMSGSIZE if there's 7290 * insufficient space in the buffer to transfer anything. 7291 */ 7292 ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, 7293 ssize_t maxsize) 7294 { 7295 size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags); 7296 struct page *pages[8], **ppages = pages; 7297 ssize_t spliced = 0, ret = 0; 7298 unsigned int i; 7299 7300 while (iter->count > 0) { 7301 ssize_t space, nr, len; 7302 size_t off; 7303 7304 ret = -EMSGSIZE; 7305 space = frag_limit - skb_shinfo(skb)->nr_frags; 7306 if (space < 0) 7307 break; 7308 7309 /* We might be able to coalesce without increasing nr_frags */ 7310 nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages)); 7311 7312 len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off); 7313 if (len <= 0) { 7314 ret = len ?: -EIO; 7315 break; 7316 } 7317 7318 i = 0; 7319 do { 7320 struct page *page = pages[i++]; 7321 size_t part = min_t(size_t, PAGE_SIZE - off, len); 7322 7323 ret = -EIO; 7324 if (WARN_ON_ONCE(!sendpage_ok(page))) 7325 goto out; 7326 7327 ret = skb_append_pagefrags(skb, page, off, part, 7328 frag_limit); 7329 if (ret < 0) { 7330 iov_iter_revert(iter, len); 7331 goto out; 7332 } 7333 7334 if (skb->ip_summed == CHECKSUM_NONE) 7335 skb_splice_csum_page(skb, page, off, part); 7336 7337 off = 0; 7338 spliced += part; 7339 maxsize -= part; 7340 len -= part; 7341 } while (len > 0); 7342 7343 if (maxsize <= 0) 7344 break; 7345 } 7346 7347 out: 7348 skb_len_add(skb, spliced); 7349 return spliced ?: ret; 7350 } 7351 EXPORT_SYMBOL(skb_splice_from_iter); 7352 7353 static __always_inline 7354 size_t memcpy_from_iter_csum(void *iter_from, size_t progress, 7355 size_t len, void *to, void *priv2) 7356 { 7357 __wsum *csum = priv2; 7358 __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len); 7359 7360 *csum = csum_block_add(*csum, next, progress); 7361 return 0; 7362 } 7363 7364 static __always_inline 7365 size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress, 7366 size_t len, void *to, void *priv2) 7367 { 7368 __wsum next, *csum = priv2; 7369 7370 next = csum_and_copy_from_user(iter_from, to + progress, len); 7371 *csum = csum_block_add(*csum, next, progress); 7372 return next ? 0 : len; 7373 } 7374 7375 bool csum_and_copy_from_iter_full(void *addr, size_t bytes, 7376 __wsum *csum, struct iov_iter *i) 7377 { 7378 size_t copied; 7379 7380 if (WARN_ON_ONCE(!i->data_source)) 7381 return false; 7382 copied = iterate_and_advance2(i, bytes, addr, csum, 7383 copy_from_user_iter_csum, 7384 memcpy_from_iter_csum); 7385 if (likely(copied == bytes)) 7386 return true; 7387 iov_iter_revert(i, copied); 7388 return false; 7389 } 7390 EXPORT_SYMBOL(csum_and_copy_from_iter_full); 7391 7392 void get_netmem(netmem_ref netmem) 7393 { 7394 struct net_iov *niov; 7395 7396 if (netmem_is_net_iov(netmem)) { 7397 niov = netmem_to_net_iov(netmem); 7398 if (net_is_devmem_iov(niov)) 7399 net_devmem_get_net_iov(netmem_to_net_iov(netmem)); 7400 return; 7401 } 7402 get_page(netmem_to_page(netmem)); 7403 } 7404 EXPORT_SYMBOL(get_netmem); 7405 7406 void put_netmem(netmem_ref netmem) 7407 { 7408 struct net_iov *niov; 7409 7410 if (netmem_is_net_iov(netmem)) { 7411 niov = netmem_to_net_iov(netmem); 7412 if (net_is_devmem_iov(niov)) 7413 net_devmem_put_net_iov(netmem_to_net_iov(netmem)); 7414 return; 7415 } 7416 7417 put_page(netmem_to_page(netmem)); 7418 } 7419 EXPORT_SYMBOL(put_netmem); 7420