1 /* 2 * Routines having to do with the 'struct sk_buff' memory handlers. 3 * 4 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> 5 * Florian La Roche <rzsfl@rz.uni-sb.de> 6 * 7 * Fixes: 8 * Alan Cox : Fixed the worst of the load 9 * balancer bugs. 10 * Dave Platt : Interrupt stacking fix. 11 * Richard Kooijman : Timestamp fixes. 12 * Alan Cox : Changed buffer format. 13 * Alan Cox : destructor hook for AF_UNIX etc. 14 * Linus Torvalds : Better skb_clone. 15 * Alan Cox : Added skb_copy. 16 * Alan Cox : Added all the changed routines Linus 17 * only put in the headers 18 * Ray VanTassle : Fixed --skb->lock in free 19 * Alan Cox : skb_copy copy arp field 20 * Andi Kleen : slabified it. 21 * Robert Olsson : Removed skb_head_pool 22 * 23 * NOTE: 24 * The __skb_ routines should be called with interrupts 25 * disabled, or you better be *real* sure that the operation is atomic 26 * with respect to whatever list is being frobbed (e.g. via lock_sock() 27 * or via disabling bottom half handlers, etc). 28 * 29 * This program is free software; you can redistribute it and/or 30 * modify it under the terms of the GNU General Public License 31 * as published by the Free Software Foundation; either version 32 * 2 of the License, or (at your option) any later version. 33 */ 34 35 /* 36 * The functions in this file will not compile correctly with gcc 2.4.x 37 */ 38 39 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 40 41 #include <linux/module.h> 42 #include <linux/types.h> 43 #include <linux/kernel.h> 44 #include <linux/kmemcheck.h> 45 #include <linux/mm.h> 46 #include <linux/interrupt.h> 47 #include <linux/in.h> 48 #include <linux/inet.h> 49 #include <linux/slab.h> 50 #include <linux/netdevice.h> 51 #ifdef CONFIG_NET_CLS_ACT 52 #include <net/pkt_sched.h> 53 #endif 54 #include <linux/string.h> 55 #include <linux/skbuff.h> 56 #include <linux/splice.h> 57 #include <linux/cache.h> 58 #include <linux/rtnetlink.h> 59 #include <linux/init.h> 60 #include <linux/scatterlist.h> 61 #include <linux/errqueue.h> 62 #include <linux/prefetch.h> 63 64 #include <net/protocol.h> 65 #include <net/dst.h> 66 #include <net/sock.h> 67 #include <net/checksum.h> 68 #include <net/xfrm.h> 69 70 #include <asm/uaccess.h> 71 #include <trace/events/skb.h> 72 #include <linux/highmem.h> 73 74 struct kmem_cache *skbuff_head_cache __read_mostly; 75 static struct kmem_cache *skbuff_fclone_cache __read_mostly; 76 77 static void sock_pipe_buf_release(struct pipe_inode_info *pipe, 78 struct pipe_buffer *buf) 79 { 80 put_page(buf->page); 81 } 82 83 static void sock_pipe_buf_get(struct pipe_inode_info *pipe, 84 struct pipe_buffer *buf) 85 { 86 get_page(buf->page); 87 } 88 89 static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, 90 struct pipe_buffer *buf) 91 { 92 return 1; 93 } 94 95 96 /* Pipe buffer operations for a socket. */ 97 static const struct pipe_buf_operations sock_pipe_buf_ops = { 98 .can_merge = 0, 99 .map = generic_pipe_buf_map, 100 .unmap = generic_pipe_buf_unmap, 101 .confirm = generic_pipe_buf_confirm, 102 .release = sock_pipe_buf_release, 103 .steal = sock_pipe_buf_steal, 104 .get = sock_pipe_buf_get, 105 }; 106 107 /* 108 * Keep out-of-line to prevent kernel bloat. 109 * __builtin_return_address is not used because it is not always 110 * reliable. 111 */ 112 113 /** 114 * skb_over_panic - private function 115 * @skb: buffer 116 * @sz: size 117 * @here: address 118 * 119 * Out of line support code for skb_put(). Not user callable. 120 */ 121 static void skb_over_panic(struct sk_buff *skb, int sz, void *here) 122 { 123 pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", 124 __func__, here, skb->len, sz, skb->head, skb->data, 125 (unsigned long)skb->tail, (unsigned long)skb->end, 126 skb->dev ? skb->dev->name : "<NULL>"); 127 BUG(); 128 } 129 130 /** 131 * skb_under_panic - private function 132 * @skb: buffer 133 * @sz: size 134 * @here: address 135 * 136 * Out of line support code for skb_push(). Not user callable. 137 */ 138 139 static void skb_under_panic(struct sk_buff *skb, int sz, void *here) 140 { 141 pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", 142 __func__, here, skb->len, sz, skb->head, skb->data, 143 (unsigned long)skb->tail, (unsigned long)skb->end, 144 skb->dev ? skb->dev->name : "<NULL>"); 145 BUG(); 146 } 147 148 /* Allocate a new skbuff. We do this ourselves so we can fill in a few 149 * 'private' fields and also do memory statistics to find all the 150 * [BEEP] leaks. 151 * 152 */ 153 154 /** 155 * __alloc_skb - allocate a network buffer 156 * @size: size to allocate 157 * @gfp_mask: allocation mask 158 * @fclone: allocate from fclone cache instead of head cache 159 * and allocate a cloned (child) skb 160 * @node: numa node to allocate memory on 161 * 162 * Allocate a new &sk_buff. The returned buffer has no headroom and a 163 * tail room of size bytes. The object has a reference count of one. 164 * The return is the buffer. On a failure the return is %NULL. 165 * 166 * Buffers may only be allocated from interrupts using a @gfp_mask of 167 * %GFP_ATOMIC. 168 */ 169 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 170 int fclone, int node) 171 { 172 struct kmem_cache *cache; 173 struct skb_shared_info *shinfo; 174 struct sk_buff *skb; 175 u8 *data; 176 177 cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; 178 179 /* Get the HEAD */ 180 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); 181 if (!skb) 182 goto out; 183 prefetchw(skb); 184 185 /* We do our best to align skb_shared_info on a separate cache 186 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives 187 * aligned memory blocks, unless SLUB/SLAB debug is enabled. 188 * Both skb->head and skb_shared_info are cache line aligned. 189 */ 190 size = SKB_DATA_ALIGN(size); 191 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 192 data = kmalloc_node_track_caller(size, gfp_mask, node); 193 if (!data) 194 goto nodata; 195 /* kmalloc(size) might give us more room than requested. 196 * Put skb_shared_info exactly at the end of allocated zone, 197 * to allow max possible filling before reallocation. 198 */ 199 size = SKB_WITH_OVERHEAD(ksize(data)); 200 prefetchw(data + size); 201 202 /* 203 * Only clear those fields we need to clear, not those that we will 204 * actually initialise below. Hence, don't put any more fields after 205 * the tail pointer in struct sk_buff! 206 */ 207 memset(skb, 0, offsetof(struct sk_buff, tail)); 208 /* Account for allocated memory : skb + skb->head */ 209 skb->truesize = SKB_TRUESIZE(size); 210 atomic_set(&skb->users, 1); 211 skb->head = data; 212 skb->data = data; 213 skb_reset_tail_pointer(skb); 214 skb->end = skb->tail + size; 215 #ifdef NET_SKBUFF_DATA_USES_OFFSET 216 skb->mac_header = ~0U; 217 #endif 218 219 /* make sure we initialize shinfo sequentially */ 220 shinfo = skb_shinfo(skb); 221 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 222 atomic_set(&shinfo->dataref, 1); 223 kmemcheck_annotate_variable(shinfo->destructor_arg); 224 225 if (fclone) { 226 struct sk_buff *child = skb + 1; 227 atomic_t *fclone_ref = (atomic_t *) (child + 1); 228 229 kmemcheck_annotate_bitfield(child, flags1); 230 kmemcheck_annotate_bitfield(child, flags2); 231 skb->fclone = SKB_FCLONE_ORIG; 232 atomic_set(fclone_ref, 1); 233 234 child->fclone = SKB_FCLONE_UNAVAILABLE; 235 } 236 out: 237 return skb; 238 nodata: 239 kmem_cache_free(cache, skb); 240 skb = NULL; 241 goto out; 242 } 243 EXPORT_SYMBOL(__alloc_skb); 244 245 /** 246 * build_skb - build a network buffer 247 * @data: data buffer provided by caller 248 * @frag_size: size of fragment, or 0 if head was kmalloced 249 * 250 * Allocate a new &sk_buff. Caller provides space holding head and 251 * skb_shared_info. @data must have been allocated by kmalloc() 252 * The return is the new skb buffer. 253 * On a failure the return is %NULL, and @data is not freed. 254 * Notes : 255 * Before IO, driver allocates only data buffer where NIC put incoming frame 256 * Driver should add room at head (NET_SKB_PAD) and 257 * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) 258 * After IO, driver calls build_skb(), to allocate sk_buff and populate it 259 * before giving packet to stack. 260 * RX rings only contains data buffers, not full skbs. 261 */ 262 struct sk_buff *build_skb(void *data, unsigned int frag_size) 263 { 264 struct skb_shared_info *shinfo; 265 struct sk_buff *skb; 266 unsigned int size = frag_size ? : ksize(data); 267 268 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); 269 if (!skb) 270 return NULL; 271 272 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 273 274 memset(skb, 0, offsetof(struct sk_buff, tail)); 275 skb->truesize = SKB_TRUESIZE(size); 276 skb->head_frag = frag_size != 0; 277 atomic_set(&skb->users, 1); 278 skb->head = data; 279 skb->data = data; 280 skb_reset_tail_pointer(skb); 281 skb->end = skb->tail + size; 282 #ifdef NET_SKBUFF_DATA_USES_OFFSET 283 skb->mac_header = ~0U; 284 #endif 285 286 /* make sure we initialize shinfo sequentially */ 287 shinfo = skb_shinfo(skb); 288 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 289 atomic_set(&shinfo->dataref, 1); 290 kmemcheck_annotate_variable(shinfo->destructor_arg); 291 292 return skb; 293 } 294 EXPORT_SYMBOL(build_skb); 295 296 struct netdev_alloc_cache { 297 struct page *page; 298 unsigned int offset; 299 }; 300 static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); 301 302 /** 303 * netdev_alloc_frag - allocate a page fragment 304 * @fragsz: fragment size 305 * 306 * Allocates a frag from a page for receive buffer. 307 * Uses GFP_ATOMIC allocations. 308 */ 309 void *netdev_alloc_frag(unsigned int fragsz) 310 { 311 struct netdev_alloc_cache *nc; 312 void *data = NULL; 313 unsigned long flags; 314 315 local_irq_save(flags); 316 nc = &__get_cpu_var(netdev_alloc_cache); 317 if (unlikely(!nc->page)) { 318 refill: 319 nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD); 320 nc->offset = 0; 321 } 322 if (likely(nc->page)) { 323 if (nc->offset + fragsz > PAGE_SIZE) { 324 put_page(nc->page); 325 goto refill; 326 } 327 data = page_address(nc->page) + nc->offset; 328 nc->offset += fragsz; 329 get_page(nc->page); 330 } 331 local_irq_restore(flags); 332 return data; 333 } 334 EXPORT_SYMBOL(netdev_alloc_frag); 335 336 /** 337 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device 338 * @dev: network device to receive on 339 * @length: length to allocate 340 * @gfp_mask: get_free_pages mask, passed to alloc_skb 341 * 342 * Allocate a new &sk_buff and assign it a usage count of one. The 343 * buffer has unspecified headroom built in. Users should allocate 344 * the headroom they think they need without accounting for the 345 * built in space. The built in space is used for optimisations. 346 * 347 * %NULL is returned if there is no free memory. 348 */ 349 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, 350 unsigned int length, gfp_t gfp_mask) 351 { 352 struct sk_buff *skb = NULL; 353 unsigned int fragsz = SKB_DATA_ALIGN(length + NET_SKB_PAD) + 354 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 355 356 if (fragsz <= PAGE_SIZE && !(gfp_mask & __GFP_WAIT)) { 357 void *data = netdev_alloc_frag(fragsz); 358 359 if (likely(data)) { 360 skb = build_skb(data, fragsz); 361 if (unlikely(!skb)) 362 put_page(virt_to_head_page(data)); 363 } 364 } else { 365 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); 366 } 367 if (likely(skb)) { 368 skb_reserve(skb, NET_SKB_PAD); 369 skb->dev = dev; 370 } 371 return skb; 372 } 373 EXPORT_SYMBOL(__netdev_alloc_skb); 374 375 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, 376 int size, unsigned int truesize) 377 { 378 skb_fill_page_desc(skb, i, page, off, size); 379 skb->len += size; 380 skb->data_len += size; 381 skb->truesize += truesize; 382 } 383 EXPORT_SYMBOL(skb_add_rx_frag); 384 385 static void skb_drop_list(struct sk_buff **listp) 386 { 387 struct sk_buff *list = *listp; 388 389 *listp = NULL; 390 391 do { 392 struct sk_buff *this = list; 393 list = list->next; 394 kfree_skb(this); 395 } while (list); 396 } 397 398 static inline void skb_drop_fraglist(struct sk_buff *skb) 399 { 400 skb_drop_list(&skb_shinfo(skb)->frag_list); 401 } 402 403 static void skb_clone_fraglist(struct sk_buff *skb) 404 { 405 struct sk_buff *list; 406 407 skb_walk_frags(skb, list) 408 skb_get(list); 409 } 410 411 static void skb_free_head(struct sk_buff *skb) 412 { 413 if (skb->head_frag) 414 put_page(virt_to_head_page(skb->head)); 415 else 416 kfree(skb->head); 417 } 418 419 static void skb_release_data(struct sk_buff *skb) 420 { 421 if (!skb->cloned || 422 !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, 423 &skb_shinfo(skb)->dataref)) { 424 if (skb_shinfo(skb)->nr_frags) { 425 int i; 426 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 427 skb_frag_unref(skb, i); 428 } 429 430 /* 431 * If skb buf is from userspace, we need to notify the caller 432 * the lower device DMA has done; 433 */ 434 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 435 struct ubuf_info *uarg; 436 437 uarg = skb_shinfo(skb)->destructor_arg; 438 if (uarg->callback) 439 uarg->callback(uarg); 440 } 441 442 if (skb_has_frag_list(skb)) 443 skb_drop_fraglist(skb); 444 445 skb_free_head(skb); 446 } 447 } 448 449 /* 450 * Free an skbuff by memory without cleaning the state. 451 */ 452 static void kfree_skbmem(struct sk_buff *skb) 453 { 454 struct sk_buff *other; 455 atomic_t *fclone_ref; 456 457 switch (skb->fclone) { 458 case SKB_FCLONE_UNAVAILABLE: 459 kmem_cache_free(skbuff_head_cache, skb); 460 break; 461 462 case SKB_FCLONE_ORIG: 463 fclone_ref = (atomic_t *) (skb + 2); 464 if (atomic_dec_and_test(fclone_ref)) 465 kmem_cache_free(skbuff_fclone_cache, skb); 466 break; 467 468 case SKB_FCLONE_CLONE: 469 fclone_ref = (atomic_t *) (skb + 1); 470 other = skb - 1; 471 472 /* The clone portion is available for 473 * fast-cloning again. 474 */ 475 skb->fclone = SKB_FCLONE_UNAVAILABLE; 476 477 if (atomic_dec_and_test(fclone_ref)) 478 kmem_cache_free(skbuff_fclone_cache, other); 479 break; 480 } 481 } 482 483 static void skb_release_head_state(struct sk_buff *skb) 484 { 485 skb_dst_drop(skb); 486 #ifdef CONFIG_XFRM 487 secpath_put(skb->sp); 488 #endif 489 if (skb->destructor) { 490 WARN_ON(in_irq()); 491 skb->destructor(skb); 492 } 493 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 494 nf_conntrack_put(skb->nfct); 495 #endif 496 #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED 497 nf_conntrack_put_reasm(skb->nfct_reasm); 498 #endif 499 #ifdef CONFIG_BRIDGE_NETFILTER 500 nf_bridge_put(skb->nf_bridge); 501 #endif 502 /* XXX: IS this still necessary? - JHS */ 503 #ifdef CONFIG_NET_SCHED 504 skb->tc_index = 0; 505 #ifdef CONFIG_NET_CLS_ACT 506 skb->tc_verd = 0; 507 #endif 508 #endif 509 } 510 511 /* Free everything but the sk_buff shell. */ 512 static void skb_release_all(struct sk_buff *skb) 513 { 514 skb_release_head_state(skb); 515 skb_release_data(skb); 516 } 517 518 /** 519 * __kfree_skb - private function 520 * @skb: buffer 521 * 522 * Free an sk_buff. Release anything attached to the buffer. 523 * Clean the state. This is an internal helper function. Users should 524 * always call kfree_skb 525 */ 526 527 void __kfree_skb(struct sk_buff *skb) 528 { 529 skb_release_all(skb); 530 kfree_skbmem(skb); 531 } 532 EXPORT_SYMBOL(__kfree_skb); 533 534 /** 535 * kfree_skb - free an sk_buff 536 * @skb: buffer to free 537 * 538 * Drop a reference to the buffer and free it if the usage count has 539 * hit zero. 540 */ 541 void kfree_skb(struct sk_buff *skb) 542 { 543 if (unlikely(!skb)) 544 return; 545 if (likely(atomic_read(&skb->users) == 1)) 546 smp_rmb(); 547 else if (likely(!atomic_dec_and_test(&skb->users))) 548 return; 549 trace_kfree_skb(skb, __builtin_return_address(0)); 550 __kfree_skb(skb); 551 } 552 EXPORT_SYMBOL(kfree_skb); 553 554 /** 555 * consume_skb - free an skbuff 556 * @skb: buffer to free 557 * 558 * Drop a ref to the buffer and free it if the usage count has hit zero 559 * Functions identically to kfree_skb, but kfree_skb assumes that the frame 560 * is being dropped after a failure and notes that 561 */ 562 void consume_skb(struct sk_buff *skb) 563 { 564 if (unlikely(!skb)) 565 return; 566 if (likely(atomic_read(&skb->users) == 1)) 567 smp_rmb(); 568 else if (likely(!atomic_dec_and_test(&skb->users))) 569 return; 570 trace_consume_skb(skb); 571 __kfree_skb(skb); 572 } 573 EXPORT_SYMBOL(consume_skb); 574 575 /** 576 * skb_recycle - clean up an skb for reuse 577 * @skb: buffer 578 * 579 * Recycles the skb to be reused as a receive buffer. This 580 * function does any necessary reference count dropping, and 581 * cleans up the skbuff as if it just came from __alloc_skb(). 582 */ 583 void skb_recycle(struct sk_buff *skb) 584 { 585 struct skb_shared_info *shinfo; 586 587 skb_release_head_state(skb); 588 589 shinfo = skb_shinfo(skb); 590 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 591 atomic_set(&shinfo->dataref, 1); 592 593 memset(skb, 0, offsetof(struct sk_buff, tail)); 594 skb->data = skb->head + NET_SKB_PAD; 595 skb_reset_tail_pointer(skb); 596 } 597 EXPORT_SYMBOL(skb_recycle); 598 599 /** 600 * skb_recycle_check - check if skb can be reused for receive 601 * @skb: buffer 602 * @skb_size: minimum receive buffer size 603 * 604 * Checks that the skb passed in is not shared or cloned, and 605 * that it is linear and its head portion at least as large as 606 * skb_size so that it can be recycled as a receive buffer. 607 * If these conditions are met, this function does any necessary 608 * reference count dropping and cleans up the skbuff as if it 609 * just came from __alloc_skb(). 610 */ 611 bool skb_recycle_check(struct sk_buff *skb, int skb_size) 612 { 613 if (!skb_is_recycleable(skb, skb_size)) 614 return false; 615 616 skb_recycle(skb); 617 618 return true; 619 } 620 EXPORT_SYMBOL(skb_recycle_check); 621 622 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 623 { 624 new->tstamp = old->tstamp; 625 new->dev = old->dev; 626 new->transport_header = old->transport_header; 627 new->network_header = old->network_header; 628 new->mac_header = old->mac_header; 629 skb_dst_copy(new, old); 630 new->rxhash = old->rxhash; 631 new->ooo_okay = old->ooo_okay; 632 new->l4_rxhash = old->l4_rxhash; 633 new->no_fcs = old->no_fcs; 634 #ifdef CONFIG_XFRM 635 new->sp = secpath_get(old->sp); 636 #endif 637 memcpy(new->cb, old->cb, sizeof(old->cb)); 638 new->csum = old->csum; 639 new->local_df = old->local_df; 640 new->pkt_type = old->pkt_type; 641 new->ip_summed = old->ip_summed; 642 skb_copy_queue_mapping(new, old); 643 new->priority = old->priority; 644 #if IS_ENABLED(CONFIG_IP_VS) 645 new->ipvs_property = old->ipvs_property; 646 #endif 647 new->protocol = old->protocol; 648 new->mark = old->mark; 649 new->skb_iif = old->skb_iif; 650 __nf_copy(new, old); 651 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) 652 new->nf_trace = old->nf_trace; 653 #endif 654 #ifdef CONFIG_NET_SCHED 655 new->tc_index = old->tc_index; 656 #ifdef CONFIG_NET_CLS_ACT 657 new->tc_verd = old->tc_verd; 658 #endif 659 #endif 660 new->vlan_tci = old->vlan_tci; 661 662 skb_copy_secmark(new, old); 663 } 664 665 /* 666 * You should not add any new code to this function. Add it to 667 * __copy_skb_header above instead. 668 */ 669 static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) 670 { 671 #define C(x) n->x = skb->x 672 673 n->next = n->prev = NULL; 674 n->sk = NULL; 675 __copy_skb_header(n, skb); 676 677 C(len); 678 C(data_len); 679 C(mac_len); 680 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; 681 n->cloned = 1; 682 n->nohdr = 0; 683 n->destructor = NULL; 684 C(tail); 685 C(end); 686 C(head); 687 C(head_frag); 688 C(data); 689 C(truesize); 690 atomic_set(&n->users, 1); 691 692 atomic_inc(&(skb_shinfo(skb)->dataref)); 693 skb->cloned = 1; 694 695 return n; 696 #undef C 697 } 698 699 /** 700 * skb_morph - morph one skb into another 701 * @dst: the skb to receive the contents 702 * @src: the skb to supply the contents 703 * 704 * This is identical to skb_clone except that the target skb is 705 * supplied by the user. 706 * 707 * The target skb is returned upon exit. 708 */ 709 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) 710 { 711 skb_release_all(dst); 712 return __skb_clone(dst, src); 713 } 714 EXPORT_SYMBOL_GPL(skb_morph); 715 716 /* skb_copy_ubufs - copy userspace skb frags buffers to kernel 717 * @skb: the skb to modify 718 * @gfp_mask: allocation priority 719 * 720 * This must be called on SKBTX_DEV_ZEROCOPY skb. 721 * It will copy all frags into kernel and drop the reference 722 * to userspace pages. 723 * 724 * If this function is called from an interrupt gfp_mask() must be 725 * %GFP_ATOMIC. 726 * 727 * Returns 0 on success or a negative error code on failure 728 * to allocate kernel memory to copy to. 729 */ 730 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) 731 { 732 int i; 733 int num_frags = skb_shinfo(skb)->nr_frags; 734 struct page *page, *head = NULL; 735 struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; 736 737 for (i = 0; i < num_frags; i++) { 738 u8 *vaddr; 739 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 740 741 page = alloc_page(GFP_ATOMIC); 742 if (!page) { 743 while (head) { 744 struct page *next = (struct page *)head->private; 745 put_page(head); 746 head = next; 747 } 748 return -ENOMEM; 749 } 750 vaddr = kmap_atomic(skb_frag_page(f)); 751 memcpy(page_address(page), 752 vaddr + f->page_offset, skb_frag_size(f)); 753 kunmap_atomic(vaddr); 754 page->private = (unsigned long)head; 755 head = page; 756 } 757 758 /* skb frags release userspace buffers */ 759 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 760 skb_frag_unref(skb, i); 761 762 uarg->callback(uarg); 763 764 /* skb frags point to kernel buffers */ 765 for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { 766 __skb_fill_page_desc(skb, i-1, head, 0, 767 skb_shinfo(skb)->frags[i - 1].size); 768 head = (struct page *)head->private; 769 } 770 771 skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; 772 return 0; 773 } 774 775 776 /** 777 * skb_clone - duplicate an sk_buff 778 * @skb: buffer to clone 779 * @gfp_mask: allocation priority 780 * 781 * Duplicate an &sk_buff. The new one is not owned by a socket. Both 782 * copies share the same packet data but not structure. The new 783 * buffer has a reference count of 1. If the allocation fails the 784 * function returns %NULL otherwise the new buffer is returned. 785 * 786 * If this function is called from an interrupt gfp_mask() must be 787 * %GFP_ATOMIC. 788 */ 789 790 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) 791 { 792 struct sk_buff *n; 793 794 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 795 if (skb_copy_ubufs(skb, gfp_mask)) 796 return NULL; 797 } 798 799 n = skb + 1; 800 if (skb->fclone == SKB_FCLONE_ORIG && 801 n->fclone == SKB_FCLONE_UNAVAILABLE) { 802 atomic_t *fclone_ref = (atomic_t *) (n + 1); 803 n->fclone = SKB_FCLONE_CLONE; 804 atomic_inc(fclone_ref); 805 } else { 806 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 807 if (!n) 808 return NULL; 809 810 kmemcheck_annotate_bitfield(n, flags1); 811 kmemcheck_annotate_bitfield(n, flags2); 812 n->fclone = SKB_FCLONE_UNAVAILABLE; 813 } 814 815 return __skb_clone(n, skb); 816 } 817 EXPORT_SYMBOL(skb_clone); 818 819 static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 820 { 821 #ifndef NET_SKBUFF_DATA_USES_OFFSET 822 /* 823 * Shift between the two data areas in bytes 824 */ 825 unsigned long offset = new->data - old->data; 826 #endif 827 828 __copy_skb_header(new, old); 829 830 #ifndef NET_SKBUFF_DATA_USES_OFFSET 831 /* {transport,network,mac}_header are relative to skb->head */ 832 new->transport_header += offset; 833 new->network_header += offset; 834 if (skb_mac_header_was_set(new)) 835 new->mac_header += offset; 836 #endif 837 skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; 838 skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; 839 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 840 } 841 842 /** 843 * skb_copy - create private copy of an sk_buff 844 * @skb: buffer to copy 845 * @gfp_mask: allocation priority 846 * 847 * Make a copy of both an &sk_buff and its data. This is used when the 848 * caller wishes to modify the data and needs a private copy of the 849 * data to alter. Returns %NULL on failure or the pointer to the buffer 850 * on success. The returned buffer has a reference count of 1. 851 * 852 * As by-product this function converts non-linear &sk_buff to linear 853 * one, so that &sk_buff becomes completely private and caller is allowed 854 * to modify all the data of returned buffer. This means that this 855 * function is not recommended for use in circumstances when only 856 * header is going to be modified. Use pskb_copy() instead. 857 */ 858 859 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) 860 { 861 int headerlen = skb_headroom(skb); 862 unsigned int size = skb_end_offset(skb) + skb->data_len; 863 struct sk_buff *n = alloc_skb(size, gfp_mask); 864 865 if (!n) 866 return NULL; 867 868 /* Set the data pointer */ 869 skb_reserve(n, headerlen); 870 /* Set the tail pointer and length */ 871 skb_put(n, skb->len); 872 873 if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) 874 BUG(); 875 876 copy_skb_header(n, skb); 877 return n; 878 } 879 EXPORT_SYMBOL(skb_copy); 880 881 /** 882 * __pskb_copy - create copy of an sk_buff with private head. 883 * @skb: buffer to copy 884 * @headroom: headroom of new skb 885 * @gfp_mask: allocation priority 886 * 887 * Make a copy of both an &sk_buff and part of its data, located 888 * in header. Fragmented data remain shared. This is used when 889 * the caller wishes to modify only header of &sk_buff and needs 890 * private copy of the header to alter. Returns %NULL on failure 891 * or the pointer to the buffer on success. 892 * The returned buffer has a reference count of 1. 893 */ 894 895 struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) 896 { 897 unsigned int size = skb_headlen(skb) + headroom; 898 struct sk_buff *n = alloc_skb(size, gfp_mask); 899 900 if (!n) 901 goto out; 902 903 /* Set the data pointer */ 904 skb_reserve(n, headroom); 905 /* Set the tail pointer and length */ 906 skb_put(n, skb_headlen(skb)); 907 /* Copy the bytes */ 908 skb_copy_from_linear_data(skb, n->data, n->len); 909 910 n->truesize += skb->data_len; 911 n->data_len = skb->data_len; 912 n->len = skb->len; 913 914 if (skb_shinfo(skb)->nr_frags) { 915 int i; 916 917 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 918 if (skb_copy_ubufs(skb, gfp_mask)) { 919 kfree_skb(n); 920 n = NULL; 921 goto out; 922 } 923 } 924 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 925 skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; 926 skb_frag_ref(skb, i); 927 } 928 skb_shinfo(n)->nr_frags = i; 929 } 930 931 if (skb_has_frag_list(skb)) { 932 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; 933 skb_clone_fraglist(n); 934 } 935 936 copy_skb_header(n, skb); 937 out: 938 return n; 939 } 940 EXPORT_SYMBOL(__pskb_copy); 941 942 /** 943 * pskb_expand_head - reallocate header of &sk_buff 944 * @skb: buffer to reallocate 945 * @nhead: room to add at head 946 * @ntail: room to add at tail 947 * @gfp_mask: allocation priority 948 * 949 * Expands (or creates identical copy, if &nhead and &ntail are zero) 950 * header of skb. &sk_buff itself is not changed. &sk_buff MUST have 951 * reference count of 1. Returns zero in the case of success or error, 952 * if expansion failed. In the last case, &sk_buff is not changed. 953 * 954 * All the pointers pointing into skb header may change and must be 955 * reloaded after call to this function. 956 */ 957 958 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, 959 gfp_t gfp_mask) 960 { 961 int i; 962 u8 *data; 963 int size = nhead + skb_end_offset(skb) + ntail; 964 long off; 965 966 BUG_ON(nhead < 0); 967 968 if (skb_shared(skb)) 969 BUG(); 970 971 size = SKB_DATA_ALIGN(size); 972 973 data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 974 gfp_mask); 975 if (!data) 976 goto nodata; 977 size = SKB_WITH_OVERHEAD(ksize(data)); 978 979 /* Copy only real data... and, alas, header. This should be 980 * optimized for the cases when header is void. 981 */ 982 memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); 983 984 memcpy((struct skb_shared_info *)(data + size), 985 skb_shinfo(skb), 986 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); 987 988 /* 989 * if shinfo is shared we must drop the old head gracefully, but if it 990 * is not we can just drop the old head and let the existing refcount 991 * be since all we did is relocate the values 992 */ 993 if (skb_cloned(skb)) { 994 /* copy this zero copy skb frags */ 995 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 996 if (skb_copy_ubufs(skb, gfp_mask)) 997 goto nofrags; 998 } 999 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1000 skb_frag_ref(skb, i); 1001 1002 if (skb_has_frag_list(skb)) 1003 skb_clone_fraglist(skb); 1004 1005 skb_release_data(skb); 1006 } else { 1007 skb_free_head(skb); 1008 } 1009 off = (data + nhead) - skb->head; 1010 1011 skb->head = data; 1012 skb->head_frag = 0; 1013 skb->data += off; 1014 #ifdef NET_SKBUFF_DATA_USES_OFFSET 1015 skb->end = size; 1016 off = nhead; 1017 #else 1018 skb->end = skb->head + size; 1019 #endif 1020 /* {transport,network,mac}_header and tail are relative to skb->head */ 1021 skb->tail += off; 1022 skb->transport_header += off; 1023 skb->network_header += off; 1024 if (skb_mac_header_was_set(skb)) 1025 skb->mac_header += off; 1026 /* Only adjust this if it actually is csum_start rather than csum */ 1027 if (skb->ip_summed == CHECKSUM_PARTIAL) 1028 skb->csum_start += nhead; 1029 skb->cloned = 0; 1030 skb->hdr_len = 0; 1031 skb->nohdr = 0; 1032 atomic_set(&skb_shinfo(skb)->dataref, 1); 1033 return 0; 1034 1035 nofrags: 1036 kfree(data); 1037 nodata: 1038 return -ENOMEM; 1039 } 1040 EXPORT_SYMBOL(pskb_expand_head); 1041 1042 /* Make private copy of skb with writable head and some headroom */ 1043 1044 struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) 1045 { 1046 struct sk_buff *skb2; 1047 int delta = headroom - skb_headroom(skb); 1048 1049 if (delta <= 0) 1050 skb2 = pskb_copy(skb, GFP_ATOMIC); 1051 else { 1052 skb2 = skb_clone(skb, GFP_ATOMIC); 1053 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, 1054 GFP_ATOMIC)) { 1055 kfree_skb(skb2); 1056 skb2 = NULL; 1057 } 1058 } 1059 return skb2; 1060 } 1061 EXPORT_SYMBOL(skb_realloc_headroom); 1062 1063 /** 1064 * skb_copy_expand - copy and expand sk_buff 1065 * @skb: buffer to copy 1066 * @newheadroom: new free bytes at head 1067 * @newtailroom: new free bytes at tail 1068 * @gfp_mask: allocation priority 1069 * 1070 * Make a copy of both an &sk_buff and its data and while doing so 1071 * allocate additional space. 1072 * 1073 * This is used when the caller wishes to modify the data and needs a 1074 * private copy of the data to alter as well as more space for new fields. 1075 * Returns %NULL on failure or the pointer to the buffer 1076 * on success. The returned buffer has a reference count of 1. 1077 * 1078 * You must pass %GFP_ATOMIC as the allocation priority if this function 1079 * is called from an interrupt. 1080 */ 1081 struct sk_buff *skb_copy_expand(const struct sk_buff *skb, 1082 int newheadroom, int newtailroom, 1083 gfp_t gfp_mask) 1084 { 1085 /* 1086 * Allocate the copy buffer 1087 */ 1088 struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, 1089 gfp_mask); 1090 int oldheadroom = skb_headroom(skb); 1091 int head_copy_len, head_copy_off; 1092 int off; 1093 1094 if (!n) 1095 return NULL; 1096 1097 skb_reserve(n, newheadroom); 1098 1099 /* Set the tail pointer and length */ 1100 skb_put(n, skb->len); 1101 1102 head_copy_len = oldheadroom; 1103 head_copy_off = 0; 1104 if (newheadroom <= head_copy_len) 1105 head_copy_len = newheadroom; 1106 else 1107 head_copy_off = newheadroom - head_copy_len; 1108 1109 /* Copy the linear header and data. */ 1110 if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, 1111 skb->len + head_copy_len)) 1112 BUG(); 1113 1114 copy_skb_header(n, skb); 1115 1116 off = newheadroom - oldheadroom; 1117 if (n->ip_summed == CHECKSUM_PARTIAL) 1118 n->csum_start += off; 1119 #ifdef NET_SKBUFF_DATA_USES_OFFSET 1120 n->transport_header += off; 1121 n->network_header += off; 1122 if (skb_mac_header_was_set(skb)) 1123 n->mac_header += off; 1124 #endif 1125 1126 return n; 1127 } 1128 EXPORT_SYMBOL(skb_copy_expand); 1129 1130 /** 1131 * skb_pad - zero pad the tail of an skb 1132 * @skb: buffer to pad 1133 * @pad: space to pad 1134 * 1135 * Ensure that a buffer is followed by a padding area that is zero 1136 * filled. Used by network drivers which may DMA or transfer data 1137 * beyond the buffer end onto the wire. 1138 * 1139 * May return error in out of memory cases. The skb is freed on error. 1140 */ 1141 1142 int skb_pad(struct sk_buff *skb, int pad) 1143 { 1144 int err; 1145 int ntail; 1146 1147 /* If the skbuff is non linear tailroom is always zero.. */ 1148 if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { 1149 memset(skb->data+skb->len, 0, pad); 1150 return 0; 1151 } 1152 1153 ntail = skb->data_len + pad - (skb->end - skb->tail); 1154 if (likely(skb_cloned(skb) || ntail > 0)) { 1155 err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); 1156 if (unlikely(err)) 1157 goto free_skb; 1158 } 1159 1160 /* FIXME: The use of this function with non-linear skb's really needs 1161 * to be audited. 1162 */ 1163 err = skb_linearize(skb); 1164 if (unlikely(err)) 1165 goto free_skb; 1166 1167 memset(skb->data + skb->len, 0, pad); 1168 return 0; 1169 1170 free_skb: 1171 kfree_skb(skb); 1172 return err; 1173 } 1174 EXPORT_SYMBOL(skb_pad); 1175 1176 /** 1177 * skb_put - add data to a buffer 1178 * @skb: buffer to use 1179 * @len: amount of data to add 1180 * 1181 * This function extends the used data area of the buffer. If this would 1182 * exceed the total buffer size the kernel will panic. A pointer to the 1183 * first byte of the extra data is returned. 1184 */ 1185 unsigned char *skb_put(struct sk_buff *skb, unsigned int len) 1186 { 1187 unsigned char *tmp = skb_tail_pointer(skb); 1188 SKB_LINEAR_ASSERT(skb); 1189 skb->tail += len; 1190 skb->len += len; 1191 if (unlikely(skb->tail > skb->end)) 1192 skb_over_panic(skb, len, __builtin_return_address(0)); 1193 return tmp; 1194 } 1195 EXPORT_SYMBOL(skb_put); 1196 1197 /** 1198 * skb_push - add data to the start of a buffer 1199 * @skb: buffer to use 1200 * @len: amount of data to add 1201 * 1202 * This function extends the used data area of the buffer at the buffer 1203 * start. If this would exceed the total buffer headroom the kernel will 1204 * panic. A pointer to the first byte of the extra data is returned. 1205 */ 1206 unsigned char *skb_push(struct sk_buff *skb, unsigned int len) 1207 { 1208 skb->data -= len; 1209 skb->len += len; 1210 if (unlikely(skb->data<skb->head)) 1211 skb_under_panic(skb, len, __builtin_return_address(0)); 1212 return skb->data; 1213 } 1214 EXPORT_SYMBOL(skb_push); 1215 1216 /** 1217 * skb_pull - remove data from the start of a buffer 1218 * @skb: buffer to use 1219 * @len: amount of data to remove 1220 * 1221 * This function removes data from the start of a buffer, returning 1222 * the memory to the headroom. A pointer to the next data in the buffer 1223 * is returned. Once the data has been pulled future pushes will overwrite 1224 * the old data. 1225 */ 1226 unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) 1227 { 1228 return skb_pull_inline(skb, len); 1229 } 1230 EXPORT_SYMBOL(skb_pull); 1231 1232 /** 1233 * skb_trim - remove end from a buffer 1234 * @skb: buffer to alter 1235 * @len: new length 1236 * 1237 * Cut the length of a buffer down by removing data from the tail. If 1238 * the buffer is already under the length specified it is not modified. 1239 * The skb must be linear. 1240 */ 1241 void skb_trim(struct sk_buff *skb, unsigned int len) 1242 { 1243 if (skb->len > len) 1244 __skb_trim(skb, len); 1245 } 1246 EXPORT_SYMBOL(skb_trim); 1247 1248 /* Trims skb to length len. It can change skb pointers. 1249 */ 1250 1251 int ___pskb_trim(struct sk_buff *skb, unsigned int len) 1252 { 1253 struct sk_buff **fragp; 1254 struct sk_buff *frag; 1255 int offset = skb_headlen(skb); 1256 int nfrags = skb_shinfo(skb)->nr_frags; 1257 int i; 1258 int err; 1259 1260 if (skb_cloned(skb) && 1261 unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) 1262 return err; 1263 1264 i = 0; 1265 if (offset >= len) 1266 goto drop_pages; 1267 1268 for (; i < nfrags; i++) { 1269 int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); 1270 1271 if (end < len) { 1272 offset = end; 1273 continue; 1274 } 1275 1276 skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); 1277 1278 drop_pages: 1279 skb_shinfo(skb)->nr_frags = i; 1280 1281 for (; i < nfrags; i++) 1282 skb_frag_unref(skb, i); 1283 1284 if (skb_has_frag_list(skb)) 1285 skb_drop_fraglist(skb); 1286 goto done; 1287 } 1288 1289 for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); 1290 fragp = &frag->next) { 1291 int end = offset + frag->len; 1292 1293 if (skb_shared(frag)) { 1294 struct sk_buff *nfrag; 1295 1296 nfrag = skb_clone(frag, GFP_ATOMIC); 1297 if (unlikely(!nfrag)) 1298 return -ENOMEM; 1299 1300 nfrag->next = frag->next; 1301 consume_skb(frag); 1302 frag = nfrag; 1303 *fragp = frag; 1304 } 1305 1306 if (end < len) { 1307 offset = end; 1308 continue; 1309 } 1310 1311 if (end > len && 1312 unlikely((err = pskb_trim(frag, len - offset)))) 1313 return err; 1314 1315 if (frag->next) 1316 skb_drop_list(&frag->next); 1317 break; 1318 } 1319 1320 done: 1321 if (len > skb_headlen(skb)) { 1322 skb->data_len -= skb->len - len; 1323 skb->len = len; 1324 } else { 1325 skb->len = len; 1326 skb->data_len = 0; 1327 skb_set_tail_pointer(skb, len); 1328 } 1329 1330 return 0; 1331 } 1332 EXPORT_SYMBOL(___pskb_trim); 1333 1334 /** 1335 * __pskb_pull_tail - advance tail of skb header 1336 * @skb: buffer to reallocate 1337 * @delta: number of bytes to advance tail 1338 * 1339 * The function makes a sense only on a fragmented &sk_buff, 1340 * it expands header moving its tail forward and copying necessary 1341 * data from fragmented part. 1342 * 1343 * &sk_buff MUST have reference count of 1. 1344 * 1345 * Returns %NULL (and &sk_buff does not change) if pull failed 1346 * or value of new tail of skb in the case of success. 1347 * 1348 * All the pointers pointing into skb header may change and must be 1349 * reloaded after call to this function. 1350 */ 1351 1352 /* Moves tail of skb head forward, copying data from fragmented part, 1353 * when it is necessary. 1354 * 1. It may fail due to malloc failure. 1355 * 2. It may change skb pointers. 1356 * 1357 * It is pretty complicated. Luckily, it is called only in exceptional cases. 1358 */ 1359 unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) 1360 { 1361 /* If skb has not enough free space at tail, get new one 1362 * plus 128 bytes for future expansions. If we have enough 1363 * room at tail, reallocate without expansion only if skb is cloned. 1364 */ 1365 int i, k, eat = (skb->tail + delta) - skb->end; 1366 1367 if (eat > 0 || skb_cloned(skb)) { 1368 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, 1369 GFP_ATOMIC)) 1370 return NULL; 1371 } 1372 1373 if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) 1374 BUG(); 1375 1376 /* Optimization: no fragments, no reasons to preestimate 1377 * size of pulled pages. Superb. 1378 */ 1379 if (!skb_has_frag_list(skb)) 1380 goto pull_pages; 1381 1382 /* Estimate size of pulled pages. */ 1383 eat = delta; 1384 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1385 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 1386 1387 if (size >= eat) 1388 goto pull_pages; 1389 eat -= size; 1390 } 1391 1392 /* If we need update frag list, we are in troubles. 1393 * Certainly, it possible to add an offset to skb data, 1394 * but taking into account that pulling is expected to 1395 * be very rare operation, it is worth to fight against 1396 * further bloating skb head and crucify ourselves here instead. 1397 * Pure masohism, indeed. 8)8) 1398 */ 1399 if (eat) { 1400 struct sk_buff *list = skb_shinfo(skb)->frag_list; 1401 struct sk_buff *clone = NULL; 1402 struct sk_buff *insp = NULL; 1403 1404 do { 1405 BUG_ON(!list); 1406 1407 if (list->len <= eat) { 1408 /* Eaten as whole. */ 1409 eat -= list->len; 1410 list = list->next; 1411 insp = list; 1412 } else { 1413 /* Eaten partially. */ 1414 1415 if (skb_shared(list)) { 1416 /* Sucks! We need to fork list. :-( */ 1417 clone = skb_clone(list, GFP_ATOMIC); 1418 if (!clone) 1419 return NULL; 1420 insp = list->next; 1421 list = clone; 1422 } else { 1423 /* This may be pulled without 1424 * problems. */ 1425 insp = list; 1426 } 1427 if (!pskb_pull(list, eat)) { 1428 kfree_skb(clone); 1429 return NULL; 1430 } 1431 break; 1432 } 1433 } while (eat); 1434 1435 /* Free pulled out fragments. */ 1436 while ((list = skb_shinfo(skb)->frag_list) != insp) { 1437 skb_shinfo(skb)->frag_list = list->next; 1438 kfree_skb(list); 1439 } 1440 /* And insert new clone at head. */ 1441 if (clone) { 1442 clone->next = list; 1443 skb_shinfo(skb)->frag_list = clone; 1444 } 1445 } 1446 /* Success! Now we may commit changes to skb data. */ 1447 1448 pull_pages: 1449 eat = delta; 1450 k = 0; 1451 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1452 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 1453 1454 if (size <= eat) { 1455 skb_frag_unref(skb, i); 1456 eat -= size; 1457 } else { 1458 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; 1459 if (eat) { 1460 skb_shinfo(skb)->frags[k].page_offset += eat; 1461 skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); 1462 eat = 0; 1463 } 1464 k++; 1465 } 1466 } 1467 skb_shinfo(skb)->nr_frags = k; 1468 1469 skb->tail += delta; 1470 skb->data_len -= delta; 1471 1472 return skb_tail_pointer(skb); 1473 } 1474 EXPORT_SYMBOL(__pskb_pull_tail); 1475 1476 /** 1477 * skb_copy_bits - copy bits from skb to kernel buffer 1478 * @skb: source skb 1479 * @offset: offset in source 1480 * @to: destination buffer 1481 * @len: number of bytes to copy 1482 * 1483 * Copy the specified number of bytes from the source skb to the 1484 * destination buffer. 1485 * 1486 * CAUTION ! : 1487 * If its prototype is ever changed, 1488 * check arch/{*}/net/{*}.S files, 1489 * since it is called from BPF assembly code. 1490 */ 1491 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) 1492 { 1493 int start = skb_headlen(skb); 1494 struct sk_buff *frag_iter; 1495 int i, copy; 1496 1497 if (offset > (int)skb->len - len) 1498 goto fault; 1499 1500 /* Copy header. */ 1501 if ((copy = start - offset) > 0) { 1502 if (copy > len) 1503 copy = len; 1504 skb_copy_from_linear_data_offset(skb, offset, to, copy); 1505 if ((len -= copy) == 0) 1506 return 0; 1507 offset += copy; 1508 to += copy; 1509 } 1510 1511 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1512 int end; 1513 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 1514 1515 WARN_ON(start > offset + len); 1516 1517 end = start + skb_frag_size(f); 1518 if ((copy = end - offset) > 0) { 1519 u8 *vaddr; 1520 1521 if (copy > len) 1522 copy = len; 1523 1524 vaddr = kmap_atomic(skb_frag_page(f)); 1525 memcpy(to, 1526 vaddr + f->page_offset + offset - start, 1527 copy); 1528 kunmap_atomic(vaddr); 1529 1530 if ((len -= copy) == 0) 1531 return 0; 1532 offset += copy; 1533 to += copy; 1534 } 1535 start = end; 1536 } 1537 1538 skb_walk_frags(skb, frag_iter) { 1539 int end; 1540 1541 WARN_ON(start > offset + len); 1542 1543 end = start + frag_iter->len; 1544 if ((copy = end - offset) > 0) { 1545 if (copy > len) 1546 copy = len; 1547 if (skb_copy_bits(frag_iter, offset - start, to, copy)) 1548 goto fault; 1549 if ((len -= copy) == 0) 1550 return 0; 1551 offset += copy; 1552 to += copy; 1553 } 1554 start = end; 1555 } 1556 1557 if (!len) 1558 return 0; 1559 1560 fault: 1561 return -EFAULT; 1562 } 1563 EXPORT_SYMBOL(skb_copy_bits); 1564 1565 /* 1566 * Callback from splice_to_pipe(), if we need to release some pages 1567 * at the end of the spd in case we error'ed out in filling the pipe. 1568 */ 1569 static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) 1570 { 1571 put_page(spd->pages[i]); 1572 } 1573 1574 static struct page *linear_to_page(struct page *page, unsigned int *len, 1575 unsigned int *offset, 1576 struct sk_buff *skb, struct sock *sk) 1577 { 1578 struct page *p = sk->sk_sndmsg_page; 1579 unsigned int off; 1580 1581 if (!p) { 1582 new_page: 1583 p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0); 1584 if (!p) 1585 return NULL; 1586 1587 off = sk->sk_sndmsg_off = 0; 1588 /* hold one ref to this page until it's full */ 1589 } else { 1590 unsigned int mlen; 1591 1592 /* If we are the only user of the page, we can reset offset */ 1593 if (page_count(p) == 1) 1594 sk->sk_sndmsg_off = 0; 1595 off = sk->sk_sndmsg_off; 1596 mlen = PAGE_SIZE - off; 1597 if (mlen < 64 && mlen < *len) { 1598 put_page(p); 1599 goto new_page; 1600 } 1601 1602 *len = min_t(unsigned int, *len, mlen); 1603 } 1604 1605 memcpy(page_address(p) + off, page_address(page) + *offset, *len); 1606 sk->sk_sndmsg_off += *len; 1607 *offset = off; 1608 1609 return p; 1610 } 1611 1612 static bool spd_can_coalesce(const struct splice_pipe_desc *spd, 1613 struct page *page, 1614 unsigned int offset) 1615 { 1616 return spd->nr_pages && 1617 spd->pages[spd->nr_pages - 1] == page && 1618 (spd->partial[spd->nr_pages - 1].offset + 1619 spd->partial[spd->nr_pages - 1].len == offset); 1620 } 1621 1622 /* 1623 * Fill page/offset/length into spd, if it can hold more pages. 1624 */ 1625 static bool spd_fill_page(struct splice_pipe_desc *spd, 1626 struct pipe_inode_info *pipe, struct page *page, 1627 unsigned int *len, unsigned int offset, 1628 struct sk_buff *skb, bool linear, 1629 struct sock *sk) 1630 { 1631 if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) 1632 return true; 1633 1634 if (linear) { 1635 page = linear_to_page(page, len, &offset, skb, sk); 1636 if (!page) 1637 return true; 1638 } 1639 if (spd_can_coalesce(spd, page, offset)) { 1640 spd->partial[spd->nr_pages - 1].len += *len; 1641 return false; 1642 } 1643 get_page(page); 1644 spd->pages[spd->nr_pages] = page; 1645 spd->partial[spd->nr_pages].len = *len; 1646 spd->partial[spd->nr_pages].offset = offset; 1647 spd->nr_pages++; 1648 1649 return false; 1650 } 1651 1652 static inline void __segment_seek(struct page **page, unsigned int *poff, 1653 unsigned int *plen, unsigned int off) 1654 { 1655 unsigned long n; 1656 1657 *poff += off; 1658 n = *poff / PAGE_SIZE; 1659 if (n) 1660 *page = nth_page(*page, n); 1661 1662 *poff = *poff % PAGE_SIZE; 1663 *plen -= off; 1664 } 1665 1666 static bool __splice_segment(struct page *page, unsigned int poff, 1667 unsigned int plen, unsigned int *off, 1668 unsigned int *len, struct sk_buff *skb, 1669 struct splice_pipe_desc *spd, bool linear, 1670 struct sock *sk, 1671 struct pipe_inode_info *pipe) 1672 { 1673 if (!*len) 1674 return true; 1675 1676 /* skip this segment if already processed */ 1677 if (*off >= plen) { 1678 *off -= plen; 1679 return false; 1680 } 1681 1682 /* ignore any bits we already processed */ 1683 if (*off) { 1684 __segment_seek(&page, &poff, &plen, *off); 1685 *off = 0; 1686 } 1687 1688 do { 1689 unsigned int flen = min(*len, plen); 1690 1691 /* the linear region may spread across several pages */ 1692 flen = min_t(unsigned int, flen, PAGE_SIZE - poff); 1693 1694 if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk)) 1695 return true; 1696 1697 __segment_seek(&page, &poff, &plen, flen); 1698 *len -= flen; 1699 1700 } while (*len && plen); 1701 1702 return false; 1703 } 1704 1705 /* 1706 * Map linear and fragment data from the skb to spd. It reports true if the 1707 * pipe is full or if we already spliced the requested length. 1708 */ 1709 static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, 1710 unsigned int *offset, unsigned int *len, 1711 struct splice_pipe_desc *spd, struct sock *sk) 1712 { 1713 int seg; 1714 1715 /* map the linear part : 1716 * If skb->head_frag is set, this 'linear' part is backed by a 1717 * fragment, and if the head is not shared with any clones then 1718 * we can avoid a copy since we own the head portion of this page. 1719 */ 1720 if (__splice_segment(virt_to_page(skb->data), 1721 (unsigned long) skb->data & (PAGE_SIZE - 1), 1722 skb_headlen(skb), 1723 offset, len, skb, spd, 1724 skb_head_is_locked(skb), 1725 sk, pipe)) 1726 return true; 1727 1728 /* 1729 * then map the fragments 1730 */ 1731 for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { 1732 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 1733 1734 if (__splice_segment(skb_frag_page(f), 1735 f->page_offset, skb_frag_size(f), 1736 offset, len, skb, spd, false, sk, pipe)) 1737 return true; 1738 } 1739 1740 return false; 1741 } 1742 1743 /* 1744 * Map data from the skb to a pipe. Should handle both the linear part, 1745 * the fragments, and the frag list. It does NOT handle frag lists within 1746 * the frag list, if such a thing exists. We'd probably need to recurse to 1747 * handle that cleanly. 1748 */ 1749 int skb_splice_bits(struct sk_buff *skb, unsigned int offset, 1750 struct pipe_inode_info *pipe, unsigned int tlen, 1751 unsigned int flags) 1752 { 1753 struct partial_page partial[MAX_SKB_FRAGS]; 1754 struct page *pages[MAX_SKB_FRAGS]; 1755 struct splice_pipe_desc spd = { 1756 .pages = pages, 1757 .partial = partial, 1758 .nr_pages_max = MAX_SKB_FRAGS, 1759 .flags = flags, 1760 .ops = &sock_pipe_buf_ops, 1761 .spd_release = sock_spd_release, 1762 }; 1763 struct sk_buff *frag_iter; 1764 struct sock *sk = skb->sk; 1765 int ret = 0; 1766 1767 /* 1768 * __skb_splice_bits() only fails if the output has no room left, 1769 * so no point in going over the frag_list for the error case. 1770 */ 1771 if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) 1772 goto done; 1773 else if (!tlen) 1774 goto done; 1775 1776 /* 1777 * now see if we have a frag_list to map 1778 */ 1779 skb_walk_frags(skb, frag_iter) { 1780 if (!tlen) 1781 break; 1782 if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) 1783 break; 1784 } 1785 1786 done: 1787 if (spd.nr_pages) { 1788 /* 1789 * Drop the socket lock, otherwise we have reverse 1790 * locking dependencies between sk_lock and i_mutex 1791 * here as compared to sendfile(). We enter here 1792 * with the socket lock held, and splice_to_pipe() will 1793 * grab the pipe inode lock. For sendfile() emulation, 1794 * we call into ->sendpage() with the i_mutex lock held 1795 * and networking will grab the socket lock. 1796 */ 1797 release_sock(sk); 1798 ret = splice_to_pipe(pipe, &spd); 1799 lock_sock(sk); 1800 } 1801 1802 return ret; 1803 } 1804 1805 /** 1806 * skb_store_bits - store bits from kernel buffer to skb 1807 * @skb: destination buffer 1808 * @offset: offset in destination 1809 * @from: source buffer 1810 * @len: number of bytes to copy 1811 * 1812 * Copy the specified number of bytes from the source buffer to the 1813 * destination skb. This function handles all the messy bits of 1814 * traversing fragment lists and such. 1815 */ 1816 1817 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) 1818 { 1819 int start = skb_headlen(skb); 1820 struct sk_buff *frag_iter; 1821 int i, copy; 1822 1823 if (offset > (int)skb->len - len) 1824 goto fault; 1825 1826 if ((copy = start - offset) > 0) { 1827 if (copy > len) 1828 copy = len; 1829 skb_copy_to_linear_data_offset(skb, offset, from, copy); 1830 if ((len -= copy) == 0) 1831 return 0; 1832 offset += copy; 1833 from += copy; 1834 } 1835 1836 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1837 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1838 int end; 1839 1840 WARN_ON(start > offset + len); 1841 1842 end = start + skb_frag_size(frag); 1843 if ((copy = end - offset) > 0) { 1844 u8 *vaddr; 1845 1846 if (copy > len) 1847 copy = len; 1848 1849 vaddr = kmap_atomic(skb_frag_page(frag)); 1850 memcpy(vaddr + frag->page_offset + offset - start, 1851 from, copy); 1852 kunmap_atomic(vaddr); 1853 1854 if ((len -= copy) == 0) 1855 return 0; 1856 offset += copy; 1857 from += copy; 1858 } 1859 start = end; 1860 } 1861 1862 skb_walk_frags(skb, frag_iter) { 1863 int end; 1864 1865 WARN_ON(start > offset + len); 1866 1867 end = start + frag_iter->len; 1868 if ((copy = end - offset) > 0) { 1869 if (copy > len) 1870 copy = len; 1871 if (skb_store_bits(frag_iter, offset - start, 1872 from, copy)) 1873 goto fault; 1874 if ((len -= copy) == 0) 1875 return 0; 1876 offset += copy; 1877 from += copy; 1878 } 1879 start = end; 1880 } 1881 if (!len) 1882 return 0; 1883 1884 fault: 1885 return -EFAULT; 1886 } 1887 EXPORT_SYMBOL(skb_store_bits); 1888 1889 /* Checksum skb data. */ 1890 1891 __wsum skb_checksum(const struct sk_buff *skb, int offset, 1892 int len, __wsum csum) 1893 { 1894 int start = skb_headlen(skb); 1895 int i, copy = start - offset; 1896 struct sk_buff *frag_iter; 1897 int pos = 0; 1898 1899 /* Checksum header. */ 1900 if (copy > 0) { 1901 if (copy > len) 1902 copy = len; 1903 csum = csum_partial(skb->data + offset, copy, csum); 1904 if ((len -= copy) == 0) 1905 return csum; 1906 offset += copy; 1907 pos = copy; 1908 } 1909 1910 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1911 int end; 1912 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1913 1914 WARN_ON(start > offset + len); 1915 1916 end = start + skb_frag_size(frag); 1917 if ((copy = end - offset) > 0) { 1918 __wsum csum2; 1919 u8 *vaddr; 1920 1921 if (copy > len) 1922 copy = len; 1923 vaddr = kmap_atomic(skb_frag_page(frag)); 1924 csum2 = csum_partial(vaddr + frag->page_offset + 1925 offset - start, copy, 0); 1926 kunmap_atomic(vaddr); 1927 csum = csum_block_add(csum, csum2, pos); 1928 if (!(len -= copy)) 1929 return csum; 1930 offset += copy; 1931 pos += copy; 1932 } 1933 start = end; 1934 } 1935 1936 skb_walk_frags(skb, frag_iter) { 1937 int end; 1938 1939 WARN_ON(start > offset + len); 1940 1941 end = start + frag_iter->len; 1942 if ((copy = end - offset) > 0) { 1943 __wsum csum2; 1944 if (copy > len) 1945 copy = len; 1946 csum2 = skb_checksum(frag_iter, offset - start, 1947 copy, 0); 1948 csum = csum_block_add(csum, csum2, pos); 1949 if ((len -= copy) == 0) 1950 return csum; 1951 offset += copy; 1952 pos += copy; 1953 } 1954 start = end; 1955 } 1956 BUG_ON(len); 1957 1958 return csum; 1959 } 1960 EXPORT_SYMBOL(skb_checksum); 1961 1962 /* Both of above in one bottle. */ 1963 1964 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, 1965 u8 *to, int len, __wsum csum) 1966 { 1967 int start = skb_headlen(skb); 1968 int i, copy = start - offset; 1969 struct sk_buff *frag_iter; 1970 int pos = 0; 1971 1972 /* Copy header. */ 1973 if (copy > 0) { 1974 if (copy > len) 1975 copy = len; 1976 csum = csum_partial_copy_nocheck(skb->data + offset, to, 1977 copy, csum); 1978 if ((len -= copy) == 0) 1979 return csum; 1980 offset += copy; 1981 to += copy; 1982 pos = copy; 1983 } 1984 1985 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1986 int end; 1987 1988 WARN_ON(start > offset + len); 1989 1990 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 1991 if ((copy = end - offset) > 0) { 1992 __wsum csum2; 1993 u8 *vaddr; 1994 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1995 1996 if (copy > len) 1997 copy = len; 1998 vaddr = kmap_atomic(skb_frag_page(frag)); 1999 csum2 = csum_partial_copy_nocheck(vaddr + 2000 frag->page_offset + 2001 offset - start, to, 2002 copy, 0); 2003 kunmap_atomic(vaddr); 2004 csum = csum_block_add(csum, csum2, pos); 2005 if (!(len -= copy)) 2006 return csum; 2007 offset += copy; 2008 to += copy; 2009 pos += copy; 2010 } 2011 start = end; 2012 } 2013 2014 skb_walk_frags(skb, frag_iter) { 2015 __wsum csum2; 2016 int end; 2017 2018 WARN_ON(start > offset + len); 2019 2020 end = start + frag_iter->len; 2021 if ((copy = end - offset) > 0) { 2022 if (copy > len) 2023 copy = len; 2024 csum2 = skb_copy_and_csum_bits(frag_iter, 2025 offset - start, 2026 to, copy, 0); 2027 csum = csum_block_add(csum, csum2, pos); 2028 if ((len -= copy) == 0) 2029 return csum; 2030 offset += copy; 2031 to += copy; 2032 pos += copy; 2033 } 2034 start = end; 2035 } 2036 BUG_ON(len); 2037 return csum; 2038 } 2039 EXPORT_SYMBOL(skb_copy_and_csum_bits); 2040 2041 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) 2042 { 2043 __wsum csum; 2044 long csstart; 2045 2046 if (skb->ip_summed == CHECKSUM_PARTIAL) 2047 csstart = skb_checksum_start_offset(skb); 2048 else 2049 csstart = skb_headlen(skb); 2050 2051 BUG_ON(csstart > skb_headlen(skb)); 2052 2053 skb_copy_from_linear_data(skb, to, csstart); 2054 2055 csum = 0; 2056 if (csstart != skb->len) 2057 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, 2058 skb->len - csstart, 0); 2059 2060 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2061 long csstuff = csstart + skb->csum_offset; 2062 2063 *((__sum16 *)(to + csstuff)) = csum_fold(csum); 2064 } 2065 } 2066 EXPORT_SYMBOL(skb_copy_and_csum_dev); 2067 2068 /** 2069 * skb_dequeue - remove from the head of the queue 2070 * @list: list to dequeue from 2071 * 2072 * Remove the head of the list. The list lock is taken so the function 2073 * may be used safely with other locking list functions. The head item is 2074 * returned or %NULL if the list is empty. 2075 */ 2076 2077 struct sk_buff *skb_dequeue(struct sk_buff_head *list) 2078 { 2079 unsigned long flags; 2080 struct sk_buff *result; 2081 2082 spin_lock_irqsave(&list->lock, flags); 2083 result = __skb_dequeue(list); 2084 spin_unlock_irqrestore(&list->lock, flags); 2085 return result; 2086 } 2087 EXPORT_SYMBOL(skb_dequeue); 2088 2089 /** 2090 * skb_dequeue_tail - remove from the tail of the queue 2091 * @list: list to dequeue from 2092 * 2093 * Remove the tail of the list. The list lock is taken so the function 2094 * may be used safely with other locking list functions. The tail item is 2095 * returned or %NULL if the list is empty. 2096 */ 2097 struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) 2098 { 2099 unsigned long flags; 2100 struct sk_buff *result; 2101 2102 spin_lock_irqsave(&list->lock, flags); 2103 result = __skb_dequeue_tail(list); 2104 spin_unlock_irqrestore(&list->lock, flags); 2105 return result; 2106 } 2107 EXPORT_SYMBOL(skb_dequeue_tail); 2108 2109 /** 2110 * skb_queue_purge - empty a list 2111 * @list: list to empty 2112 * 2113 * Delete all buffers on an &sk_buff list. Each buffer is removed from 2114 * the list and one reference dropped. This function takes the list 2115 * lock and is atomic with respect to other list locking functions. 2116 */ 2117 void skb_queue_purge(struct sk_buff_head *list) 2118 { 2119 struct sk_buff *skb; 2120 while ((skb = skb_dequeue(list)) != NULL) 2121 kfree_skb(skb); 2122 } 2123 EXPORT_SYMBOL(skb_queue_purge); 2124 2125 /** 2126 * skb_queue_head - queue a buffer at the list head 2127 * @list: list to use 2128 * @newsk: buffer to queue 2129 * 2130 * Queue a buffer at the start of the list. This function takes the 2131 * list lock and can be used safely with other locking &sk_buff functions 2132 * safely. 2133 * 2134 * A buffer cannot be placed on two lists at the same time. 2135 */ 2136 void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) 2137 { 2138 unsigned long flags; 2139 2140 spin_lock_irqsave(&list->lock, flags); 2141 __skb_queue_head(list, newsk); 2142 spin_unlock_irqrestore(&list->lock, flags); 2143 } 2144 EXPORT_SYMBOL(skb_queue_head); 2145 2146 /** 2147 * skb_queue_tail - queue a buffer at the list tail 2148 * @list: list to use 2149 * @newsk: buffer to queue 2150 * 2151 * Queue a buffer at the tail of the list. This function takes the 2152 * list lock and can be used safely with other locking &sk_buff functions 2153 * safely. 2154 * 2155 * A buffer cannot be placed on two lists at the same time. 2156 */ 2157 void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) 2158 { 2159 unsigned long flags; 2160 2161 spin_lock_irqsave(&list->lock, flags); 2162 __skb_queue_tail(list, newsk); 2163 spin_unlock_irqrestore(&list->lock, flags); 2164 } 2165 EXPORT_SYMBOL(skb_queue_tail); 2166 2167 /** 2168 * skb_unlink - remove a buffer from a list 2169 * @skb: buffer to remove 2170 * @list: list to use 2171 * 2172 * Remove a packet from a list. The list locks are taken and this 2173 * function is atomic with respect to other list locked calls 2174 * 2175 * You must know what list the SKB is on. 2176 */ 2177 void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) 2178 { 2179 unsigned long flags; 2180 2181 spin_lock_irqsave(&list->lock, flags); 2182 __skb_unlink(skb, list); 2183 spin_unlock_irqrestore(&list->lock, flags); 2184 } 2185 EXPORT_SYMBOL(skb_unlink); 2186 2187 /** 2188 * skb_append - append a buffer 2189 * @old: buffer to insert after 2190 * @newsk: buffer to insert 2191 * @list: list to use 2192 * 2193 * Place a packet after a given packet in a list. The list locks are taken 2194 * and this function is atomic with respect to other list locked calls. 2195 * A buffer cannot be placed on two lists at the same time. 2196 */ 2197 void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 2198 { 2199 unsigned long flags; 2200 2201 spin_lock_irqsave(&list->lock, flags); 2202 __skb_queue_after(list, old, newsk); 2203 spin_unlock_irqrestore(&list->lock, flags); 2204 } 2205 EXPORT_SYMBOL(skb_append); 2206 2207 /** 2208 * skb_insert - insert a buffer 2209 * @old: buffer to insert before 2210 * @newsk: buffer to insert 2211 * @list: list to use 2212 * 2213 * Place a packet before a given packet in a list. The list locks are 2214 * taken and this function is atomic with respect to other list locked 2215 * calls. 2216 * 2217 * A buffer cannot be placed on two lists at the same time. 2218 */ 2219 void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 2220 { 2221 unsigned long flags; 2222 2223 spin_lock_irqsave(&list->lock, flags); 2224 __skb_insert(newsk, old->prev, old, list); 2225 spin_unlock_irqrestore(&list->lock, flags); 2226 } 2227 EXPORT_SYMBOL(skb_insert); 2228 2229 static inline void skb_split_inside_header(struct sk_buff *skb, 2230 struct sk_buff* skb1, 2231 const u32 len, const int pos) 2232 { 2233 int i; 2234 2235 skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), 2236 pos - len); 2237 /* And move data appendix as is. */ 2238 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 2239 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; 2240 2241 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; 2242 skb_shinfo(skb)->nr_frags = 0; 2243 skb1->data_len = skb->data_len; 2244 skb1->len += skb1->data_len; 2245 skb->data_len = 0; 2246 skb->len = len; 2247 skb_set_tail_pointer(skb, len); 2248 } 2249 2250 static inline void skb_split_no_header(struct sk_buff *skb, 2251 struct sk_buff* skb1, 2252 const u32 len, int pos) 2253 { 2254 int i, k = 0; 2255 const int nfrags = skb_shinfo(skb)->nr_frags; 2256 2257 skb_shinfo(skb)->nr_frags = 0; 2258 skb1->len = skb1->data_len = skb->len - len; 2259 skb->len = len; 2260 skb->data_len = len - pos; 2261 2262 for (i = 0; i < nfrags; i++) { 2263 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2264 2265 if (pos + size > len) { 2266 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; 2267 2268 if (pos < len) { 2269 /* Split frag. 2270 * We have two variants in this case: 2271 * 1. Move all the frag to the second 2272 * part, if it is possible. F.e. 2273 * this approach is mandatory for TUX, 2274 * where splitting is expensive. 2275 * 2. Split is accurately. We make this. 2276 */ 2277 skb_frag_ref(skb, i); 2278 skb_shinfo(skb1)->frags[0].page_offset += len - pos; 2279 skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); 2280 skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); 2281 skb_shinfo(skb)->nr_frags++; 2282 } 2283 k++; 2284 } else 2285 skb_shinfo(skb)->nr_frags++; 2286 pos += size; 2287 } 2288 skb_shinfo(skb1)->nr_frags = k; 2289 } 2290 2291 /** 2292 * skb_split - Split fragmented skb to two parts at length len. 2293 * @skb: the buffer to split 2294 * @skb1: the buffer to receive the second part 2295 * @len: new length for skb 2296 */ 2297 void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) 2298 { 2299 int pos = skb_headlen(skb); 2300 2301 if (len < pos) /* Split line is inside header. */ 2302 skb_split_inside_header(skb, skb1, len, pos); 2303 else /* Second chunk has no header, nothing to copy. */ 2304 skb_split_no_header(skb, skb1, len, pos); 2305 } 2306 EXPORT_SYMBOL(skb_split); 2307 2308 /* Shifting from/to a cloned skb is a no-go. 2309 * 2310 * Caller cannot keep skb_shinfo related pointers past calling here! 2311 */ 2312 static int skb_prepare_for_shift(struct sk_buff *skb) 2313 { 2314 return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 2315 } 2316 2317 /** 2318 * skb_shift - Shifts paged data partially from skb to another 2319 * @tgt: buffer into which tail data gets added 2320 * @skb: buffer from which the paged data comes from 2321 * @shiftlen: shift up to this many bytes 2322 * 2323 * Attempts to shift up to shiftlen worth of bytes, which may be less than 2324 * the length of the skb, from skb to tgt. Returns number bytes shifted. 2325 * It's up to caller to free skb if everything was shifted. 2326 * 2327 * If @tgt runs out of frags, the whole operation is aborted. 2328 * 2329 * Skb cannot include anything else but paged data while tgt is allowed 2330 * to have non-paged data as well. 2331 * 2332 * TODO: full sized shift could be optimized but that would need 2333 * specialized skb free'er to handle frags without up-to-date nr_frags. 2334 */ 2335 int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) 2336 { 2337 int from, to, merge, todo; 2338 struct skb_frag_struct *fragfrom, *fragto; 2339 2340 BUG_ON(shiftlen > skb->len); 2341 BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ 2342 2343 todo = shiftlen; 2344 from = 0; 2345 to = skb_shinfo(tgt)->nr_frags; 2346 fragfrom = &skb_shinfo(skb)->frags[from]; 2347 2348 /* Actual merge is delayed until the point when we know we can 2349 * commit all, so that we don't have to undo partial changes 2350 */ 2351 if (!to || 2352 !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), 2353 fragfrom->page_offset)) { 2354 merge = -1; 2355 } else { 2356 merge = to - 1; 2357 2358 todo -= skb_frag_size(fragfrom); 2359 if (todo < 0) { 2360 if (skb_prepare_for_shift(skb) || 2361 skb_prepare_for_shift(tgt)) 2362 return 0; 2363 2364 /* All previous frag pointers might be stale! */ 2365 fragfrom = &skb_shinfo(skb)->frags[from]; 2366 fragto = &skb_shinfo(tgt)->frags[merge]; 2367 2368 skb_frag_size_add(fragto, shiftlen); 2369 skb_frag_size_sub(fragfrom, shiftlen); 2370 fragfrom->page_offset += shiftlen; 2371 2372 goto onlymerged; 2373 } 2374 2375 from++; 2376 } 2377 2378 /* Skip full, not-fitting skb to avoid expensive operations */ 2379 if ((shiftlen == skb->len) && 2380 (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) 2381 return 0; 2382 2383 if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) 2384 return 0; 2385 2386 while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { 2387 if (to == MAX_SKB_FRAGS) 2388 return 0; 2389 2390 fragfrom = &skb_shinfo(skb)->frags[from]; 2391 fragto = &skb_shinfo(tgt)->frags[to]; 2392 2393 if (todo >= skb_frag_size(fragfrom)) { 2394 *fragto = *fragfrom; 2395 todo -= skb_frag_size(fragfrom); 2396 from++; 2397 to++; 2398 2399 } else { 2400 __skb_frag_ref(fragfrom); 2401 fragto->page = fragfrom->page; 2402 fragto->page_offset = fragfrom->page_offset; 2403 skb_frag_size_set(fragto, todo); 2404 2405 fragfrom->page_offset += todo; 2406 skb_frag_size_sub(fragfrom, todo); 2407 todo = 0; 2408 2409 to++; 2410 break; 2411 } 2412 } 2413 2414 /* Ready to "commit" this state change to tgt */ 2415 skb_shinfo(tgt)->nr_frags = to; 2416 2417 if (merge >= 0) { 2418 fragfrom = &skb_shinfo(skb)->frags[0]; 2419 fragto = &skb_shinfo(tgt)->frags[merge]; 2420 2421 skb_frag_size_add(fragto, skb_frag_size(fragfrom)); 2422 __skb_frag_unref(fragfrom); 2423 } 2424 2425 /* Reposition in the original skb */ 2426 to = 0; 2427 while (from < skb_shinfo(skb)->nr_frags) 2428 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; 2429 skb_shinfo(skb)->nr_frags = to; 2430 2431 BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); 2432 2433 onlymerged: 2434 /* Most likely the tgt won't ever need its checksum anymore, skb on 2435 * the other hand might need it if it needs to be resent 2436 */ 2437 tgt->ip_summed = CHECKSUM_PARTIAL; 2438 skb->ip_summed = CHECKSUM_PARTIAL; 2439 2440 /* Yak, is it really working this way? Some helper please? */ 2441 skb->len -= shiftlen; 2442 skb->data_len -= shiftlen; 2443 skb->truesize -= shiftlen; 2444 tgt->len += shiftlen; 2445 tgt->data_len += shiftlen; 2446 tgt->truesize += shiftlen; 2447 2448 return shiftlen; 2449 } 2450 2451 /** 2452 * skb_prepare_seq_read - Prepare a sequential read of skb data 2453 * @skb: the buffer to read 2454 * @from: lower offset of data to be read 2455 * @to: upper offset of data to be read 2456 * @st: state variable 2457 * 2458 * Initializes the specified state variable. Must be called before 2459 * invoking skb_seq_read() for the first time. 2460 */ 2461 void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, 2462 unsigned int to, struct skb_seq_state *st) 2463 { 2464 st->lower_offset = from; 2465 st->upper_offset = to; 2466 st->root_skb = st->cur_skb = skb; 2467 st->frag_idx = st->stepped_offset = 0; 2468 st->frag_data = NULL; 2469 } 2470 EXPORT_SYMBOL(skb_prepare_seq_read); 2471 2472 /** 2473 * skb_seq_read - Sequentially read skb data 2474 * @consumed: number of bytes consumed by the caller so far 2475 * @data: destination pointer for data to be returned 2476 * @st: state variable 2477 * 2478 * Reads a block of skb data at &consumed relative to the 2479 * lower offset specified to skb_prepare_seq_read(). Assigns 2480 * the head of the data block to &data and returns the length 2481 * of the block or 0 if the end of the skb data or the upper 2482 * offset has been reached. 2483 * 2484 * The caller is not required to consume all of the data 2485 * returned, i.e. &consumed is typically set to the number 2486 * of bytes already consumed and the next call to 2487 * skb_seq_read() will return the remaining part of the block. 2488 * 2489 * Note 1: The size of each block of data returned can be arbitrary, 2490 * this limitation is the cost for zerocopy seqeuental 2491 * reads of potentially non linear data. 2492 * 2493 * Note 2: Fragment lists within fragments are not implemented 2494 * at the moment, state->root_skb could be replaced with 2495 * a stack for this purpose. 2496 */ 2497 unsigned int skb_seq_read(unsigned int consumed, const u8 **data, 2498 struct skb_seq_state *st) 2499 { 2500 unsigned int block_limit, abs_offset = consumed + st->lower_offset; 2501 skb_frag_t *frag; 2502 2503 if (unlikely(abs_offset >= st->upper_offset)) 2504 return 0; 2505 2506 next_skb: 2507 block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; 2508 2509 if (abs_offset < block_limit && !st->frag_data) { 2510 *data = st->cur_skb->data + (abs_offset - st->stepped_offset); 2511 return block_limit - abs_offset; 2512 } 2513 2514 if (st->frag_idx == 0 && !st->frag_data) 2515 st->stepped_offset += skb_headlen(st->cur_skb); 2516 2517 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { 2518 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; 2519 block_limit = skb_frag_size(frag) + st->stepped_offset; 2520 2521 if (abs_offset < block_limit) { 2522 if (!st->frag_data) 2523 st->frag_data = kmap_atomic(skb_frag_page(frag)); 2524 2525 *data = (u8 *) st->frag_data + frag->page_offset + 2526 (abs_offset - st->stepped_offset); 2527 2528 return block_limit - abs_offset; 2529 } 2530 2531 if (st->frag_data) { 2532 kunmap_atomic(st->frag_data); 2533 st->frag_data = NULL; 2534 } 2535 2536 st->frag_idx++; 2537 st->stepped_offset += skb_frag_size(frag); 2538 } 2539 2540 if (st->frag_data) { 2541 kunmap_atomic(st->frag_data); 2542 st->frag_data = NULL; 2543 } 2544 2545 if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { 2546 st->cur_skb = skb_shinfo(st->root_skb)->frag_list; 2547 st->frag_idx = 0; 2548 goto next_skb; 2549 } else if (st->cur_skb->next) { 2550 st->cur_skb = st->cur_skb->next; 2551 st->frag_idx = 0; 2552 goto next_skb; 2553 } 2554 2555 return 0; 2556 } 2557 EXPORT_SYMBOL(skb_seq_read); 2558 2559 /** 2560 * skb_abort_seq_read - Abort a sequential read of skb data 2561 * @st: state variable 2562 * 2563 * Must be called if skb_seq_read() was not called until it 2564 * returned 0. 2565 */ 2566 void skb_abort_seq_read(struct skb_seq_state *st) 2567 { 2568 if (st->frag_data) 2569 kunmap_atomic(st->frag_data); 2570 } 2571 EXPORT_SYMBOL(skb_abort_seq_read); 2572 2573 #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) 2574 2575 static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, 2576 struct ts_config *conf, 2577 struct ts_state *state) 2578 { 2579 return skb_seq_read(offset, text, TS_SKB_CB(state)); 2580 } 2581 2582 static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) 2583 { 2584 skb_abort_seq_read(TS_SKB_CB(state)); 2585 } 2586 2587 /** 2588 * skb_find_text - Find a text pattern in skb data 2589 * @skb: the buffer to look in 2590 * @from: search offset 2591 * @to: search limit 2592 * @config: textsearch configuration 2593 * @state: uninitialized textsearch state variable 2594 * 2595 * Finds a pattern in the skb data according to the specified 2596 * textsearch configuration. Use textsearch_next() to retrieve 2597 * subsequent occurrences of the pattern. Returns the offset 2598 * to the first occurrence or UINT_MAX if no match was found. 2599 */ 2600 unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, 2601 unsigned int to, struct ts_config *config, 2602 struct ts_state *state) 2603 { 2604 unsigned int ret; 2605 2606 config->get_next_block = skb_ts_get_next_block; 2607 config->finish = skb_ts_finish; 2608 2609 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); 2610 2611 ret = textsearch_find(config, state); 2612 return (ret <= to - from ? ret : UINT_MAX); 2613 } 2614 EXPORT_SYMBOL(skb_find_text); 2615 2616 /** 2617 * skb_append_datato_frags: - append the user data to a skb 2618 * @sk: sock structure 2619 * @skb: skb structure to be appened with user data. 2620 * @getfrag: call back function to be used for getting the user data 2621 * @from: pointer to user message iov 2622 * @length: length of the iov message 2623 * 2624 * Description: This procedure append the user data in the fragment part 2625 * of the skb if any page alloc fails user this procedure returns -ENOMEM 2626 */ 2627 int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, 2628 int (*getfrag)(void *from, char *to, int offset, 2629 int len, int odd, struct sk_buff *skb), 2630 void *from, int length) 2631 { 2632 int frg_cnt = 0; 2633 skb_frag_t *frag = NULL; 2634 struct page *page = NULL; 2635 int copy, left; 2636 int offset = 0; 2637 int ret; 2638 2639 do { 2640 /* Return error if we don't have space for new frag */ 2641 frg_cnt = skb_shinfo(skb)->nr_frags; 2642 if (frg_cnt >= MAX_SKB_FRAGS) 2643 return -EFAULT; 2644 2645 /* allocate a new page for next frag */ 2646 page = alloc_pages(sk->sk_allocation, 0); 2647 2648 /* If alloc_page fails just return failure and caller will 2649 * free previous allocated pages by doing kfree_skb() 2650 */ 2651 if (page == NULL) 2652 return -ENOMEM; 2653 2654 /* initialize the next frag */ 2655 skb_fill_page_desc(skb, frg_cnt, page, 0, 0); 2656 skb->truesize += PAGE_SIZE; 2657 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); 2658 2659 /* get the new initialized frag */ 2660 frg_cnt = skb_shinfo(skb)->nr_frags; 2661 frag = &skb_shinfo(skb)->frags[frg_cnt - 1]; 2662 2663 /* copy the user data to page */ 2664 left = PAGE_SIZE - frag->page_offset; 2665 copy = (length > left)? left : length; 2666 2667 ret = getfrag(from, skb_frag_address(frag) + skb_frag_size(frag), 2668 offset, copy, 0, skb); 2669 if (ret < 0) 2670 return -EFAULT; 2671 2672 /* copy was successful so update the size parameters */ 2673 skb_frag_size_add(frag, copy); 2674 skb->len += copy; 2675 skb->data_len += copy; 2676 offset += copy; 2677 length -= copy; 2678 2679 } while (length > 0); 2680 2681 return 0; 2682 } 2683 EXPORT_SYMBOL(skb_append_datato_frags); 2684 2685 /** 2686 * skb_pull_rcsum - pull skb and update receive checksum 2687 * @skb: buffer to update 2688 * @len: length of data pulled 2689 * 2690 * This function performs an skb_pull on the packet and updates 2691 * the CHECKSUM_COMPLETE checksum. It should be used on 2692 * receive path processing instead of skb_pull unless you know 2693 * that the checksum difference is zero (e.g., a valid IP header) 2694 * or you are setting ip_summed to CHECKSUM_NONE. 2695 */ 2696 unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) 2697 { 2698 BUG_ON(len > skb->len); 2699 skb->len -= len; 2700 BUG_ON(skb->len < skb->data_len); 2701 skb_postpull_rcsum(skb, skb->data, len); 2702 return skb->data += len; 2703 } 2704 EXPORT_SYMBOL_GPL(skb_pull_rcsum); 2705 2706 /** 2707 * skb_segment - Perform protocol segmentation on skb. 2708 * @skb: buffer to segment 2709 * @features: features for the output path (see dev->features) 2710 * 2711 * This function performs segmentation on the given skb. It returns 2712 * a pointer to the first in a list of new skbs for the segments. 2713 * In case of error it returns ERR_PTR(err). 2714 */ 2715 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) 2716 { 2717 struct sk_buff *segs = NULL; 2718 struct sk_buff *tail = NULL; 2719 struct sk_buff *fskb = skb_shinfo(skb)->frag_list; 2720 unsigned int mss = skb_shinfo(skb)->gso_size; 2721 unsigned int doffset = skb->data - skb_mac_header(skb); 2722 unsigned int offset = doffset; 2723 unsigned int headroom; 2724 unsigned int len; 2725 int sg = !!(features & NETIF_F_SG); 2726 int nfrags = skb_shinfo(skb)->nr_frags; 2727 int err = -ENOMEM; 2728 int i = 0; 2729 int pos; 2730 2731 __skb_push(skb, doffset); 2732 headroom = skb_headroom(skb); 2733 pos = skb_headlen(skb); 2734 2735 do { 2736 struct sk_buff *nskb; 2737 skb_frag_t *frag; 2738 int hsize; 2739 int size; 2740 2741 len = skb->len - offset; 2742 if (len > mss) 2743 len = mss; 2744 2745 hsize = skb_headlen(skb) - offset; 2746 if (hsize < 0) 2747 hsize = 0; 2748 if (hsize > len || !sg) 2749 hsize = len; 2750 2751 if (!hsize && i >= nfrags) { 2752 BUG_ON(fskb->len != len); 2753 2754 pos += len; 2755 nskb = skb_clone(fskb, GFP_ATOMIC); 2756 fskb = fskb->next; 2757 2758 if (unlikely(!nskb)) 2759 goto err; 2760 2761 hsize = skb_end_offset(nskb); 2762 if (skb_cow_head(nskb, doffset + headroom)) { 2763 kfree_skb(nskb); 2764 goto err; 2765 } 2766 2767 nskb->truesize += skb_end_offset(nskb) - hsize; 2768 skb_release_head_state(nskb); 2769 __skb_push(nskb, doffset); 2770 } else { 2771 nskb = alloc_skb(hsize + doffset + headroom, 2772 GFP_ATOMIC); 2773 2774 if (unlikely(!nskb)) 2775 goto err; 2776 2777 skb_reserve(nskb, headroom); 2778 __skb_put(nskb, doffset); 2779 } 2780 2781 if (segs) 2782 tail->next = nskb; 2783 else 2784 segs = nskb; 2785 tail = nskb; 2786 2787 __copy_skb_header(nskb, skb); 2788 nskb->mac_len = skb->mac_len; 2789 2790 /* nskb and skb might have different headroom */ 2791 if (nskb->ip_summed == CHECKSUM_PARTIAL) 2792 nskb->csum_start += skb_headroom(nskb) - headroom; 2793 2794 skb_reset_mac_header(nskb); 2795 skb_set_network_header(nskb, skb->mac_len); 2796 nskb->transport_header = (nskb->network_header + 2797 skb_network_header_len(skb)); 2798 skb_copy_from_linear_data(skb, nskb->data, doffset); 2799 2800 if (fskb != skb_shinfo(skb)->frag_list) 2801 continue; 2802 2803 if (!sg) { 2804 nskb->ip_summed = CHECKSUM_NONE; 2805 nskb->csum = skb_copy_and_csum_bits(skb, offset, 2806 skb_put(nskb, len), 2807 len, 0); 2808 continue; 2809 } 2810 2811 frag = skb_shinfo(nskb)->frags; 2812 2813 skb_copy_from_linear_data_offset(skb, offset, 2814 skb_put(nskb, hsize), hsize); 2815 2816 while (pos < offset + len && i < nfrags) { 2817 *frag = skb_shinfo(skb)->frags[i]; 2818 __skb_frag_ref(frag); 2819 size = skb_frag_size(frag); 2820 2821 if (pos < offset) { 2822 frag->page_offset += offset - pos; 2823 skb_frag_size_sub(frag, offset - pos); 2824 } 2825 2826 skb_shinfo(nskb)->nr_frags++; 2827 2828 if (pos + size <= offset + len) { 2829 i++; 2830 pos += size; 2831 } else { 2832 skb_frag_size_sub(frag, pos + size - (offset + len)); 2833 goto skip_fraglist; 2834 } 2835 2836 frag++; 2837 } 2838 2839 if (pos < offset + len) { 2840 struct sk_buff *fskb2 = fskb; 2841 2842 BUG_ON(pos + fskb->len != offset + len); 2843 2844 pos += fskb->len; 2845 fskb = fskb->next; 2846 2847 if (fskb2->next) { 2848 fskb2 = skb_clone(fskb2, GFP_ATOMIC); 2849 if (!fskb2) 2850 goto err; 2851 } else 2852 skb_get(fskb2); 2853 2854 SKB_FRAG_ASSERT(nskb); 2855 skb_shinfo(nskb)->frag_list = fskb2; 2856 } 2857 2858 skip_fraglist: 2859 nskb->data_len = len - hsize; 2860 nskb->len += nskb->data_len; 2861 nskb->truesize += nskb->data_len; 2862 } while ((offset += len) < skb->len); 2863 2864 return segs; 2865 2866 err: 2867 while ((skb = segs)) { 2868 segs = skb->next; 2869 kfree_skb(skb); 2870 } 2871 return ERR_PTR(err); 2872 } 2873 EXPORT_SYMBOL_GPL(skb_segment); 2874 2875 int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) 2876 { 2877 struct sk_buff *p = *head; 2878 struct sk_buff *nskb; 2879 struct skb_shared_info *skbinfo = skb_shinfo(skb); 2880 struct skb_shared_info *pinfo = skb_shinfo(p); 2881 unsigned int headroom; 2882 unsigned int len = skb_gro_len(skb); 2883 unsigned int offset = skb_gro_offset(skb); 2884 unsigned int headlen = skb_headlen(skb); 2885 unsigned int delta_truesize; 2886 2887 if (p->len + len >= 65536) 2888 return -E2BIG; 2889 2890 if (pinfo->frag_list) 2891 goto merge; 2892 else if (headlen <= offset) { 2893 skb_frag_t *frag; 2894 skb_frag_t *frag2; 2895 int i = skbinfo->nr_frags; 2896 int nr_frags = pinfo->nr_frags + i; 2897 2898 offset -= headlen; 2899 2900 if (nr_frags > MAX_SKB_FRAGS) 2901 return -E2BIG; 2902 2903 pinfo->nr_frags = nr_frags; 2904 skbinfo->nr_frags = 0; 2905 2906 frag = pinfo->frags + nr_frags; 2907 frag2 = skbinfo->frags + i; 2908 do { 2909 *--frag = *--frag2; 2910 } while (--i); 2911 2912 frag->page_offset += offset; 2913 skb_frag_size_sub(frag, offset); 2914 2915 /* all fragments truesize : remove (head size + sk_buff) */ 2916 delta_truesize = skb->truesize - 2917 SKB_TRUESIZE(skb_end_offset(skb)); 2918 2919 skb->truesize -= skb->data_len; 2920 skb->len -= skb->data_len; 2921 skb->data_len = 0; 2922 2923 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; 2924 goto done; 2925 } else if (skb->head_frag) { 2926 int nr_frags = pinfo->nr_frags; 2927 skb_frag_t *frag = pinfo->frags + nr_frags; 2928 struct page *page = virt_to_head_page(skb->head); 2929 unsigned int first_size = headlen - offset; 2930 unsigned int first_offset; 2931 2932 if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) 2933 return -E2BIG; 2934 2935 first_offset = skb->data - 2936 (unsigned char *)page_address(page) + 2937 offset; 2938 2939 pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; 2940 2941 frag->page.p = page; 2942 frag->page_offset = first_offset; 2943 skb_frag_size_set(frag, first_size); 2944 2945 memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); 2946 /* We dont need to clear skbinfo->nr_frags here */ 2947 2948 delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 2949 NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; 2950 goto done; 2951 } else if (skb_gro_len(p) != pinfo->gso_size) 2952 return -E2BIG; 2953 2954 headroom = skb_headroom(p); 2955 nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); 2956 if (unlikely(!nskb)) 2957 return -ENOMEM; 2958 2959 __copy_skb_header(nskb, p); 2960 nskb->mac_len = p->mac_len; 2961 2962 skb_reserve(nskb, headroom); 2963 __skb_put(nskb, skb_gro_offset(p)); 2964 2965 skb_set_mac_header(nskb, skb_mac_header(p) - p->data); 2966 skb_set_network_header(nskb, skb_network_offset(p)); 2967 skb_set_transport_header(nskb, skb_transport_offset(p)); 2968 2969 __skb_pull(p, skb_gro_offset(p)); 2970 memcpy(skb_mac_header(nskb), skb_mac_header(p), 2971 p->data - skb_mac_header(p)); 2972 2973 *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); 2974 skb_shinfo(nskb)->frag_list = p; 2975 skb_shinfo(nskb)->gso_size = pinfo->gso_size; 2976 pinfo->gso_size = 0; 2977 skb_header_release(p); 2978 nskb->prev = p; 2979 2980 nskb->data_len += p->len; 2981 nskb->truesize += p->truesize; 2982 nskb->len += p->len; 2983 2984 *head = nskb; 2985 nskb->next = p->next; 2986 p->next = NULL; 2987 2988 p = nskb; 2989 2990 merge: 2991 delta_truesize = skb->truesize; 2992 if (offset > headlen) { 2993 unsigned int eat = offset - headlen; 2994 2995 skbinfo->frags[0].page_offset += eat; 2996 skb_frag_size_sub(&skbinfo->frags[0], eat); 2997 skb->data_len -= eat; 2998 skb->len -= eat; 2999 offset = headlen; 3000 } 3001 3002 __skb_pull(skb, offset); 3003 3004 p->prev->next = skb; 3005 p->prev = skb; 3006 skb_header_release(skb); 3007 3008 done: 3009 NAPI_GRO_CB(p)->count++; 3010 p->data_len += len; 3011 p->truesize += delta_truesize; 3012 p->len += len; 3013 3014 NAPI_GRO_CB(skb)->same_flow = 1; 3015 return 0; 3016 } 3017 EXPORT_SYMBOL_GPL(skb_gro_receive); 3018 3019 void __init skb_init(void) 3020 { 3021 skbuff_head_cache = kmem_cache_create("skbuff_head_cache", 3022 sizeof(struct sk_buff), 3023 0, 3024 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 3025 NULL); 3026 skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", 3027 (2*sizeof(struct sk_buff)) + 3028 sizeof(atomic_t), 3029 0, 3030 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 3031 NULL); 3032 } 3033 3034 /** 3035 * skb_to_sgvec - Fill a scatter-gather list from a socket buffer 3036 * @skb: Socket buffer containing the buffers to be mapped 3037 * @sg: The scatter-gather list to map into 3038 * @offset: The offset into the buffer's contents to start mapping 3039 * @len: Length of buffer space to be mapped 3040 * 3041 * Fill the specified scatter-gather list with mappings/pointers into a 3042 * region of the buffer space attached to a socket buffer. 3043 */ 3044 static int 3045 __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 3046 { 3047 int start = skb_headlen(skb); 3048 int i, copy = start - offset; 3049 struct sk_buff *frag_iter; 3050 int elt = 0; 3051 3052 if (copy > 0) { 3053 if (copy > len) 3054 copy = len; 3055 sg_set_buf(sg, skb->data + offset, copy); 3056 elt++; 3057 if ((len -= copy) == 0) 3058 return elt; 3059 offset += copy; 3060 } 3061 3062 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 3063 int end; 3064 3065 WARN_ON(start > offset + len); 3066 3067 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 3068 if ((copy = end - offset) > 0) { 3069 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 3070 3071 if (copy > len) 3072 copy = len; 3073 sg_set_page(&sg[elt], skb_frag_page(frag), copy, 3074 frag->page_offset+offset-start); 3075 elt++; 3076 if (!(len -= copy)) 3077 return elt; 3078 offset += copy; 3079 } 3080 start = end; 3081 } 3082 3083 skb_walk_frags(skb, frag_iter) { 3084 int end; 3085 3086 WARN_ON(start > offset + len); 3087 3088 end = start + frag_iter->len; 3089 if ((copy = end - offset) > 0) { 3090 if (copy > len) 3091 copy = len; 3092 elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start, 3093 copy); 3094 if ((len -= copy) == 0) 3095 return elt; 3096 offset += copy; 3097 } 3098 start = end; 3099 } 3100 BUG_ON(len); 3101 return elt; 3102 } 3103 3104 int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 3105 { 3106 int nsg = __skb_to_sgvec(skb, sg, offset, len); 3107 3108 sg_mark_end(&sg[nsg - 1]); 3109 3110 return nsg; 3111 } 3112 EXPORT_SYMBOL_GPL(skb_to_sgvec); 3113 3114 /** 3115 * skb_cow_data - Check that a socket buffer's data buffers are writable 3116 * @skb: The socket buffer to check. 3117 * @tailbits: Amount of trailing space to be added 3118 * @trailer: Returned pointer to the skb where the @tailbits space begins 3119 * 3120 * Make sure that the data buffers attached to a socket buffer are 3121 * writable. If they are not, private copies are made of the data buffers 3122 * and the socket buffer is set to use these instead. 3123 * 3124 * If @tailbits is given, make sure that there is space to write @tailbits 3125 * bytes of data beyond current end of socket buffer. @trailer will be 3126 * set to point to the skb in which this space begins. 3127 * 3128 * The number of scatterlist elements required to completely map the 3129 * COW'd and extended socket buffer will be returned. 3130 */ 3131 int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) 3132 { 3133 int copyflag; 3134 int elt; 3135 struct sk_buff *skb1, **skb_p; 3136 3137 /* If skb is cloned or its head is paged, reallocate 3138 * head pulling out all the pages (pages are considered not writable 3139 * at the moment even if they are anonymous). 3140 */ 3141 if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && 3142 __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) 3143 return -ENOMEM; 3144 3145 /* Easy case. Most of packets will go this way. */ 3146 if (!skb_has_frag_list(skb)) { 3147 /* A little of trouble, not enough of space for trailer. 3148 * This should not happen, when stack is tuned to generate 3149 * good frames. OK, on miss we reallocate and reserve even more 3150 * space, 128 bytes is fair. */ 3151 3152 if (skb_tailroom(skb) < tailbits && 3153 pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) 3154 return -ENOMEM; 3155 3156 /* Voila! */ 3157 *trailer = skb; 3158 return 1; 3159 } 3160 3161 /* Misery. We are in troubles, going to mincer fragments... */ 3162 3163 elt = 1; 3164 skb_p = &skb_shinfo(skb)->frag_list; 3165 copyflag = 0; 3166 3167 while ((skb1 = *skb_p) != NULL) { 3168 int ntail = 0; 3169 3170 /* The fragment is partially pulled by someone, 3171 * this can happen on input. Copy it and everything 3172 * after it. */ 3173 3174 if (skb_shared(skb1)) 3175 copyflag = 1; 3176 3177 /* If the skb is the last, worry about trailer. */ 3178 3179 if (skb1->next == NULL && tailbits) { 3180 if (skb_shinfo(skb1)->nr_frags || 3181 skb_has_frag_list(skb1) || 3182 skb_tailroom(skb1) < tailbits) 3183 ntail = tailbits + 128; 3184 } 3185 3186 if (copyflag || 3187 skb_cloned(skb1) || 3188 ntail || 3189 skb_shinfo(skb1)->nr_frags || 3190 skb_has_frag_list(skb1)) { 3191 struct sk_buff *skb2; 3192 3193 /* Fuck, we are miserable poor guys... */ 3194 if (ntail == 0) 3195 skb2 = skb_copy(skb1, GFP_ATOMIC); 3196 else 3197 skb2 = skb_copy_expand(skb1, 3198 skb_headroom(skb1), 3199 ntail, 3200 GFP_ATOMIC); 3201 if (unlikely(skb2 == NULL)) 3202 return -ENOMEM; 3203 3204 if (skb1->sk) 3205 skb_set_owner_w(skb2, skb1->sk); 3206 3207 /* Looking around. Are we still alive? 3208 * OK, link new skb, drop old one */ 3209 3210 skb2->next = skb1->next; 3211 *skb_p = skb2; 3212 kfree_skb(skb1); 3213 skb1 = skb2; 3214 } 3215 elt++; 3216 *trailer = skb1; 3217 skb_p = &skb1->next; 3218 } 3219 3220 return elt; 3221 } 3222 EXPORT_SYMBOL_GPL(skb_cow_data); 3223 3224 static void sock_rmem_free(struct sk_buff *skb) 3225 { 3226 struct sock *sk = skb->sk; 3227 3228 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 3229 } 3230 3231 /* 3232 * Note: We dont mem charge error packets (no sk_forward_alloc changes) 3233 */ 3234 int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) 3235 { 3236 int len = skb->len; 3237 3238 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 3239 (unsigned int)sk->sk_rcvbuf) 3240 return -ENOMEM; 3241 3242 skb_orphan(skb); 3243 skb->sk = sk; 3244 skb->destructor = sock_rmem_free; 3245 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 3246 3247 /* before exiting rcu section, make sure dst is refcounted */ 3248 skb_dst_force(skb); 3249 3250 skb_queue_tail(&sk->sk_error_queue, skb); 3251 if (!sock_flag(sk, SOCK_DEAD)) 3252 sk->sk_data_ready(sk, len); 3253 return 0; 3254 } 3255 EXPORT_SYMBOL(sock_queue_err_skb); 3256 3257 void skb_tstamp_tx(struct sk_buff *orig_skb, 3258 struct skb_shared_hwtstamps *hwtstamps) 3259 { 3260 struct sock *sk = orig_skb->sk; 3261 struct sock_exterr_skb *serr; 3262 struct sk_buff *skb; 3263 int err; 3264 3265 if (!sk) 3266 return; 3267 3268 skb = skb_clone(orig_skb, GFP_ATOMIC); 3269 if (!skb) 3270 return; 3271 3272 if (hwtstamps) { 3273 *skb_hwtstamps(skb) = 3274 *hwtstamps; 3275 } else { 3276 /* 3277 * no hardware time stamps available, 3278 * so keep the shared tx_flags and only 3279 * store software time stamp 3280 */ 3281 skb->tstamp = ktime_get_real(); 3282 } 3283 3284 serr = SKB_EXT_ERR(skb); 3285 memset(serr, 0, sizeof(*serr)); 3286 serr->ee.ee_errno = ENOMSG; 3287 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 3288 3289 err = sock_queue_err_skb(sk, skb); 3290 3291 if (err) 3292 kfree_skb(skb); 3293 } 3294 EXPORT_SYMBOL_GPL(skb_tstamp_tx); 3295 3296 void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) 3297 { 3298 struct sock *sk = skb->sk; 3299 struct sock_exterr_skb *serr; 3300 int err; 3301 3302 skb->wifi_acked_valid = 1; 3303 skb->wifi_acked = acked; 3304 3305 serr = SKB_EXT_ERR(skb); 3306 memset(serr, 0, sizeof(*serr)); 3307 serr->ee.ee_errno = ENOMSG; 3308 serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; 3309 3310 err = sock_queue_err_skb(sk, skb); 3311 if (err) 3312 kfree_skb(skb); 3313 } 3314 EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); 3315 3316 3317 /** 3318 * skb_partial_csum_set - set up and verify partial csum values for packet 3319 * @skb: the skb to set 3320 * @start: the number of bytes after skb->data to start checksumming. 3321 * @off: the offset from start to place the checksum. 3322 * 3323 * For untrusted partially-checksummed packets, we need to make sure the values 3324 * for skb->csum_start and skb->csum_offset are valid so we don't oops. 3325 * 3326 * This function checks and sets those values and skb->ip_summed: if this 3327 * returns false you should drop the packet. 3328 */ 3329 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) 3330 { 3331 if (unlikely(start > skb_headlen(skb)) || 3332 unlikely((int)start + off > skb_headlen(skb) - 2)) { 3333 net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n", 3334 start, off, skb_headlen(skb)); 3335 return false; 3336 } 3337 skb->ip_summed = CHECKSUM_PARTIAL; 3338 skb->csum_start = skb_headroom(skb) + start; 3339 skb->csum_offset = off; 3340 return true; 3341 } 3342 EXPORT_SYMBOL_GPL(skb_partial_csum_set); 3343 3344 void __skb_warn_lro_forwarding(const struct sk_buff *skb) 3345 { 3346 net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", 3347 skb->dev->name); 3348 } 3349 EXPORT_SYMBOL(__skb_warn_lro_forwarding); 3350 3351 void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) 3352 { 3353 if (head_stolen) 3354 kmem_cache_free(skbuff_head_cache, skb); 3355 else 3356 __kfree_skb(skb); 3357 } 3358 EXPORT_SYMBOL(kfree_skb_partial); 3359 3360 /** 3361 * skb_try_coalesce - try to merge skb to prior one 3362 * @to: prior buffer 3363 * @from: buffer to add 3364 * @fragstolen: pointer to boolean 3365 * @delta_truesize: how much more was allocated than was requested 3366 */ 3367 bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, 3368 bool *fragstolen, int *delta_truesize) 3369 { 3370 int i, delta, len = from->len; 3371 3372 *fragstolen = false; 3373 3374 if (skb_cloned(to)) 3375 return false; 3376 3377 if (len <= skb_tailroom(to)) { 3378 BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); 3379 *delta_truesize = 0; 3380 return true; 3381 } 3382 3383 if (skb_has_frag_list(to) || skb_has_frag_list(from)) 3384 return false; 3385 3386 if (skb_headlen(from) != 0) { 3387 struct page *page; 3388 unsigned int offset; 3389 3390 if (skb_shinfo(to)->nr_frags + 3391 skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) 3392 return false; 3393 3394 if (skb_head_is_locked(from)) 3395 return false; 3396 3397 delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); 3398 3399 page = virt_to_head_page(from->head); 3400 offset = from->data - (unsigned char *)page_address(page); 3401 3402 skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, 3403 page, offset, skb_headlen(from)); 3404 *fragstolen = true; 3405 } else { 3406 if (skb_shinfo(to)->nr_frags + 3407 skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) 3408 return false; 3409 3410 delta = from->truesize - 3411 SKB_TRUESIZE(skb_end_pointer(from) - from->head); 3412 } 3413 3414 WARN_ON_ONCE(delta < len); 3415 3416 memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, 3417 skb_shinfo(from)->frags, 3418 skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); 3419 skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; 3420 3421 if (!skb_cloned(from)) 3422 skb_shinfo(from)->nr_frags = 0; 3423 3424 /* if the skb is cloned this does nothing since we set nr_frags to 0 */ 3425 for (i = 0; i < skb_shinfo(from)->nr_frags; i++) 3426 skb_frag_ref(from, i); 3427 3428 to->truesize += delta; 3429 to->len += len; 3430 to->data_len += len; 3431 3432 *delta_truesize = delta; 3433 return true; 3434 } 3435 EXPORT_SYMBOL(skb_try_coalesce); 3436