1 /* 2 * Routines having to do with the 'struct sk_buff' memory handlers. 3 * 4 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> 5 * Florian La Roche <rzsfl@rz.uni-sb.de> 6 * 7 * Fixes: 8 * Alan Cox : Fixed the worst of the load 9 * balancer bugs. 10 * Dave Platt : Interrupt stacking fix. 11 * Richard Kooijman : Timestamp fixes. 12 * Alan Cox : Changed buffer format. 13 * Alan Cox : destructor hook for AF_UNIX etc. 14 * Linus Torvalds : Better skb_clone. 15 * Alan Cox : Added skb_copy. 16 * Alan Cox : Added all the changed routines Linus 17 * only put in the headers 18 * Ray VanTassle : Fixed --skb->lock in free 19 * Alan Cox : skb_copy copy arp field 20 * Andi Kleen : slabified it. 21 * Robert Olsson : Removed skb_head_pool 22 * 23 * NOTE: 24 * The __skb_ routines should be called with interrupts 25 * disabled, or you better be *real* sure that the operation is atomic 26 * with respect to whatever list is being frobbed (e.g. via lock_sock() 27 * or via disabling bottom half handlers, etc). 28 * 29 * This program is free software; you can redistribute it and/or 30 * modify it under the terms of the GNU General Public License 31 * as published by the Free Software Foundation; either version 32 * 2 of the License, or (at your option) any later version. 33 */ 34 35 /* 36 * The functions in this file will not compile correctly with gcc 2.4.x 37 */ 38 39 #include <linux/module.h> 40 #include <linux/types.h> 41 #include <linux/kernel.h> 42 #include <linux/kmemcheck.h> 43 #include <linux/mm.h> 44 #include <linux/interrupt.h> 45 #include <linux/in.h> 46 #include <linux/inet.h> 47 #include <linux/slab.h> 48 #include <linux/netdevice.h> 49 #ifdef CONFIG_NET_CLS_ACT 50 #include <net/pkt_sched.h> 51 #endif 52 #include <linux/string.h> 53 #include <linux/skbuff.h> 54 #include <linux/splice.h> 55 #include <linux/cache.h> 56 #include <linux/rtnetlink.h> 57 #include <linux/init.h> 58 #include <linux/scatterlist.h> 59 #include <linux/errqueue.h> 60 #include <linux/prefetch.h> 61 62 #include <net/protocol.h> 63 #include <net/dst.h> 64 #include <net/sock.h> 65 #include <net/checksum.h> 66 #include <net/xfrm.h> 67 68 #include <asm/uaccess.h> 69 #include <asm/system.h> 70 #include <trace/events/skb.h> 71 72 #include "kmap_skb.h" 73 74 static struct kmem_cache *skbuff_head_cache __read_mostly; 75 static struct kmem_cache *skbuff_fclone_cache __read_mostly; 76 77 static void sock_pipe_buf_release(struct pipe_inode_info *pipe, 78 struct pipe_buffer *buf) 79 { 80 put_page(buf->page); 81 } 82 83 static void sock_pipe_buf_get(struct pipe_inode_info *pipe, 84 struct pipe_buffer *buf) 85 { 86 get_page(buf->page); 87 } 88 89 static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, 90 struct pipe_buffer *buf) 91 { 92 return 1; 93 } 94 95 96 /* Pipe buffer operations for a socket. */ 97 static const struct pipe_buf_operations sock_pipe_buf_ops = { 98 .can_merge = 0, 99 .map = generic_pipe_buf_map, 100 .unmap = generic_pipe_buf_unmap, 101 .confirm = generic_pipe_buf_confirm, 102 .release = sock_pipe_buf_release, 103 .steal = sock_pipe_buf_steal, 104 .get = sock_pipe_buf_get, 105 }; 106 107 /* 108 * Keep out-of-line to prevent kernel bloat. 109 * __builtin_return_address is not used because it is not always 110 * reliable. 111 */ 112 113 /** 114 * skb_over_panic - private function 115 * @skb: buffer 116 * @sz: size 117 * @here: address 118 * 119 * Out of line support code for skb_put(). Not user callable. 120 */ 121 static void skb_over_panic(struct sk_buff *skb, int sz, void *here) 122 { 123 printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p " 124 "data:%p tail:%#lx end:%#lx dev:%s\n", 125 here, skb->len, sz, skb->head, skb->data, 126 (unsigned long)skb->tail, (unsigned long)skb->end, 127 skb->dev ? skb->dev->name : "<NULL>"); 128 BUG(); 129 } 130 131 /** 132 * skb_under_panic - private function 133 * @skb: buffer 134 * @sz: size 135 * @here: address 136 * 137 * Out of line support code for skb_push(). Not user callable. 138 */ 139 140 static void skb_under_panic(struct sk_buff *skb, int sz, void *here) 141 { 142 printk(KERN_EMERG "skb_under_panic: text:%p len:%d put:%d head:%p " 143 "data:%p tail:%#lx end:%#lx dev:%s\n", 144 here, skb->len, sz, skb->head, skb->data, 145 (unsigned long)skb->tail, (unsigned long)skb->end, 146 skb->dev ? skb->dev->name : "<NULL>"); 147 BUG(); 148 } 149 150 /* Allocate a new skbuff. We do this ourselves so we can fill in a few 151 * 'private' fields and also do memory statistics to find all the 152 * [BEEP] leaks. 153 * 154 */ 155 156 /** 157 * __alloc_skb - allocate a network buffer 158 * @size: size to allocate 159 * @gfp_mask: allocation mask 160 * @fclone: allocate from fclone cache instead of head cache 161 * and allocate a cloned (child) skb 162 * @node: numa node to allocate memory on 163 * 164 * Allocate a new &sk_buff. The returned buffer has no headroom and a 165 * tail room of size bytes. The object has a reference count of one. 166 * The return is the buffer. On a failure the return is %NULL. 167 * 168 * Buffers may only be allocated from interrupts using a @gfp_mask of 169 * %GFP_ATOMIC. 170 */ 171 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 172 int fclone, int node) 173 { 174 struct kmem_cache *cache; 175 struct skb_shared_info *shinfo; 176 struct sk_buff *skb; 177 u8 *data; 178 179 cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; 180 181 /* Get the HEAD */ 182 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); 183 if (!skb) 184 goto out; 185 prefetchw(skb); 186 187 /* We do our best to align skb_shared_info on a separate cache 188 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives 189 * aligned memory blocks, unless SLUB/SLAB debug is enabled. 190 * Both skb->head and skb_shared_info are cache line aligned. 191 */ 192 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 193 data = kmalloc_node_track_caller(size, gfp_mask, node); 194 if (!data) 195 goto nodata; 196 /* kmalloc(size) might give us more room than requested. 197 * Put skb_shared_info exactly at the end of allocated zone, 198 * to allow max possible filling before reallocation. 199 */ 200 size = SKB_WITH_OVERHEAD(ksize(data)); 201 prefetchw(data + size); 202 203 /* 204 * Only clear those fields we need to clear, not those that we will 205 * actually initialise below. Hence, don't put any more fields after 206 * the tail pointer in struct sk_buff! 207 */ 208 memset(skb, 0, offsetof(struct sk_buff, tail)); 209 /* Account for allocated memory : skb + skb->head */ 210 skb->truesize = SKB_TRUESIZE(size); 211 atomic_set(&skb->users, 1); 212 skb->head = data; 213 skb->data = data; 214 skb_reset_tail_pointer(skb); 215 skb->end = skb->tail + size; 216 #ifdef NET_SKBUFF_DATA_USES_OFFSET 217 skb->mac_header = ~0U; 218 #endif 219 220 /* make sure we initialize shinfo sequentially */ 221 shinfo = skb_shinfo(skb); 222 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 223 atomic_set(&shinfo->dataref, 1); 224 kmemcheck_annotate_variable(shinfo->destructor_arg); 225 226 if (fclone) { 227 struct sk_buff *child = skb + 1; 228 atomic_t *fclone_ref = (atomic_t *) (child + 1); 229 230 kmemcheck_annotate_bitfield(child, flags1); 231 kmemcheck_annotate_bitfield(child, flags2); 232 skb->fclone = SKB_FCLONE_ORIG; 233 atomic_set(fclone_ref, 1); 234 235 child->fclone = SKB_FCLONE_UNAVAILABLE; 236 } 237 out: 238 return skb; 239 nodata: 240 kmem_cache_free(cache, skb); 241 skb = NULL; 242 goto out; 243 } 244 EXPORT_SYMBOL(__alloc_skb); 245 246 /** 247 * __netdev_alloc_skb - allocate an skbuff for rx on a specific device 248 * @dev: network device to receive on 249 * @length: length to allocate 250 * @gfp_mask: get_free_pages mask, passed to alloc_skb 251 * 252 * Allocate a new &sk_buff and assign it a usage count of one. The 253 * buffer has unspecified headroom built in. Users should allocate 254 * the headroom they think they need without accounting for the 255 * built in space. The built in space is used for optimisations. 256 * 257 * %NULL is returned if there is no free memory. 258 */ 259 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, 260 unsigned int length, gfp_t gfp_mask) 261 { 262 struct sk_buff *skb; 263 264 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); 265 if (likely(skb)) { 266 skb_reserve(skb, NET_SKB_PAD); 267 skb->dev = dev; 268 } 269 return skb; 270 } 271 EXPORT_SYMBOL(__netdev_alloc_skb); 272 273 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, 274 int size) 275 { 276 skb_fill_page_desc(skb, i, page, off, size); 277 skb->len += size; 278 skb->data_len += size; 279 skb->truesize += size; 280 } 281 EXPORT_SYMBOL(skb_add_rx_frag); 282 283 /** 284 * dev_alloc_skb - allocate an skbuff for receiving 285 * @length: length to allocate 286 * 287 * Allocate a new &sk_buff and assign it a usage count of one. The 288 * buffer has unspecified headroom built in. Users should allocate 289 * the headroom they think they need without accounting for the 290 * built in space. The built in space is used for optimisations. 291 * 292 * %NULL is returned if there is no free memory. Although this function 293 * allocates memory it can be called from an interrupt. 294 */ 295 struct sk_buff *dev_alloc_skb(unsigned int length) 296 { 297 /* 298 * There is more code here than it seems: 299 * __dev_alloc_skb is an inline 300 */ 301 return __dev_alloc_skb(length, GFP_ATOMIC); 302 } 303 EXPORT_SYMBOL(dev_alloc_skb); 304 305 static void skb_drop_list(struct sk_buff **listp) 306 { 307 struct sk_buff *list = *listp; 308 309 *listp = NULL; 310 311 do { 312 struct sk_buff *this = list; 313 list = list->next; 314 kfree_skb(this); 315 } while (list); 316 } 317 318 static inline void skb_drop_fraglist(struct sk_buff *skb) 319 { 320 skb_drop_list(&skb_shinfo(skb)->frag_list); 321 } 322 323 static void skb_clone_fraglist(struct sk_buff *skb) 324 { 325 struct sk_buff *list; 326 327 skb_walk_frags(skb, list) 328 skb_get(list); 329 } 330 331 static void skb_release_data(struct sk_buff *skb) 332 { 333 if (!skb->cloned || 334 !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, 335 &skb_shinfo(skb)->dataref)) { 336 if (skb_shinfo(skb)->nr_frags) { 337 int i; 338 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 339 skb_frag_unref(skb, i); 340 } 341 342 /* 343 * If skb buf is from userspace, we need to notify the caller 344 * the lower device DMA has done; 345 */ 346 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 347 struct ubuf_info *uarg; 348 349 uarg = skb_shinfo(skb)->destructor_arg; 350 if (uarg->callback) 351 uarg->callback(uarg); 352 } 353 354 if (skb_has_frag_list(skb)) 355 skb_drop_fraglist(skb); 356 357 kfree(skb->head); 358 } 359 } 360 361 /* 362 * Free an skbuff by memory without cleaning the state. 363 */ 364 static void kfree_skbmem(struct sk_buff *skb) 365 { 366 struct sk_buff *other; 367 atomic_t *fclone_ref; 368 369 switch (skb->fclone) { 370 case SKB_FCLONE_UNAVAILABLE: 371 kmem_cache_free(skbuff_head_cache, skb); 372 break; 373 374 case SKB_FCLONE_ORIG: 375 fclone_ref = (atomic_t *) (skb + 2); 376 if (atomic_dec_and_test(fclone_ref)) 377 kmem_cache_free(skbuff_fclone_cache, skb); 378 break; 379 380 case SKB_FCLONE_CLONE: 381 fclone_ref = (atomic_t *) (skb + 1); 382 other = skb - 1; 383 384 /* The clone portion is available for 385 * fast-cloning again. 386 */ 387 skb->fclone = SKB_FCLONE_UNAVAILABLE; 388 389 if (atomic_dec_and_test(fclone_ref)) 390 kmem_cache_free(skbuff_fclone_cache, other); 391 break; 392 } 393 } 394 395 static void skb_release_head_state(struct sk_buff *skb) 396 { 397 skb_dst_drop(skb); 398 #ifdef CONFIG_XFRM 399 secpath_put(skb->sp); 400 #endif 401 if (skb->destructor) { 402 WARN_ON(in_irq()); 403 skb->destructor(skb); 404 } 405 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) 406 nf_conntrack_put(skb->nfct); 407 #endif 408 #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED 409 nf_conntrack_put_reasm(skb->nfct_reasm); 410 #endif 411 #ifdef CONFIG_BRIDGE_NETFILTER 412 nf_bridge_put(skb->nf_bridge); 413 #endif 414 /* XXX: IS this still necessary? - JHS */ 415 #ifdef CONFIG_NET_SCHED 416 skb->tc_index = 0; 417 #ifdef CONFIG_NET_CLS_ACT 418 skb->tc_verd = 0; 419 #endif 420 #endif 421 } 422 423 /* Free everything but the sk_buff shell. */ 424 static void skb_release_all(struct sk_buff *skb) 425 { 426 skb_release_head_state(skb); 427 skb_release_data(skb); 428 } 429 430 /** 431 * __kfree_skb - private function 432 * @skb: buffer 433 * 434 * Free an sk_buff. Release anything attached to the buffer. 435 * Clean the state. This is an internal helper function. Users should 436 * always call kfree_skb 437 */ 438 439 void __kfree_skb(struct sk_buff *skb) 440 { 441 skb_release_all(skb); 442 kfree_skbmem(skb); 443 } 444 EXPORT_SYMBOL(__kfree_skb); 445 446 /** 447 * kfree_skb - free an sk_buff 448 * @skb: buffer to free 449 * 450 * Drop a reference to the buffer and free it if the usage count has 451 * hit zero. 452 */ 453 void kfree_skb(struct sk_buff *skb) 454 { 455 if (unlikely(!skb)) 456 return; 457 if (likely(atomic_read(&skb->users) == 1)) 458 smp_rmb(); 459 else if (likely(!atomic_dec_and_test(&skb->users))) 460 return; 461 trace_kfree_skb(skb, __builtin_return_address(0)); 462 __kfree_skb(skb); 463 } 464 EXPORT_SYMBOL(kfree_skb); 465 466 /** 467 * consume_skb - free an skbuff 468 * @skb: buffer to free 469 * 470 * Drop a ref to the buffer and free it if the usage count has hit zero 471 * Functions identically to kfree_skb, but kfree_skb assumes that the frame 472 * is being dropped after a failure and notes that 473 */ 474 void consume_skb(struct sk_buff *skb) 475 { 476 if (unlikely(!skb)) 477 return; 478 if (likely(atomic_read(&skb->users) == 1)) 479 smp_rmb(); 480 else if (likely(!atomic_dec_and_test(&skb->users))) 481 return; 482 trace_consume_skb(skb); 483 __kfree_skb(skb); 484 } 485 EXPORT_SYMBOL(consume_skb); 486 487 /** 488 * skb_recycle - clean up an skb for reuse 489 * @skb: buffer 490 * 491 * Recycles the skb to be reused as a receive buffer. This 492 * function does any necessary reference count dropping, and 493 * cleans up the skbuff as if it just came from __alloc_skb(). 494 */ 495 void skb_recycle(struct sk_buff *skb) 496 { 497 struct skb_shared_info *shinfo; 498 499 skb_release_head_state(skb); 500 501 shinfo = skb_shinfo(skb); 502 memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); 503 atomic_set(&shinfo->dataref, 1); 504 505 memset(skb, 0, offsetof(struct sk_buff, tail)); 506 skb->data = skb->head + NET_SKB_PAD; 507 skb_reset_tail_pointer(skb); 508 } 509 EXPORT_SYMBOL(skb_recycle); 510 511 /** 512 * skb_recycle_check - check if skb can be reused for receive 513 * @skb: buffer 514 * @skb_size: minimum receive buffer size 515 * 516 * Checks that the skb passed in is not shared or cloned, and 517 * that it is linear and its head portion at least as large as 518 * skb_size so that it can be recycled as a receive buffer. 519 * If these conditions are met, this function does any necessary 520 * reference count dropping and cleans up the skbuff as if it 521 * just came from __alloc_skb(). 522 */ 523 bool skb_recycle_check(struct sk_buff *skb, int skb_size) 524 { 525 if (!skb_is_recycleable(skb, skb_size)) 526 return false; 527 528 skb_recycle(skb); 529 530 return true; 531 } 532 EXPORT_SYMBOL(skb_recycle_check); 533 534 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 535 { 536 new->tstamp = old->tstamp; 537 new->dev = old->dev; 538 new->transport_header = old->transport_header; 539 new->network_header = old->network_header; 540 new->mac_header = old->mac_header; 541 skb_dst_copy(new, old); 542 new->rxhash = old->rxhash; 543 new->ooo_okay = old->ooo_okay; 544 new->l4_rxhash = old->l4_rxhash; 545 #ifdef CONFIG_XFRM 546 new->sp = secpath_get(old->sp); 547 #endif 548 memcpy(new->cb, old->cb, sizeof(old->cb)); 549 new->csum = old->csum; 550 new->local_df = old->local_df; 551 new->pkt_type = old->pkt_type; 552 new->ip_summed = old->ip_summed; 553 skb_copy_queue_mapping(new, old); 554 new->priority = old->priority; 555 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) 556 new->ipvs_property = old->ipvs_property; 557 #endif 558 new->protocol = old->protocol; 559 new->mark = old->mark; 560 new->skb_iif = old->skb_iif; 561 __nf_copy(new, old); 562 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ 563 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) 564 new->nf_trace = old->nf_trace; 565 #endif 566 #ifdef CONFIG_NET_SCHED 567 new->tc_index = old->tc_index; 568 #ifdef CONFIG_NET_CLS_ACT 569 new->tc_verd = old->tc_verd; 570 #endif 571 #endif 572 new->vlan_tci = old->vlan_tci; 573 574 skb_copy_secmark(new, old); 575 } 576 577 /* 578 * You should not add any new code to this function. Add it to 579 * __copy_skb_header above instead. 580 */ 581 static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) 582 { 583 #define C(x) n->x = skb->x 584 585 n->next = n->prev = NULL; 586 n->sk = NULL; 587 __copy_skb_header(n, skb); 588 589 C(len); 590 C(data_len); 591 C(mac_len); 592 n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; 593 n->cloned = 1; 594 n->nohdr = 0; 595 n->destructor = NULL; 596 C(tail); 597 C(end); 598 C(head); 599 C(data); 600 C(truesize); 601 atomic_set(&n->users, 1); 602 603 atomic_inc(&(skb_shinfo(skb)->dataref)); 604 skb->cloned = 1; 605 606 return n; 607 #undef C 608 } 609 610 /** 611 * skb_morph - morph one skb into another 612 * @dst: the skb to receive the contents 613 * @src: the skb to supply the contents 614 * 615 * This is identical to skb_clone except that the target skb is 616 * supplied by the user. 617 * 618 * The target skb is returned upon exit. 619 */ 620 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) 621 { 622 skb_release_all(dst); 623 return __skb_clone(dst, src); 624 } 625 EXPORT_SYMBOL_GPL(skb_morph); 626 627 /* skb_copy_ubufs - copy userspace skb frags buffers to kernel 628 * @skb: the skb to modify 629 * @gfp_mask: allocation priority 630 * 631 * This must be called on SKBTX_DEV_ZEROCOPY skb. 632 * It will copy all frags into kernel and drop the reference 633 * to userspace pages. 634 * 635 * If this function is called from an interrupt gfp_mask() must be 636 * %GFP_ATOMIC. 637 * 638 * Returns 0 on success or a negative error code on failure 639 * to allocate kernel memory to copy to. 640 */ 641 int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) 642 { 643 int i; 644 int num_frags = skb_shinfo(skb)->nr_frags; 645 struct page *page, *head = NULL; 646 struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; 647 648 for (i = 0; i < num_frags; i++) { 649 u8 *vaddr; 650 skb_frag_t *f = &skb_shinfo(skb)->frags[i]; 651 652 page = alloc_page(GFP_ATOMIC); 653 if (!page) { 654 while (head) { 655 struct page *next = (struct page *)head->private; 656 put_page(head); 657 head = next; 658 } 659 return -ENOMEM; 660 } 661 vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); 662 memcpy(page_address(page), 663 vaddr + f->page_offset, skb_frag_size(f)); 664 kunmap_skb_frag(vaddr); 665 page->private = (unsigned long)head; 666 head = page; 667 } 668 669 /* skb frags release userspace buffers */ 670 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 671 skb_frag_unref(skb, i); 672 673 uarg->callback(uarg); 674 675 /* skb frags point to kernel buffers */ 676 for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { 677 __skb_fill_page_desc(skb, i-1, head, 0, 678 skb_shinfo(skb)->frags[i - 1].size); 679 head = (struct page *)head->private; 680 } 681 682 skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; 683 return 0; 684 } 685 686 687 /** 688 * skb_clone - duplicate an sk_buff 689 * @skb: buffer to clone 690 * @gfp_mask: allocation priority 691 * 692 * Duplicate an &sk_buff. The new one is not owned by a socket. Both 693 * copies share the same packet data but not structure. The new 694 * buffer has a reference count of 1. If the allocation fails the 695 * function returns %NULL otherwise the new buffer is returned. 696 * 697 * If this function is called from an interrupt gfp_mask() must be 698 * %GFP_ATOMIC. 699 */ 700 701 struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) 702 { 703 struct sk_buff *n; 704 705 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 706 if (skb_copy_ubufs(skb, gfp_mask)) 707 return NULL; 708 } 709 710 n = skb + 1; 711 if (skb->fclone == SKB_FCLONE_ORIG && 712 n->fclone == SKB_FCLONE_UNAVAILABLE) { 713 atomic_t *fclone_ref = (atomic_t *) (n + 1); 714 n->fclone = SKB_FCLONE_CLONE; 715 atomic_inc(fclone_ref); 716 } else { 717 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 718 if (!n) 719 return NULL; 720 721 kmemcheck_annotate_bitfield(n, flags1); 722 kmemcheck_annotate_bitfield(n, flags2); 723 n->fclone = SKB_FCLONE_UNAVAILABLE; 724 } 725 726 return __skb_clone(n, skb); 727 } 728 EXPORT_SYMBOL(skb_clone); 729 730 static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 731 { 732 #ifndef NET_SKBUFF_DATA_USES_OFFSET 733 /* 734 * Shift between the two data areas in bytes 735 */ 736 unsigned long offset = new->data - old->data; 737 #endif 738 739 __copy_skb_header(new, old); 740 741 #ifndef NET_SKBUFF_DATA_USES_OFFSET 742 /* {transport,network,mac}_header are relative to skb->head */ 743 new->transport_header += offset; 744 new->network_header += offset; 745 if (skb_mac_header_was_set(new)) 746 new->mac_header += offset; 747 #endif 748 skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; 749 skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; 750 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 751 } 752 753 /** 754 * skb_copy - create private copy of an sk_buff 755 * @skb: buffer to copy 756 * @gfp_mask: allocation priority 757 * 758 * Make a copy of both an &sk_buff and its data. This is used when the 759 * caller wishes to modify the data and needs a private copy of the 760 * data to alter. Returns %NULL on failure or the pointer to the buffer 761 * on success. The returned buffer has a reference count of 1. 762 * 763 * As by-product this function converts non-linear &sk_buff to linear 764 * one, so that &sk_buff becomes completely private and caller is allowed 765 * to modify all the data of returned buffer. This means that this 766 * function is not recommended for use in circumstances when only 767 * header is going to be modified. Use pskb_copy() instead. 768 */ 769 770 struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) 771 { 772 int headerlen = skb_headroom(skb); 773 unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len; 774 struct sk_buff *n = alloc_skb(size, gfp_mask); 775 776 if (!n) 777 return NULL; 778 779 /* Set the data pointer */ 780 skb_reserve(n, headerlen); 781 /* Set the tail pointer and length */ 782 skb_put(n, skb->len); 783 784 if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) 785 BUG(); 786 787 copy_skb_header(n, skb); 788 return n; 789 } 790 EXPORT_SYMBOL(skb_copy); 791 792 /** 793 * pskb_copy - create copy of an sk_buff with private head. 794 * @skb: buffer to copy 795 * @gfp_mask: allocation priority 796 * 797 * Make a copy of both an &sk_buff and part of its data, located 798 * in header. Fragmented data remain shared. This is used when 799 * the caller wishes to modify only header of &sk_buff and needs 800 * private copy of the header to alter. Returns %NULL on failure 801 * or the pointer to the buffer on success. 802 * The returned buffer has a reference count of 1. 803 */ 804 805 struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) 806 { 807 unsigned int size = skb_end_pointer(skb) - skb->head; 808 struct sk_buff *n = alloc_skb(size, gfp_mask); 809 810 if (!n) 811 goto out; 812 813 /* Set the data pointer */ 814 skb_reserve(n, skb_headroom(skb)); 815 /* Set the tail pointer and length */ 816 skb_put(n, skb_headlen(skb)); 817 /* Copy the bytes */ 818 skb_copy_from_linear_data(skb, n->data, n->len); 819 820 n->truesize += skb->data_len; 821 n->data_len = skb->data_len; 822 n->len = skb->len; 823 824 if (skb_shinfo(skb)->nr_frags) { 825 int i; 826 827 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 828 if (skb_copy_ubufs(skb, gfp_mask)) { 829 kfree_skb(n); 830 n = NULL; 831 goto out; 832 } 833 } 834 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 835 skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; 836 skb_frag_ref(skb, i); 837 } 838 skb_shinfo(n)->nr_frags = i; 839 } 840 841 if (skb_has_frag_list(skb)) { 842 skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; 843 skb_clone_fraglist(n); 844 } 845 846 copy_skb_header(n, skb); 847 out: 848 return n; 849 } 850 EXPORT_SYMBOL(pskb_copy); 851 852 /** 853 * pskb_expand_head - reallocate header of &sk_buff 854 * @skb: buffer to reallocate 855 * @nhead: room to add at head 856 * @ntail: room to add at tail 857 * @gfp_mask: allocation priority 858 * 859 * Expands (or creates identical copy, if &nhead and &ntail are zero) 860 * header of skb. &sk_buff itself is not changed. &sk_buff MUST have 861 * reference count of 1. Returns zero in the case of success or error, 862 * if expansion failed. In the last case, &sk_buff is not changed. 863 * 864 * All the pointers pointing into skb header may change and must be 865 * reloaded after call to this function. 866 */ 867 868 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, 869 gfp_t gfp_mask) 870 { 871 int i; 872 u8 *data; 873 int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; 874 long off; 875 bool fastpath; 876 877 BUG_ON(nhead < 0); 878 879 if (skb_shared(skb)) 880 BUG(); 881 882 size = SKB_DATA_ALIGN(size); 883 884 /* Check if we can avoid taking references on fragments if we own 885 * the last reference on skb->head. (see skb_release_data()) 886 */ 887 if (!skb->cloned) 888 fastpath = true; 889 else { 890 int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; 891 fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; 892 } 893 894 if (fastpath && 895 size + sizeof(struct skb_shared_info) <= ksize(skb->head)) { 896 memmove(skb->head + size, skb_shinfo(skb), 897 offsetof(struct skb_shared_info, 898 frags[skb_shinfo(skb)->nr_frags])); 899 memmove(skb->head + nhead, skb->head, 900 skb_tail_pointer(skb) - skb->head); 901 off = nhead; 902 goto adjust_others; 903 } 904 905 data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); 906 if (!data) 907 goto nodata; 908 909 /* Copy only real data... and, alas, header. This should be 910 * optimized for the cases when header is void. 911 */ 912 memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); 913 914 memcpy((struct skb_shared_info *)(data + size), 915 skb_shinfo(skb), 916 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); 917 918 if (fastpath) { 919 kfree(skb->head); 920 } else { 921 /* copy this zero copy skb frags */ 922 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 923 if (skb_copy_ubufs(skb, gfp_mask)) 924 goto nofrags; 925 } 926 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 927 skb_frag_ref(skb, i); 928 929 if (skb_has_frag_list(skb)) 930 skb_clone_fraglist(skb); 931 932 skb_release_data(skb); 933 } 934 off = (data + nhead) - skb->head; 935 936 skb->head = data; 937 adjust_others: 938 skb->data += off; 939 #ifdef NET_SKBUFF_DATA_USES_OFFSET 940 skb->end = size; 941 off = nhead; 942 #else 943 skb->end = skb->head + size; 944 #endif 945 /* {transport,network,mac}_header and tail are relative to skb->head */ 946 skb->tail += off; 947 skb->transport_header += off; 948 skb->network_header += off; 949 if (skb_mac_header_was_set(skb)) 950 skb->mac_header += off; 951 /* Only adjust this if it actually is csum_start rather than csum */ 952 if (skb->ip_summed == CHECKSUM_PARTIAL) 953 skb->csum_start += nhead; 954 skb->cloned = 0; 955 skb->hdr_len = 0; 956 skb->nohdr = 0; 957 atomic_set(&skb_shinfo(skb)->dataref, 1); 958 return 0; 959 960 nofrags: 961 kfree(data); 962 nodata: 963 return -ENOMEM; 964 } 965 EXPORT_SYMBOL(pskb_expand_head); 966 967 /* Make private copy of skb with writable head and some headroom */ 968 969 struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) 970 { 971 struct sk_buff *skb2; 972 int delta = headroom - skb_headroom(skb); 973 974 if (delta <= 0) 975 skb2 = pskb_copy(skb, GFP_ATOMIC); 976 else { 977 skb2 = skb_clone(skb, GFP_ATOMIC); 978 if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, 979 GFP_ATOMIC)) { 980 kfree_skb(skb2); 981 skb2 = NULL; 982 } 983 } 984 return skb2; 985 } 986 EXPORT_SYMBOL(skb_realloc_headroom); 987 988 /** 989 * skb_copy_expand - copy and expand sk_buff 990 * @skb: buffer to copy 991 * @newheadroom: new free bytes at head 992 * @newtailroom: new free bytes at tail 993 * @gfp_mask: allocation priority 994 * 995 * Make a copy of both an &sk_buff and its data and while doing so 996 * allocate additional space. 997 * 998 * This is used when the caller wishes to modify the data and needs a 999 * private copy of the data to alter as well as more space for new fields. 1000 * Returns %NULL on failure or the pointer to the buffer 1001 * on success. The returned buffer has a reference count of 1. 1002 * 1003 * You must pass %GFP_ATOMIC as the allocation priority if this function 1004 * is called from an interrupt. 1005 */ 1006 struct sk_buff *skb_copy_expand(const struct sk_buff *skb, 1007 int newheadroom, int newtailroom, 1008 gfp_t gfp_mask) 1009 { 1010 /* 1011 * Allocate the copy buffer 1012 */ 1013 struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, 1014 gfp_mask); 1015 int oldheadroom = skb_headroom(skb); 1016 int head_copy_len, head_copy_off; 1017 int off; 1018 1019 if (!n) 1020 return NULL; 1021 1022 skb_reserve(n, newheadroom); 1023 1024 /* Set the tail pointer and length */ 1025 skb_put(n, skb->len); 1026 1027 head_copy_len = oldheadroom; 1028 head_copy_off = 0; 1029 if (newheadroom <= head_copy_len) 1030 head_copy_len = newheadroom; 1031 else 1032 head_copy_off = newheadroom - head_copy_len; 1033 1034 /* Copy the linear header and data. */ 1035 if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, 1036 skb->len + head_copy_len)) 1037 BUG(); 1038 1039 copy_skb_header(n, skb); 1040 1041 off = newheadroom - oldheadroom; 1042 if (n->ip_summed == CHECKSUM_PARTIAL) 1043 n->csum_start += off; 1044 #ifdef NET_SKBUFF_DATA_USES_OFFSET 1045 n->transport_header += off; 1046 n->network_header += off; 1047 if (skb_mac_header_was_set(skb)) 1048 n->mac_header += off; 1049 #endif 1050 1051 return n; 1052 } 1053 EXPORT_SYMBOL(skb_copy_expand); 1054 1055 /** 1056 * skb_pad - zero pad the tail of an skb 1057 * @skb: buffer to pad 1058 * @pad: space to pad 1059 * 1060 * Ensure that a buffer is followed by a padding area that is zero 1061 * filled. Used by network drivers which may DMA or transfer data 1062 * beyond the buffer end onto the wire. 1063 * 1064 * May return error in out of memory cases. The skb is freed on error. 1065 */ 1066 1067 int skb_pad(struct sk_buff *skb, int pad) 1068 { 1069 int err; 1070 int ntail; 1071 1072 /* If the skbuff is non linear tailroom is always zero.. */ 1073 if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { 1074 memset(skb->data+skb->len, 0, pad); 1075 return 0; 1076 } 1077 1078 ntail = skb->data_len + pad - (skb->end - skb->tail); 1079 if (likely(skb_cloned(skb) || ntail > 0)) { 1080 err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); 1081 if (unlikely(err)) 1082 goto free_skb; 1083 } 1084 1085 /* FIXME: The use of this function with non-linear skb's really needs 1086 * to be audited. 1087 */ 1088 err = skb_linearize(skb); 1089 if (unlikely(err)) 1090 goto free_skb; 1091 1092 memset(skb->data + skb->len, 0, pad); 1093 return 0; 1094 1095 free_skb: 1096 kfree_skb(skb); 1097 return err; 1098 } 1099 EXPORT_SYMBOL(skb_pad); 1100 1101 /** 1102 * skb_put - add data to a buffer 1103 * @skb: buffer to use 1104 * @len: amount of data to add 1105 * 1106 * This function extends the used data area of the buffer. If this would 1107 * exceed the total buffer size the kernel will panic. A pointer to the 1108 * first byte of the extra data is returned. 1109 */ 1110 unsigned char *skb_put(struct sk_buff *skb, unsigned int len) 1111 { 1112 unsigned char *tmp = skb_tail_pointer(skb); 1113 SKB_LINEAR_ASSERT(skb); 1114 skb->tail += len; 1115 skb->len += len; 1116 if (unlikely(skb->tail > skb->end)) 1117 skb_over_panic(skb, len, __builtin_return_address(0)); 1118 return tmp; 1119 } 1120 EXPORT_SYMBOL(skb_put); 1121 1122 /** 1123 * skb_push - add data to the start of a buffer 1124 * @skb: buffer to use 1125 * @len: amount of data to add 1126 * 1127 * This function extends the used data area of the buffer at the buffer 1128 * start. If this would exceed the total buffer headroom the kernel will 1129 * panic. A pointer to the first byte of the extra data is returned. 1130 */ 1131 unsigned char *skb_push(struct sk_buff *skb, unsigned int len) 1132 { 1133 skb->data -= len; 1134 skb->len += len; 1135 if (unlikely(skb->data<skb->head)) 1136 skb_under_panic(skb, len, __builtin_return_address(0)); 1137 return skb->data; 1138 } 1139 EXPORT_SYMBOL(skb_push); 1140 1141 /** 1142 * skb_pull - remove data from the start of a buffer 1143 * @skb: buffer to use 1144 * @len: amount of data to remove 1145 * 1146 * This function removes data from the start of a buffer, returning 1147 * the memory to the headroom. A pointer to the next data in the buffer 1148 * is returned. Once the data has been pulled future pushes will overwrite 1149 * the old data. 1150 */ 1151 unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) 1152 { 1153 return skb_pull_inline(skb, len); 1154 } 1155 EXPORT_SYMBOL(skb_pull); 1156 1157 /** 1158 * skb_trim - remove end from a buffer 1159 * @skb: buffer to alter 1160 * @len: new length 1161 * 1162 * Cut the length of a buffer down by removing data from the tail. If 1163 * the buffer is already under the length specified it is not modified. 1164 * The skb must be linear. 1165 */ 1166 void skb_trim(struct sk_buff *skb, unsigned int len) 1167 { 1168 if (skb->len > len) 1169 __skb_trim(skb, len); 1170 } 1171 EXPORT_SYMBOL(skb_trim); 1172 1173 /* Trims skb to length len. It can change skb pointers. 1174 */ 1175 1176 int ___pskb_trim(struct sk_buff *skb, unsigned int len) 1177 { 1178 struct sk_buff **fragp; 1179 struct sk_buff *frag; 1180 int offset = skb_headlen(skb); 1181 int nfrags = skb_shinfo(skb)->nr_frags; 1182 int i; 1183 int err; 1184 1185 if (skb_cloned(skb) && 1186 unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) 1187 return err; 1188 1189 i = 0; 1190 if (offset >= len) 1191 goto drop_pages; 1192 1193 for (; i < nfrags; i++) { 1194 int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]); 1195 1196 if (end < len) { 1197 offset = end; 1198 continue; 1199 } 1200 1201 skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset); 1202 1203 drop_pages: 1204 skb_shinfo(skb)->nr_frags = i; 1205 1206 for (; i < nfrags; i++) 1207 skb_frag_unref(skb, i); 1208 1209 if (skb_has_frag_list(skb)) 1210 skb_drop_fraglist(skb); 1211 goto done; 1212 } 1213 1214 for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); 1215 fragp = &frag->next) { 1216 int end = offset + frag->len; 1217 1218 if (skb_shared(frag)) { 1219 struct sk_buff *nfrag; 1220 1221 nfrag = skb_clone(frag, GFP_ATOMIC); 1222 if (unlikely(!nfrag)) 1223 return -ENOMEM; 1224 1225 nfrag->next = frag->next; 1226 kfree_skb(frag); 1227 frag = nfrag; 1228 *fragp = frag; 1229 } 1230 1231 if (end < len) { 1232 offset = end; 1233 continue; 1234 } 1235 1236 if (end > len && 1237 unlikely((err = pskb_trim(frag, len - offset)))) 1238 return err; 1239 1240 if (frag->next) 1241 skb_drop_list(&frag->next); 1242 break; 1243 } 1244 1245 done: 1246 if (len > skb_headlen(skb)) { 1247 skb->data_len -= skb->len - len; 1248 skb->len = len; 1249 } else { 1250 skb->len = len; 1251 skb->data_len = 0; 1252 skb_set_tail_pointer(skb, len); 1253 } 1254 1255 return 0; 1256 } 1257 EXPORT_SYMBOL(___pskb_trim); 1258 1259 /** 1260 * __pskb_pull_tail - advance tail of skb header 1261 * @skb: buffer to reallocate 1262 * @delta: number of bytes to advance tail 1263 * 1264 * The function makes a sense only on a fragmented &sk_buff, 1265 * it expands header moving its tail forward and copying necessary 1266 * data from fragmented part. 1267 * 1268 * &sk_buff MUST have reference count of 1. 1269 * 1270 * Returns %NULL (and &sk_buff does not change) if pull failed 1271 * or value of new tail of skb in the case of success. 1272 * 1273 * All the pointers pointing into skb header may change and must be 1274 * reloaded after call to this function. 1275 */ 1276 1277 /* Moves tail of skb head forward, copying data from fragmented part, 1278 * when it is necessary. 1279 * 1. It may fail due to malloc failure. 1280 * 2. It may change skb pointers. 1281 * 1282 * It is pretty complicated. Luckily, it is called only in exceptional cases. 1283 */ 1284 unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) 1285 { 1286 /* If skb has not enough free space at tail, get new one 1287 * plus 128 bytes for future expansions. If we have enough 1288 * room at tail, reallocate without expansion only if skb is cloned. 1289 */ 1290 int i, k, eat = (skb->tail + delta) - skb->end; 1291 1292 if (eat > 0 || skb_cloned(skb)) { 1293 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, 1294 GFP_ATOMIC)) 1295 return NULL; 1296 } 1297 1298 if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) 1299 BUG(); 1300 1301 /* Optimization: no fragments, no reasons to preestimate 1302 * size of pulled pages. Superb. 1303 */ 1304 if (!skb_has_frag_list(skb)) 1305 goto pull_pages; 1306 1307 /* Estimate size of pulled pages. */ 1308 eat = delta; 1309 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1310 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 1311 1312 if (size >= eat) 1313 goto pull_pages; 1314 eat -= size; 1315 } 1316 1317 /* If we need update frag list, we are in troubles. 1318 * Certainly, it possible to add an offset to skb data, 1319 * but taking into account that pulling is expected to 1320 * be very rare operation, it is worth to fight against 1321 * further bloating skb head and crucify ourselves here instead. 1322 * Pure masohism, indeed. 8)8) 1323 */ 1324 if (eat) { 1325 struct sk_buff *list = skb_shinfo(skb)->frag_list; 1326 struct sk_buff *clone = NULL; 1327 struct sk_buff *insp = NULL; 1328 1329 do { 1330 BUG_ON(!list); 1331 1332 if (list->len <= eat) { 1333 /* Eaten as whole. */ 1334 eat -= list->len; 1335 list = list->next; 1336 insp = list; 1337 } else { 1338 /* Eaten partially. */ 1339 1340 if (skb_shared(list)) { 1341 /* Sucks! We need to fork list. :-( */ 1342 clone = skb_clone(list, GFP_ATOMIC); 1343 if (!clone) 1344 return NULL; 1345 insp = list->next; 1346 list = clone; 1347 } else { 1348 /* This may be pulled without 1349 * problems. */ 1350 insp = list; 1351 } 1352 if (!pskb_pull(list, eat)) { 1353 kfree_skb(clone); 1354 return NULL; 1355 } 1356 break; 1357 } 1358 } while (eat); 1359 1360 /* Free pulled out fragments. */ 1361 while ((list = skb_shinfo(skb)->frag_list) != insp) { 1362 skb_shinfo(skb)->frag_list = list->next; 1363 kfree_skb(list); 1364 } 1365 /* And insert new clone at head. */ 1366 if (clone) { 1367 clone->next = list; 1368 skb_shinfo(skb)->frag_list = clone; 1369 } 1370 } 1371 /* Success! Now we may commit changes to skb data. */ 1372 1373 pull_pages: 1374 eat = delta; 1375 k = 0; 1376 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1377 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 1378 1379 if (size <= eat) { 1380 skb_frag_unref(skb, i); 1381 eat -= size; 1382 } else { 1383 skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; 1384 if (eat) { 1385 skb_shinfo(skb)->frags[k].page_offset += eat; 1386 skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); 1387 eat = 0; 1388 } 1389 k++; 1390 } 1391 } 1392 skb_shinfo(skb)->nr_frags = k; 1393 1394 skb->tail += delta; 1395 skb->data_len -= delta; 1396 1397 return skb_tail_pointer(skb); 1398 } 1399 EXPORT_SYMBOL(__pskb_pull_tail); 1400 1401 /** 1402 * skb_copy_bits - copy bits from skb to kernel buffer 1403 * @skb: source skb 1404 * @offset: offset in source 1405 * @to: destination buffer 1406 * @len: number of bytes to copy 1407 * 1408 * Copy the specified number of bytes from the source skb to the 1409 * destination buffer. 1410 * 1411 * CAUTION ! : 1412 * If its prototype is ever changed, 1413 * check arch/{*}/net/{*}.S files, 1414 * since it is called from BPF assembly code. 1415 */ 1416 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) 1417 { 1418 int start = skb_headlen(skb); 1419 struct sk_buff *frag_iter; 1420 int i, copy; 1421 1422 if (offset > (int)skb->len - len) 1423 goto fault; 1424 1425 /* Copy header. */ 1426 if ((copy = start - offset) > 0) { 1427 if (copy > len) 1428 copy = len; 1429 skb_copy_from_linear_data_offset(skb, offset, to, copy); 1430 if ((len -= copy) == 0) 1431 return 0; 1432 offset += copy; 1433 to += copy; 1434 } 1435 1436 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1437 int end; 1438 1439 WARN_ON(start > offset + len); 1440 1441 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 1442 if ((copy = end - offset) > 0) { 1443 u8 *vaddr; 1444 1445 if (copy > len) 1446 copy = len; 1447 1448 vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); 1449 memcpy(to, 1450 vaddr + skb_shinfo(skb)->frags[i].page_offset+ 1451 offset - start, copy); 1452 kunmap_skb_frag(vaddr); 1453 1454 if ((len -= copy) == 0) 1455 return 0; 1456 offset += copy; 1457 to += copy; 1458 } 1459 start = end; 1460 } 1461 1462 skb_walk_frags(skb, frag_iter) { 1463 int end; 1464 1465 WARN_ON(start > offset + len); 1466 1467 end = start + frag_iter->len; 1468 if ((copy = end - offset) > 0) { 1469 if (copy > len) 1470 copy = len; 1471 if (skb_copy_bits(frag_iter, offset - start, to, copy)) 1472 goto fault; 1473 if ((len -= copy) == 0) 1474 return 0; 1475 offset += copy; 1476 to += copy; 1477 } 1478 start = end; 1479 } 1480 1481 if (!len) 1482 return 0; 1483 1484 fault: 1485 return -EFAULT; 1486 } 1487 EXPORT_SYMBOL(skb_copy_bits); 1488 1489 /* 1490 * Callback from splice_to_pipe(), if we need to release some pages 1491 * at the end of the spd in case we error'ed out in filling the pipe. 1492 */ 1493 static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) 1494 { 1495 put_page(spd->pages[i]); 1496 } 1497 1498 static inline struct page *linear_to_page(struct page *page, unsigned int *len, 1499 unsigned int *offset, 1500 struct sk_buff *skb, struct sock *sk) 1501 { 1502 struct page *p = sk->sk_sndmsg_page; 1503 unsigned int off; 1504 1505 if (!p) { 1506 new_page: 1507 p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0); 1508 if (!p) 1509 return NULL; 1510 1511 off = sk->sk_sndmsg_off = 0; 1512 /* hold one ref to this page until it's full */ 1513 } else { 1514 unsigned int mlen; 1515 1516 off = sk->sk_sndmsg_off; 1517 mlen = PAGE_SIZE - off; 1518 if (mlen < 64 && mlen < *len) { 1519 put_page(p); 1520 goto new_page; 1521 } 1522 1523 *len = min_t(unsigned int, *len, mlen); 1524 } 1525 1526 memcpy(page_address(p) + off, page_address(page) + *offset, *len); 1527 sk->sk_sndmsg_off += *len; 1528 *offset = off; 1529 get_page(p); 1530 1531 return p; 1532 } 1533 1534 /* 1535 * Fill page/offset/length into spd, if it can hold more pages. 1536 */ 1537 static inline int spd_fill_page(struct splice_pipe_desc *spd, 1538 struct pipe_inode_info *pipe, struct page *page, 1539 unsigned int *len, unsigned int offset, 1540 struct sk_buff *skb, int linear, 1541 struct sock *sk) 1542 { 1543 if (unlikely(spd->nr_pages == pipe->buffers)) 1544 return 1; 1545 1546 if (linear) { 1547 page = linear_to_page(page, len, &offset, skb, sk); 1548 if (!page) 1549 return 1; 1550 } else 1551 get_page(page); 1552 1553 spd->pages[spd->nr_pages] = page; 1554 spd->partial[spd->nr_pages].len = *len; 1555 spd->partial[spd->nr_pages].offset = offset; 1556 spd->nr_pages++; 1557 1558 return 0; 1559 } 1560 1561 static inline void __segment_seek(struct page **page, unsigned int *poff, 1562 unsigned int *plen, unsigned int off) 1563 { 1564 unsigned long n; 1565 1566 *poff += off; 1567 n = *poff / PAGE_SIZE; 1568 if (n) 1569 *page = nth_page(*page, n); 1570 1571 *poff = *poff % PAGE_SIZE; 1572 *plen -= off; 1573 } 1574 1575 static inline int __splice_segment(struct page *page, unsigned int poff, 1576 unsigned int plen, unsigned int *off, 1577 unsigned int *len, struct sk_buff *skb, 1578 struct splice_pipe_desc *spd, int linear, 1579 struct sock *sk, 1580 struct pipe_inode_info *pipe) 1581 { 1582 if (!*len) 1583 return 1; 1584 1585 /* skip this segment if already processed */ 1586 if (*off >= plen) { 1587 *off -= plen; 1588 return 0; 1589 } 1590 1591 /* ignore any bits we already processed */ 1592 if (*off) { 1593 __segment_seek(&page, &poff, &plen, *off); 1594 *off = 0; 1595 } 1596 1597 do { 1598 unsigned int flen = min(*len, plen); 1599 1600 /* the linear region may spread across several pages */ 1601 flen = min_t(unsigned int, flen, PAGE_SIZE - poff); 1602 1603 if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk)) 1604 return 1; 1605 1606 __segment_seek(&page, &poff, &plen, flen); 1607 *len -= flen; 1608 1609 } while (*len && plen); 1610 1611 return 0; 1612 } 1613 1614 /* 1615 * Map linear and fragment data from the skb to spd. It reports failure if the 1616 * pipe is full or if we already spliced the requested length. 1617 */ 1618 static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, 1619 unsigned int *offset, unsigned int *len, 1620 struct splice_pipe_desc *spd, struct sock *sk) 1621 { 1622 int seg; 1623 1624 /* 1625 * map the linear part 1626 */ 1627 if (__splice_segment(virt_to_page(skb->data), 1628 (unsigned long) skb->data & (PAGE_SIZE - 1), 1629 skb_headlen(skb), 1630 offset, len, skb, spd, 1, sk, pipe)) 1631 return 1; 1632 1633 /* 1634 * then map the fragments 1635 */ 1636 for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { 1637 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; 1638 1639 if (__splice_segment(skb_frag_page(f), 1640 f->page_offset, skb_frag_size(f), 1641 offset, len, skb, spd, 0, sk, pipe)) 1642 return 1; 1643 } 1644 1645 return 0; 1646 } 1647 1648 /* 1649 * Map data from the skb to a pipe. Should handle both the linear part, 1650 * the fragments, and the frag list. It does NOT handle frag lists within 1651 * the frag list, if such a thing exists. We'd probably need to recurse to 1652 * handle that cleanly. 1653 */ 1654 int skb_splice_bits(struct sk_buff *skb, unsigned int offset, 1655 struct pipe_inode_info *pipe, unsigned int tlen, 1656 unsigned int flags) 1657 { 1658 struct partial_page partial[PIPE_DEF_BUFFERS]; 1659 struct page *pages[PIPE_DEF_BUFFERS]; 1660 struct splice_pipe_desc spd = { 1661 .pages = pages, 1662 .partial = partial, 1663 .flags = flags, 1664 .ops = &sock_pipe_buf_ops, 1665 .spd_release = sock_spd_release, 1666 }; 1667 struct sk_buff *frag_iter; 1668 struct sock *sk = skb->sk; 1669 int ret = 0; 1670 1671 if (splice_grow_spd(pipe, &spd)) 1672 return -ENOMEM; 1673 1674 /* 1675 * __skb_splice_bits() only fails if the output has no room left, 1676 * so no point in going over the frag_list for the error case. 1677 */ 1678 if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) 1679 goto done; 1680 else if (!tlen) 1681 goto done; 1682 1683 /* 1684 * now see if we have a frag_list to map 1685 */ 1686 skb_walk_frags(skb, frag_iter) { 1687 if (!tlen) 1688 break; 1689 if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) 1690 break; 1691 } 1692 1693 done: 1694 if (spd.nr_pages) { 1695 /* 1696 * Drop the socket lock, otherwise we have reverse 1697 * locking dependencies between sk_lock and i_mutex 1698 * here as compared to sendfile(). We enter here 1699 * with the socket lock held, and splice_to_pipe() will 1700 * grab the pipe inode lock. For sendfile() emulation, 1701 * we call into ->sendpage() with the i_mutex lock held 1702 * and networking will grab the socket lock. 1703 */ 1704 release_sock(sk); 1705 ret = splice_to_pipe(pipe, &spd); 1706 lock_sock(sk); 1707 } 1708 1709 splice_shrink_spd(pipe, &spd); 1710 return ret; 1711 } 1712 1713 /** 1714 * skb_store_bits - store bits from kernel buffer to skb 1715 * @skb: destination buffer 1716 * @offset: offset in destination 1717 * @from: source buffer 1718 * @len: number of bytes to copy 1719 * 1720 * Copy the specified number of bytes from the source buffer to the 1721 * destination skb. This function handles all the messy bits of 1722 * traversing fragment lists and such. 1723 */ 1724 1725 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) 1726 { 1727 int start = skb_headlen(skb); 1728 struct sk_buff *frag_iter; 1729 int i, copy; 1730 1731 if (offset > (int)skb->len - len) 1732 goto fault; 1733 1734 if ((copy = start - offset) > 0) { 1735 if (copy > len) 1736 copy = len; 1737 skb_copy_to_linear_data_offset(skb, offset, from, copy); 1738 if ((len -= copy) == 0) 1739 return 0; 1740 offset += copy; 1741 from += copy; 1742 } 1743 1744 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1745 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1746 int end; 1747 1748 WARN_ON(start > offset + len); 1749 1750 end = start + skb_frag_size(frag); 1751 if ((copy = end - offset) > 0) { 1752 u8 *vaddr; 1753 1754 if (copy > len) 1755 copy = len; 1756 1757 vaddr = kmap_skb_frag(frag); 1758 memcpy(vaddr + frag->page_offset + offset - start, 1759 from, copy); 1760 kunmap_skb_frag(vaddr); 1761 1762 if ((len -= copy) == 0) 1763 return 0; 1764 offset += copy; 1765 from += copy; 1766 } 1767 start = end; 1768 } 1769 1770 skb_walk_frags(skb, frag_iter) { 1771 int end; 1772 1773 WARN_ON(start > offset + len); 1774 1775 end = start + frag_iter->len; 1776 if ((copy = end - offset) > 0) { 1777 if (copy > len) 1778 copy = len; 1779 if (skb_store_bits(frag_iter, offset - start, 1780 from, copy)) 1781 goto fault; 1782 if ((len -= copy) == 0) 1783 return 0; 1784 offset += copy; 1785 from += copy; 1786 } 1787 start = end; 1788 } 1789 if (!len) 1790 return 0; 1791 1792 fault: 1793 return -EFAULT; 1794 } 1795 EXPORT_SYMBOL(skb_store_bits); 1796 1797 /* Checksum skb data. */ 1798 1799 __wsum skb_checksum(const struct sk_buff *skb, int offset, 1800 int len, __wsum csum) 1801 { 1802 int start = skb_headlen(skb); 1803 int i, copy = start - offset; 1804 struct sk_buff *frag_iter; 1805 int pos = 0; 1806 1807 /* Checksum header. */ 1808 if (copy > 0) { 1809 if (copy > len) 1810 copy = len; 1811 csum = csum_partial(skb->data + offset, copy, csum); 1812 if ((len -= copy) == 0) 1813 return csum; 1814 offset += copy; 1815 pos = copy; 1816 } 1817 1818 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1819 int end; 1820 1821 WARN_ON(start > offset + len); 1822 1823 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 1824 if ((copy = end - offset) > 0) { 1825 __wsum csum2; 1826 u8 *vaddr; 1827 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1828 1829 if (copy > len) 1830 copy = len; 1831 vaddr = kmap_skb_frag(frag); 1832 csum2 = csum_partial(vaddr + frag->page_offset + 1833 offset - start, copy, 0); 1834 kunmap_skb_frag(vaddr); 1835 csum = csum_block_add(csum, csum2, pos); 1836 if (!(len -= copy)) 1837 return csum; 1838 offset += copy; 1839 pos += copy; 1840 } 1841 start = end; 1842 } 1843 1844 skb_walk_frags(skb, frag_iter) { 1845 int end; 1846 1847 WARN_ON(start > offset + len); 1848 1849 end = start + frag_iter->len; 1850 if ((copy = end - offset) > 0) { 1851 __wsum csum2; 1852 if (copy > len) 1853 copy = len; 1854 csum2 = skb_checksum(frag_iter, offset - start, 1855 copy, 0); 1856 csum = csum_block_add(csum, csum2, pos); 1857 if ((len -= copy) == 0) 1858 return csum; 1859 offset += copy; 1860 pos += copy; 1861 } 1862 start = end; 1863 } 1864 BUG_ON(len); 1865 1866 return csum; 1867 } 1868 EXPORT_SYMBOL(skb_checksum); 1869 1870 /* Both of above in one bottle. */ 1871 1872 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, 1873 u8 *to, int len, __wsum csum) 1874 { 1875 int start = skb_headlen(skb); 1876 int i, copy = start - offset; 1877 struct sk_buff *frag_iter; 1878 int pos = 0; 1879 1880 /* Copy header. */ 1881 if (copy > 0) { 1882 if (copy > len) 1883 copy = len; 1884 csum = csum_partial_copy_nocheck(skb->data + offset, to, 1885 copy, csum); 1886 if ((len -= copy) == 0) 1887 return csum; 1888 offset += copy; 1889 to += copy; 1890 pos = copy; 1891 } 1892 1893 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1894 int end; 1895 1896 WARN_ON(start > offset + len); 1897 1898 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 1899 if ((copy = end - offset) > 0) { 1900 __wsum csum2; 1901 u8 *vaddr; 1902 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 1903 1904 if (copy > len) 1905 copy = len; 1906 vaddr = kmap_skb_frag(frag); 1907 csum2 = csum_partial_copy_nocheck(vaddr + 1908 frag->page_offset + 1909 offset - start, to, 1910 copy, 0); 1911 kunmap_skb_frag(vaddr); 1912 csum = csum_block_add(csum, csum2, pos); 1913 if (!(len -= copy)) 1914 return csum; 1915 offset += copy; 1916 to += copy; 1917 pos += copy; 1918 } 1919 start = end; 1920 } 1921 1922 skb_walk_frags(skb, frag_iter) { 1923 __wsum csum2; 1924 int end; 1925 1926 WARN_ON(start > offset + len); 1927 1928 end = start + frag_iter->len; 1929 if ((copy = end - offset) > 0) { 1930 if (copy > len) 1931 copy = len; 1932 csum2 = skb_copy_and_csum_bits(frag_iter, 1933 offset - start, 1934 to, copy, 0); 1935 csum = csum_block_add(csum, csum2, pos); 1936 if ((len -= copy) == 0) 1937 return csum; 1938 offset += copy; 1939 to += copy; 1940 pos += copy; 1941 } 1942 start = end; 1943 } 1944 BUG_ON(len); 1945 return csum; 1946 } 1947 EXPORT_SYMBOL(skb_copy_and_csum_bits); 1948 1949 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) 1950 { 1951 __wsum csum; 1952 long csstart; 1953 1954 if (skb->ip_summed == CHECKSUM_PARTIAL) 1955 csstart = skb_checksum_start_offset(skb); 1956 else 1957 csstart = skb_headlen(skb); 1958 1959 BUG_ON(csstart > skb_headlen(skb)); 1960 1961 skb_copy_from_linear_data(skb, to, csstart); 1962 1963 csum = 0; 1964 if (csstart != skb->len) 1965 csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, 1966 skb->len - csstart, 0); 1967 1968 if (skb->ip_summed == CHECKSUM_PARTIAL) { 1969 long csstuff = csstart + skb->csum_offset; 1970 1971 *((__sum16 *)(to + csstuff)) = csum_fold(csum); 1972 } 1973 } 1974 EXPORT_SYMBOL(skb_copy_and_csum_dev); 1975 1976 /** 1977 * skb_dequeue - remove from the head of the queue 1978 * @list: list to dequeue from 1979 * 1980 * Remove the head of the list. The list lock is taken so the function 1981 * may be used safely with other locking list functions. The head item is 1982 * returned or %NULL if the list is empty. 1983 */ 1984 1985 struct sk_buff *skb_dequeue(struct sk_buff_head *list) 1986 { 1987 unsigned long flags; 1988 struct sk_buff *result; 1989 1990 spin_lock_irqsave(&list->lock, flags); 1991 result = __skb_dequeue(list); 1992 spin_unlock_irqrestore(&list->lock, flags); 1993 return result; 1994 } 1995 EXPORT_SYMBOL(skb_dequeue); 1996 1997 /** 1998 * skb_dequeue_tail - remove from the tail of the queue 1999 * @list: list to dequeue from 2000 * 2001 * Remove the tail of the list. The list lock is taken so the function 2002 * may be used safely with other locking list functions. The tail item is 2003 * returned or %NULL if the list is empty. 2004 */ 2005 struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) 2006 { 2007 unsigned long flags; 2008 struct sk_buff *result; 2009 2010 spin_lock_irqsave(&list->lock, flags); 2011 result = __skb_dequeue_tail(list); 2012 spin_unlock_irqrestore(&list->lock, flags); 2013 return result; 2014 } 2015 EXPORT_SYMBOL(skb_dequeue_tail); 2016 2017 /** 2018 * skb_queue_purge - empty a list 2019 * @list: list to empty 2020 * 2021 * Delete all buffers on an &sk_buff list. Each buffer is removed from 2022 * the list and one reference dropped. This function takes the list 2023 * lock and is atomic with respect to other list locking functions. 2024 */ 2025 void skb_queue_purge(struct sk_buff_head *list) 2026 { 2027 struct sk_buff *skb; 2028 while ((skb = skb_dequeue(list)) != NULL) 2029 kfree_skb(skb); 2030 } 2031 EXPORT_SYMBOL(skb_queue_purge); 2032 2033 /** 2034 * skb_queue_head - queue a buffer at the list head 2035 * @list: list to use 2036 * @newsk: buffer to queue 2037 * 2038 * Queue a buffer at the start of the list. This function takes the 2039 * list lock and can be used safely with other locking &sk_buff functions 2040 * safely. 2041 * 2042 * A buffer cannot be placed on two lists at the same time. 2043 */ 2044 void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) 2045 { 2046 unsigned long flags; 2047 2048 spin_lock_irqsave(&list->lock, flags); 2049 __skb_queue_head(list, newsk); 2050 spin_unlock_irqrestore(&list->lock, flags); 2051 } 2052 EXPORT_SYMBOL(skb_queue_head); 2053 2054 /** 2055 * skb_queue_tail - queue a buffer at the list tail 2056 * @list: list to use 2057 * @newsk: buffer to queue 2058 * 2059 * Queue a buffer at the tail of the list. This function takes the 2060 * list lock and can be used safely with other locking &sk_buff functions 2061 * safely. 2062 * 2063 * A buffer cannot be placed on two lists at the same time. 2064 */ 2065 void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) 2066 { 2067 unsigned long flags; 2068 2069 spin_lock_irqsave(&list->lock, flags); 2070 __skb_queue_tail(list, newsk); 2071 spin_unlock_irqrestore(&list->lock, flags); 2072 } 2073 EXPORT_SYMBOL(skb_queue_tail); 2074 2075 /** 2076 * skb_unlink - remove a buffer from a list 2077 * @skb: buffer to remove 2078 * @list: list to use 2079 * 2080 * Remove a packet from a list. The list locks are taken and this 2081 * function is atomic with respect to other list locked calls 2082 * 2083 * You must know what list the SKB is on. 2084 */ 2085 void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) 2086 { 2087 unsigned long flags; 2088 2089 spin_lock_irqsave(&list->lock, flags); 2090 __skb_unlink(skb, list); 2091 spin_unlock_irqrestore(&list->lock, flags); 2092 } 2093 EXPORT_SYMBOL(skb_unlink); 2094 2095 /** 2096 * skb_append - append a buffer 2097 * @old: buffer to insert after 2098 * @newsk: buffer to insert 2099 * @list: list to use 2100 * 2101 * Place a packet after a given packet in a list. The list locks are taken 2102 * and this function is atomic with respect to other list locked calls. 2103 * A buffer cannot be placed on two lists at the same time. 2104 */ 2105 void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 2106 { 2107 unsigned long flags; 2108 2109 spin_lock_irqsave(&list->lock, flags); 2110 __skb_queue_after(list, old, newsk); 2111 spin_unlock_irqrestore(&list->lock, flags); 2112 } 2113 EXPORT_SYMBOL(skb_append); 2114 2115 /** 2116 * skb_insert - insert a buffer 2117 * @old: buffer to insert before 2118 * @newsk: buffer to insert 2119 * @list: list to use 2120 * 2121 * Place a packet before a given packet in a list. The list locks are 2122 * taken and this function is atomic with respect to other list locked 2123 * calls. 2124 * 2125 * A buffer cannot be placed on two lists at the same time. 2126 */ 2127 void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) 2128 { 2129 unsigned long flags; 2130 2131 spin_lock_irqsave(&list->lock, flags); 2132 __skb_insert(newsk, old->prev, old, list); 2133 spin_unlock_irqrestore(&list->lock, flags); 2134 } 2135 EXPORT_SYMBOL(skb_insert); 2136 2137 static inline void skb_split_inside_header(struct sk_buff *skb, 2138 struct sk_buff* skb1, 2139 const u32 len, const int pos) 2140 { 2141 int i; 2142 2143 skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len), 2144 pos - len); 2145 /* And move data appendix as is. */ 2146 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 2147 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; 2148 2149 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; 2150 skb_shinfo(skb)->nr_frags = 0; 2151 skb1->data_len = skb->data_len; 2152 skb1->len += skb1->data_len; 2153 skb->data_len = 0; 2154 skb->len = len; 2155 skb_set_tail_pointer(skb, len); 2156 } 2157 2158 static inline void skb_split_no_header(struct sk_buff *skb, 2159 struct sk_buff* skb1, 2160 const u32 len, int pos) 2161 { 2162 int i, k = 0; 2163 const int nfrags = skb_shinfo(skb)->nr_frags; 2164 2165 skb_shinfo(skb)->nr_frags = 0; 2166 skb1->len = skb1->data_len = skb->len - len; 2167 skb->len = len; 2168 skb->data_len = len - pos; 2169 2170 for (i = 0; i < nfrags; i++) { 2171 int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); 2172 2173 if (pos + size > len) { 2174 skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; 2175 2176 if (pos < len) { 2177 /* Split frag. 2178 * We have two variants in this case: 2179 * 1. Move all the frag to the second 2180 * part, if it is possible. F.e. 2181 * this approach is mandatory for TUX, 2182 * where splitting is expensive. 2183 * 2. Split is accurately. We make this. 2184 */ 2185 skb_frag_ref(skb, i); 2186 skb_shinfo(skb1)->frags[0].page_offset += len - pos; 2187 skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos); 2188 skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos); 2189 skb_shinfo(skb)->nr_frags++; 2190 } 2191 k++; 2192 } else 2193 skb_shinfo(skb)->nr_frags++; 2194 pos += size; 2195 } 2196 skb_shinfo(skb1)->nr_frags = k; 2197 } 2198 2199 /** 2200 * skb_split - Split fragmented skb to two parts at length len. 2201 * @skb: the buffer to split 2202 * @skb1: the buffer to receive the second part 2203 * @len: new length for skb 2204 */ 2205 void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) 2206 { 2207 int pos = skb_headlen(skb); 2208 2209 if (len < pos) /* Split line is inside header. */ 2210 skb_split_inside_header(skb, skb1, len, pos); 2211 else /* Second chunk has no header, nothing to copy. */ 2212 skb_split_no_header(skb, skb1, len, pos); 2213 } 2214 EXPORT_SYMBOL(skb_split); 2215 2216 /* Shifting from/to a cloned skb is a no-go. 2217 * 2218 * Caller cannot keep skb_shinfo related pointers past calling here! 2219 */ 2220 static int skb_prepare_for_shift(struct sk_buff *skb) 2221 { 2222 return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 2223 } 2224 2225 /** 2226 * skb_shift - Shifts paged data partially from skb to another 2227 * @tgt: buffer into which tail data gets added 2228 * @skb: buffer from which the paged data comes from 2229 * @shiftlen: shift up to this many bytes 2230 * 2231 * Attempts to shift up to shiftlen worth of bytes, which may be less than 2232 * the length of the skb, from tgt to skb. Returns number bytes shifted. 2233 * It's up to caller to free skb if everything was shifted. 2234 * 2235 * If @tgt runs out of frags, the whole operation is aborted. 2236 * 2237 * Skb cannot include anything else but paged data while tgt is allowed 2238 * to have non-paged data as well. 2239 * 2240 * TODO: full sized shift could be optimized but that would need 2241 * specialized skb free'er to handle frags without up-to-date nr_frags. 2242 */ 2243 int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) 2244 { 2245 int from, to, merge, todo; 2246 struct skb_frag_struct *fragfrom, *fragto; 2247 2248 BUG_ON(shiftlen > skb->len); 2249 BUG_ON(skb_headlen(skb)); /* Would corrupt stream */ 2250 2251 todo = shiftlen; 2252 from = 0; 2253 to = skb_shinfo(tgt)->nr_frags; 2254 fragfrom = &skb_shinfo(skb)->frags[from]; 2255 2256 /* Actual merge is delayed until the point when we know we can 2257 * commit all, so that we don't have to undo partial changes 2258 */ 2259 if (!to || 2260 !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), 2261 fragfrom->page_offset)) { 2262 merge = -1; 2263 } else { 2264 merge = to - 1; 2265 2266 todo -= skb_frag_size(fragfrom); 2267 if (todo < 0) { 2268 if (skb_prepare_for_shift(skb) || 2269 skb_prepare_for_shift(tgt)) 2270 return 0; 2271 2272 /* All previous frag pointers might be stale! */ 2273 fragfrom = &skb_shinfo(skb)->frags[from]; 2274 fragto = &skb_shinfo(tgt)->frags[merge]; 2275 2276 skb_frag_size_add(fragto, shiftlen); 2277 skb_frag_size_sub(fragfrom, shiftlen); 2278 fragfrom->page_offset += shiftlen; 2279 2280 goto onlymerged; 2281 } 2282 2283 from++; 2284 } 2285 2286 /* Skip full, not-fitting skb to avoid expensive operations */ 2287 if ((shiftlen == skb->len) && 2288 (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) 2289 return 0; 2290 2291 if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt)) 2292 return 0; 2293 2294 while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { 2295 if (to == MAX_SKB_FRAGS) 2296 return 0; 2297 2298 fragfrom = &skb_shinfo(skb)->frags[from]; 2299 fragto = &skb_shinfo(tgt)->frags[to]; 2300 2301 if (todo >= skb_frag_size(fragfrom)) { 2302 *fragto = *fragfrom; 2303 todo -= skb_frag_size(fragfrom); 2304 from++; 2305 to++; 2306 2307 } else { 2308 __skb_frag_ref(fragfrom); 2309 fragto->page = fragfrom->page; 2310 fragto->page_offset = fragfrom->page_offset; 2311 skb_frag_size_set(fragto, todo); 2312 2313 fragfrom->page_offset += todo; 2314 skb_frag_size_sub(fragfrom, todo); 2315 todo = 0; 2316 2317 to++; 2318 break; 2319 } 2320 } 2321 2322 /* Ready to "commit" this state change to tgt */ 2323 skb_shinfo(tgt)->nr_frags = to; 2324 2325 if (merge >= 0) { 2326 fragfrom = &skb_shinfo(skb)->frags[0]; 2327 fragto = &skb_shinfo(tgt)->frags[merge]; 2328 2329 skb_frag_size_add(fragto, skb_frag_size(fragfrom)); 2330 __skb_frag_unref(fragfrom); 2331 } 2332 2333 /* Reposition in the original skb */ 2334 to = 0; 2335 while (from < skb_shinfo(skb)->nr_frags) 2336 skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; 2337 skb_shinfo(skb)->nr_frags = to; 2338 2339 BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); 2340 2341 onlymerged: 2342 /* Most likely the tgt won't ever need its checksum anymore, skb on 2343 * the other hand might need it if it needs to be resent 2344 */ 2345 tgt->ip_summed = CHECKSUM_PARTIAL; 2346 skb->ip_summed = CHECKSUM_PARTIAL; 2347 2348 /* Yak, is it really working this way? Some helper please? */ 2349 skb->len -= shiftlen; 2350 skb->data_len -= shiftlen; 2351 skb->truesize -= shiftlen; 2352 tgt->len += shiftlen; 2353 tgt->data_len += shiftlen; 2354 tgt->truesize += shiftlen; 2355 2356 return shiftlen; 2357 } 2358 2359 /** 2360 * skb_prepare_seq_read - Prepare a sequential read of skb data 2361 * @skb: the buffer to read 2362 * @from: lower offset of data to be read 2363 * @to: upper offset of data to be read 2364 * @st: state variable 2365 * 2366 * Initializes the specified state variable. Must be called before 2367 * invoking skb_seq_read() for the first time. 2368 */ 2369 void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, 2370 unsigned int to, struct skb_seq_state *st) 2371 { 2372 st->lower_offset = from; 2373 st->upper_offset = to; 2374 st->root_skb = st->cur_skb = skb; 2375 st->frag_idx = st->stepped_offset = 0; 2376 st->frag_data = NULL; 2377 } 2378 EXPORT_SYMBOL(skb_prepare_seq_read); 2379 2380 /** 2381 * skb_seq_read - Sequentially read skb data 2382 * @consumed: number of bytes consumed by the caller so far 2383 * @data: destination pointer for data to be returned 2384 * @st: state variable 2385 * 2386 * Reads a block of skb data at &consumed relative to the 2387 * lower offset specified to skb_prepare_seq_read(). Assigns 2388 * the head of the data block to &data and returns the length 2389 * of the block or 0 if the end of the skb data or the upper 2390 * offset has been reached. 2391 * 2392 * The caller is not required to consume all of the data 2393 * returned, i.e. &consumed is typically set to the number 2394 * of bytes already consumed and the next call to 2395 * skb_seq_read() will return the remaining part of the block. 2396 * 2397 * Note 1: The size of each block of data returned can be arbitrary, 2398 * this limitation is the cost for zerocopy seqeuental 2399 * reads of potentially non linear data. 2400 * 2401 * Note 2: Fragment lists within fragments are not implemented 2402 * at the moment, state->root_skb could be replaced with 2403 * a stack for this purpose. 2404 */ 2405 unsigned int skb_seq_read(unsigned int consumed, const u8 **data, 2406 struct skb_seq_state *st) 2407 { 2408 unsigned int block_limit, abs_offset = consumed + st->lower_offset; 2409 skb_frag_t *frag; 2410 2411 if (unlikely(abs_offset >= st->upper_offset)) 2412 return 0; 2413 2414 next_skb: 2415 block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; 2416 2417 if (abs_offset < block_limit && !st->frag_data) { 2418 *data = st->cur_skb->data + (abs_offset - st->stepped_offset); 2419 return block_limit - abs_offset; 2420 } 2421 2422 if (st->frag_idx == 0 && !st->frag_data) 2423 st->stepped_offset += skb_headlen(st->cur_skb); 2424 2425 while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { 2426 frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; 2427 block_limit = skb_frag_size(frag) + st->stepped_offset; 2428 2429 if (abs_offset < block_limit) { 2430 if (!st->frag_data) 2431 st->frag_data = kmap_skb_frag(frag); 2432 2433 *data = (u8 *) st->frag_data + frag->page_offset + 2434 (abs_offset - st->stepped_offset); 2435 2436 return block_limit - abs_offset; 2437 } 2438 2439 if (st->frag_data) { 2440 kunmap_skb_frag(st->frag_data); 2441 st->frag_data = NULL; 2442 } 2443 2444 st->frag_idx++; 2445 st->stepped_offset += skb_frag_size(frag); 2446 } 2447 2448 if (st->frag_data) { 2449 kunmap_skb_frag(st->frag_data); 2450 st->frag_data = NULL; 2451 } 2452 2453 if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { 2454 st->cur_skb = skb_shinfo(st->root_skb)->frag_list; 2455 st->frag_idx = 0; 2456 goto next_skb; 2457 } else if (st->cur_skb->next) { 2458 st->cur_skb = st->cur_skb->next; 2459 st->frag_idx = 0; 2460 goto next_skb; 2461 } 2462 2463 return 0; 2464 } 2465 EXPORT_SYMBOL(skb_seq_read); 2466 2467 /** 2468 * skb_abort_seq_read - Abort a sequential read of skb data 2469 * @st: state variable 2470 * 2471 * Must be called if skb_seq_read() was not called until it 2472 * returned 0. 2473 */ 2474 void skb_abort_seq_read(struct skb_seq_state *st) 2475 { 2476 if (st->frag_data) 2477 kunmap_skb_frag(st->frag_data); 2478 } 2479 EXPORT_SYMBOL(skb_abort_seq_read); 2480 2481 #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) 2482 2483 static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, 2484 struct ts_config *conf, 2485 struct ts_state *state) 2486 { 2487 return skb_seq_read(offset, text, TS_SKB_CB(state)); 2488 } 2489 2490 static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) 2491 { 2492 skb_abort_seq_read(TS_SKB_CB(state)); 2493 } 2494 2495 /** 2496 * skb_find_text - Find a text pattern in skb data 2497 * @skb: the buffer to look in 2498 * @from: search offset 2499 * @to: search limit 2500 * @config: textsearch configuration 2501 * @state: uninitialized textsearch state variable 2502 * 2503 * Finds a pattern in the skb data according to the specified 2504 * textsearch configuration. Use textsearch_next() to retrieve 2505 * subsequent occurrences of the pattern. Returns the offset 2506 * to the first occurrence or UINT_MAX if no match was found. 2507 */ 2508 unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, 2509 unsigned int to, struct ts_config *config, 2510 struct ts_state *state) 2511 { 2512 unsigned int ret; 2513 2514 config->get_next_block = skb_ts_get_next_block; 2515 config->finish = skb_ts_finish; 2516 2517 skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); 2518 2519 ret = textsearch_find(config, state); 2520 return (ret <= to - from ? ret : UINT_MAX); 2521 } 2522 EXPORT_SYMBOL(skb_find_text); 2523 2524 /** 2525 * skb_append_datato_frags: - append the user data to a skb 2526 * @sk: sock structure 2527 * @skb: skb structure to be appened with user data. 2528 * @getfrag: call back function to be used for getting the user data 2529 * @from: pointer to user message iov 2530 * @length: length of the iov message 2531 * 2532 * Description: This procedure append the user data in the fragment part 2533 * of the skb if any page alloc fails user this procedure returns -ENOMEM 2534 */ 2535 int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, 2536 int (*getfrag)(void *from, char *to, int offset, 2537 int len, int odd, struct sk_buff *skb), 2538 void *from, int length) 2539 { 2540 int frg_cnt = 0; 2541 skb_frag_t *frag = NULL; 2542 struct page *page = NULL; 2543 int copy, left; 2544 int offset = 0; 2545 int ret; 2546 2547 do { 2548 /* Return error if we don't have space for new frag */ 2549 frg_cnt = skb_shinfo(skb)->nr_frags; 2550 if (frg_cnt >= MAX_SKB_FRAGS) 2551 return -EFAULT; 2552 2553 /* allocate a new page for next frag */ 2554 page = alloc_pages(sk->sk_allocation, 0); 2555 2556 /* If alloc_page fails just return failure and caller will 2557 * free previous allocated pages by doing kfree_skb() 2558 */ 2559 if (page == NULL) 2560 return -ENOMEM; 2561 2562 /* initialize the next frag */ 2563 skb_fill_page_desc(skb, frg_cnt, page, 0, 0); 2564 skb->truesize += PAGE_SIZE; 2565 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); 2566 2567 /* get the new initialized frag */ 2568 frg_cnt = skb_shinfo(skb)->nr_frags; 2569 frag = &skb_shinfo(skb)->frags[frg_cnt - 1]; 2570 2571 /* copy the user data to page */ 2572 left = PAGE_SIZE - frag->page_offset; 2573 copy = (length > left)? left : length; 2574 2575 ret = getfrag(from, skb_frag_address(frag) + skb_frag_size(frag), 2576 offset, copy, 0, skb); 2577 if (ret < 0) 2578 return -EFAULT; 2579 2580 /* copy was successful so update the size parameters */ 2581 skb_frag_size_add(frag, copy); 2582 skb->len += copy; 2583 skb->data_len += copy; 2584 offset += copy; 2585 length -= copy; 2586 2587 } while (length > 0); 2588 2589 return 0; 2590 } 2591 EXPORT_SYMBOL(skb_append_datato_frags); 2592 2593 /** 2594 * skb_pull_rcsum - pull skb and update receive checksum 2595 * @skb: buffer to update 2596 * @len: length of data pulled 2597 * 2598 * This function performs an skb_pull on the packet and updates 2599 * the CHECKSUM_COMPLETE checksum. It should be used on 2600 * receive path processing instead of skb_pull unless you know 2601 * that the checksum difference is zero (e.g., a valid IP header) 2602 * or you are setting ip_summed to CHECKSUM_NONE. 2603 */ 2604 unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) 2605 { 2606 BUG_ON(len > skb->len); 2607 skb->len -= len; 2608 BUG_ON(skb->len < skb->data_len); 2609 skb_postpull_rcsum(skb, skb->data, len); 2610 return skb->data += len; 2611 } 2612 EXPORT_SYMBOL_GPL(skb_pull_rcsum); 2613 2614 /** 2615 * skb_segment - Perform protocol segmentation on skb. 2616 * @skb: buffer to segment 2617 * @features: features for the output path (see dev->features) 2618 * 2619 * This function performs segmentation on the given skb. It returns 2620 * a pointer to the first in a list of new skbs for the segments. 2621 * In case of error it returns ERR_PTR(err). 2622 */ 2623 struct sk_buff *skb_segment(struct sk_buff *skb, u32 features) 2624 { 2625 struct sk_buff *segs = NULL; 2626 struct sk_buff *tail = NULL; 2627 struct sk_buff *fskb = skb_shinfo(skb)->frag_list; 2628 unsigned int mss = skb_shinfo(skb)->gso_size; 2629 unsigned int doffset = skb->data - skb_mac_header(skb); 2630 unsigned int offset = doffset; 2631 unsigned int headroom; 2632 unsigned int len; 2633 int sg = !!(features & NETIF_F_SG); 2634 int nfrags = skb_shinfo(skb)->nr_frags; 2635 int err = -ENOMEM; 2636 int i = 0; 2637 int pos; 2638 2639 __skb_push(skb, doffset); 2640 headroom = skb_headroom(skb); 2641 pos = skb_headlen(skb); 2642 2643 do { 2644 struct sk_buff *nskb; 2645 skb_frag_t *frag; 2646 int hsize; 2647 int size; 2648 2649 len = skb->len - offset; 2650 if (len > mss) 2651 len = mss; 2652 2653 hsize = skb_headlen(skb) - offset; 2654 if (hsize < 0) 2655 hsize = 0; 2656 if (hsize > len || !sg) 2657 hsize = len; 2658 2659 if (!hsize && i >= nfrags) { 2660 BUG_ON(fskb->len != len); 2661 2662 pos += len; 2663 nskb = skb_clone(fskb, GFP_ATOMIC); 2664 fskb = fskb->next; 2665 2666 if (unlikely(!nskb)) 2667 goto err; 2668 2669 hsize = skb_end_pointer(nskb) - nskb->head; 2670 if (skb_cow_head(nskb, doffset + headroom)) { 2671 kfree_skb(nskb); 2672 goto err; 2673 } 2674 2675 nskb->truesize += skb_end_pointer(nskb) - nskb->head - 2676 hsize; 2677 skb_release_head_state(nskb); 2678 __skb_push(nskb, doffset); 2679 } else { 2680 nskb = alloc_skb(hsize + doffset + headroom, 2681 GFP_ATOMIC); 2682 2683 if (unlikely(!nskb)) 2684 goto err; 2685 2686 skb_reserve(nskb, headroom); 2687 __skb_put(nskb, doffset); 2688 } 2689 2690 if (segs) 2691 tail->next = nskb; 2692 else 2693 segs = nskb; 2694 tail = nskb; 2695 2696 __copy_skb_header(nskb, skb); 2697 nskb->mac_len = skb->mac_len; 2698 2699 /* nskb and skb might have different headroom */ 2700 if (nskb->ip_summed == CHECKSUM_PARTIAL) 2701 nskb->csum_start += skb_headroom(nskb) - headroom; 2702 2703 skb_reset_mac_header(nskb); 2704 skb_set_network_header(nskb, skb->mac_len); 2705 nskb->transport_header = (nskb->network_header + 2706 skb_network_header_len(skb)); 2707 skb_copy_from_linear_data(skb, nskb->data, doffset); 2708 2709 if (fskb != skb_shinfo(skb)->frag_list) 2710 continue; 2711 2712 if (!sg) { 2713 nskb->ip_summed = CHECKSUM_NONE; 2714 nskb->csum = skb_copy_and_csum_bits(skb, offset, 2715 skb_put(nskb, len), 2716 len, 0); 2717 continue; 2718 } 2719 2720 frag = skb_shinfo(nskb)->frags; 2721 2722 skb_copy_from_linear_data_offset(skb, offset, 2723 skb_put(nskb, hsize), hsize); 2724 2725 while (pos < offset + len && i < nfrags) { 2726 *frag = skb_shinfo(skb)->frags[i]; 2727 __skb_frag_ref(frag); 2728 size = skb_frag_size(frag); 2729 2730 if (pos < offset) { 2731 frag->page_offset += offset - pos; 2732 skb_frag_size_sub(frag, offset - pos); 2733 } 2734 2735 skb_shinfo(nskb)->nr_frags++; 2736 2737 if (pos + size <= offset + len) { 2738 i++; 2739 pos += size; 2740 } else { 2741 skb_frag_size_sub(frag, pos + size - (offset + len)); 2742 goto skip_fraglist; 2743 } 2744 2745 frag++; 2746 } 2747 2748 if (pos < offset + len) { 2749 struct sk_buff *fskb2 = fskb; 2750 2751 BUG_ON(pos + fskb->len != offset + len); 2752 2753 pos += fskb->len; 2754 fskb = fskb->next; 2755 2756 if (fskb2->next) { 2757 fskb2 = skb_clone(fskb2, GFP_ATOMIC); 2758 if (!fskb2) 2759 goto err; 2760 } else 2761 skb_get(fskb2); 2762 2763 SKB_FRAG_ASSERT(nskb); 2764 skb_shinfo(nskb)->frag_list = fskb2; 2765 } 2766 2767 skip_fraglist: 2768 nskb->data_len = len - hsize; 2769 nskb->len += nskb->data_len; 2770 nskb->truesize += nskb->data_len; 2771 } while ((offset += len) < skb->len); 2772 2773 return segs; 2774 2775 err: 2776 while ((skb = segs)) { 2777 segs = skb->next; 2778 kfree_skb(skb); 2779 } 2780 return ERR_PTR(err); 2781 } 2782 EXPORT_SYMBOL_GPL(skb_segment); 2783 2784 int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) 2785 { 2786 struct sk_buff *p = *head; 2787 struct sk_buff *nskb; 2788 struct skb_shared_info *skbinfo = skb_shinfo(skb); 2789 struct skb_shared_info *pinfo = skb_shinfo(p); 2790 unsigned int headroom; 2791 unsigned int len = skb_gro_len(skb); 2792 unsigned int offset = skb_gro_offset(skb); 2793 unsigned int headlen = skb_headlen(skb); 2794 2795 if (p->len + len >= 65536) 2796 return -E2BIG; 2797 2798 if (pinfo->frag_list) 2799 goto merge; 2800 else if (headlen <= offset) { 2801 skb_frag_t *frag; 2802 skb_frag_t *frag2; 2803 int i = skbinfo->nr_frags; 2804 int nr_frags = pinfo->nr_frags + i; 2805 2806 offset -= headlen; 2807 2808 if (nr_frags > MAX_SKB_FRAGS) 2809 return -E2BIG; 2810 2811 pinfo->nr_frags = nr_frags; 2812 skbinfo->nr_frags = 0; 2813 2814 frag = pinfo->frags + nr_frags; 2815 frag2 = skbinfo->frags + i; 2816 do { 2817 *--frag = *--frag2; 2818 } while (--i); 2819 2820 frag->page_offset += offset; 2821 skb_frag_size_sub(frag, offset); 2822 2823 skb->truesize -= skb->data_len; 2824 skb->len -= skb->data_len; 2825 skb->data_len = 0; 2826 2827 NAPI_GRO_CB(skb)->free = 1; 2828 goto done; 2829 } else if (skb_gro_len(p) != pinfo->gso_size) 2830 return -E2BIG; 2831 2832 headroom = skb_headroom(p); 2833 nskb = alloc_skb(headroom + skb_gro_offset(p), GFP_ATOMIC); 2834 if (unlikely(!nskb)) 2835 return -ENOMEM; 2836 2837 __copy_skb_header(nskb, p); 2838 nskb->mac_len = p->mac_len; 2839 2840 skb_reserve(nskb, headroom); 2841 __skb_put(nskb, skb_gro_offset(p)); 2842 2843 skb_set_mac_header(nskb, skb_mac_header(p) - p->data); 2844 skb_set_network_header(nskb, skb_network_offset(p)); 2845 skb_set_transport_header(nskb, skb_transport_offset(p)); 2846 2847 __skb_pull(p, skb_gro_offset(p)); 2848 memcpy(skb_mac_header(nskb), skb_mac_header(p), 2849 p->data - skb_mac_header(p)); 2850 2851 *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); 2852 skb_shinfo(nskb)->frag_list = p; 2853 skb_shinfo(nskb)->gso_size = pinfo->gso_size; 2854 pinfo->gso_size = 0; 2855 skb_header_release(p); 2856 nskb->prev = p; 2857 2858 nskb->data_len += p->len; 2859 nskb->truesize += p->len; 2860 nskb->len += p->len; 2861 2862 *head = nskb; 2863 nskb->next = p->next; 2864 p->next = NULL; 2865 2866 p = nskb; 2867 2868 merge: 2869 if (offset > headlen) { 2870 unsigned int eat = offset - headlen; 2871 2872 skbinfo->frags[0].page_offset += eat; 2873 skb_frag_size_sub(&skbinfo->frags[0], eat); 2874 skb->data_len -= eat; 2875 skb->len -= eat; 2876 offset = headlen; 2877 } 2878 2879 __skb_pull(skb, offset); 2880 2881 p->prev->next = skb; 2882 p->prev = skb; 2883 skb_header_release(skb); 2884 2885 done: 2886 NAPI_GRO_CB(p)->count++; 2887 p->data_len += len; 2888 p->truesize += len; 2889 p->len += len; 2890 2891 NAPI_GRO_CB(skb)->same_flow = 1; 2892 return 0; 2893 } 2894 EXPORT_SYMBOL_GPL(skb_gro_receive); 2895 2896 void __init skb_init(void) 2897 { 2898 skbuff_head_cache = kmem_cache_create("skbuff_head_cache", 2899 sizeof(struct sk_buff), 2900 0, 2901 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 2902 NULL); 2903 skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", 2904 (2*sizeof(struct sk_buff)) + 2905 sizeof(atomic_t), 2906 0, 2907 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 2908 NULL); 2909 } 2910 2911 /** 2912 * skb_to_sgvec - Fill a scatter-gather list from a socket buffer 2913 * @skb: Socket buffer containing the buffers to be mapped 2914 * @sg: The scatter-gather list to map into 2915 * @offset: The offset into the buffer's contents to start mapping 2916 * @len: Length of buffer space to be mapped 2917 * 2918 * Fill the specified scatter-gather list with mappings/pointers into a 2919 * region of the buffer space attached to a socket buffer. 2920 */ 2921 static int 2922 __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 2923 { 2924 int start = skb_headlen(skb); 2925 int i, copy = start - offset; 2926 struct sk_buff *frag_iter; 2927 int elt = 0; 2928 2929 if (copy > 0) { 2930 if (copy > len) 2931 copy = len; 2932 sg_set_buf(sg, skb->data + offset, copy); 2933 elt++; 2934 if ((len -= copy) == 0) 2935 return elt; 2936 offset += copy; 2937 } 2938 2939 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2940 int end; 2941 2942 WARN_ON(start > offset + len); 2943 2944 end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); 2945 if ((copy = end - offset) > 0) { 2946 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2947 2948 if (copy > len) 2949 copy = len; 2950 sg_set_page(&sg[elt], skb_frag_page(frag), copy, 2951 frag->page_offset+offset-start); 2952 elt++; 2953 if (!(len -= copy)) 2954 return elt; 2955 offset += copy; 2956 } 2957 start = end; 2958 } 2959 2960 skb_walk_frags(skb, frag_iter) { 2961 int end; 2962 2963 WARN_ON(start > offset + len); 2964 2965 end = start + frag_iter->len; 2966 if ((copy = end - offset) > 0) { 2967 if (copy > len) 2968 copy = len; 2969 elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start, 2970 copy); 2971 if ((len -= copy) == 0) 2972 return elt; 2973 offset += copy; 2974 } 2975 start = end; 2976 } 2977 BUG_ON(len); 2978 return elt; 2979 } 2980 2981 int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) 2982 { 2983 int nsg = __skb_to_sgvec(skb, sg, offset, len); 2984 2985 sg_mark_end(&sg[nsg - 1]); 2986 2987 return nsg; 2988 } 2989 EXPORT_SYMBOL_GPL(skb_to_sgvec); 2990 2991 /** 2992 * skb_cow_data - Check that a socket buffer's data buffers are writable 2993 * @skb: The socket buffer to check. 2994 * @tailbits: Amount of trailing space to be added 2995 * @trailer: Returned pointer to the skb where the @tailbits space begins 2996 * 2997 * Make sure that the data buffers attached to a socket buffer are 2998 * writable. If they are not, private copies are made of the data buffers 2999 * and the socket buffer is set to use these instead. 3000 * 3001 * If @tailbits is given, make sure that there is space to write @tailbits 3002 * bytes of data beyond current end of socket buffer. @trailer will be 3003 * set to point to the skb in which this space begins. 3004 * 3005 * The number of scatterlist elements required to completely map the 3006 * COW'd and extended socket buffer will be returned. 3007 */ 3008 int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) 3009 { 3010 int copyflag; 3011 int elt; 3012 struct sk_buff *skb1, **skb_p; 3013 3014 /* If skb is cloned or its head is paged, reallocate 3015 * head pulling out all the pages (pages are considered not writable 3016 * at the moment even if they are anonymous). 3017 */ 3018 if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && 3019 __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) 3020 return -ENOMEM; 3021 3022 /* Easy case. Most of packets will go this way. */ 3023 if (!skb_has_frag_list(skb)) { 3024 /* A little of trouble, not enough of space for trailer. 3025 * This should not happen, when stack is tuned to generate 3026 * good frames. OK, on miss we reallocate and reserve even more 3027 * space, 128 bytes is fair. */ 3028 3029 if (skb_tailroom(skb) < tailbits && 3030 pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) 3031 return -ENOMEM; 3032 3033 /* Voila! */ 3034 *trailer = skb; 3035 return 1; 3036 } 3037 3038 /* Misery. We are in troubles, going to mincer fragments... */ 3039 3040 elt = 1; 3041 skb_p = &skb_shinfo(skb)->frag_list; 3042 copyflag = 0; 3043 3044 while ((skb1 = *skb_p) != NULL) { 3045 int ntail = 0; 3046 3047 /* The fragment is partially pulled by someone, 3048 * this can happen on input. Copy it and everything 3049 * after it. */ 3050 3051 if (skb_shared(skb1)) 3052 copyflag = 1; 3053 3054 /* If the skb is the last, worry about trailer. */ 3055 3056 if (skb1->next == NULL && tailbits) { 3057 if (skb_shinfo(skb1)->nr_frags || 3058 skb_has_frag_list(skb1) || 3059 skb_tailroom(skb1) < tailbits) 3060 ntail = tailbits + 128; 3061 } 3062 3063 if (copyflag || 3064 skb_cloned(skb1) || 3065 ntail || 3066 skb_shinfo(skb1)->nr_frags || 3067 skb_has_frag_list(skb1)) { 3068 struct sk_buff *skb2; 3069 3070 /* Fuck, we are miserable poor guys... */ 3071 if (ntail == 0) 3072 skb2 = skb_copy(skb1, GFP_ATOMIC); 3073 else 3074 skb2 = skb_copy_expand(skb1, 3075 skb_headroom(skb1), 3076 ntail, 3077 GFP_ATOMIC); 3078 if (unlikely(skb2 == NULL)) 3079 return -ENOMEM; 3080 3081 if (skb1->sk) 3082 skb_set_owner_w(skb2, skb1->sk); 3083 3084 /* Looking around. Are we still alive? 3085 * OK, link new skb, drop old one */ 3086 3087 skb2->next = skb1->next; 3088 *skb_p = skb2; 3089 kfree_skb(skb1); 3090 skb1 = skb2; 3091 } 3092 elt++; 3093 *trailer = skb1; 3094 skb_p = &skb1->next; 3095 } 3096 3097 return elt; 3098 } 3099 EXPORT_SYMBOL_GPL(skb_cow_data); 3100 3101 static void sock_rmem_free(struct sk_buff *skb) 3102 { 3103 struct sock *sk = skb->sk; 3104 3105 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 3106 } 3107 3108 /* 3109 * Note: We dont mem charge error packets (no sk_forward_alloc changes) 3110 */ 3111 int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) 3112 { 3113 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= 3114 (unsigned)sk->sk_rcvbuf) 3115 return -ENOMEM; 3116 3117 skb_orphan(skb); 3118 skb->sk = sk; 3119 skb->destructor = sock_rmem_free; 3120 atomic_add(skb->truesize, &sk->sk_rmem_alloc); 3121 3122 /* before exiting rcu section, make sure dst is refcounted */ 3123 skb_dst_force(skb); 3124 3125 skb_queue_tail(&sk->sk_error_queue, skb); 3126 if (!sock_flag(sk, SOCK_DEAD)) 3127 sk->sk_data_ready(sk, skb->len); 3128 return 0; 3129 } 3130 EXPORT_SYMBOL(sock_queue_err_skb); 3131 3132 void skb_tstamp_tx(struct sk_buff *orig_skb, 3133 struct skb_shared_hwtstamps *hwtstamps) 3134 { 3135 struct sock *sk = orig_skb->sk; 3136 struct sock_exterr_skb *serr; 3137 struct sk_buff *skb; 3138 int err; 3139 3140 if (!sk) 3141 return; 3142 3143 skb = skb_clone(orig_skb, GFP_ATOMIC); 3144 if (!skb) 3145 return; 3146 3147 if (hwtstamps) { 3148 *skb_hwtstamps(skb) = 3149 *hwtstamps; 3150 } else { 3151 /* 3152 * no hardware time stamps available, 3153 * so keep the shared tx_flags and only 3154 * store software time stamp 3155 */ 3156 skb->tstamp = ktime_get_real(); 3157 } 3158 3159 serr = SKB_EXT_ERR(skb); 3160 memset(serr, 0, sizeof(*serr)); 3161 serr->ee.ee_errno = ENOMSG; 3162 serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; 3163 3164 err = sock_queue_err_skb(sk, skb); 3165 3166 if (err) 3167 kfree_skb(skb); 3168 } 3169 EXPORT_SYMBOL_GPL(skb_tstamp_tx); 3170 3171 3172 /** 3173 * skb_partial_csum_set - set up and verify partial csum values for packet 3174 * @skb: the skb to set 3175 * @start: the number of bytes after skb->data to start checksumming. 3176 * @off: the offset from start to place the checksum. 3177 * 3178 * For untrusted partially-checksummed packets, we need to make sure the values 3179 * for skb->csum_start and skb->csum_offset are valid so we don't oops. 3180 * 3181 * This function checks and sets those values and skb->ip_summed: if this 3182 * returns false you should drop the packet. 3183 */ 3184 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) 3185 { 3186 if (unlikely(start > skb_headlen(skb)) || 3187 unlikely((int)start + off > skb_headlen(skb) - 2)) { 3188 if (net_ratelimit()) 3189 printk(KERN_WARNING 3190 "bad partial csum: csum=%u/%u len=%u\n", 3191 start, off, skb_headlen(skb)); 3192 return false; 3193 } 3194 skb->ip_summed = CHECKSUM_PARTIAL; 3195 skb->csum_start = skb_headroom(skb) + start; 3196 skb->csum_offset = off; 3197 return true; 3198 } 3199 EXPORT_SYMBOL_GPL(skb_partial_csum_set); 3200 3201 void __skb_warn_lro_forwarding(const struct sk_buff *skb) 3202 { 3203 if (net_ratelimit()) 3204 pr_warning("%s: received packets cannot be forwarded" 3205 " while LRO is enabled\n", skb->dev->name); 3206 } 3207 EXPORT_SYMBOL(__skb_warn_lro_forwarding); 3208