1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * SUCS NET3: 4 * 5 * Generic datagram handling routines. These are generic for all 6 * protocols. Possibly a generic IP version on top of these would 7 * make sense. Not tonight however 8-). 8 * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and 9 * NetROM layer all have identical poll code and mostly 10 * identical recvmsg() code. So we share it here. The poll was 11 * shared before but buried in udp.c so I moved it. 12 * 13 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old 14 * udp.c code) 15 * 16 * Fixes: 17 * Alan Cox : NULL return from skb_peek_copy() 18 * understood 19 * Alan Cox : Rewrote skb_read_datagram to avoid the 20 * skb_peek_copy stuff. 21 * Alan Cox : Added support for SOCK_SEQPACKET. 22 * IPX can no longer use the SO_TYPE hack 23 * but AX.25 now works right, and SPX is 24 * feasible. 25 * Alan Cox : Fixed write poll of non IP protocol 26 * crash. 27 * Florian La Roche: Changed for my new skbuff handling. 28 * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. 29 * Linus Torvalds : BSD semantic fixes. 30 * Alan Cox : Datagram iovec handling 31 * Darryl Miles : Fixed non-blocking SOCK_STREAM. 32 * Alan Cox : POSIXisms 33 * Pete Wyckoff : Unconnected accept() fix. 34 * 35 */ 36 37 #include <linux/module.h> 38 #include <linux/types.h> 39 #include <linux/kernel.h> 40 #include <linux/uaccess.h> 41 #include <linux/mm.h> 42 #include <linux/interrupt.h> 43 #include <linux/errno.h> 44 #include <linux/sched.h> 45 #include <linux/inet.h> 46 #include <linux/netdevice.h> 47 #include <linux/rtnetlink.h> 48 #include <linux/poll.h> 49 #include <linux/highmem.h> 50 #include <linux/spinlock.h> 51 #include <linux/slab.h> 52 #include <linux/pagemap.h> 53 #include <linux/iov_iter.h> 54 #include <linux/indirect_call_wrapper.h> 55 #include <linux/crc32.h> 56 57 #include <net/protocol.h> 58 #include <linux/skbuff.h> 59 60 #include <net/checksum.h> 61 #include <net/sock.h> 62 #include <net/tcp_states.h> 63 #include <trace/events/skb.h> 64 #include <net/busy_poll.h> 65 66 #include "devmem.h" 67 68 /* 69 * Is a socket 'connection oriented' ? 70 */ 71 static inline int connection_based(struct sock *sk) 72 { 73 return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; 74 } 75 76 static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, 77 void *key) 78 { 79 /* 80 * Avoid a wakeup if event not interesting for us 81 */ 82 if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR))) 83 return 0; 84 return autoremove_wake_function(wait, mode, sync, key); 85 } 86 /* 87 * Wait for the last received packet to be different from skb 88 */ 89 int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, 90 int *err, long *timeo_p, 91 const struct sk_buff *skb) 92 { 93 int error; 94 DEFINE_WAIT_FUNC(wait, receiver_wake_function); 95 96 prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 97 98 /* Socket errors? */ 99 error = sock_error(sk); 100 if (error) 101 goto out_err; 102 103 if (READ_ONCE(queue->prev) != skb) 104 goto out; 105 106 /* Socket shut down? */ 107 if (sk->sk_shutdown & RCV_SHUTDOWN) 108 goto out_noerr; 109 110 /* Sequenced packets can come disconnected. 111 * If so we report the problem 112 */ 113 error = -ENOTCONN; 114 if (connection_based(sk) && 115 !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) 116 goto out_err; 117 118 /* handle signals */ 119 if (signal_pending(current)) 120 goto interrupted; 121 122 error = 0; 123 *timeo_p = schedule_timeout(*timeo_p); 124 out: 125 finish_wait(sk_sleep(sk), &wait); 126 return error; 127 interrupted: 128 error = sock_intr_errno(*timeo_p); 129 out_err: 130 *err = error; 131 goto out; 132 out_noerr: 133 *err = 0; 134 error = 1; 135 goto out; 136 } 137 EXPORT_SYMBOL(__skb_wait_for_more_packets); 138 139 static struct sk_buff *skb_set_peeked(struct sk_buff *skb) 140 { 141 struct sk_buff *nskb; 142 143 if (skb->peeked) 144 return skb; 145 146 /* We have to unshare an skb before modifying it. */ 147 if (!skb_shared(skb)) 148 goto done; 149 150 nskb = skb_clone(skb, GFP_ATOMIC); 151 if (!nskb) 152 return ERR_PTR(-ENOMEM); 153 154 skb->prev->next = nskb; 155 skb->next->prev = nskb; 156 nskb->prev = skb->prev; 157 nskb->next = skb->next; 158 159 consume_skb(skb); 160 skb = nskb; 161 162 done: 163 skb->peeked = 1; 164 165 return skb; 166 } 167 168 struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, 169 unsigned int flags, 170 int *off, int *err, 171 struct sk_buff **last) 172 { 173 bool peek_at_off = false; 174 struct sk_buff *skb; 175 int _off = 0; 176 177 if (unlikely(flags & MSG_PEEK && *off >= 0)) { 178 peek_at_off = true; 179 _off = *off; 180 } 181 182 *last = queue->prev; 183 skb_queue_walk(queue, skb) { 184 if (flags & MSG_PEEK) { 185 if (peek_at_off && _off >= skb->len && 186 (_off || skb->peeked)) { 187 _off -= skb->len; 188 continue; 189 } 190 if (!skb->len) { 191 skb = skb_set_peeked(skb); 192 if (IS_ERR(skb)) { 193 *err = PTR_ERR(skb); 194 return NULL; 195 } 196 } 197 refcount_inc(&skb->users); 198 } else { 199 __skb_unlink(skb, queue); 200 } 201 *off = _off; 202 return skb; 203 } 204 return NULL; 205 } 206 207 /** 208 * __skb_try_recv_datagram - Receive a datagram skbuff 209 * @sk: socket 210 * @queue: socket queue from which to receive 211 * @flags: MSG\_ flags 212 * @off: an offset in bytes to peek skb from. Returns an offset 213 * within an skb where data actually starts 214 * @err: error code returned 215 * @last: set to last peeked message to inform the wait function 216 * what to look for when peeking 217 * 218 * Get a datagram skbuff, understands the peeking, nonblocking wakeups 219 * and possible races. This replaces identical code in packet, raw and 220 * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes 221 * the long standing peek and read race for datagram sockets. If you 222 * alter this routine remember it must be re-entrant. 223 * 224 * This function will lock the socket if a skb is returned, so 225 * the caller needs to unlock the socket in that case (usually by 226 * calling skb_free_datagram). Returns NULL with @err set to 227 * -EAGAIN if no data was available or to some other value if an 228 * error was detected. 229 * 230 * * It does not lock socket since today. This function is 231 * * free of race conditions. This measure should/can improve 232 * * significantly datagram socket latencies at high loads, 233 * * when data copying to user space takes lots of time. 234 * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet 235 * * 8) Great win.) 236 * * --ANK (980729) 237 * 238 * The order of the tests when we find no data waiting are specified 239 * quite explicitly by POSIX 1003.1g, don't change them without having 240 * the standard around please. 241 */ 242 struct sk_buff *__skb_try_recv_datagram(struct sock *sk, 243 struct sk_buff_head *queue, 244 unsigned int flags, int *off, int *err, 245 struct sk_buff **last) 246 { 247 struct sk_buff *skb; 248 unsigned long cpu_flags; 249 /* 250 * Caller is allowed not to check sk->sk_err before skb_recv_datagram() 251 */ 252 int error = sock_error(sk); 253 254 if (error) 255 goto no_packet; 256 257 do { 258 /* Again only user level code calls this function, so nothing 259 * interrupt level will suddenly eat the receive_queue. 260 * 261 * Look at current nfs client by the way... 262 * However, this function was correct in any case. 8) 263 */ 264 spin_lock_irqsave(&queue->lock, cpu_flags); 265 skb = __skb_try_recv_from_queue(queue, flags, off, &error, 266 last); 267 spin_unlock_irqrestore(&queue->lock, cpu_flags); 268 if (error) 269 goto no_packet; 270 if (skb) 271 return skb; 272 273 if (!sk_can_busy_loop(sk)) 274 break; 275 276 sk_busy_loop(sk, flags & MSG_DONTWAIT); 277 } while (READ_ONCE(queue->prev) != *last); 278 279 error = -EAGAIN; 280 281 no_packet: 282 *err = error; 283 return NULL; 284 } 285 EXPORT_SYMBOL(__skb_try_recv_datagram); 286 287 struct sk_buff *__skb_recv_datagram(struct sock *sk, 288 struct sk_buff_head *sk_queue, 289 unsigned int flags, int *off, int *err) 290 { 291 struct sk_buff *skb, *last; 292 long timeo; 293 294 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 295 296 do { 297 skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err, 298 &last); 299 if (skb) 300 return skb; 301 302 if (*err != -EAGAIN) 303 break; 304 } while (timeo && 305 !__skb_wait_for_more_packets(sk, sk_queue, err, 306 &timeo, last)); 307 308 return NULL; 309 } 310 EXPORT_SYMBOL(__skb_recv_datagram); 311 312 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, 313 int *err) 314 { 315 int off = 0; 316 317 return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags, 318 &off, err); 319 } 320 EXPORT_SYMBOL(skb_recv_datagram); 321 322 void skb_free_datagram(struct sock *sk, struct sk_buff *skb) 323 { 324 consume_skb(skb); 325 } 326 EXPORT_SYMBOL(skb_free_datagram); 327 328 int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, 329 struct sk_buff *skb, unsigned int flags, 330 void (*destructor)(struct sock *sk, 331 struct sk_buff *skb)) 332 { 333 int err = 0; 334 335 if (flags & MSG_PEEK) { 336 err = -ENOENT; 337 spin_lock_bh(&sk_queue->lock); 338 if (skb->next) { 339 __skb_unlink(skb, sk_queue); 340 refcount_dec(&skb->users); 341 if (destructor) 342 destructor(sk, skb); 343 err = 0; 344 } 345 spin_unlock_bh(&sk_queue->lock); 346 } 347 348 sk_drops_inc(sk); 349 return err; 350 } 351 EXPORT_SYMBOL(__sk_queue_drop_skb); 352 353 /** 354 * skb_kill_datagram - Free a datagram skbuff forcibly 355 * @sk: socket 356 * @skb: datagram skbuff 357 * @flags: MSG\_ flags 358 * 359 * This function frees a datagram skbuff that was received by 360 * skb_recv_datagram. The flags argument must match the one 361 * used for skb_recv_datagram. 362 * 363 * If the MSG_PEEK flag is set, and the packet is still on the 364 * receive queue of the socket, it will be taken off the queue 365 * before it is freed. 366 * 367 * This function currently only disables BH when acquiring the 368 * sk_receive_queue lock. Therefore it must not be used in a 369 * context where that lock is acquired in an IRQ context. 370 * 371 * It returns 0 if the packet was removed by us. 372 */ 373 374 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) 375 { 376 int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags, 377 NULL); 378 379 kfree_skb(skb); 380 return err; 381 } 382 EXPORT_SYMBOL(skb_kill_datagram); 383 384 INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr, 385 size_t bytes, 386 void *data __always_unused, 387 struct iov_iter *i)); 388 389 static int __skb_datagram_iter(const struct sk_buff *skb, int offset, 390 struct iov_iter *to, int len, bool fault_short, 391 size_t (*cb)(const void *, size_t, void *, 392 struct iov_iter *), void *data) 393 { 394 int start = skb_headlen(skb); 395 int i, copy = start - offset, start_off = offset, n; 396 struct sk_buff *frag_iter; 397 398 /* Copy header. */ 399 if (copy > 0) { 400 if (copy > len) 401 copy = len; 402 n = INDIRECT_CALL_1(cb, simple_copy_to_iter, 403 skb->data + offset, copy, data, to); 404 offset += n; 405 if (n != copy) 406 goto short_copy; 407 if ((len -= copy) == 0) 408 return 0; 409 } 410 411 if (!skb_frags_readable(skb)) 412 goto short_copy; 413 414 /* Copy paged appendix. Hmm... why does this look so complicated? */ 415 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 416 int end; 417 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 418 419 WARN_ON(start > offset + len); 420 421 end = start + skb_frag_size(frag); 422 if ((copy = end - offset) > 0) { 423 u32 p_off, p_len, copied; 424 struct page *p; 425 u8 *vaddr; 426 427 if (copy > len) 428 copy = len; 429 430 n = 0; 431 skb_frag_foreach_page(frag, 432 skb_frag_off(frag) + offset - start, 433 copy, p, p_off, p_len, copied) { 434 vaddr = kmap_local_page(p); 435 n += INDIRECT_CALL_1(cb, simple_copy_to_iter, 436 vaddr + p_off, p_len, data, to); 437 kunmap_local(vaddr); 438 } 439 440 offset += n; 441 if (n != copy) 442 goto short_copy; 443 if (!(len -= copy)) 444 return 0; 445 } 446 start = end; 447 } 448 449 skb_walk_frags(skb, frag_iter) { 450 int end; 451 452 WARN_ON(start > offset + len); 453 454 end = start + frag_iter->len; 455 if ((copy = end - offset) > 0) { 456 if (copy > len) 457 copy = len; 458 if (__skb_datagram_iter(frag_iter, offset - start, 459 to, copy, fault_short, cb, data)) 460 goto fault; 461 if ((len -= copy) == 0) 462 return 0; 463 offset += copy; 464 } 465 start = end; 466 } 467 if (!len) 468 return 0; 469 470 /* This is not really a user copy fault, but rather someone 471 * gave us a bogus length on the skb. We should probably 472 * print a warning here as it may indicate a kernel bug. 473 */ 474 475 fault: 476 iov_iter_revert(to, offset - start_off); 477 return -EFAULT; 478 479 short_copy: 480 if (fault_short || iov_iter_count(to)) 481 goto fault; 482 483 return 0; 484 } 485 486 #ifdef CONFIG_NET_CRC32C 487 static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes, 488 void *_crcp, struct iov_iter *i) 489 { 490 u32 *crcp = _crcp; 491 size_t copied; 492 493 copied = copy_to_iter(addr, bytes, i); 494 *crcp = crc32c(*crcp, addr, copied); 495 return copied; 496 } 497 498 /** 499 * skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator 500 * and update a CRC32C value. 501 * @skb: buffer to copy 502 * @offset: offset in the buffer to start copying from 503 * @to: iovec iterator to copy to 504 * @len: amount of data to copy from buffer to iovec 505 * @crcp: pointer to CRC32C value to update 506 * 507 * Return: 0 on success, -EFAULT if there was a fault during copy. 508 */ 509 int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, 510 struct iov_iter *to, int len, u32 *crcp) 511 { 512 return __skb_datagram_iter(skb, offset, to, len, true, 513 crc32c_and_copy_to_iter, crcp); 514 } 515 EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter); 516 #endif /* CONFIG_NET_CRC32C */ 517 518 static size_t simple_copy_to_iter(const void *addr, size_t bytes, 519 void *data __always_unused, struct iov_iter *i) 520 { 521 return copy_to_iter(addr, bytes, i); 522 } 523 524 /** 525 * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. 526 * @skb: buffer to copy 527 * @offset: offset in the buffer to start copying from 528 * @to: iovec iterator to copy to 529 * @len: amount of data to copy from buffer to iovec 530 */ 531 int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, 532 struct iov_iter *to, int len) 533 { 534 trace_skb_copy_datagram_iovec(skb, len); 535 return __skb_datagram_iter(skb, offset, to, len, false, 536 simple_copy_to_iter, NULL); 537 } 538 EXPORT_SYMBOL(skb_copy_datagram_iter); 539 540 /** 541 * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter. 542 * @skb: buffer to copy 543 * @offset: offset in the buffer to start copying to 544 * @from: the copy source 545 * @len: amount of data to copy to buffer from iovec 546 * 547 * Returns 0 or -EFAULT. 548 */ 549 int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, 550 struct iov_iter *from, 551 int len) 552 { 553 int start = skb_headlen(skb); 554 int i, copy = start - offset; 555 struct sk_buff *frag_iter; 556 557 /* Copy header. */ 558 if (copy > 0) { 559 if (copy > len) 560 copy = len; 561 if (copy_from_iter(skb->data + offset, copy, from) != copy) 562 goto fault; 563 if ((len -= copy) == 0) 564 return 0; 565 offset += copy; 566 } 567 568 /* Copy paged appendix. Hmm... why does this look so complicated? */ 569 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 570 int end; 571 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 572 573 WARN_ON(start > offset + len); 574 575 end = start + skb_frag_size(frag); 576 if ((copy = end - offset) > 0) { 577 size_t copied; 578 579 if (copy > len) 580 copy = len; 581 copied = copy_page_from_iter(skb_frag_page(frag), 582 skb_frag_off(frag) + offset - start, 583 copy, from); 584 if (copied != copy) 585 goto fault; 586 587 if (!(len -= copy)) 588 return 0; 589 offset += copy; 590 } 591 start = end; 592 } 593 594 skb_walk_frags(skb, frag_iter) { 595 int end; 596 597 WARN_ON(start > offset + len); 598 599 end = start + frag_iter->len; 600 if ((copy = end - offset) > 0) { 601 if (copy > len) 602 copy = len; 603 if (skb_copy_datagram_from_iter(frag_iter, 604 offset - start, 605 from, copy)) 606 goto fault; 607 if ((len -= copy) == 0) 608 return 0; 609 offset += copy; 610 } 611 start = end; 612 } 613 if (!len) 614 return 0; 615 616 fault: 617 return -EFAULT; 618 } 619 EXPORT_SYMBOL(skb_copy_datagram_from_iter); 620 621 int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset, 622 struct iov_iter *from, int len) 623 { 624 struct iov_iter_state state; 625 int ret; 626 627 iov_iter_save_state(from, &state); 628 ret = skb_copy_datagram_from_iter(skb, offset, from, len); 629 if (ret) 630 iov_iter_restore(from, &state); 631 return ret; 632 } 633 EXPORT_SYMBOL(skb_copy_datagram_from_iter_full); 634 635 int zerocopy_fill_skb_from_iter(struct sk_buff *skb, 636 struct iov_iter *from, size_t length) 637 { 638 int frag = skb_shinfo(skb)->nr_frags; 639 640 if (!skb_frags_readable(skb)) 641 return -EFAULT; 642 643 while (length && iov_iter_count(from)) { 644 struct page *head, *last_head = NULL; 645 struct page *pages[MAX_SKB_FRAGS]; 646 int refs, order, n = 0; 647 size_t start; 648 ssize_t copied; 649 650 if (frag == MAX_SKB_FRAGS) 651 return -EMSGSIZE; 652 653 copied = iov_iter_get_pages2(from, pages, length, 654 MAX_SKB_FRAGS - frag, &start); 655 if (copied < 0) 656 return -EFAULT; 657 658 length -= copied; 659 660 skb->data_len += copied; 661 skb->len += copied; 662 skb->truesize += PAGE_ALIGN(copied + start); 663 664 head = compound_head(pages[n]); 665 order = compound_order(head); 666 667 for (refs = 0; copied != 0; start = 0) { 668 int size = min_t(int, copied, PAGE_SIZE - start); 669 670 if (pages[n] - head > (1UL << order) - 1) { 671 head = compound_head(pages[n]); 672 order = compound_order(head); 673 } 674 675 start += (pages[n] - head) << PAGE_SHIFT; 676 copied -= size; 677 n++; 678 if (frag) { 679 skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1]; 680 681 if (head == skb_frag_page(last) && 682 start == skb_frag_off(last) + skb_frag_size(last)) { 683 skb_frag_size_add(last, size); 684 /* We combined this page, we need to release 685 * a reference. Since compound pages refcount 686 * is shared among many pages, batch the refcount 687 * adjustments to limit false sharing. 688 */ 689 last_head = head; 690 refs++; 691 continue; 692 } 693 } 694 if (refs) { 695 page_ref_sub(last_head, refs); 696 refs = 0; 697 } 698 skb_fill_page_desc_noacc(skb, frag++, head, start, size); 699 } 700 if (refs) 701 page_ref_sub(last_head, refs); 702 } 703 return 0; 704 } 705 706 static int 707 zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, 708 int length, 709 struct net_devmem_dmabuf_binding *binding) 710 { 711 int i = skb_shinfo(skb)->nr_frags; 712 size_t virt_addr, size, off; 713 struct net_iov *niov; 714 715 /* Devmem filling works by taking an IOVEC from the user where the 716 * iov_addrs are interpreted as an offset in bytes into the dma-buf to 717 * send from. We do not support other iter types. 718 */ 719 if (iov_iter_type(from) != ITER_IOVEC && 720 iov_iter_type(from) != ITER_UBUF) 721 return -EFAULT; 722 723 while (length && iov_iter_count(from)) { 724 if (i == MAX_SKB_FRAGS) 725 return -EMSGSIZE; 726 727 virt_addr = (size_t)iter_iov_addr(from); 728 niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size); 729 if (!niov) 730 return -EFAULT; 731 732 size = min_t(size_t, size, length); 733 size = min_t(size_t, size, iter_iov_len(from)); 734 735 get_netmem(net_iov_to_netmem(niov)); 736 skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off, 737 size, PAGE_SIZE); 738 iov_iter_advance(from, size); 739 length -= size; 740 i++; 741 } 742 743 return 0; 744 } 745 746 int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, 747 struct sk_buff *skb, struct iov_iter *from, 748 size_t length, 749 struct net_devmem_dmabuf_binding *binding) 750 { 751 unsigned long orig_size = skb->truesize; 752 unsigned long truesize; 753 int ret; 754 755 if (msg && msg->msg_ubuf && msg->sg_from_iter) 756 ret = msg->sg_from_iter(skb, from, length); 757 else if (binding) 758 ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding); 759 else 760 ret = zerocopy_fill_skb_from_iter(skb, from, length); 761 762 truesize = skb->truesize - orig_size; 763 if (sk && sk->sk_type == SOCK_STREAM) { 764 sk_wmem_queued_add(sk, truesize); 765 if (!skb_zcopy_pure(skb)) 766 sk_mem_charge(sk, truesize); 767 } else { 768 refcount_add(truesize, &skb->sk->sk_wmem_alloc); 769 } 770 return ret; 771 } 772 EXPORT_SYMBOL(__zerocopy_sg_from_iter); 773 774 /** 775 * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter 776 * @skb: buffer to copy 777 * @from: the source to copy from 778 * 779 * The function will first copy up to headlen, and then pin the userspace 780 * pages and build frags through them. 781 * 782 * Returns 0, -EFAULT or -EMSGSIZE. 783 */ 784 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) 785 { 786 int copy = min_t(int, skb_headlen(skb), iov_iter_count(from)); 787 788 /* copy up to skb headlen */ 789 if (skb_copy_datagram_from_iter(skb, 0, from, copy)) 790 return -EFAULT; 791 792 return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL); 793 } 794 EXPORT_SYMBOL(zerocopy_sg_from_iter); 795 796 static __always_inline 797 size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress, 798 size_t len, void *from, void *priv2) 799 { 800 __wsum next, *csum = priv2; 801 802 next = csum_and_copy_to_user(from + progress, iter_to, len); 803 *csum = csum_block_add(*csum, next, progress); 804 return next ? 0 : len; 805 } 806 807 static __always_inline 808 size_t memcpy_to_iter_csum(void *iter_to, size_t progress, 809 size_t len, void *from, void *priv2) 810 { 811 __wsum *csum = priv2; 812 __wsum next = csum_partial_copy_nocheck(from + progress, iter_to, len); 813 814 *csum = csum_block_add(*csum, next, progress); 815 return 0; 816 } 817 818 struct csum_state { 819 __wsum csum; 820 size_t off; 821 }; 822 823 static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, 824 struct iov_iter *i) 825 { 826 struct csum_state *csstate = _csstate; 827 __wsum sum; 828 829 if (WARN_ON_ONCE(i->data_source)) 830 return 0; 831 if (unlikely(iov_iter_is_discard(i))) { 832 // can't use csum_memcpy() for that one - data is not copied 833 csstate->csum = csum_block_add(csstate->csum, 834 csum_partial(addr, bytes, 0), 835 csstate->off); 836 csstate->off += bytes; 837 return bytes; 838 } 839 840 sum = csum_shift(csstate->csum, csstate->off); 841 842 bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum, 843 copy_to_user_iter_csum, 844 memcpy_to_iter_csum); 845 csstate->csum = csum_shift(sum, csstate->off); 846 csstate->off += bytes; 847 return bytes; 848 } 849 850 /** 851 * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator 852 * and update a checksum. 853 * @skb: buffer to copy 854 * @offset: offset in the buffer to start copying from 855 * @to: iovec iterator to copy to 856 * @len: amount of data to copy from buffer to iovec 857 * @csump: checksum pointer 858 */ 859 static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, 860 struct iov_iter *to, int len, 861 __wsum *csump) 862 { 863 struct csum_state csdata = { .csum = *csump }; 864 int ret; 865 866 ret = __skb_datagram_iter(skb, offset, to, len, true, 867 csum_and_copy_to_iter, &csdata); 868 if (ret) 869 return ret; 870 871 *csump = csdata.csum; 872 return 0; 873 } 874 875 /** 876 * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. 877 * @skb: skbuff 878 * @hlen: hardware length 879 * @msg: destination 880 * 881 * Caller _must_ check that skb will fit to this iovec. 882 * 883 * Returns: 0 - success. 884 * -EINVAL - checksum failure. 885 * -EFAULT - fault during copy. 886 */ 887 int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, 888 int hlen, struct msghdr *msg) 889 { 890 __wsum csum; 891 int chunk = skb->len - hlen; 892 893 if (!chunk) 894 return 0; 895 896 if (msg_data_left(msg) < chunk) { 897 if (__skb_checksum_complete(skb)) 898 return -EINVAL; 899 if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) 900 goto fault; 901 } else { 902 csum = csum_partial(skb->data, hlen, skb->csum); 903 if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, 904 chunk, &csum)) 905 goto fault; 906 907 if (csum_fold(csum)) { 908 iov_iter_revert(&msg->msg_iter, chunk); 909 return -EINVAL; 910 } 911 912 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 913 !skb->csum_complete_sw) 914 netdev_rx_csum_fault(NULL, skb); 915 } 916 return 0; 917 fault: 918 return -EFAULT; 919 } 920 EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); 921 922 /** 923 * datagram_poll - generic datagram poll 924 * @file: file struct 925 * @sock: socket 926 * @wait: poll table 927 * 928 * Datagram poll: Again totally generic. This also handles 929 * sequenced packet sockets providing the socket receive queue 930 * is only ever holding data ready to receive. 931 * 932 * Note: when you *don't* use this routine for this protocol, 933 * and you use a different write policy from sock_writeable() 934 * then please supply your own write_space callback. 935 */ 936 __poll_t datagram_poll(struct file *file, struct socket *sock, 937 poll_table *wait) 938 { 939 struct sock *sk = sock->sk; 940 __poll_t mask; 941 u8 shutdown; 942 943 sock_poll_wait(file, sock, wait); 944 mask = 0; 945 946 /* exceptional events? */ 947 if (READ_ONCE(sk->sk_err) || 948 !skb_queue_empty_lockless(&sk->sk_error_queue)) 949 mask |= EPOLLERR | 950 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 951 952 shutdown = READ_ONCE(sk->sk_shutdown); 953 if (shutdown & RCV_SHUTDOWN) 954 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 955 if (shutdown == SHUTDOWN_MASK) 956 mask |= EPOLLHUP; 957 958 /* readable? */ 959 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 960 mask |= EPOLLIN | EPOLLRDNORM; 961 962 /* Connection-based need to check for termination and startup */ 963 if (connection_based(sk)) { 964 int state = READ_ONCE(sk->sk_state); 965 966 if (state == TCP_CLOSE) 967 mask |= EPOLLHUP; 968 /* connection hasn't started yet? */ 969 if (state == TCP_SYN_SENT) 970 return mask; 971 } 972 973 /* writable? */ 974 if (sock_writeable(sk)) 975 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 976 else 977 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 978 979 return mask; 980 } 981 EXPORT_SYMBOL(datagram_poll); 982