1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * SUCS NET3: 4 * 5 * Generic datagram handling routines. These are generic for all 6 * protocols. Possibly a generic IP version on top of these would 7 * make sense. Not tonight however 8-). 8 * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and 9 * NetROM layer all have identical poll code and mostly 10 * identical recvmsg() code. So we share it here. The poll was 11 * shared before but buried in udp.c so I moved it. 12 * 13 * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old 14 * udp.c code) 15 * 16 * Fixes: 17 * Alan Cox : NULL return from skb_peek_copy() 18 * understood 19 * Alan Cox : Rewrote skb_read_datagram to avoid the 20 * skb_peek_copy stuff. 21 * Alan Cox : Added support for SOCK_SEQPACKET. 22 * IPX can no longer use the SO_TYPE hack 23 * but AX.25 now works right, and SPX is 24 * feasible. 25 * Alan Cox : Fixed write poll of non IP protocol 26 * crash. 27 * Florian La Roche: Changed for my new skbuff handling. 28 * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. 29 * Linus Torvalds : BSD semantic fixes. 30 * Alan Cox : Datagram iovec handling 31 * Darryl Miles : Fixed non-blocking SOCK_STREAM. 32 * Alan Cox : POSIXisms 33 * Pete Wyckoff : Unconnected accept() fix. 34 * 35 */ 36 37 #include <linux/module.h> 38 #include <linux/types.h> 39 #include <linux/kernel.h> 40 #include <linux/uaccess.h> 41 #include <linux/mm.h> 42 #include <linux/interrupt.h> 43 #include <linux/errno.h> 44 #include <linux/sched.h> 45 #include <linux/inet.h> 46 #include <linux/netdevice.h> 47 #include <linux/rtnetlink.h> 48 #include <linux/poll.h> 49 #include <linux/highmem.h> 50 #include <linux/spinlock.h> 51 #include <linux/slab.h> 52 #include <linux/pagemap.h> 53 #include <linux/iov_iter.h> 54 #include <linux/indirect_call_wrapper.h> 55 #include <linux/crc32.h> 56 57 #include <net/protocol.h> 58 #include <linux/skbuff.h> 59 60 #include <net/checksum.h> 61 #include <net/sock.h> 62 #include <net/tcp_states.h> 63 #include <trace/events/skb.h> 64 #include <net/busy_poll.h> 65 66 #include "devmem.h" 67 68 /* 69 * Is a socket 'connection oriented' ? 70 */ 71 static inline int connection_based(struct sock *sk) 72 { 73 return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; 74 } 75 76 static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, 77 void *key) 78 { 79 /* 80 * Avoid a wakeup if event not interesting for us 81 */ 82 if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR))) 83 return 0; 84 return autoremove_wake_function(wait, mode, sync, key); 85 } 86 /* 87 * Wait for the last received packet to be different from skb 88 */ 89 int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, 90 int *err, long *timeo_p, 91 const struct sk_buff *skb) 92 { 93 int error; 94 DEFINE_WAIT_FUNC(wait, receiver_wake_function); 95 96 prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 97 98 /* Socket errors? */ 99 error = sock_error(sk); 100 if (error) 101 goto out_err; 102 103 if (READ_ONCE(queue->prev) != skb) 104 goto out; 105 106 /* Socket shut down? */ 107 if (sk->sk_shutdown & RCV_SHUTDOWN) 108 goto out_noerr; 109 110 /* Sequenced packets can come disconnected. 111 * If so we report the problem 112 */ 113 error = -ENOTCONN; 114 if (connection_based(sk) && 115 !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) 116 goto out_err; 117 118 /* handle signals */ 119 if (signal_pending(current)) 120 goto interrupted; 121 122 error = 0; 123 *timeo_p = schedule_timeout(*timeo_p); 124 out: 125 finish_wait(sk_sleep(sk), &wait); 126 return error; 127 interrupted: 128 error = sock_intr_errno(*timeo_p); 129 out_err: 130 *err = error; 131 goto out; 132 out_noerr: 133 *err = 0; 134 error = 1; 135 goto out; 136 } 137 EXPORT_SYMBOL(__skb_wait_for_more_packets); 138 139 static struct sk_buff *skb_set_peeked(struct sk_buff *skb) 140 { 141 struct sk_buff *nskb; 142 143 if (skb->peeked) 144 return skb; 145 146 /* We have to unshare an skb before modifying it. */ 147 if (!skb_shared(skb)) 148 goto done; 149 150 nskb = skb_clone(skb, GFP_ATOMIC); 151 if (!nskb) 152 return ERR_PTR(-ENOMEM); 153 154 skb->prev->next = nskb; 155 skb->next->prev = nskb; 156 nskb->prev = skb->prev; 157 nskb->next = skb->next; 158 159 consume_skb(skb); 160 skb = nskb; 161 162 done: 163 skb->peeked = 1; 164 165 return skb; 166 } 167 168 struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, 169 unsigned int flags, 170 int *off, int *err, 171 struct sk_buff **last) 172 { 173 bool peek_at_off = false; 174 struct sk_buff *skb; 175 int _off = 0; 176 177 if (unlikely(flags & MSG_PEEK && *off >= 0)) { 178 peek_at_off = true; 179 _off = *off; 180 } 181 182 *last = queue->prev; 183 skb_queue_walk(queue, skb) { 184 if (flags & MSG_PEEK) { 185 if (peek_at_off && _off >= skb->len && 186 (_off || skb->peeked)) { 187 _off -= skb->len; 188 continue; 189 } 190 if (!skb->len) { 191 skb = skb_set_peeked(skb); 192 if (IS_ERR(skb)) { 193 *err = PTR_ERR(skb); 194 return NULL; 195 } 196 } 197 refcount_inc(&skb->users); 198 } else { 199 __skb_unlink(skb, queue); 200 } 201 *off = _off; 202 return skb; 203 } 204 return NULL; 205 } 206 207 /** 208 * __skb_try_recv_datagram - Receive a datagram skbuff 209 * @sk: socket 210 * @queue: socket queue from which to receive 211 * @flags: MSG\_ flags 212 * @off: an offset in bytes to peek skb from. Returns an offset 213 * within an skb where data actually starts 214 * @err: error code returned 215 * @last: set to last peeked message to inform the wait function 216 * what to look for when peeking 217 * 218 * Get a datagram skbuff, understands the peeking, nonblocking wakeups 219 * and possible races. This replaces identical code in packet, raw and 220 * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes 221 * the long standing peek and read race for datagram sockets. If you 222 * alter this routine remember it must be re-entrant. 223 * 224 * This function will lock the socket if a skb is returned, so 225 * the caller needs to unlock the socket in that case (usually by 226 * calling skb_free_datagram). Returns NULL with @err set to 227 * -EAGAIN if no data was available or to some other value if an 228 * error was detected. 229 * 230 * * It does not lock socket since today. This function is 231 * * free of race conditions. This measure should/can improve 232 * * significantly datagram socket latencies at high loads, 233 * * when data copying to user space takes lots of time. 234 * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet 235 * * 8) Great win.) 236 * * --ANK (980729) 237 * 238 * The order of the tests when we find no data waiting are specified 239 * quite explicitly by POSIX 1003.1g, don't change them without having 240 * the standard around please. 241 */ 242 struct sk_buff *__skb_try_recv_datagram(struct sock *sk, 243 struct sk_buff_head *queue, 244 unsigned int flags, int *off, int *err, 245 struct sk_buff **last) 246 { 247 struct sk_buff *skb; 248 unsigned long cpu_flags; 249 /* 250 * Caller is allowed not to check sk->sk_err before skb_recv_datagram() 251 */ 252 int error = sock_error(sk); 253 254 if (error) 255 goto no_packet; 256 257 do { 258 /* Again only user level code calls this function, so nothing 259 * interrupt level will suddenly eat the receive_queue. 260 * 261 * Look at current nfs client by the way... 262 * However, this function was correct in any case. 8) 263 */ 264 spin_lock_irqsave(&queue->lock, cpu_flags); 265 skb = __skb_try_recv_from_queue(queue, flags, off, &error, 266 last); 267 spin_unlock_irqrestore(&queue->lock, cpu_flags); 268 if (error) 269 goto no_packet; 270 if (skb) 271 return skb; 272 273 if (!sk_can_busy_loop(sk)) 274 break; 275 276 sk_busy_loop(sk, flags & MSG_DONTWAIT); 277 } while (READ_ONCE(queue->prev) != *last); 278 279 error = -EAGAIN; 280 281 no_packet: 282 *err = error; 283 return NULL; 284 } 285 EXPORT_SYMBOL(__skb_try_recv_datagram); 286 287 struct sk_buff *__skb_recv_datagram(struct sock *sk, 288 struct sk_buff_head *sk_queue, 289 unsigned int flags, int *off, int *err) 290 { 291 struct sk_buff *skb, *last; 292 long timeo; 293 294 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 295 296 do { 297 skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err, 298 &last); 299 if (skb) 300 return skb; 301 302 if (*err != -EAGAIN) 303 break; 304 } while (timeo && 305 !__skb_wait_for_more_packets(sk, sk_queue, err, 306 &timeo, last)); 307 308 return NULL; 309 } 310 EXPORT_SYMBOL(__skb_recv_datagram); 311 312 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, 313 int *err) 314 { 315 int off = 0; 316 317 return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags, 318 &off, err); 319 } 320 EXPORT_SYMBOL(skb_recv_datagram); 321 322 void skb_free_datagram(struct sock *sk, struct sk_buff *skb) 323 { 324 consume_skb(skb); 325 } 326 EXPORT_SYMBOL(skb_free_datagram); 327 328 int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, 329 struct sk_buff *skb, unsigned int flags, 330 void (*destructor)(struct sock *sk, 331 struct sk_buff *skb)) 332 { 333 int err = 0; 334 335 if (flags & MSG_PEEK) { 336 err = -ENOENT; 337 spin_lock_bh(&sk_queue->lock); 338 if (skb->next) { 339 __skb_unlink(skb, sk_queue); 340 refcount_dec(&skb->users); 341 if (destructor) 342 destructor(sk, skb); 343 err = 0; 344 } 345 spin_unlock_bh(&sk_queue->lock); 346 } 347 348 atomic_inc(&sk->sk_drops); 349 return err; 350 } 351 EXPORT_SYMBOL(__sk_queue_drop_skb); 352 353 /** 354 * skb_kill_datagram - Free a datagram skbuff forcibly 355 * @sk: socket 356 * @skb: datagram skbuff 357 * @flags: MSG\_ flags 358 * 359 * This function frees a datagram skbuff that was received by 360 * skb_recv_datagram. The flags argument must match the one 361 * used for skb_recv_datagram. 362 * 363 * If the MSG_PEEK flag is set, and the packet is still on the 364 * receive queue of the socket, it will be taken off the queue 365 * before it is freed. 366 * 367 * This function currently only disables BH when acquiring the 368 * sk_receive_queue lock. Therefore it must not be used in a 369 * context where that lock is acquired in an IRQ context. 370 * 371 * It returns 0 if the packet was removed by us. 372 */ 373 374 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) 375 { 376 int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags, 377 NULL); 378 379 kfree_skb(skb); 380 return err; 381 } 382 EXPORT_SYMBOL(skb_kill_datagram); 383 384 INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr, 385 size_t bytes, 386 void *data __always_unused, 387 struct iov_iter *i)); 388 389 static int __skb_datagram_iter(const struct sk_buff *skb, int offset, 390 struct iov_iter *to, int len, bool fault_short, 391 size_t (*cb)(const void *, size_t, void *, 392 struct iov_iter *), void *data) 393 { 394 int start = skb_headlen(skb); 395 int i, copy = start - offset, start_off = offset, n; 396 struct sk_buff *frag_iter; 397 398 /* Copy header. */ 399 if (copy > 0) { 400 if (copy > len) 401 copy = len; 402 n = INDIRECT_CALL_1(cb, simple_copy_to_iter, 403 skb->data + offset, copy, data, to); 404 offset += n; 405 if (n != copy) 406 goto short_copy; 407 if ((len -= copy) == 0) 408 return 0; 409 } 410 411 if (!skb_frags_readable(skb)) 412 goto short_copy; 413 414 /* Copy paged appendix. Hmm... why does this look so complicated? */ 415 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 416 int end; 417 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 418 419 WARN_ON(start > offset + len); 420 421 end = start + skb_frag_size(frag); 422 if ((copy = end - offset) > 0) { 423 u32 p_off, p_len, copied; 424 struct page *p; 425 u8 *vaddr; 426 427 if (copy > len) 428 copy = len; 429 430 n = 0; 431 skb_frag_foreach_page(frag, 432 skb_frag_off(frag) + offset - start, 433 copy, p, p_off, p_len, copied) { 434 vaddr = kmap_local_page(p); 435 n += INDIRECT_CALL_1(cb, simple_copy_to_iter, 436 vaddr + p_off, p_len, data, to); 437 kunmap_local(vaddr); 438 } 439 440 offset += n; 441 if (n != copy) 442 goto short_copy; 443 if (!(len -= copy)) 444 return 0; 445 } 446 start = end; 447 } 448 449 skb_walk_frags(skb, frag_iter) { 450 int end; 451 452 WARN_ON(start > offset + len); 453 454 end = start + frag_iter->len; 455 if ((copy = end - offset) > 0) { 456 if (copy > len) 457 copy = len; 458 if (__skb_datagram_iter(frag_iter, offset - start, 459 to, copy, fault_short, cb, data)) 460 goto fault; 461 if ((len -= copy) == 0) 462 return 0; 463 offset += copy; 464 } 465 start = end; 466 } 467 if (!len) 468 return 0; 469 470 /* This is not really a user copy fault, but rather someone 471 * gave us a bogus length on the skb. We should probably 472 * print a warning here as it may indicate a kernel bug. 473 */ 474 475 fault: 476 iov_iter_revert(to, offset - start_off); 477 return -EFAULT; 478 479 short_copy: 480 if (fault_short || iov_iter_count(to)) 481 goto fault; 482 483 return 0; 484 } 485 486 #ifdef CONFIG_NET_CRC32C 487 static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes, 488 void *_crcp, struct iov_iter *i) 489 { 490 u32 *crcp = _crcp; 491 size_t copied; 492 493 copied = copy_to_iter(addr, bytes, i); 494 *crcp = crc32c(*crcp, addr, copied); 495 return copied; 496 } 497 498 /** 499 * skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator 500 * and update a CRC32C value. 501 * @skb: buffer to copy 502 * @offset: offset in the buffer to start copying from 503 * @to: iovec iterator to copy to 504 * @len: amount of data to copy from buffer to iovec 505 * @crcp: pointer to CRC32C value to update 506 * 507 * Return: 0 on success, -EFAULT if there was a fault during copy. 508 */ 509 int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, 510 struct iov_iter *to, int len, u32 *crcp) 511 { 512 return __skb_datagram_iter(skb, offset, to, len, true, 513 crc32c_and_copy_to_iter, crcp); 514 } 515 EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter); 516 #endif /* CONFIG_NET_CRC32C */ 517 518 static size_t simple_copy_to_iter(const void *addr, size_t bytes, 519 void *data __always_unused, struct iov_iter *i) 520 { 521 return copy_to_iter(addr, bytes, i); 522 } 523 524 /** 525 * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. 526 * @skb: buffer to copy 527 * @offset: offset in the buffer to start copying from 528 * @to: iovec iterator to copy to 529 * @len: amount of data to copy from buffer to iovec 530 */ 531 int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, 532 struct iov_iter *to, int len) 533 { 534 trace_skb_copy_datagram_iovec(skb, len); 535 return __skb_datagram_iter(skb, offset, to, len, false, 536 simple_copy_to_iter, NULL); 537 } 538 EXPORT_SYMBOL(skb_copy_datagram_iter); 539 540 /** 541 * skb_copy_datagram_from_iter - Copy a datagram from an iov_iter. 542 * @skb: buffer to copy 543 * @offset: offset in the buffer to start copying to 544 * @from: the copy source 545 * @len: amount of data to copy to buffer from iovec 546 * 547 * Returns 0 or -EFAULT. 548 */ 549 int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, 550 struct iov_iter *from, 551 int len) 552 { 553 int start = skb_headlen(skb); 554 int i, copy = start - offset; 555 struct sk_buff *frag_iter; 556 557 /* Copy header. */ 558 if (copy > 0) { 559 if (copy > len) 560 copy = len; 561 if (copy_from_iter(skb->data + offset, copy, from) != copy) 562 goto fault; 563 if ((len -= copy) == 0) 564 return 0; 565 offset += copy; 566 } 567 568 /* Copy paged appendix. Hmm... why does this look so complicated? */ 569 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 570 int end; 571 const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 572 573 WARN_ON(start > offset + len); 574 575 end = start + skb_frag_size(frag); 576 if ((copy = end - offset) > 0) { 577 size_t copied; 578 579 if (copy > len) 580 copy = len; 581 copied = copy_page_from_iter(skb_frag_page(frag), 582 skb_frag_off(frag) + offset - start, 583 copy, from); 584 if (copied != copy) 585 goto fault; 586 587 if (!(len -= copy)) 588 return 0; 589 offset += copy; 590 } 591 start = end; 592 } 593 594 skb_walk_frags(skb, frag_iter) { 595 int end; 596 597 WARN_ON(start > offset + len); 598 599 end = start + frag_iter->len; 600 if ((copy = end - offset) > 0) { 601 if (copy > len) 602 copy = len; 603 if (skb_copy_datagram_from_iter(frag_iter, 604 offset - start, 605 from, copy)) 606 goto fault; 607 if ((len -= copy) == 0) 608 return 0; 609 offset += copy; 610 } 611 start = end; 612 } 613 if (!len) 614 return 0; 615 616 fault: 617 return -EFAULT; 618 } 619 EXPORT_SYMBOL(skb_copy_datagram_from_iter); 620 621 int zerocopy_fill_skb_from_iter(struct sk_buff *skb, 622 struct iov_iter *from, size_t length) 623 { 624 int frag = skb_shinfo(skb)->nr_frags; 625 626 if (!skb_frags_readable(skb)) 627 return -EFAULT; 628 629 while (length && iov_iter_count(from)) { 630 struct page *head, *last_head = NULL; 631 struct page *pages[MAX_SKB_FRAGS]; 632 int refs, order, n = 0; 633 size_t start; 634 ssize_t copied; 635 636 if (frag == MAX_SKB_FRAGS) 637 return -EMSGSIZE; 638 639 copied = iov_iter_get_pages2(from, pages, length, 640 MAX_SKB_FRAGS - frag, &start); 641 if (copied < 0) 642 return -EFAULT; 643 644 length -= copied; 645 646 skb->data_len += copied; 647 skb->len += copied; 648 skb->truesize += PAGE_ALIGN(copied + start); 649 650 head = compound_head(pages[n]); 651 order = compound_order(head); 652 653 for (refs = 0; copied != 0; start = 0) { 654 int size = min_t(int, copied, PAGE_SIZE - start); 655 656 if (pages[n] - head > (1UL << order) - 1) { 657 head = compound_head(pages[n]); 658 order = compound_order(head); 659 } 660 661 start += (pages[n] - head) << PAGE_SHIFT; 662 copied -= size; 663 n++; 664 if (frag) { 665 skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1]; 666 667 if (head == skb_frag_page(last) && 668 start == skb_frag_off(last) + skb_frag_size(last)) { 669 skb_frag_size_add(last, size); 670 /* We combined this page, we need to release 671 * a reference. Since compound pages refcount 672 * is shared among many pages, batch the refcount 673 * adjustments to limit false sharing. 674 */ 675 last_head = head; 676 refs++; 677 continue; 678 } 679 } 680 if (refs) { 681 page_ref_sub(last_head, refs); 682 refs = 0; 683 } 684 skb_fill_page_desc_noacc(skb, frag++, head, start, size); 685 } 686 if (refs) 687 page_ref_sub(last_head, refs); 688 } 689 return 0; 690 } 691 692 static int 693 zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, 694 int length, 695 struct net_devmem_dmabuf_binding *binding) 696 { 697 int i = skb_shinfo(skb)->nr_frags; 698 size_t virt_addr, size, off; 699 struct net_iov *niov; 700 701 /* Devmem filling works by taking an IOVEC from the user where the 702 * iov_addrs are interpreted as an offset in bytes into the dma-buf to 703 * send from. We do not support other iter types. 704 */ 705 if (iov_iter_type(from) != ITER_IOVEC && 706 iov_iter_type(from) != ITER_UBUF) 707 return -EFAULT; 708 709 while (length && iov_iter_count(from)) { 710 if (i == MAX_SKB_FRAGS) 711 return -EMSGSIZE; 712 713 virt_addr = (size_t)iter_iov_addr(from); 714 niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size); 715 if (!niov) 716 return -EFAULT; 717 718 size = min_t(size_t, size, length); 719 size = min_t(size_t, size, iter_iov_len(from)); 720 721 get_netmem(net_iov_to_netmem(niov)); 722 skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off, 723 size, PAGE_SIZE); 724 iov_iter_advance(from, size); 725 length -= size; 726 i++; 727 } 728 729 return 0; 730 } 731 732 int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, 733 struct sk_buff *skb, struct iov_iter *from, 734 size_t length, 735 struct net_devmem_dmabuf_binding *binding) 736 { 737 unsigned long orig_size = skb->truesize; 738 unsigned long truesize; 739 int ret; 740 741 if (msg && msg->msg_ubuf && msg->sg_from_iter) 742 ret = msg->sg_from_iter(skb, from, length); 743 else if (binding) 744 ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding); 745 else 746 ret = zerocopy_fill_skb_from_iter(skb, from, length); 747 748 truesize = skb->truesize - orig_size; 749 if (sk && sk->sk_type == SOCK_STREAM) { 750 sk_wmem_queued_add(sk, truesize); 751 if (!skb_zcopy_pure(skb)) 752 sk_mem_charge(sk, truesize); 753 } else { 754 refcount_add(truesize, &skb->sk->sk_wmem_alloc); 755 } 756 return ret; 757 } 758 EXPORT_SYMBOL(__zerocopy_sg_from_iter); 759 760 /** 761 * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter 762 * @skb: buffer to copy 763 * @from: the source to copy from 764 * 765 * The function will first copy up to headlen, and then pin the userspace 766 * pages and build frags through them. 767 * 768 * Returns 0, -EFAULT or -EMSGSIZE. 769 */ 770 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) 771 { 772 int copy = min_t(int, skb_headlen(skb), iov_iter_count(from)); 773 774 /* copy up to skb headlen */ 775 if (skb_copy_datagram_from_iter(skb, 0, from, copy)) 776 return -EFAULT; 777 778 return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL); 779 } 780 EXPORT_SYMBOL(zerocopy_sg_from_iter); 781 782 static __always_inline 783 size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress, 784 size_t len, void *from, void *priv2) 785 { 786 __wsum next, *csum = priv2; 787 788 next = csum_and_copy_to_user(from + progress, iter_to, len); 789 *csum = csum_block_add(*csum, next, progress); 790 return next ? 0 : len; 791 } 792 793 static __always_inline 794 size_t memcpy_to_iter_csum(void *iter_to, size_t progress, 795 size_t len, void *from, void *priv2) 796 { 797 __wsum *csum = priv2; 798 __wsum next = csum_partial_copy_nocheck(from + progress, iter_to, len); 799 800 *csum = csum_block_add(*csum, next, progress); 801 return 0; 802 } 803 804 struct csum_state { 805 __wsum csum; 806 size_t off; 807 }; 808 809 static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, 810 struct iov_iter *i) 811 { 812 struct csum_state *csstate = _csstate; 813 __wsum sum; 814 815 if (WARN_ON_ONCE(i->data_source)) 816 return 0; 817 if (unlikely(iov_iter_is_discard(i))) { 818 // can't use csum_memcpy() for that one - data is not copied 819 csstate->csum = csum_block_add(csstate->csum, 820 csum_partial(addr, bytes, 0), 821 csstate->off); 822 csstate->off += bytes; 823 return bytes; 824 } 825 826 sum = csum_shift(csstate->csum, csstate->off); 827 828 bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum, 829 copy_to_user_iter_csum, 830 memcpy_to_iter_csum); 831 csstate->csum = csum_shift(sum, csstate->off); 832 csstate->off += bytes; 833 return bytes; 834 } 835 836 /** 837 * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator 838 * and update a checksum. 839 * @skb: buffer to copy 840 * @offset: offset in the buffer to start copying from 841 * @to: iovec iterator to copy to 842 * @len: amount of data to copy from buffer to iovec 843 * @csump: checksum pointer 844 */ 845 static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, 846 struct iov_iter *to, int len, 847 __wsum *csump) 848 { 849 struct csum_state csdata = { .csum = *csump }; 850 int ret; 851 852 ret = __skb_datagram_iter(skb, offset, to, len, true, 853 csum_and_copy_to_iter, &csdata); 854 if (ret) 855 return ret; 856 857 *csump = csdata.csum; 858 return 0; 859 } 860 861 /** 862 * skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec. 863 * @skb: skbuff 864 * @hlen: hardware length 865 * @msg: destination 866 * 867 * Caller _must_ check that skb will fit to this iovec. 868 * 869 * Returns: 0 - success. 870 * -EINVAL - checksum failure. 871 * -EFAULT - fault during copy. 872 */ 873 int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, 874 int hlen, struct msghdr *msg) 875 { 876 __wsum csum; 877 int chunk = skb->len - hlen; 878 879 if (!chunk) 880 return 0; 881 882 if (msg_data_left(msg) < chunk) { 883 if (__skb_checksum_complete(skb)) 884 return -EINVAL; 885 if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) 886 goto fault; 887 } else { 888 csum = csum_partial(skb->data, hlen, skb->csum); 889 if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, 890 chunk, &csum)) 891 goto fault; 892 893 if (csum_fold(csum)) { 894 iov_iter_revert(&msg->msg_iter, chunk); 895 return -EINVAL; 896 } 897 898 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 899 !skb->csum_complete_sw) 900 netdev_rx_csum_fault(NULL, skb); 901 } 902 return 0; 903 fault: 904 return -EFAULT; 905 } 906 EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); 907 908 /** 909 * datagram_poll - generic datagram poll 910 * @file: file struct 911 * @sock: socket 912 * @wait: poll table 913 * 914 * Datagram poll: Again totally generic. This also handles 915 * sequenced packet sockets providing the socket receive queue 916 * is only ever holding data ready to receive. 917 * 918 * Note: when you *don't* use this routine for this protocol, 919 * and you use a different write policy from sock_writeable() 920 * then please supply your own write_space callback. 921 */ 922 __poll_t datagram_poll(struct file *file, struct socket *sock, 923 poll_table *wait) 924 { 925 struct sock *sk = sock->sk; 926 __poll_t mask; 927 u8 shutdown; 928 929 sock_poll_wait(file, sock, wait); 930 mask = 0; 931 932 /* exceptional events? */ 933 if (READ_ONCE(sk->sk_err) || 934 !skb_queue_empty_lockless(&sk->sk_error_queue)) 935 mask |= EPOLLERR | 936 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 937 938 shutdown = READ_ONCE(sk->sk_shutdown); 939 if (shutdown & RCV_SHUTDOWN) 940 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 941 if (shutdown == SHUTDOWN_MASK) 942 mask |= EPOLLHUP; 943 944 /* readable? */ 945 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 946 mask |= EPOLLIN | EPOLLRDNORM; 947 948 /* Connection-based need to check for termination and startup */ 949 if (connection_based(sk)) { 950 int state = READ_ONCE(sk->sk_state); 951 952 if (state == TCP_CLOSE) 953 mask |= EPOLLHUP; 954 /* connection hasn't started yet? */ 955 if (state == TCP_SYN_SENT) 956 return mask; 957 } 958 959 /* writable? */ 960 if (sock_writeable(sk)) 961 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 962 else 963 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 964 965 return mask; 966 } 967 EXPORT_SYMBOL(datagram_poll); 968