1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/file.h> 5 #include <linux/slab.h> 6 #include <linux/net.h> 7 #include <linux/compat.h> 8 #include <net/compat.h> 9 #include <linux/io_uring.h> 10 11 #include <uapi/linux/io_uring.h> 12 13 #include "io_uring.h" 14 #include "kbuf.h" 15 #include "alloc_cache.h" 16 #include "net.h" 17 #include "notif.h" 18 #include "rsrc.h" 19 20 #if defined(CONFIG_NET) 21 struct io_shutdown { 22 struct file *file; 23 int how; 24 }; 25 26 struct io_accept { 27 struct file *file; 28 struct sockaddr __user *addr; 29 int __user *addr_len; 30 int flags; 31 int iou_flags; 32 u32 file_slot; 33 unsigned long nofile; 34 }; 35 36 struct io_socket { 37 struct file *file; 38 int domain; 39 int type; 40 int protocol; 41 int flags; 42 u32 file_slot; 43 unsigned long nofile; 44 }; 45 46 struct io_connect { 47 struct file *file; 48 struct sockaddr __user *addr; 49 int addr_len; 50 bool in_progress; 51 bool seen_econnaborted; 52 }; 53 54 struct io_sr_msg { 55 struct file *file; 56 union { 57 struct compat_msghdr __user *umsg_compat; 58 struct user_msghdr __user *umsg; 59 void __user *buf; 60 }; 61 int len; 62 unsigned done_io; 63 unsigned msg_flags; 64 unsigned nr_multishot_loops; 65 u16 flags; 66 /* initialised and used only by !msg send variants */ 67 u16 addr_len; 68 u16 buf_group; 69 void __user *addr; 70 void __user *msg_control; 71 /* used only for send zerocopy */ 72 struct io_kiocb *notif; 73 }; 74 75 /* 76 * Number of times we'll try and do receives if there's more data. If we 77 * exceed this limit, then add us to the back of the queue and retry from 78 * there. This helps fairness between flooding clients. 79 */ 80 #define MULTISHOT_MAX_RETRY 32 81 82 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 83 { 84 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); 85 86 if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || 87 sqe->buf_index || sqe->splice_fd_in)) 88 return -EINVAL; 89 90 shutdown->how = READ_ONCE(sqe->len); 91 req->flags |= REQ_F_FORCE_ASYNC; 92 return 0; 93 } 94 95 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) 96 { 97 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); 98 struct socket *sock; 99 int ret; 100 101 WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); 102 103 sock = sock_from_file(req->file); 104 if (unlikely(!sock)) 105 return -ENOTSOCK; 106 107 ret = __sys_shutdown_sock(sock, shutdown->how); 108 io_req_set_res(req, ret, 0); 109 return IOU_OK; 110 } 111 112 static bool io_net_retry(struct socket *sock, int flags) 113 { 114 if (!(flags & MSG_WAITALL)) 115 return false; 116 return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; 117 } 118 119 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) 120 { 121 if (kmsg->free_iov) { 122 kfree(kmsg->free_iov); 123 kmsg->free_iov_nr = 0; 124 kmsg->free_iov = NULL; 125 } 126 } 127 128 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) 129 { 130 struct io_async_msghdr *hdr = req->async_data; 131 struct iovec *iov; 132 133 /* can't recycle, ensure we free the iovec if we have one */ 134 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { 135 io_netmsg_iovec_free(hdr); 136 return; 137 } 138 139 /* Let normal cleanup path reap it if we fail adding to the cache */ 140 iov = hdr->free_iov; 141 if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { 142 if (iov) 143 kasan_mempool_poison_object(iov); 144 req->async_data = NULL; 145 req->flags &= ~REQ_F_ASYNC_DATA; 146 } 147 } 148 149 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) 150 { 151 struct io_ring_ctx *ctx = req->ctx; 152 struct io_async_msghdr *hdr; 153 154 hdr = io_alloc_cache_get(&ctx->netmsg_cache); 155 if (hdr) { 156 if (hdr->free_iov) { 157 kasan_mempool_unpoison_object(hdr->free_iov, 158 hdr->free_iov_nr * sizeof(struct iovec)); 159 req->flags |= REQ_F_NEED_CLEANUP; 160 } 161 req->flags |= REQ_F_ASYNC_DATA; 162 req->async_data = hdr; 163 return hdr; 164 } 165 166 if (!io_alloc_async_data(req)) { 167 hdr = req->async_data; 168 hdr->free_iov_nr = 0; 169 hdr->free_iov = NULL; 170 return hdr; 171 } 172 return NULL; 173 } 174 175 /* assign new iovec to kmsg, if we need to */ 176 static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, 177 struct iovec *iov) 178 { 179 if (iov) { 180 req->flags |= REQ_F_NEED_CLEANUP; 181 kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs; 182 if (kmsg->free_iov) 183 kfree(kmsg->free_iov); 184 kmsg->free_iov = iov; 185 } 186 return 0; 187 } 188 189 static inline void io_mshot_prep_retry(struct io_kiocb *req, 190 struct io_async_msghdr *kmsg) 191 { 192 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 193 194 req->flags &= ~REQ_F_BL_EMPTY; 195 sr->done_io = 0; 196 sr->len = 0; /* get from the provided buffer */ 197 req->buf_index = sr->buf_group; 198 } 199 200 #ifdef CONFIG_COMPAT 201 static int io_compat_msg_copy_hdr(struct io_kiocb *req, 202 struct io_async_msghdr *iomsg, 203 struct compat_msghdr *msg, int ddir) 204 { 205 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 206 struct compat_iovec __user *uiov; 207 struct iovec *iov; 208 int ret, nr_segs; 209 210 if (iomsg->free_iov) { 211 nr_segs = iomsg->free_iov_nr; 212 iov = iomsg->free_iov; 213 } else { 214 iov = &iomsg->fast_iov; 215 nr_segs = 1; 216 } 217 218 if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) 219 return -EFAULT; 220 221 uiov = compat_ptr(msg->msg_iov); 222 if (req->flags & REQ_F_BUFFER_SELECT) { 223 compat_ssize_t clen; 224 225 if (msg->msg_iovlen == 0) { 226 sr->len = iov->iov_len = 0; 227 iov->iov_base = NULL; 228 } else if (msg->msg_iovlen > 1) { 229 return -EINVAL; 230 } else { 231 if (!access_ok(uiov, sizeof(*uiov))) 232 return -EFAULT; 233 if (__get_user(clen, &uiov->iov_len)) 234 return -EFAULT; 235 if (clen < 0) 236 return -EINVAL; 237 sr->len = clen; 238 } 239 240 return 0; 241 } 242 243 ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, 244 nr_segs, &iov, &iomsg->msg.msg_iter, true); 245 if (unlikely(ret < 0)) 246 return ret; 247 248 return io_net_vec_assign(req, iomsg, iov); 249 } 250 #endif 251 252 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, 253 struct user_msghdr *msg, int ddir) 254 { 255 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 256 struct iovec *iov; 257 int ret, nr_segs; 258 259 if (iomsg->free_iov) { 260 nr_segs = iomsg->free_iov_nr; 261 iov = iomsg->free_iov; 262 } else { 263 iov = &iomsg->fast_iov; 264 nr_segs = 1; 265 } 266 267 if (!user_access_begin(sr->umsg, sizeof(*sr->umsg))) 268 return -EFAULT; 269 270 ret = -EFAULT; 271 unsafe_get_user(msg->msg_name, &sr->umsg->msg_name, ua_end); 272 unsafe_get_user(msg->msg_namelen, &sr->umsg->msg_namelen, ua_end); 273 unsafe_get_user(msg->msg_iov, &sr->umsg->msg_iov, ua_end); 274 unsafe_get_user(msg->msg_iovlen, &sr->umsg->msg_iovlen, ua_end); 275 unsafe_get_user(msg->msg_control, &sr->umsg->msg_control, ua_end); 276 unsafe_get_user(msg->msg_controllen, &sr->umsg->msg_controllen, ua_end); 277 msg->msg_flags = 0; 278 279 if (req->flags & REQ_F_BUFFER_SELECT) { 280 if (msg->msg_iovlen == 0) { 281 sr->len = iov->iov_len = 0; 282 iov->iov_base = NULL; 283 } else if (msg->msg_iovlen > 1) { 284 ret = -EINVAL; 285 goto ua_end; 286 } else { 287 /* we only need the length for provided buffers */ 288 if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t))) 289 goto ua_end; 290 unsafe_get_user(iov->iov_len, &msg->msg_iov[0].iov_len, 291 ua_end); 292 sr->len = iov->iov_len; 293 } 294 ret = 0; 295 ua_end: 296 user_access_end(); 297 return ret; 298 } 299 300 user_access_end(); 301 ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, 302 &iov, &iomsg->msg.msg_iter, false); 303 if (unlikely(ret < 0)) 304 return ret; 305 306 return io_net_vec_assign(req, iomsg, iov); 307 } 308 309 static int io_sendmsg_copy_hdr(struct io_kiocb *req, 310 struct io_async_msghdr *iomsg) 311 { 312 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 313 struct user_msghdr msg; 314 int ret; 315 316 iomsg->msg.msg_name = &iomsg->addr; 317 iomsg->msg.msg_iter.nr_segs = 0; 318 319 #ifdef CONFIG_COMPAT 320 if (unlikely(req->ctx->compat)) { 321 struct compat_msghdr cmsg; 322 323 ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE); 324 if (unlikely(ret)) 325 return ret; 326 327 return __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); 328 } 329 #endif 330 331 ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE); 332 if (unlikely(ret)) 333 return ret; 334 335 ret = __copy_msghdr(&iomsg->msg, &msg, NULL); 336 337 /* save msg_control as sys_sendmsg() overwrites it */ 338 sr->msg_control = iomsg->msg.msg_control_user; 339 return ret; 340 } 341 342 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) 343 { 344 struct io_async_msghdr *io = req->async_data; 345 346 io_netmsg_iovec_free(io); 347 } 348 349 static int io_send_setup(struct io_kiocb *req) 350 { 351 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 352 struct io_async_msghdr *kmsg = req->async_data; 353 int ret; 354 355 kmsg->msg.msg_name = NULL; 356 kmsg->msg.msg_namelen = 0; 357 kmsg->msg.msg_control = NULL; 358 kmsg->msg.msg_controllen = 0; 359 kmsg->msg.msg_ubuf = NULL; 360 361 if (sr->addr) { 362 ret = move_addr_to_kernel(sr->addr, sr->addr_len, &kmsg->addr); 363 if (unlikely(ret < 0)) 364 return ret; 365 kmsg->msg.msg_name = &kmsg->addr; 366 kmsg->msg.msg_namelen = sr->addr_len; 367 } 368 if (!io_do_buffer_select(req)) { 369 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, 370 &kmsg->msg.msg_iter); 371 if (unlikely(ret < 0)) 372 return ret; 373 } 374 return 0; 375 } 376 377 static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg) 378 { 379 struct io_async_msghdr *kmsg; 380 int ret; 381 382 kmsg = io_msg_alloc_async(req); 383 if (unlikely(!kmsg)) 384 return -ENOMEM; 385 if (!is_msg) 386 return io_send_setup(req); 387 ret = io_sendmsg_copy_hdr(req, kmsg); 388 if (!ret) 389 req->flags |= REQ_F_NEED_CLEANUP; 390 return ret; 391 } 392 393 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) 394 395 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 396 { 397 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 398 399 sr->done_io = 0; 400 401 if (req->opcode == IORING_OP_SEND) { 402 if (READ_ONCE(sqe->__pad3[0])) 403 return -EINVAL; 404 sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 405 sr->addr_len = READ_ONCE(sqe->addr_len); 406 } else if (sqe->addr2 || sqe->file_index) { 407 return -EINVAL; 408 } 409 410 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 411 sr->len = READ_ONCE(sqe->len); 412 sr->flags = READ_ONCE(sqe->ioprio); 413 if (sr->flags & ~SENDMSG_FLAGS) 414 return -EINVAL; 415 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 416 if (sr->msg_flags & MSG_DONTWAIT) 417 req->flags |= REQ_F_NOWAIT; 418 if (sr->flags & IORING_RECVSEND_BUNDLE) { 419 if (req->opcode == IORING_OP_SENDMSG) 420 return -EINVAL; 421 if (!(req->flags & REQ_F_BUFFER_SELECT)) 422 return -EINVAL; 423 sr->msg_flags |= MSG_WAITALL; 424 sr->buf_group = req->buf_index; 425 req->buf_list = NULL; 426 } 427 if (req->flags & REQ_F_BUFFER_SELECT && sr->len) 428 return -EINVAL; 429 430 #ifdef CONFIG_COMPAT 431 if (req->ctx->compat) 432 sr->msg_flags |= MSG_CMSG_COMPAT; 433 #endif 434 return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG); 435 } 436 437 static void io_req_msg_cleanup(struct io_kiocb *req, 438 unsigned int issue_flags) 439 { 440 req->flags &= ~REQ_F_NEED_CLEANUP; 441 io_netmsg_recycle(req, issue_flags); 442 } 443 444 /* 445 * For bundle completions, we need to figure out how many segments we consumed. 446 * A bundle could be using a single ITER_UBUF if that's all we mapped, or it 447 * could be using an ITER_IOVEC. If the latter, then if we consumed all of 448 * the segments, then it's a trivial questiont o answer. If we have residual 449 * data in the iter, then loop the segments to figure out how much we 450 * transferred. 451 */ 452 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) 453 { 454 struct iovec *iov; 455 int nbufs; 456 457 /* no data is always zero segments, and a ubuf is always 1 segment */ 458 if (ret <= 0) 459 return 0; 460 if (iter_is_ubuf(&kmsg->msg.msg_iter)) 461 return 1; 462 463 iov = kmsg->free_iov; 464 if (!iov) 465 iov = &kmsg->fast_iov; 466 467 /* if all data was transferred, it's basic pointer math */ 468 if (!iov_iter_count(&kmsg->msg.msg_iter)) 469 return iter_iov(&kmsg->msg.msg_iter) - iov; 470 471 /* short transfer, count segments */ 472 nbufs = 0; 473 do { 474 int this_len = min_t(int, iov[nbufs].iov_len, ret); 475 476 nbufs++; 477 ret -= this_len; 478 } while (ret); 479 480 return nbufs; 481 } 482 483 static inline bool io_send_finish(struct io_kiocb *req, int *ret, 484 struct io_async_msghdr *kmsg, 485 unsigned issue_flags) 486 { 487 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 488 bool bundle_finished = *ret <= 0; 489 unsigned int cflags; 490 491 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { 492 cflags = io_put_kbuf(req, issue_flags); 493 goto finish; 494 } 495 496 cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags); 497 498 if (bundle_finished || req->flags & REQ_F_BL_EMPTY) 499 goto finish; 500 501 /* 502 * Fill CQE for this receive and see if we should keep trying to 503 * receive from this socket. 504 */ 505 if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 506 io_mshot_prep_retry(req, kmsg); 507 return false; 508 } 509 510 /* Otherwise stop bundle and use the current result. */ 511 finish: 512 io_req_set_res(req, *ret, cflags); 513 *ret = IOU_OK; 514 return true; 515 } 516 517 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 518 { 519 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 520 struct io_async_msghdr *kmsg = req->async_data; 521 struct socket *sock; 522 unsigned flags; 523 int min_ret = 0; 524 int ret; 525 526 sock = sock_from_file(req->file); 527 if (unlikely(!sock)) 528 return -ENOTSOCK; 529 530 if (!(req->flags & REQ_F_POLLED) && 531 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 532 return -EAGAIN; 533 534 flags = sr->msg_flags; 535 if (issue_flags & IO_URING_F_NONBLOCK) 536 flags |= MSG_DONTWAIT; 537 if (flags & MSG_WAITALL) 538 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 539 540 kmsg->msg.msg_control_user = sr->msg_control; 541 542 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 543 544 if (ret < min_ret) { 545 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 546 return -EAGAIN; 547 if (ret > 0 && io_net_retry(sock, flags)) { 548 kmsg->msg.msg_controllen = 0; 549 kmsg->msg.msg_control = NULL; 550 sr->done_io += ret; 551 req->flags |= REQ_F_BL_NO_RECYCLE; 552 return -EAGAIN; 553 } 554 if (ret == -ERESTARTSYS) 555 ret = -EINTR; 556 req_set_fail(req); 557 } 558 io_req_msg_cleanup(req, issue_flags); 559 if (ret >= 0) 560 ret += sr->done_io; 561 else if (sr->done_io) 562 ret = sr->done_io; 563 io_req_set_res(req, ret, 0); 564 return IOU_OK; 565 } 566 567 int io_send(struct io_kiocb *req, unsigned int issue_flags) 568 { 569 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 570 struct io_async_msghdr *kmsg = req->async_data; 571 struct socket *sock; 572 unsigned flags; 573 int min_ret = 0; 574 int ret; 575 576 sock = sock_from_file(req->file); 577 if (unlikely(!sock)) 578 return -ENOTSOCK; 579 580 if (!(req->flags & REQ_F_POLLED) && 581 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 582 return -EAGAIN; 583 584 flags = sr->msg_flags; 585 if (issue_flags & IO_URING_F_NONBLOCK) 586 flags |= MSG_DONTWAIT; 587 588 retry_bundle: 589 if (io_do_buffer_select(req)) { 590 struct buf_sel_arg arg = { 591 .iovs = &kmsg->fast_iov, 592 .max_len = INT_MAX, 593 .nr_iovs = 1, 594 .mode = KBUF_MODE_EXPAND, 595 }; 596 597 if (kmsg->free_iov) { 598 arg.nr_iovs = kmsg->free_iov_nr; 599 arg.iovs = kmsg->free_iov; 600 arg.mode |= KBUF_MODE_FREE; 601 } 602 603 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) 604 arg.nr_iovs = 1; 605 606 ret = io_buffers_select(req, &arg, issue_flags); 607 if (unlikely(ret < 0)) 608 return ret; 609 610 sr->len = arg.out_len; 611 iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret, 612 arg.out_len); 613 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { 614 kmsg->free_iov_nr = ret; 615 kmsg->free_iov = arg.iovs; 616 } 617 } 618 619 /* 620 * If MSG_WAITALL is set, or this is a bundle send, then we need 621 * the full amount. If just bundle is set, if we do a short send 622 * then we complete the bundle sequence rather than continue on. 623 */ 624 if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE) 625 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 626 627 flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 628 kmsg->msg.msg_flags = flags; 629 ret = sock_sendmsg(sock, &kmsg->msg); 630 if (ret < min_ret) { 631 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 632 return -EAGAIN; 633 634 if (ret > 0 && io_net_retry(sock, flags)) { 635 sr->len -= ret; 636 sr->buf += ret; 637 sr->done_io += ret; 638 req->flags |= REQ_F_BL_NO_RECYCLE; 639 return -EAGAIN; 640 } 641 if (ret == -ERESTARTSYS) 642 ret = -EINTR; 643 req_set_fail(req); 644 } 645 if (ret >= 0) 646 ret += sr->done_io; 647 else if (sr->done_io) 648 ret = sr->done_io; 649 650 if (!io_send_finish(req, &ret, kmsg, issue_flags)) 651 goto retry_bundle; 652 653 io_req_msg_cleanup(req, issue_flags); 654 return ret; 655 } 656 657 static int io_recvmsg_mshot_prep(struct io_kiocb *req, 658 struct io_async_msghdr *iomsg, 659 int namelen, size_t controllen) 660 { 661 if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) == 662 (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) { 663 int hdr; 664 665 if (unlikely(namelen < 0)) 666 return -EOVERFLOW; 667 if (check_add_overflow(sizeof(struct io_uring_recvmsg_out), 668 namelen, &hdr)) 669 return -EOVERFLOW; 670 if (check_add_overflow(hdr, controllen, &hdr)) 671 return -EOVERFLOW; 672 673 iomsg->namelen = namelen; 674 iomsg->controllen = controllen; 675 return 0; 676 } 677 678 return 0; 679 } 680 681 static int io_recvmsg_copy_hdr(struct io_kiocb *req, 682 struct io_async_msghdr *iomsg) 683 { 684 struct user_msghdr msg; 685 int ret; 686 687 iomsg->msg.msg_name = &iomsg->addr; 688 iomsg->msg.msg_iter.nr_segs = 0; 689 690 #ifdef CONFIG_COMPAT 691 if (unlikely(req->ctx->compat)) { 692 struct compat_msghdr cmsg; 693 694 ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST); 695 if (unlikely(ret)) 696 return ret; 697 698 ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr); 699 if (unlikely(ret)) 700 return ret; 701 702 return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen, 703 cmsg.msg_controllen); 704 } 705 #endif 706 707 ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST); 708 if (unlikely(ret)) 709 return ret; 710 711 ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); 712 if (unlikely(ret)) 713 return ret; 714 715 return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, 716 msg.msg_controllen); 717 } 718 719 static int io_recvmsg_prep_setup(struct io_kiocb *req) 720 { 721 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 722 struct io_async_msghdr *kmsg; 723 int ret; 724 725 kmsg = io_msg_alloc_async(req); 726 if (unlikely(!kmsg)) 727 return -ENOMEM; 728 729 if (req->opcode == IORING_OP_RECV) { 730 kmsg->msg.msg_name = NULL; 731 kmsg->msg.msg_namelen = 0; 732 kmsg->msg.msg_control = NULL; 733 kmsg->msg.msg_get_inq = 1; 734 kmsg->msg.msg_controllen = 0; 735 kmsg->msg.msg_iocb = NULL; 736 kmsg->msg.msg_ubuf = NULL; 737 738 if (!io_do_buffer_select(req)) { 739 ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 740 &kmsg->msg.msg_iter); 741 if (unlikely(ret)) 742 return ret; 743 } 744 return 0; 745 } 746 747 ret = io_recvmsg_copy_hdr(req, kmsg); 748 if (!ret) 749 req->flags |= REQ_F_NEED_CLEANUP; 750 return ret; 751 } 752 753 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ 754 IORING_RECVSEND_BUNDLE) 755 756 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 757 { 758 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 759 760 sr->done_io = 0; 761 762 if (unlikely(sqe->file_index || sqe->addr2)) 763 return -EINVAL; 764 765 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 766 sr->len = READ_ONCE(sqe->len); 767 sr->flags = READ_ONCE(sqe->ioprio); 768 if (sr->flags & ~RECVMSG_FLAGS) 769 return -EINVAL; 770 sr->msg_flags = READ_ONCE(sqe->msg_flags); 771 if (sr->msg_flags & MSG_DONTWAIT) 772 req->flags |= REQ_F_NOWAIT; 773 if (sr->msg_flags & MSG_ERRQUEUE) 774 req->flags |= REQ_F_CLEAR_POLLIN; 775 if (req->flags & REQ_F_BUFFER_SELECT) { 776 /* 777 * Store the buffer group for this multishot receive separately, 778 * as if we end up doing an io-wq based issue that selects a 779 * buffer, it has to be committed immediately and that will 780 * clear ->buf_list. This means we lose the link to the buffer 781 * list, and the eventual buffer put on completion then cannot 782 * restore it. 783 */ 784 sr->buf_group = req->buf_index; 785 req->buf_list = NULL; 786 } 787 if (sr->flags & IORING_RECV_MULTISHOT) { 788 if (!(req->flags & REQ_F_BUFFER_SELECT)) 789 return -EINVAL; 790 if (sr->msg_flags & MSG_WAITALL) 791 return -EINVAL; 792 if (req->opcode == IORING_OP_RECV && sr->len) 793 return -EINVAL; 794 req->flags |= REQ_F_APOLL_MULTISHOT; 795 } 796 if (sr->flags & IORING_RECVSEND_BUNDLE) { 797 if (req->opcode == IORING_OP_RECVMSG) 798 return -EINVAL; 799 } 800 801 #ifdef CONFIG_COMPAT 802 if (req->ctx->compat) 803 sr->msg_flags |= MSG_CMSG_COMPAT; 804 #endif 805 sr->nr_multishot_loops = 0; 806 return io_recvmsg_prep_setup(req); 807 } 808 809 /* 810 * Finishes io_recv and io_recvmsg. 811 * 812 * Returns true if it is actually finished, or false if it should run 813 * again (for multishot). 814 */ 815 static inline bool io_recv_finish(struct io_kiocb *req, int *ret, 816 struct io_async_msghdr *kmsg, 817 bool mshot_finished, unsigned issue_flags) 818 { 819 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 820 unsigned int cflags; 821 822 if (sr->flags & IORING_RECVSEND_BUNDLE) 823 cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), 824 issue_flags); 825 else 826 cflags = io_put_kbuf(req, issue_flags); 827 828 if (kmsg->msg.msg_inq > 0) 829 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 830 831 /* bundle with no more immediate buffers, we're done */ 832 if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY) 833 goto finish; 834 835 /* 836 * Fill CQE for this receive and see if we should keep trying to 837 * receive from this socket. 838 */ 839 if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && 840 io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 841 int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; 842 843 io_mshot_prep_retry(req, kmsg); 844 /* Known not-empty or unknown state, retry */ 845 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { 846 if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) 847 return false; 848 /* mshot retries exceeded, force a requeue */ 849 sr->nr_multishot_loops = 0; 850 mshot_retry_ret = IOU_REQUEUE; 851 } 852 if (issue_flags & IO_URING_F_MULTISHOT) 853 *ret = mshot_retry_ret; 854 else 855 *ret = -EAGAIN; 856 return true; 857 } 858 859 /* Finish the request / stop multishot. */ 860 finish: 861 io_req_set_res(req, *ret, cflags); 862 863 if (issue_flags & IO_URING_F_MULTISHOT) 864 *ret = IOU_STOP_MULTISHOT; 865 else 866 *ret = IOU_OK; 867 io_req_msg_cleanup(req, issue_flags); 868 return true; 869 } 870 871 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg, 872 struct io_sr_msg *sr, void __user **buf, 873 size_t *len) 874 { 875 unsigned long ubuf = (unsigned long) *buf; 876 unsigned long hdr; 877 878 hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + 879 kmsg->controllen; 880 if (*len < hdr) 881 return -EFAULT; 882 883 if (kmsg->controllen) { 884 unsigned long control = ubuf + hdr - kmsg->controllen; 885 886 kmsg->msg.msg_control_user = (void __user *) control; 887 kmsg->msg.msg_controllen = kmsg->controllen; 888 } 889 890 sr->buf = *buf; /* stash for later copy */ 891 *buf = (void __user *) (ubuf + hdr); 892 kmsg->payloadlen = *len = *len - hdr; 893 return 0; 894 } 895 896 struct io_recvmsg_multishot_hdr { 897 struct io_uring_recvmsg_out msg; 898 struct sockaddr_storage addr; 899 }; 900 901 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, 902 struct io_async_msghdr *kmsg, 903 unsigned int flags, bool *finished) 904 { 905 int err; 906 int copy_len; 907 struct io_recvmsg_multishot_hdr hdr; 908 909 if (kmsg->namelen) 910 kmsg->msg.msg_name = &hdr.addr; 911 kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); 912 kmsg->msg.msg_namelen = 0; 913 914 if (sock->file->f_flags & O_NONBLOCK) 915 flags |= MSG_DONTWAIT; 916 917 err = sock_recvmsg(sock, &kmsg->msg, flags); 918 *finished = err <= 0; 919 if (err < 0) 920 return err; 921 922 hdr.msg = (struct io_uring_recvmsg_out) { 923 .controllen = kmsg->controllen - kmsg->msg.msg_controllen, 924 .flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT 925 }; 926 927 hdr.msg.payloadlen = err; 928 if (err > kmsg->payloadlen) 929 err = kmsg->payloadlen; 930 931 copy_len = sizeof(struct io_uring_recvmsg_out); 932 if (kmsg->msg.msg_namelen > kmsg->namelen) 933 copy_len += kmsg->namelen; 934 else 935 copy_len += kmsg->msg.msg_namelen; 936 937 /* 938 * "fromlen shall refer to the value before truncation.." 939 * 1003.1g 940 */ 941 hdr.msg.namelen = kmsg->msg.msg_namelen; 942 943 /* ensure that there is no gap between hdr and sockaddr_storage */ 944 BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) != 945 sizeof(struct io_uring_recvmsg_out)); 946 if (copy_to_user(io->buf, &hdr, copy_len)) { 947 *finished = true; 948 return -EFAULT; 949 } 950 951 return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + 952 kmsg->controllen + err; 953 } 954 955 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 956 { 957 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 958 struct io_async_msghdr *kmsg = req->async_data; 959 struct socket *sock; 960 unsigned flags; 961 int ret, min_ret = 0; 962 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 963 bool mshot_finished = true; 964 965 sock = sock_from_file(req->file); 966 if (unlikely(!sock)) 967 return -ENOTSOCK; 968 969 if (!(req->flags & REQ_F_POLLED) && 970 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 971 return -EAGAIN; 972 973 flags = sr->msg_flags; 974 if (force_nonblock) 975 flags |= MSG_DONTWAIT; 976 977 retry_multishot: 978 if (io_do_buffer_select(req)) { 979 void __user *buf; 980 size_t len = sr->len; 981 982 buf = io_buffer_select(req, &len, issue_flags); 983 if (!buf) 984 return -ENOBUFS; 985 986 if (req->flags & REQ_F_APOLL_MULTISHOT) { 987 ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len); 988 if (ret) { 989 io_kbuf_recycle(req, issue_flags); 990 return ret; 991 } 992 } 993 994 iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); 995 } 996 997 kmsg->msg.msg_get_inq = 1; 998 kmsg->msg.msg_inq = -1; 999 if (req->flags & REQ_F_APOLL_MULTISHOT) { 1000 ret = io_recvmsg_multishot(sock, sr, kmsg, flags, 1001 &mshot_finished); 1002 } else { 1003 /* disable partial retry for recvmsg with cmsg attached */ 1004 if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) 1005 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1006 1007 ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, 1008 kmsg->uaddr, flags); 1009 } 1010 1011 if (ret < min_ret) { 1012 if (ret == -EAGAIN && force_nonblock) { 1013 if (issue_flags & IO_URING_F_MULTISHOT) { 1014 io_kbuf_recycle(req, issue_flags); 1015 return IOU_ISSUE_SKIP_COMPLETE; 1016 } 1017 return -EAGAIN; 1018 } 1019 if (ret > 0 && io_net_retry(sock, flags)) { 1020 sr->done_io += ret; 1021 req->flags |= REQ_F_BL_NO_RECYCLE; 1022 return -EAGAIN; 1023 } 1024 if (ret == -ERESTARTSYS) 1025 ret = -EINTR; 1026 req_set_fail(req); 1027 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1028 req_set_fail(req); 1029 } 1030 1031 if (ret > 0) 1032 ret += sr->done_io; 1033 else if (sr->done_io) 1034 ret = sr->done_io; 1035 else 1036 io_kbuf_recycle(req, issue_flags); 1037 1038 if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1039 goto retry_multishot; 1040 1041 return ret; 1042 } 1043 1044 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, 1045 size_t *len, unsigned int issue_flags) 1046 { 1047 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1048 int ret; 1049 1050 /* 1051 * If the ring isn't locked, then don't use the peek interface 1052 * to grab multiple buffers as we will lock/unlock between 1053 * this selection and posting the buffers. 1054 */ 1055 if (!(issue_flags & IO_URING_F_UNLOCKED) && 1056 sr->flags & IORING_RECVSEND_BUNDLE) { 1057 struct buf_sel_arg arg = { 1058 .iovs = &kmsg->fast_iov, 1059 .nr_iovs = 1, 1060 .mode = KBUF_MODE_EXPAND, 1061 }; 1062 1063 if (kmsg->free_iov) { 1064 arg.nr_iovs = kmsg->free_iov_nr; 1065 arg.iovs = kmsg->free_iov; 1066 arg.mode |= KBUF_MODE_FREE; 1067 } 1068 1069 if (kmsg->msg.msg_inq > 0) 1070 arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); 1071 1072 ret = io_buffers_peek(req, &arg); 1073 if (unlikely(ret < 0)) 1074 return ret; 1075 1076 /* special case 1 vec, can be a fast path */ 1077 if (ret == 1) { 1078 sr->buf = arg.iovs[0].iov_base; 1079 sr->len = arg.iovs[0].iov_len; 1080 goto map_ubuf; 1081 } 1082 iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, 1083 arg.out_len); 1084 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { 1085 kmsg->free_iov_nr = ret; 1086 kmsg->free_iov = arg.iovs; 1087 } 1088 } else { 1089 void __user *buf; 1090 1091 *len = sr->len; 1092 buf = io_buffer_select(req, len, issue_flags); 1093 if (!buf) 1094 return -ENOBUFS; 1095 sr->buf = buf; 1096 sr->len = *len; 1097 map_ubuf: 1098 ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 1099 &kmsg->msg.msg_iter); 1100 if (unlikely(ret)) 1101 return ret; 1102 } 1103 1104 return 0; 1105 } 1106 1107 int io_recv(struct io_kiocb *req, unsigned int issue_flags) 1108 { 1109 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1110 struct io_async_msghdr *kmsg = req->async_data; 1111 struct socket *sock; 1112 unsigned flags; 1113 int ret, min_ret = 0; 1114 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1115 size_t len = sr->len; 1116 1117 if (!(req->flags & REQ_F_POLLED) && 1118 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1119 return -EAGAIN; 1120 1121 sock = sock_from_file(req->file); 1122 if (unlikely(!sock)) 1123 return -ENOTSOCK; 1124 1125 flags = sr->msg_flags; 1126 if (force_nonblock) 1127 flags |= MSG_DONTWAIT; 1128 1129 retry_multishot: 1130 if (io_do_buffer_select(req)) { 1131 ret = io_recv_buf_select(req, kmsg, &len, issue_flags); 1132 if (unlikely(ret)) 1133 goto out_free; 1134 sr->buf = NULL; 1135 } 1136 1137 kmsg->msg.msg_inq = -1; 1138 kmsg->msg.msg_flags = 0; 1139 1140 if (flags & MSG_WAITALL) 1141 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1142 1143 ret = sock_recvmsg(sock, &kmsg->msg, flags); 1144 if (ret < min_ret) { 1145 if (ret == -EAGAIN && force_nonblock) { 1146 if (issue_flags & IO_URING_F_MULTISHOT) { 1147 io_kbuf_recycle(req, issue_flags); 1148 return IOU_ISSUE_SKIP_COMPLETE; 1149 } 1150 1151 return -EAGAIN; 1152 } 1153 if (ret > 0 && io_net_retry(sock, flags)) { 1154 sr->len -= ret; 1155 sr->buf += ret; 1156 sr->done_io += ret; 1157 req->flags |= REQ_F_BL_NO_RECYCLE; 1158 return -EAGAIN; 1159 } 1160 if (ret == -ERESTARTSYS) 1161 ret = -EINTR; 1162 req_set_fail(req); 1163 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1164 out_free: 1165 req_set_fail(req); 1166 } 1167 1168 if (ret > 0) 1169 ret += sr->done_io; 1170 else if (sr->done_io) 1171 ret = sr->done_io; 1172 else 1173 io_kbuf_recycle(req, issue_flags); 1174 1175 if (!io_recv_finish(req, &ret, kmsg, ret <= 0, issue_flags)) 1176 goto retry_multishot; 1177 1178 return ret; 1179 } 1180 1181 void io_send_zc_cleanup(struct io_kiocb *req) 1182 { 1183 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1184 struct io_async_msghdr *io = req->async_data; 1185 1186 if (req_has_async_data(req)) 1187 io_netmsg_iovec_free(io); 1188 if (zc->notif) { 1189 io_notif_flush(zc->notif); 1190 zc->notif = NULL; 1191 } 1192 } 1193 1194 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF) 1195 #define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE) 1196 1197 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1198 { 1199 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1200 struct io_ring_ctx *ctx = req->ctx; 1201 struct io_kiocb *notif; 1202 1203 zc->done_io = 0; 1204 req->flags |= REQ_F_POLL_NO_LAZY; 1205 1206 if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) 1207 return -EINVAL; 1208 /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ 1209 if (req->flags & REQ_F_CQE_SKIP) 1210 return -EINVAL; 1211 1212 notif = zc->notif = io_alloc_notif(ctx); 1213 if (!notif) 1214 return -ENOMEM; 1215 notif->cqe.user_data = req->cqe.user_data; 1216 notif->cqe.res = 0; 1217 notif->cqe.flags = IORING_CQE_F_NOTIF; 1218 req->flags |= REQ_F_NEED_CLEANUP; 1219 1220 zc->flags = READ_ONCE(sqe->ioprio); 1221 if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) { 1222 if (zc->flags & ~IO_ZC_FLAGS_VALID) 1223 return -EINVAL; 1224 if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { 1225 struct io_notif_data *nd = io_notif_to_data(notif); 1226 1227 nd->zc_report = true; 1228 nd->zc_used = false; 1229 nd->zc_copied = false; 1230 } 1231 } 1232 1233 if (zc->flags & IORING_RECVSEND_FIXED_BUF) { 1234 unsigned idx = READ_ONCE(sqe->buf_index); 1235 1236 if (unlikely(idx >= ctx->nr_user_bufs)) 1237 return -EFAULT; 1238 idx = array_index_nospec(idx, ctx->nr_user_bufs); 1239 req->imu = READ_ONCE(ctx->user_bufs[idx]); 1240 io_req_set_rsrc_node(notif, ctx, 0); 1241 } 1242 1243 if (req->opcode == IORING_OP_SEND_ZC) { 1244 if (READ_ONCE(sqe->__pad3[0])) 1245 return -EINVAL; 1246 zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 1247 zc->addr_len = READ_ONCE(sqe->addr_len); 1248 } else { 1249 if (unlikely(sqe->addr2 || sqe->file_index)) 1250 return -EINVAL; 1251 if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF)) 1252 return -EINVAL; 1253 } 1254 1255 zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1256 zc->len = READ_ONCE(sqe->len); 1257 zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; 1258 if (zc->msg_flags & MSG_DONTWAIT) 1259 req->flags |= REQ_F_NOWAIT; 1260 1261 #ifdef CONFIG_COMPAT 1262 if (req->ctx->compat) 1263 zc->msg_flags |= MSG_CMSG_COMPAT; 1264 #endif 1265 return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC); 1266 } 1267 1268 static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb, 1269 struct iov_iter *from, size_t length) 1270 { 1271 skb_zcopy_downgrade_managed(skb); 1272 return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); 1273 } 1274 1275 static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, 1276 struct iov_iter *from, size_t length) 1277 { 1278 struct skb_shared_info *shinfo = skb_shinfo(skb); 1279 int frag = shinfo->nr_frags; 1280 int ret = 0; 1281 struct bvec_iter bi; 1282 ssize_t copied = 0; 1283 unsigned long truesize = 0; 1284 1285 if (!frag) 1286 shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; 1287 else if (unlikely(!skb_zcopy_managed(skb))) 1288 return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); 1289 1290 bi.bi_size = min(from->count, length); 1291 bi.bi_bvec_done = from->iov_offset; 1292 bi.bi_idx = 0; 1293 1294 while (bi.bi_size && frag < MAX_SKB_FRAGS) { 1295 struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi); 1296 1297 copied += v.bv_len; 1298 truesize += PAGE_ALIGN(v.bv_len + v.bv_offset); 1299 __skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page, 1300 v.bv_offset, v.bv_len); 1301 bvec_iter_advance_single(from->bvec, &bi, v.bv_len); 1302 } 1303 if (bi.bi_size) 1304 ret = -EMSGSIZE; 1305 1306 shinfo->nr_frags = frag; 1307 from->bvec += bi.bi_idx; 1308 from->nr_segs -= bi.bi_idx; 1309 from->count -= copied; 1310 from->iov_offset = bi.bi_bvec_done; 1311 1312 skb->data_len += copied; 1313 skb->len += copied; 1314 skb->truesize += truesize; 1315 1316 if (sk && sk->sk_type == SOCK_STREAM) { 1317 sk_wmem_queued_add(sk, truesize); 1318 if (!skb_zcopy_pure(skb)) 1319 sk_mem_charge(sk, truesize); 1320 } else { 1321 refcount_add(truesize, &skb->sk->sk_wmem_alloc); 1322 } 1323 return ret; 1324 } 1325 1326 static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg) 1327 { 1328 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1329 int ret; 1330 1331 if (sr->flags & IORING_RECVSEND_FIXED_BUF) { 1332 ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, req->imu, 1333 (u64)(uintptr_t)sr->buf, sr->len); 1334 if (unlikely(ret)) 1335 return ret; 1336 kmsg->msg.sg_from_iter = io_sg_from_iter; 1337 } else { 1338 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); 1339 if (unlikely(ret)) 1340 return ret; 1341 ret = io_notif_account_mem(sr->notif, sr->len); 1342 if (unlikely(ret)) 1343 return ret; 1344 kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; 1345 } 1346 1347 return ret; 1348 } 1349 1350 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) 1351 { 1352 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1353 struct io_async_msghdr *kmsg = req->async_data; 1354 struct socket *sock; 1355 unsigned msg_flags; 1356 int ret, min_ret = 0; 1357 1358 sock = sock_from_file(req->file); 1359 if (unlikely(!sock)) 1360 return -ENOTSOCK; 1361 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1362 return -EOPNOTSUPP; 1363 1364 if (!(req->flags & REQ_F_POLLED) && 1365 (zc->flags & IORING_RECVSEND_POLL_FIRST)) 1366 return -EAGAIN; 1367 1368 if (!zc->done_io) { 1369 ret = io_send_zc_import(req, kmsg); 1370 if (unlikely(ret)) 1371 return ret; 1372 } 1373 1374 msg_flags = zc->msg_flags; 1375 if (issue_flags & IO_URING_F_NONBLOCK) 1376 msg_flags |= MSG_DONTWAIT; 1377 if (msg_flags & MSG_WAITALL) 1378 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1379 msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 1380 1381 kmsg->msg.msg_flags = msg_flags; 1382 kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; 1383 ret = sock_sendmsg(sock, &kmsg->msg); 1384 1385 if (unlikely(ret < min_ret)) { 1386 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1387 return -EAGAIN; 1388 1389 if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { 1390 zc->len -= ret; 1391 zc->buf += ret; 1392 zc->done_io += ret; 1393 req->flags |= REQ_F_BL_NO_RECYCLE; 1394 return -EAGAIN; 1395 } 1396 if (ret == -ERESTARTSYS) 1397 ret = -EINTR; 1398 req_set_fail(req); 1399 } 1400 1401 if (ret >= 0) 1402 ret += zc->done_io; 1403 else if (zc->done_io) 1404 ret = zc->done_io; 1405 1406 /* 1407 * If we're in io-wq we can't rely on tw ordering guarantees, defer 1408 * flushing notif to io_send_zc_cleanup() 1409 */ 1410 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1411 io_notif_flush(zc->notif); 1412 io_req_msg_cleanup(req, 0); 1413 } 1414 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1415 return IOU_OK; 1416 } 1417 1418 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) 1419 { 1420 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1421 struct io_async_msghdr *kmsg = req->async_data; 1422 struct socket *sock; 1423 unsigned flags; 1424 int ret, min_ret = 0; 1425 1426 sock = sock_from_file(req->file); 1427 if (unlikely(!sock)) 1428 return -ENOTSOCK; 1429 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1430 return -EOPNOTSUPP; 1431 1432 if (!(req->flags & REQ_F_POLLED) && 1433 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1434 return -EAGAIN; 1435 1436 flags = sr->msg_flags; 1437 if (issue_flags & IO_URING_F_NONBLOCK) 1438 flags |= MSG_DONTWAIT; 1439 if (flags & MSG_WAITALL) 1440 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1441 1442 kmsg->msg.msg_control_user = sr->msg_control; 1443 kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; 1444 kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; 1445 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 1446 1447 if (unlikely(ret < min_ret)) { 1448 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1449 return -EAGAIN; 1450 1451 if (ret > 0 && io_net_retry(sock, flags)) { 1452 sr->done_io += ret; 1453 req->flags |= REQ_F_BL_NO_RECYCLE; 1454 return -EAGAIN; 1455 } 1456 if (ret == -ERESTARTSYS) 1457 ret = -EINTR; 1458 req_set_fail(req); 1459 } 1460 1461 if (ret >= 0) 1462 ret += sr->done_io; 1463 else if (sr->done_io) 1464 ret = sr->done_io; 1465 1466 /* 1467 * If we're in io-wq we can't rely on tw ordering guarantees, defer 1468 * flushing notif to io_send_zc_cleanup() 1469 */ 1470 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1471 io_notif_flush(sr->notif); 1472 io_req_msg_cleanup(req, 0); 1473 } 1474 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1475 return IOU_OK; 1476 } 1477 1478 void io_sendrecv_fail(struct io_kiocb *req) 1479 { 1480 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1481 1482 if (sr->done_io) 1483 req->cqe.res = sr->done_io; 1484 1485 if ((req->flags & REQ_F_NEED_CLEANUP) && 1486 (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC)) 1487 req->cqe.flags |= IORING_CQE_F_MORE; 1488 } 1489 1490 #define ACCEPT_FLAGS (IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \ 1491 IORING_ACCEPT_POLL_FIRST) 1492 1493 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1494 { 1495 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1496 1497 if (sqe->len || sqe->buf_index) 1498 return -EINVAL; 1499 1500 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1501 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 1502 accept->flags = READ_ONCE(sqe->accept_flags); 1503 accept->nofile = rlimit(RLIMIT_NOFILE); 1504 accept->iou_flags = READ_ONCE(sqe->ioprio); 1505 if (accept->iou_flags & ~ACCEPT_FLAGS) 1506 return -EINVAL; 1507 1508 accept->file_slot = READ_ONCE(sqe->file_index); 1509 if (accept->file_slot) { 1510 if (accept->flags & SOCK_CLOEXEC) 1511 return -EINVAL; 1512 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT && 1513 accept->file_slot != IORING_FILE_INDEX_ALLOC) 1514 return -EINVAL; 1515 } 1516 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1517 return -EINVAL; 1518 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) 1519 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 1520 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT) 1521 req->flags |= REQ_F_APOLL_MULTISHOT; 1522 if (accept->iou_flags & IORING_ACCEPT_DONTWAIT) 1523 req->flags |= REQ_F_NOWAIT; 1524 return 0; 1525 } 1526 1527 int io_accept(struct io_kiocb *req, unsigned int issue_flags) 1528 { 1529 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1530 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1531 bool fixed = !!accept->file_slot; 1532 struct proto_accept_arg arg = { 1533 .flags = force_nonblock ? O_NONBLOCK : 0, 1534 }; 1535 struct file *file; 1536 unsigned cflags; 1537 int ret, fd; 1538 1539 if (!(req->flags & REQ_F_POLLED) && 1540 accept->iou_flags & IORING_ACCEPT_POLL_FIRST) 1541 return -EAGAIN; 1542 1543 retry: 1544 if (!fixed) { 1545 fd = __get_unused_fd_flags(accept->flags, accept->nofile); 1546 if (unlikely(fd < 0)) 1547 return fd; 1548 } 1549 arg.err = 0; 1550 arg.is_empty = -1; 1551 file = do_accept(req->file, &arg, accept->addr, accept->addr_len, 1552 accept->flags); 1553 if (IS_ERR(file)) { 1554 if (!fixed) 1555 put_unused_fd(fd); 1556 ret = PTR_ERR(file); 1557 if (ret == -EAGAIN && force_nonblock && 1558 !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) { 1559 /* 1560 * if it's multishot and polled, we don't need to 1561 * return EAGAIN to arm the poll infra since it 1562 * has already been done 1563 */ 1564 if (issue_flags & IO_URING_F_MULTISHOT) 1565 return IOU_ISSUE_SKIP_COMPLETE; 1566 return ret; 1567 } 1568 if (ret == -ERESTARTSYS) 1569 ret = -EINTR; 1570 req_set_fail(req); 1571 } else if (!fixed) { 1572 fd_install(fd, file); 1573 ret = fd; 1574 } else { 1575 ret = io_fixed_fd_install(req, issue_flags, file, 1576 accept->file_slot); 1577 } 1578 1579 cflags = 0; 1580 if (!arg.is_empty) 1581 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 1582 1583 if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { 1584 io_req_set_res(req, ret, cflags); 1585 return IOU_OK; 1586 } 1587 1588 if (ret < 0) 1589 return ret; 1590 if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { 1591 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) 1592 goto retry; 1593 if (issue_flags & IO_URING_F_MULTISHOT) 1594 return IOU_ISSUE_SKIP_COMPLETE; 1595 return -EAGAIN; 1596 } 1597 1598 io_req_set_res(req, ret, cflags); 1599 return IOU_STOP_MULTISHOT; 1600 } 1601 1602 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1603 { 1604 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); 1605 1606 if (sqe->addr || sqe->rw_flags || sqe->buf_index) 1607 return -EINVAL; 1608 1609 sock->domain = READ_ONCE(sqe->fd); 1610 sock->type = READ_ONCE(sqe->off); 1611 sock->protocol = READ_ONCE(sqe->len); 1612 sock->file_slot = READ_ONCE(sqe->file_index); 1613 sock->nofile = rlimit(RLIMIT_NOFILE); 1614 1615 sock->flags = sock->type & ~SOCK_TYPE_MASK; 1616 if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) 1617 return -EINVAL; 1618 if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1619 return -EINVAL; 1620 return 0; 1621 } 1622 1623 int io_socket(struct io_kiocb *req, unsigned int issue_flags) 1624 { 1625 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); 1626 bool fixed = !!sock->file_slot; 1627 struct file *file; 1628 int ret, fd; 1629 1630 if (!fixed) { 1631 fd = __get_unused_fd_flags(sock->flags, sock->nofile); 1632 if (unlikely(fd < 0)) 1633 return fd; 1634 } 1635 file = __sys_socket_file(sock->domain, sock->type, sock->protocol); 1636 if (IS_ERR(file)) { 1637 if (!fixed) 1638 put_unused_fd(fd); 1639 ret = PTR_ERR(file); 1640 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1641 return -EAGAIN; 1642 if (ret == -ERESTARTSYS) 1643 ret = -EINTR; 1644 req_set_fail(req); 1645 } else if (!fixed) { 1646 fd_install(fd, file); 1647 ret = fd; 1648 } else { 1649 ret = io_fixed_fd_install(req, issue_flags, file, 1650 sock->file_slot); 1651 } 1652 io_req_set_res(req, ret, 0); 1653 return IOU_OK; 1654 } 1655 1656 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1657 { 1658 struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); 1659 struct io_async_msghdr *io; 1660 1661 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 1662 return -EINVAL; 1663 1664 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1665 conn->addr_len = READ_ONCE(sqe->addr2); 1666 conn->in_progress = conn->seen_econnaborted = false; 1667 1668 io = io_msg_alloc_async(req); 1669 if (unlikely(!io)) 1670 return -ENOMEM; 1671 1672 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr); 1673 } 1674 1675 int io_connect(struct io_kiocb *req, unsigned int issue_flags) 1676 { 1677 struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect); 1678 struct io_async_msghdr *io = req->async_data; 1679 unsigned file_flags; 1680 int ret; 1681 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1682 1683 file_flags = force_nonblock ? O_NONBLOCK : 0; 1684 1685 ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, 1686 file_flags); 1687 if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED) 1688 && force_nonblock) { 1689 if (ret == -EINPROGRESS) { 1690 connect->in_progress = true; 1691 } else if (ret == -ECONNABORTED) { 1692 if (connect->seen_econnaborted) 1693 goto out; 1694 connect->seen_econnaborted = true; 1695 } 1696 return -EAGAIN; 1697 } 1698 if (connect->in_progress) { 1699 /* 1700 * At least bluetooth will return -EBADFD on a re-connect 1701 * attempt, and it's (supposedly) also valid to get -EISCONN 1702 * which means the previous result is good. For both of these, 1703 * grab the sock_error() and use that for the completion. 1704 */ 1705 if (ret == -EBADFD || ret == -EISCONN) 1706 ret = sock_error(sock_from_file(req->file)->sk); 1707 } 1708 if (ret == -ERESTARTSYS) 1709 ret = -EINTR; 1710 out: 1711 if (ret < 0) 1712 req_set_fail(req); 1713 io_req_msg_cleanup(req, issue_flags); 1714 io_req_set_res(req, ret, 0); 1715 return IOU_OK; 1716 } 1717 1718 void io_netmsg_cache_free(const void *entry) 1719 { 1720 struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; 1721 1722 if (kmsg->free_iov) { 1723 kasan_mempool_unpoison_object(kmsg->free_iov, 1724 kmsg->free_iov_nr * sizeof(struct iovec)); 1725 io_netmsg_iovec_free(kmsg); 1726 } 1727 kfree(kmsg); 1728 } 1729 #endif 1730