1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/file.h> 5 #include <linux/slab.h> 6 #include <linux/net.h> 7 #include <linux/compat.h> 8 #include <net/compat.h> 9 #include <linux/io_uring.h> 10 11 #include <uapi/linux/io_uring.h> 12 13 #include "io_uring.h" 14 #include "kbuf.h" 15 #include "alloc_cache.h" 16 #include "net.h" 17 #include "notif.h" 18 #include "rsrc.h" 19 20 #if defined(CONFIG_NET) 21 struct io_shutdown { 22 struct file *file; 23 int how; 24 }; 25 26 struct io_accept { 27 struct file *file; 28 struct sockaddr __user *addr; 29 int __user *addr_len; 30 int flags; 31 int iou_flags; 32 u32 file_slot; 33 unsigned long nofile; 34 }; 35 36 struct io_socket { 37 struct file *file; 38 int domain; 39 int type; 40 int protocol; 41 int flags; 42 u32 file_slot; 43 unsigned long nofile; 44 }; 45 46 struct io_connect { 47 struct file *file; 48 struct sockaddr __user *addr; 49 int addr_len; 50 bool in_progress; 51 bool seen_econnaborted; 52 }; 53 54 struct io_sr_msg { 55 struct file *file; 56 union { 57 struct compat_msghdr __user *umsg_compat; 58 struct user_msghdr __user *umsg; 59 void __user *buf; 60 }; 61 int len; 62 unsigned done_io; 63 unsigned msg_flags; 64 unsigned nr_multishot_loops; 65 u16 flags; 66 /* initialised and used only by !msg send variants */ 67 u16 addr_len; 68 u16 buf_group; 69 void __user *addr; 70 void __user *msg_control; 71 /* used only for send zerocopy */ 72 struct io_kiocb *notif; 73 }; 74 75 /* 76 * Number of times we'll try and do receives if there's more data. If we 77 * exceed this limit, then add us to the back of the queue and retry from 78 * there. This helps fairness between flooding clients. 79 */ 80 #define MULTISHOT_MAX_RETRY 32 81 82 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 83 { 84 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); 85 86 if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || 87 sqe->buf_index || sqe->splice_fd_in)) 88 return -EINVAL; 89 90 shutdown->how = READ_ONCE(sqe->len); 91 req->flags |= REQ_F_FORCE_ASYNC; 92 return 0; 93 } 94 95 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) 96 { 97 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); 98 struct socket *sock; 99 int ret; 100 101 WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); 102 103 sock = sock_from_file(req->file); 104 if (unlikely(!sock)) 105 return -ENOTSOCK; 106 107 ret = __sys_shutdown_sock(sock, shutdown->how); 108 io_req_set_res(req, ret, 0); 109 return IOU_OK; 110 } 111 112 static bool io_net_retry(struct socket *sock, int flags) 113 { 114 if (!(flags & MSG_WAITALL)) 115 return false; 116 return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; 117 } 118 119 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) 120 { 121 if (kmsg->free_iov) { 122 kfree(kmsg->free_iov); 123 kmsg->free_iov_nr = 0; 124 kmsg->free_iov = NULL; 125 } 126 } 127 128 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) 129 { 130 struct io_async_msghdr *hdr = req->async_data; 131 struct iovec *iov; 132 133 /* can't recycle, ensure we free the iovec if we have one */ 134 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { 135 io_netmsg_iovec_free(hdr); 136 return; 137 } 138 139 /* Let normal cleanup path reap it if we fail adding to the cache */ 140 iov = hdr->free_iov; 141 if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { 142 if (iov) 143 kasan_mempool_poison_object(iov); 144 req->async_data = NULL; 145 req->flags &= ~REQ_F_ASYNC_DATA; 146 } 147 } 148 149 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) 150 { 151 struct io_ring_ctx *ctx = req->ctx; 152 struct io_async_msghdr *hdr; 153 154 hdr = io_alloc_cache_get(&ctx->netmsg_cache); 155 if (hdr) { 156 if (hdr->free_iov) { 157 kasan_mempool_unpoison_object(hdr->free_iov, 158 hdr->free_iov_nr * sizeof(struct iovec)); 159 req->flags |= REQ_F_NEED_CLEANUP; 160 } 161 req->flags |= REQ_F_ASYNC_DATA; 162 req->async_data = hdr; 163 return hdr; 164 } 165 166 if (!io_alloc_async_data(req)) { 167 hdr = req->async_data; 168 hdr->free_iov_nr = 0; 169 hdr->free_iov = NULL; 170 return hdr; 171 } 172 return NULL; 173 } 174 175 /* assign new iovec to kmsg, if we need to */ 176 static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, 177 struct iovec *iov) 178 { 179 if (iov) { 180 req->flags |= REQ_F_NEED_CLEANUP; 181 kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs; 182 if (kmsg->free_iov) 183 kfree(kmsg->free_iov); 184 kmsg->free_iov = iov; 185 } 186 return 0; 187 } 188 189 static inline void io_mshot_prep_retry(struct io_kiocb *req, 190 struct io_async_msghdr *kmsg) 191 { 192 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 193 194 req->flags &= ~REQ_F_BL_EMPTY; 195 sr->done_io = 0; 196 sr->len = 0; /* get from the provided buffer */ 197 req->buf_index = sr->buf_group; 198 } 199 200 #ifdef CONFIG_COMPAT 201 static int io_compat_msg_copy_hdr(struct io_kiocb *req, 202 struct io_async_msghdr *iomsg, 203 struct compat_msghdr *msg, int ddir) 204 { 205 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 206 struct compat_iovec __user *uiov; 207 struct iovec *iov; 208 int ret, nr_segs; 209 210 if (iomsg->free_iov) { 211 nr_segs = iomsg->free_iov_nr; 212 iov = iomsg->free_iov; 213 } else { 214 iov = &iomsg->fast_iov; 215 nr_segs = 1; 216 } 217 218 if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) 219 return -EFAULT; 220 221 uiov = compat_ptr(msg->msg_iov); 222 if (req->flags & REQ_F_BUFFER_SELECT) { 223 compat_ssize_t clen; 224 225 if (msg->msg_iovlen == 0) { 226 sr->len = iov->iov_len = 0; 227 iov->iov_base = NULL; 228 } else if (msg->msg_iovlen > 1) { 229 return -EINVAL; 230 } else { 231 if (!access_ok(uiov, sizeof(*uiov))) 232 return -EFAULT; 233 if (__get_user(clen, &uiov->iov_len)) 234 return -EFAULT; 235 if (clen < 0) 236 return -EINVAL; 237 sr->len = clen; 238 } 239 240 return 0; 241 } 242 243 ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, 244 nr_segs, &iov, &iomsg->msg.msg_iter, true); 245 if (unlikely(ret < 0)) 246 return ret; 247 248 return io_net_vec_assign(req, iomsg, iov); 249 } 250 #endif 251 252 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, 253 struct user_msghdr *msg, int ddir) 254 { 255 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 256 struct iovec *iov; 257 int ret, nr_segs; 258 259 if (iomsg->free_iov) { 260 nr_segs = iomsg->free_iov_nr; 261 iov = iomsg->free_iov; 262 } else { 263 iov = &iomsg->fast_iov; 264 nr_segs = 1; 265 } 266 267 if (!user_access_begin(sr->umsg, sizeof(*sr->umsg))) 268 return -EFAULT; 269 270 ret = -EFAULT; 271 unsafe_get_user(msg->msg_name, &sr->umsg->msg_name, ua_end); 272 unsafe_get_user(msg->msg_namelen, &sr->umsg->msg_namelen, ua_end); 273 unsafe_get_user(msg->msg_iov, &sr->umsg->msg_iov, ua_end); 274 unsafe_get_user(msg->msg_iovlen, &sr->umsg->msg_iovlen, ua_end); 275 unsafe_get_user(msg->msg_control, &sr->umsg->msg_control, ua_end); 276 unsafe_get_user(msg->msg_controllen, &sr->umsg->msg_controllen, ua_end); 277 msg->msg_flags = 0; 278 279 if (req->flags & REQ_F_BUFFER_SELECT) { 280 if (msg->msg_iovlen == 0) { 281 sr->len = iov->iov_len = 0; 282 iov->iov_base = NULL; 283 } else if (msg->msg_iovlen > 1) { 284 ret = -EINVAL; 285 goto ua_end; 286 } else { 287 /* we only need the length for provided buffers */ 288 if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t))) 289 goto ua_end; 290 unsafe_get_user(iov->iov_len, &msg->msg_iov[0].iov_len, 291 ua_end); 292 sr->len = iov->iov_len; 293 } 294 ret = 0; 295 ua_end: 296 user_access_end(); 297 return ret; 298 } 299 300 user_access_end(); 301 ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, 302 &iov, &iomsg->msg.msg_iter, false); 303 if (unlikely(ret < 0)) 304 return ret; 305 306 return io_net_vec_assign(req, iomsg, iov); 307 } 308 309 static int io_sendmsg_copy_hdr(struct io_kiocb *req, 310 struct io_async_msghdr *iomsg) 311 { 312 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 313 struct user_msghdr msg; 314 int ret; 315 316 iomsg->msg.msg_name = &iomsg->addr; 317 iomsg->msg.msg_iter.nr_segs = 0; 318 319 #ifdef CONFIG_COMPAT 320 if (unlikely(req->ctx->compat)) { 321 struct compat_msghdr cmsg; 322 323 ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE); 324 if (unlikely(ret)) 325 return ret; 326 327 return __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); 328 } 329 #endif 330 331 ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE); 332 if (unlikely(ret)) 333 return ret; 334 335 ret = __copy_msghdr(&iomsg->msg, &msg, NULL); 336 337 /* save msg_control as sys_sendmsg() overwrites it */ 338 sr->msg_control = iomsg->msg.msg_control_user; 339 return ret; 340 } 341 342 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) 343 { 344 struct io_async_msghdr *io = req->async_data; 345 346 io_netmsg_iovec_free(io); 347 } 348 349 static int io_send_setup(struct io_kiocb *req) 350 { 351 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 352 struct io_async_msghdr *kmsg = req->async_data; 353 int ret; 354 355 kmsg->msg.msg_name = NULL; 356 kmsg->msg.msg_namelen = 0; 357 kmsg->msg.msg_control = NULL; 358 kmsg->msg.msg_controllen = 0; 359 kmsg->msg.msg_ubuf = NULL; 360 361 if (sr->addr) { 362 ret = move_addr_to_kernel(sr->addr, sr->addr_len, &kmsg->addr); 363 if (unlikely(ret < 0)) 364 return ret; 365 kmsg->msg.msg_name = &kmsg->addr; 366 kmsg->msg.msg_namelen = sr->addr_len; 367 } 368 if (!io_do_buffer_select(req)) { 369 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, 370 &kmsg->msg.msg_iter); 371 if (unlikely(ret < 0)) 372 return ret; 373 } 374 return 0; 375 } 376 377 static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg) 378 { 379 struct io_async_msghdr *kmsg; 380 int ret; 381 382 kmsg = io_msg_alloc_async(req); 383 if (unlikely(!kmsg)) 384 return -ENOMEM; 385 if (!is_msg) 386 return io_send_setup(req); 387 ret = io_sendmsg_copy_hdr(req, kmsg); 388 if (!ret) 389 req->flags |= REQ_F_NEED_CLEANUP; 390 return ret; 391 } 392 393 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) 394 395 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 396 { 397 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 398 399 sr->done_io = 0; 400 401 if (req->opcode == IORING_OP_SEND) { 402 if (READ_ONCE(sqe->__pad3[0])) 403 return -EINVAL; 404 sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 405 sr->addr_len = READ_ONCE(sqe->addr_len); 406 } else if (sqe->addr2 || sqe->file_index) { 407 return -EINVAL; 408 } 409 410 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 411 sr->len = READ_ONCE(sqe->len); 412 sr->flags = READ_ONCE(sqe->ioprio); 413 if (sr->flags & ~SENDMSG_FLAGS) 414 return -EINVAL; 415 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 416 if (sr->msg_flags & MSG_DONTWAIT) 417 req->flags |= REQ_F_NOWAIT; 418 if (sr->flags & IORING_RECVSEND_BUNDLE) { 419 if (req->opcode == IORING_OP_SENDMSG) 420 return -EINVAL; 421 if (!(req->flags & REQ_F_BUFFER_SELECT)) 422 return -EINVAL; 423 sr->msg_flags |= MSG_WAITALL; 424 sr->buf_group = req->buf_index; 425 req->buf_list = NULL; 426 } 427 if (req->flags & REQ_F_BUFFER_SELECT && sr->len) 428 return -EINVAL; 429 430 #ifdef CONFIG_COMPAT 431 if (req->ctx->compat) 432 sr->msg_flags |= MSG_CMSG_COMPAT; 433 #endif 434 return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG); 435 } 436 437 static void io_req_msg_cleanup(struct io_kiocb *req, 438 unsigned int issue_flags) 439 { 440 req->flags &= ~REQ_F_NEED_CLEANUP; 441 io_netmsg_recycle(req, issue_flags); 442 } 443 444 /* 445 * For bundle completions, we need to figure out how many segments we consumed. 446 * A bundle could be using a single ITER_UBUF if that's all we mapped, or it 447 * could be using an ITER_IOVEC. If the latter, then if we consumed all of 448 * the segments, then it's a trivial questiont o answer. If we have residual 449 * data in the iter, then loop the segments to figure out how much we 450 * transferred. 451 */ 452 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) 453 { 454 struct iovec *iov; 455 int nbufs; 456 457 /* no data is always zero segments, and a ubuf is always 1 segment */ 458 if (ret <= 0) 459 return 0; 460 if (iter_is_ubuf(&kmsg->msg.msg_iter)) 461 return 1; 462 463 iov = kmsg->free_iov; 464 if (!iov) 465 iov = &kmsg->fast_iov; 466 467 /* if all data was transferred, it's basic pointer math */ 468 if (!iov_iter_count(&kmsg->msg.msg_iter)) 469 return iter_iov(&kmsg->msg.msg_iter) - iov; 470 471 /* short transfer, count segments */ 472 nbufs = 0; 473 do { 474 int this_len = min_t(int, iov[nbufs].iov_len, ret); 475 476 nbufs++; 477 ret -= this_len; 478 } while (ret); 479 480 return nbufs; 481 } 482 483 static inline bool io_send_finish(struct io_kiocb *req, int *ret, 484 struct io_async_msghdr *kmsg, 485 unsigned issue_flags) 486 { 487 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 488 bool bundle_finished = *ret <= 0; 489 unsigned int cflags; 490 491 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { 492 cflags = io_put_kbuf(req, issue_flags); 493 goto finish; 494 } 495 496 cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags); 497 498 if (bundle_finished || req->flags & REQ_F_BL_EMPTY) 499 goto finish; 500 501 /* 502 * Fill CQE for this receive and see if we should keep trying to 503 * receive from this socket. 504 */ 505 if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 506 io_mshot_prep_retry(req, kmsg); 507 return false; 508 } 509 510 /* Otherwise stop bundle and use the current result. */ 511 finish: 512 io_req_set_res(req, *ret, cflags); 513 *ret = IOU_OK; 514 return true; 515 } 516 517 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 518 { 519 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 520 struct io_async_msghdr *kmsg = req->async_data; 521 struct socket *sock; 522 unsigned flags; 523 int min_ret = 0; 524 int ret; 525 526 sock = sock_from_file(req->file); 527 if (unlikely(!sock)) 528 return -ENOTSOCK; 529 530 if (!(req->flags & REQ_F_POLLED) && 531 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 532 return -EAGAIN; 533 534 flags = sr->msg_flags; 535 if (issue_flags & IO_URING_F_NONBLOCK) 536 flags |= MSG_DONTWAIT; 537 if (flags & MSG_WAITALL) 538 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 539 540 kmsg->msg.msg_control_user = sr->msg_control; 541 542 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 543 544 if (ret < min_ret) { 545 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 546 return -EAGAIN; 547 if (ret > 0 && io_net_retry(sock, flags)) { 548 kmsg->msg.msg_controllen = 0; 549 kmsg->msg.msg_control = NULL; 550 sr->done_io += ret; 551 req->flags |= REQ_F_BL_NO_RECYCLE; 552 return -EAGAIN; 553 } 554 if (ret == -ERESTARTSYS) 555 ret = -EINTR; 556 req_set_fail(req); 557 } 558 io_req_msg_cleanup(req, issue_flags); 559 if (ret >= 0) 560 ret += sr->done_io; 561 else if (sr->done_io) 562 ret = sr->done_io; 563 io_req_set_res(req, ret, 0); 564 return IOU_OK; 565 } 566 567 int io_send(struct io_kiocb *req, unsigned int issue_flags) 568 { 569 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 570 struct io_async_msghdr *kmsg = req->async_data; 571 struct socket *sock; 572 unsigned flags; 573 int min_ret = 0; 574 int ret; 575 576 sock = sock_from_file(req->file); 577 if (unlikely(!sock)) 578 return -ENOTSOCK; 579 580 if (!(req->flags & REQ_F_POLLED) && 581 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 582 return -EAGAIN; 583 584 flags = sr->msg_flags; 585 if (issue_flags & IO_URING_F_NONBLOCK) 586 flags |= MSG_DONTWAIT; 587 588 retry_bundle: 589 if (io_do_buffer_select(req)) { 590 struct buf_sel_arg arg = { 591 .iovs = &kmsg->fast_iov, 592 .max_len = INT_MAX, 593 .nr_iovs = 1, 594 .mode = KBUF_MODE_EXPAND, 595 }; 596 597 if (kmsg->free_iov) { 598 arg.nr_iovs = kmsg->free_iov_nr; 599 arg.iovs = kmsg->free_iov; 600 arg.mode |= KBUF_MODE_FREE; 601 } 602 603 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) 604 arg.nr_iovs = 1; 605 606 ret = io_buffers_select(req, &arg, issue_flags); 607 if (unlikely(ret < 0)) 608 return ret; 609 610 sr->len = arg.out_len; 611 iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret, 612 arg.out_len); 613 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { 614 kmsg->free_iov_nr = ret; 615 kmsg->free_iov = arg.iovs; 616 } 617 } 618 619 /* 620 * If MSG_WAITALL is set, or this is a bundle send, then we need 621 * the full amount. If just bundle is set, if we do a short send 622 * then we complete the bundle sequence rather than continue on. 623 */ 624 if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE) 625 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 626 627 flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 628 kmsg->msg.msg_flags = flags; 629 ret = sock_sendmsg(sock, &kmsg->msg); 630 if (ret < min_ret) { 631 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 632 return -EAGAIN; 633 634 if (ret > 0 && io_net_retry(sock, flags)) { 635 sr->len -= ret; 636 sr->buf += ret; 637 sr->done_io += ret; 638 req->flags |= REQ_F_BL_NO_RECYCLE; 639 return -EAGAIN; 640 } 641 if (ret == -ERESTARTSYS) 642 ret = -EINTR; 643 req_set_fail(req); 644 } 645 if (ret >= 0) 646 ret += sr->done_io; 647 else if (sr->done_io) 648 ret = sr->done_io; 649 650 if (!io_send_finish(req, &ret, kmsg, issue_flags)) 651 goto retry_bundle; 652 653 io_req_msg_cleanup(req, issue_flags); 654 return ret; 655 } 656 657 static int io_recvmsg_mshot_prep(struct io_kiocb *req, 658 struct io_async_msghdr *iomsg, 659 int namelen, size_t controllen) 660 { 661 if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) == 662 (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) { 663 int hdr; 664 665 if (unlikely(namelen < 0)) 666 return -EOVERFLOW; 667 if (check_add_overflow(sizeof(struct io_uring_recvmsg_out), 668 namelen, &hdr)) 669 return -EOVERFLOW; 670 if (check_add_overflow(hdr, controllen, &hdr)) 671 return -EOVERFLOW; 672 673 iomsg->namelen = namelen; 674 iomsg->controllen = controllen; 675 return 0; 676 } 677 678 return 0; 679 } 680 681 static int io_recvmsg_copy_hdr(struct io_kiocb *req, 682 struct io_async_msghdr *iomsg) 683 { 684 struct user_msghdr msg; 685 int ret; 686 687 iomsg->msg.msg_name = &iomsg->addr; 688 iomsg->msg.msg_iter.nr_segs = 0; 689 690 #ifdef CONFIG_COMPAT 691 if (unlikely(req->ctx->compat)) { 692 struct compat_msghdr cmsg; 693 694 ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST); 695 if (unlikely(ret)) 696 return ret; 697 698 ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr); 699 if (unlikely(ret)) 700 return ret; 701 702 return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen, 703 cmsg.msg_controllen); 704 } 705 #endif 706 707 ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST); 708 if (unlikely(ret)) 709 return ret; 710 711 ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); 712 if (unlikely(ret)) 713 return ret; 714 715 return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, 716 msg.msg_controllen); 717 } 718 719 static int io_recvmsg_prep_setup(struct io_kiocb *req) 720 { 721 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 722 struct io_async_msghdr *kmsg; 723 int ret; 724 725 kmsg = io_msg_alloc_async(req); 726 if (unlikely(!kmsg)) 727 return -ENOMEM; 728 729 if (req->opcode == IORING_OP_RECV) { 730 kmsg->msg.msg_name = NULL; 731 kmsg->msg.msg_namelen = 0; 732 kmsg->msg.msg_control = NULL; 733 kmsg->msg.msg_get_inq = 1; 734 kmsg->msg.msg_controllen = 0; 735 kmsg->msg.msg_iocb = NULL; 736 kmsg->msg.msg_ubuf = NULL; 737 738 if (!io_do_buffer_select(req)) { 739 ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 740 &kmsg->msg.msg_iter); 741 if (unlikely(ret)) 742 return ret; 743 } 744 return 0; 745 } 746 747 ret = io_recvmsg_copy_hdr(req, kmsg); 748 if (!ret) 749 req->flags |= REQ_F_NEED_CLEANUP; 750 return ret; 751 } 752 753 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ 754 IORING_RECVSEND_BUNDLE) 755 756 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 757 { 758 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 759 760 sr->done_io = 0; 761 762 if (unlikely(sqe->file_index || sqe->addr2)) 763 return -EINVAL; 764 765 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 766 sr->len = READ_ONCE(sqe->len); 767 sr->flags = READ_ONCE(sqe->ioprio); 768 if (sr->flags & ~RECVMSG_FLAGS) 769 return -EINVAL; 770 sr->msg_flags = READ_ONCE(sqe->msg_flags); 771 if (sr->msg_flags & MSG_DONTWAIT) 772 req->flags |= REQ_F_NOWAIT; 773 if (sr->msg_flags & MSG_ERRQUEUE) 774 req->flags |= REQ_F_CLEAR_POLLIN; 775 if (req->flags & REQ_F_BUFFER_SELECT) { 776 /* 777 * Store the buffer group for this multishot receive separately, 778 * as if we end up doing an io-wq based issue that selects a 779 * buffer, it has to be committed immediately and that will 780 * clear ->buf_list. This means we lose the link to the buffer 781 * list, and the eventual buffer put on completion then cannot 782 * restore it. 783 */ 784 sr->buf_group = req->buf_index; 785 req->buf_list = NULL; 786 } 787 if (sr->flags & IORING_RECV_MULTISHOT) { 788 if (!(req->flags & REQ_F_BUFFER_SELECT)) 789 return -EINVAL; 790 if (sr->msg_flags & MSG_WAITALL) 791 return -EINVAL; 792 if (req->opcode == IORING_OP_RECV && sr->len) 793 return -EINVAL; 794 req->flags |= REQ_F_APOLL_MULTISHOT; 795 } 796 if (sr->flags & IORING_RECVSEND_BUNDLE) { 797 if (req->opcode == IORING_OP_RECVMSG) 798 return -EINVAL; 799 } 800 801 #ifdef CONFIG_COMPAT 802 if (req->ctx->compat) 803 sr->msg_flags |= MSG_CMSG_COMPAT; 804 #endif 805 sr->nr_multishot_loops = 0; 806 return io_recvmsg_prep_setup(req); 807 } 808 809 /* 810 * Finishes io_recv and io_recvmsg. 811 * 812 * Returns true if it is actually finished, or false if it should run 813 * again (for multishot). 814 */ 815 static inline bool io_recv_finish(struct io_kiocb *req, int *ret, 816 struct io_async_msghdr *kmsg, 817 bool mshot_finished, unsigned issue_flags) 818 { 819 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 820 unsigned int cflags; 821 822 if (sr->flags & IORING_RECVSEND_BUNDLE) 823 cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), 824 issue_flags); 825 else 826 cflags = io_put_kbuf(req, issue_flags); 827 828 if (kmsg->msg.msg_inq > 0) 829 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 830 831 /* bundle with no more immediate buffers, we're done */ 832 if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY) 833 goto finish; 834 835 /* 836 * Fill CQE for this receive and see if we should keep trying to 837 * receive from this socket. 838 */ 839 if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && 840 io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 841 int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; 842 843 io_mshot_prep_retry(req, kmsg); 844 /* Known not-empty or unknown state, retry */ 845 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { 846 if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) 847 return false; 848 /* mshot retries exceeded, force a requeue */ 849 sr->nr_multishot_loops = 0; 850 mshot_retry_ret = IOU_REQUEUE; 851 } 852 if (issue_flags & IO_URING_F_MULTISHOT) 853 *ret = mshot_retry_ret; 854 else 855 *ret = -EAGAIN; 856 return true; 857 } 858 859 /* Finish the request / stop multishot. */ 860 finish: 861 io_req_set_res(req, *ret, cflags); 862 863 if (issue_flags & IO_URING_F_MULTISHOT) 864 *ret = IOU_STOP_MULTISHOT; 865 else 866 *ret = IOU_OK; 867 io_req_msg_cleanup(req, issue_flags); 868 return true; 869 } 870 871 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg, 872 struct io_sr_msg *sr, void __user **buf, 873 size_t *len) 874 { 875 unsigned long ubuf = (unsigned long) *buf; 876 unsigned long hdr; 877 878 hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + 879 kmsg->controllen; 880 if (*len < hdr) 881 return -EFAULT; 882 883 if (kmsg->controllen) { 884 unsigned long control = ubuf + hdr - kmsg->controllen; 885 886 kmsg->msg.msg_control_user = (void __user *) control; 887 kmsg->msg.msg_controllen = kmsg->controllen; 888 } 889 890 sr->buf = *buf; /* stash for later copy */ 891 *buf = (void __user *) (ubuf + hdr); 892 kmsg->payloadlen = *len = *len - hdr; 893 return 0; 894 } 895 896 struct io_recvmsg_multishot_hdr { 897 struct io_uring_recvmsg_out msg; 898 struct sockaddr_storage addr; 899 }; 900 901 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, 902 struct io_async_msghdr *kmsg, 903 unsigned int flags, bool *finished) 904 { 905 int err; 906 int copy_len; 907 struct io_recvmsg_multishot_hdr hdr; 908 909 if (kmsg->namelen) 910 kmsg->msg.msg_name = &hdr.addr; 911 kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); 912 kmsg->msg.msg_namelen = 0; 913 914 if (sock->file->f_flags & O_NONBLOCK) 915 flags |= MSG_DONTWAIT; 916 917 err = sock_recvmsg(sock, &kmsg->msg, flags); 918 *finished = err <= 0; 919 if (err < 0) 920 return err; 921 922 hdr.msg = (struct io_uring_recvmsg_out) { 923 .controllen = kmsg->controllen - kmsg->msg.msg_controllen, 924 .flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT 925 }; 926 927 hdr.msg.payloadlen = err; 928 if (err > kmsg->payloadlen) 929 err = kmsg->payloadlen; 930 931 copy_len = sizeof(struct io_uring_recvmsg_out); 932 if (kmsg->msg.msg_namelen > kmsg->namelen) 933 copy_len += kmsg->namelen; 934 else 935 copy_len += kmsg->msg.msg_namelen; 936 937 /* 938 * "fromlen shall refer to the value before truncation.." 939 * 1003.1g 940 */ 941 hdr.msg.namelen = kmsg->msg.msg_namelen; 942 943 /* ensure that there is no gap between hdr and sockaddr_storage */ 944 BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) != 945 sizeof(struct io_uring_recvmsg_out)); 946 if (copy_to_user(io->buf, &hdr, copy_len)) { 947 *finished = true; 948 return -EFAULT; 949 } 950 951 return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + 952 kmsg->controllen + err; 953 } 954 955 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 956 { 957 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 958 struct io_async_msghdr *kmsg = req->async_data; 959 struct socket *sock; 960 unsigned flags; 961 int ret, min_ret = 0; 962 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 963 bool mshot_finished = true; 964 965 sock = sock_from_file(req->file); 966 if (unlikely(!sock)) 967 return -ENOTSOCK; 968 969 if (!(req->flags & REQ_F_POLLED) && 970 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 971 return -EAGAIN; 972 973 flags = sr->msg_flags; 974 if (force_nonblock) 975 flags |= MSG_DONTWAIT; 976 977 retry_multishot: 978 if (io_do_buffer_select(req)) { 979 void __user *buf; 980 size_t len = sr->len; 981 982 buf = io_buffer_select(req, &len, issue_flags); 983 if (!buf) 984 return -ENOBUFS; 985 986 if (req->flags & REQ_F_APOLL_MULTISHOT) { 987 ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len); 988 if (ret) { 989 io_kbuf_recycle(req, issue_flags); 990 return ret; 991 } 992 } 993 994 iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); 995 } 996 997 kmsg->msg.msg_get_inq = 1; 998 kmsg->msg.msg_inq = -1; 999 if (req->flags & REQ_F_APOLL_MULTISHOT) { 1000 ret = io_recvmsg_multishot(sock, sr, kmsg, flags, 1001 &mshot_finished); 1002 } else { 1003 /* disable partial retry for recvmsg with cmsg attached */ 1004 if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) 1005 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1006 1007 ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, 1008 kmsg->uaddr, flags); 1009 } 1010 1011 if (ret < min_ret) { 1012 if (ret == -EAGAIN && force_nonblock) { 1013 if (issue_flags & IO_URING_F_MULTISHOT) { 1014 io_kbuf_recycle(req, issue_flags); 1015 return IOU_ISSUE_SKIP_COMPLETE; 1016 } 1017 return -EAGAIN; 1018 } 1019 if (ret > 0 && io_net_retry(sock, flags)) { 1020 sr->done_io += ret; 1021 req->flags |= REQ_F_BL_NO_RECYCLE; 1022 return -EAGAIN; 1023 } 1024 if (ret == -ERESTARTSYS) 1025 ret = -EINTR; 1026 req_set_fail(req); 1027 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1028 req_set_fail(req); 1029 } 1030 1031 if (ret > 0) 1032 ret += sr->done_io; 1033 else if (sr->done_io) 1034 ret = sr->done_io; 1035 else 1036 io_kbuf_recycle(req, issue_flags); 1037 1038 if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1039 goto retry_multishot; 1040 1041 return ret; 1042 } 1043 1044 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, 1045 size_t *len, unsigned int issue_flags) 1046 { 1047 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1048 int ret; 1049 1050 /* 1051 * If the ring isn't locked, then don't use the peek interface 1052 * to grab multiple buffers as we will lock/unlock between 1053 * this selection and posting the buffers. 1054 */ 1055 if (!(issue_flags & IO_URING_F_UNLOCKED) && 1056 sr->flags & IORING_RECVSEND_BUNDLE) { 1057 struct buf_sel_arg arg = { 1058 .iovs = &kmsg->fast_iov, 1059 .nr_iovs = 1, 1060 .mode = KBUF_MODE_EXPAND, 1061 }; 1062 1063 if (kmsg->free_iov) { 1064 arg.nr_iovs = kmsg->free_iov_nr; 1065 arg.iovs = kmsg->free_iov; 1066 arg.mode |= KBUF_MODE_FREE; 1067 } 1068 1069 if (kmsg->msg.msg_inq > 0) 1070 arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); 1071 1072 ret = io_buffers_peek(req, &arg); 1073 if (unlikely(ret < 0)) 1074 return ret; 1075 1076 /* special case 1 vec, can be a fast path */ 1077 if (ret == 1) { 1078 sr->buf = arg.iovs[0].iov_base; 1079 sr->len = arg.iovs[0].iov_len; 1080 goto map_ubuf; 1081 } 1082 iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, 1083 arg.out_len); 1084 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { 1085 kmsg->free_iov_nr = ret; 1086 kmsg->free_iov = arg.iovs; 1087 } 1088 } else { 1089 void __user *buf; 1090 1091 *len = sr->len; 1092 buf = io_buffer_select(req, len, issue_flags); 1093 if (!buf) 1094 return -ENOBUFS; 1095 sr->buf = buf; 1096 sr->len = *len; 1097 map_ubuf: 1098 ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 1099 &kmsg->msg.msg_iter); 1100 if (unlikely(ret)) 1101 return ret; 1102 } 1103 1104 return 0; 1105 } 1106 1107 int io_recv(struct io_kiocb *req, unsigned int issue_flags) 1108 { 1109 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1110 struct io_async_msghdr *kmsg = req->async_data; 1111 struct socket *sock; 1112 unsigned flags; 1113 int ret, min_ret = 0; 1114 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1115 size_t len = sr->len; 1116 1117 if (!(req->flags & REQ_F_POLLED) && 1118 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1119 return -EAGAIN; 1120 1121 sock = sock_from_file(req->file); 1122 if (unlikely(!sock)) 1123 return -ENOTSOCK; 1124 1125 flags = sr->msg_flags; 1126 if (force_nonblock) 1127 flags |= MSG_DONTWAIT; 1128 1129 retry_multishot: 1130 if (io_do_buffer_select(req)) { 1131 ret = io_recv_buf_select(req, kmsg, &len, issue_flags); 1132 if (unlikely(ret)) { 1133 kmsg->msg.msg_inq = -1; 1134 goto out_free; 1135 } 1136 sr->buf = NULL; 1137 } 1138 1139 kmsg->msg.msg_flags = 0; 1140 kmsg->msg.msg_inq = -1; 1141 1142 if (flags & MSG_WAITALL) 1143 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1144 1145 ret = sock_recvmsg(sock, &kmsg->msg, flags); 1146 if (ret < min_ret) { 1147 if (ret == -EAGAIN && force_nonblock) { 1148 if (issue_flags & IO_URING_F_MULTISHOT) { 1149 io_kbuf_recycle(req, issue_flags); 1150 return IOU_ISSUE_SKIP_COMPLETE; 1151 } 1152 1153 return -EAGAIN; 1154 } 1155 if (ret > 0 && io_net_retry(sock, flags)) { 1156 sr->len -= ret; 1157 sr->buf += ret; 1158 sr->done_io += ret; 1159 req->flags |= REQ_F_BL_NO_RECYCLE; 1160 return -EAGAIN; 1161 } 1162 if (ret == -ERESTARTSYS) 1163 ret = -EINTR; 1164 req_set_fail(req); 1165 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1166 out_free: 1167 req_set_fail(req); 1168 } 1169 1170 if (ret > 0) 1171 ret += sr->done_io; 1172 else if (sr->done_io) 1173 ret = sr->done_io; 1174 else 1175 io_kbuf_recycle(req, issue_flags); 1176 1177 if (!io_recv_finish(req, &ret, kmsg, ret <= 0, issue_flags)) 1178 goto retry_multishot; 1179 1180 return ret; 1181 } 1182 1183 void io_send_zc_cleanup(struct io_kiocb *req) 1184 { 1185 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1186 struct io_async_msghdr *io = req->async_data; 1187 1188 if (req_has_async_data(req)) 1189 io_netmsg_iovec_free(io); 1190 if (zc->notif) { 1191 io_notif_flush(zc->notif); 1192 zc->notif = NULL; 1193 } 1194 } 1195 1196 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF) 1197 #define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE) 1198 1199 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1200 { 1201 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1202 struct io_ring_ctx *ctx = req->ctx; 1203 struct io_kiocb *notif; 1204 1205 zc->done_io = 0; 1206 req->flags |= REQ_F_POLL_NO_LAZY; 1207 1208 if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) 1209 return -EINVAL; 1210 /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ 1211 if (req->flags & REQ_F_CQE_SKIP) 1212 return -EINVAL; 1213 1214 notif = zc->notif = io_alloc_notif(ctx); 1215 if (!notif) 1216 return -ENOMEM; 1217 notif->cqe.user_data = req->cqe.user_data; 1218 notif->cqe.res = 0; 1219 notif->cqe.flags = IORING_CQE_F_NOTIF; 1220 req->flags |= REQ_F_NEED_CLEANUP; 1221 1222 zc->flags = READ_ONCE(sqe->ioprio); 1223 if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) { 1224 if (zc->flags & ~IO_ZC_FLAGS_VALID) 1225 return -EINVAL; 1226 if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { 1227 struct io_notif_data *nd = io_notif_to_data(notif); 1228 1229 nd->zc_report = true; 1230 nd->zc_used = false; 1231 nd->zc_copied = false; 1232 } 1233 } 1234 1235 if (zc->flags & IORING_RECVSEND_FIXED_BUF) { 1236 unsigned idx = READ_ONCE(sqe->buf_index); 1237 1238 if (unlikely(idx >= ctx->nr_user_bufs)) 1239 return -EFAULT; 1240 idx = array_index_nospec(idx, ctx->nr_user_bufs); 1241 req->imu = READ_ONCE(ctx->user_bufs[idx]); 1242 io_req_set_rsrc_node(notif, ctx, 0); 1243 } 1244 1245 if (req->opcode == IORING_OP_SEND_ZC) { 1246 if (READ_ONCE(sqe->__pad3[0])) 1247 return -EINVAL; 1248 zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 1249 zc->addr_len = READ_ONCE(sqe->addr_len); 1250 } else { 1251 if (unlikely(sqe->addr2 || sqe->file_index)) 1252 return -EINVAL; 1253 if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF)) 1254 return -EINVAL; 1255 } 1256 1257 zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1258 zc->len = READ_ONCE(sqe->len); 1259 zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; 1260 if (zc->msg_flags & MSG_DONTWAIT) 1261 req->flags |= REQ_F_NOWAIT; 1262 1263 #ifdef CONFIG_COMPAT 1264 if (req->ctx->compat) 1265 zc->msg_flags |= MSG_CMSG_COMPAT; 1266 #endif 1267 return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC); 1268 } 1269 1270 static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb, 1271 struct iov_iter *from, size_t length) 1272 { 1273 skb_zcopy_downgrade_managed(skb); 1274 return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); 1275 } 1276 1277 static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, 1278 struct iov_iter *from, size_t length) 1279 { 1280 struct skb_shared_info *shinfo = skb_shinfo(skb); 1281 int frag = shinfo->nr_frags; 1282 int ret = 0; 1283 struct bvec_iter bi; 1284 ssize_t copied = 0; 1285 unsigned long truesize = 0; 1286 1287 if (!frag) 1288 shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; 1289 else if (unlikely(!skb_zcopy_managed(skb))) 1290 return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); 1291 1292 bi.bi_size = min(from->count, length); 1293 bi.bi_bvec_done = from->iov_offset; 1294 bi.bi_idx = 0; 1295 1296 while (bi.bi_size && frag < MAX_SKB_FRAGS) { 1297 struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi); 1298 1299 copied += v.bv_len; 1300 truesize += PAGE_ALIGN(v.bv_len + v.bv_offset); 1301 __skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page, 1302 v.bv_offset, v.bv_len); 1303 bvec_iter_advance_single(from->bvec, &bi, v.bv_len); 1304 } 1305 if (bi.bi_size) 1306 ret = -EMSGSIZE; 1307 1308 shinfo->nr_frags = frag; 1309 from->bvec += bi.bi_idx; 1310 from->nr_segs -= bi.bi_idx; 1311 from->count -= copied; 1312 from->iov_offset = bi.bi_bvec_done; 1313 1314 skb->data_len += copied; 1315 skb->len += copied; 1316 skb->truesize += truesize; 1317 1318 if (sk && sk->sk_type == SOCK_STREAM) { 1319 sk_wmem_queued_add(sk, truesize); 1320 if (!skb_zcopy_pure(skb)) 1321 sk_mem_charge(sk, truesize); 1322 } else { 1323 refcount_add(truesize, &skb->sk->sk_wmem_alloc); 1324 } 1325 return ret; 1326 } 1327 1328 static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg) 1329 { 1330 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1331 int ret; 1332 1333 if (sr->flags & IORING_RECVSEND_FIXED_BUF) { 1334 ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, req->imu, 1335 (u64)(uintptr_t)sr->buf, sr->len); 1336 if (unlikely(ret)) 1337 return ret; 1338 kmsg->msg.sg_from_iter = io_sg_from_iter; 1339 } else { 1340 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); 1341 if (unlikely(ret)) 1342 return ret; 1343 ret = io_notif_account_mem(sr->notif, sr->len); 1344 if (unlikely(ret)) 1345 return ret; 1346 kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; 1347 } 1348 1349 return ret; 1350 } 1351 1352 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) 1353 { 1354 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1355 struct io_async_msghdr *kmsg = req->async_data; 1356 struct socket *sock; 1357 unsigned msg_flags; 1358 int ret, min_ret = 0; 1359 1360 sock = sock_from_file(req->file); 1361 if (unlikely(!sock)) 1362 return -ENOTSOCK; 1363 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1364 return -EOPNOTSUPP; 1365 1366 if (!(req->flags & REQ_F_POLLED) && 1367 (zc->flags & IORING_RECVSEND_POLL_FIRST)) 1368 return -EAGAIN; 1369 1370 if (!zc->done_io) { 1371 ret = io_send_zc_import(req, kmsg); 1372 if (unlikely(ret)) 1373 return ret; 1374 } 1375 1376 msg_flags = zc->msg_flags; 1377 if (issue_flags & IO_URING_F_NONBLOCK) 1378 msg_flags |= MSG_DONTWAIT; 1379 if (msg_flags & MSG_WAITALL) 1380 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1381 msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 1382 1383 kmsg->msg.msg_flags = msg_flags; 1384 kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; 1385 ret = sock_sendmsg(sock, &kmsg->msg); 1386 1387 if (unlikely(ret < min_ret)) { 1388 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1389 return -EAGAIN; 1390 1391 if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { 1392 zc->len -= ret; 1393 zc->buf += ret; 1394 zc->done_io += ret; 1395 req->flags |= REQ_F_BL_NO_RECYCLE; 1396 return -EAGAIN; 1397 } 1398 if (ret == -ERESTARTSYS) 1399 ret = -EINTR; 1400 req_set_fail(req); 1401 } 1402 1403 if (ret >= 0) 1404 ret += zc->done_io; 1405 else if (zc->done_io) 1406 ret = zc->done_io; 1407 1408 /* 1409 * If we're in io-wq we can't rely on tw ordering guarantees, defer 1410 * flushing notif to io_send_zc_cleanup() 1411 */ 1412 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1413 io_notif_flush(zc->notif); 1414 io_req_msg_cleanup(req, 0); 1415 } 1416 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1417 return IOU_OK; 1418 } 1419 1420 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) 1421 { 1422 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1423 struct io_async_msghdr *kmsg = req->async_data; 1424 struct socket *sock; 1425 unsigned flags; 1426 int ret, min_ret = 0; 1427 1428 sock = sock_from_file(req->file); 1429 if (unlikely(!sock)) 1430 return -ENOTSOCK; 1431 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1432 return -EOPNOTSUPP; 1433 1434 if (!(req->flags & REQ_F_POLLED) && 1435 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1436 return -EAGAIN; 1437 1438 flags = sr->msg_flags; 1439 if (issue_flags & IO_URING_F_NONBLOCK) 1440 flags |= MSG_DONTWAIT; 1441 if (flags & MSG_WAITALL) 1442 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1443 1444 kmsg->msg.msg_control_user = sr->msg_control; 1445 kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; 1446 kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; 1447 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 1448 1449 if (unlikely(ret < min_ret)) { 1450 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1451 return -EAGAIN; 1452 1453 if (ret > 0 && io_net_retry(sock, flags)) { 1454 sr->done_io += ret; 1455 req->flags |= REQ_F_BL_NO_RECYCLE; 1456 return -EAGAIN; 1457 } 1458 if (ret == -ERESTARTSYS) 1459 ret = -EINTR; 1460 req_set_fail(req); 1461 } 1462 1463 if (ret >= 0) 1464 ret += sr->done_io; 1465 else if (sr->done_io) 1466 ret = sr->done_io; 1467 1468 /* 1469 * If we're in io-wq we can't rely on tw ordering guarantees, defer 1470 * flushing notif to io_send_zc_cleanup() 1471 */ 1472 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1473 io_notif_flush(sr->notif); 1474 io_req_msg_cleanup(req, 0); 1475 } 1476 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1477 return IOU_OK; 1478 } 1479 1480 void io_sendrecv_fail(struct io_kiocb *req) 1481 { 1482 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1483 1484 if (sr->done_io) 1485 req->cqe.res = sr->done_io; 1486 1487 if ((req->flags & REQ_F_NEED_CLEANUP) && 1488 (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC)) 1489 req->cqe.flags |= IORING_CQE_F_MORE; 1490 } 1491 1492 #define ACCEPT_FLAGS (IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \ 1493 IORING_ACCEPT_POLL_FIRST) 1494 1495 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1496 { 1497 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1498 1499 if (sqe->len || sqe->buf_index) 1500 return -EINVAL; 1501 1502 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1503 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 1504 accept->flags = READ_ONCE(sqe->accept_flags); 1505 accept->nofile = rlimit(RLIMIT_NOFILE); 1506 accept->iou_flags = READ_ONCE(sqe->ioprio); 1507 if (accept->iou_flags & ~ACCEPT_FLAGS) 1508 return -EINVAL; 1509 1510 accept->file_slot = READ_ONCE(sqe->file_index); 1511 if (accept->file_slot) { 1512 if (accept->flags & SOCK_CLOEXEC) 1513 return -EINVAL; 1514 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT && 1515 accept->file_slot != IORING_FILE_INDEX_ALLOC) 1516 return -EINVAL; 1517 } 1518 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1519 return -EINVAL; 1520 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) 1521 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 1522 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT) 1523 req->flags |= REQ_F_APOLL_MULTISHOT; 1524 if (accept->iou_flags & IORING_ACCEPT_DONTWAIT) 1525 req->flags |= REQ_F_NOWAIT; 1526 return 0; 1527 } 1528 1529 int io_accept(struct io_kiocb *req, unsigned int issue_flags) 1530 { 1531 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1532 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1533 bool fixed = !!accept->file_slot; 1534 struct proto_accept_arg arg = { 1535 .flags = force_nonblock ? O_NONBLOCK : 0, 1536 }; 1537 struct file *file; 1538 unsigned cflags; 1539 int ret, fd; 1540 1541 if (!(req->flags & REQ_F_POLLED) && 1542 accept->iou_flags & IORING_ACCEPT_POLL_FIRST) 1543 return -EAGAIN; 1544 1545 retry: 1546 if (!fixed) { 1547 fd = __get_unused_fd_flags(accept->flags, accept->nofile); 1548 if (unlikely(fd < 0)) 1549 return fd; 1550 } 1551 arg.err = 0; 1552 arg.is_empty = -1; 1553 file = do_accept(req->file, &arg, accept->addr, accept->addr_len, 1554 accept->flags); 1555 if (IS_ERR(file)) { 1556 if (!fixed) 1557 put_unused_fd(fd); 1558 ret = PTR_ERR(file); 1559 if (ret == -EAGAIN && force_nonblock && 1560 !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) { 1561 /* 1562 * if it's multishot and polled, we don't need to 1563 * return EAGAIN to arm the poll infra since it 1564 * has already been done 1565 */ 1566 if (issue_flags & IO_URING_F_MULTISHOT) 1567 return IOU_ISSUE_SKIP_COMPLETE; 1568 return ret; 1569 } 1570 if (ret == -ERESTARTSYS) 1571 ret = -EINTR; 1572 req_set_fail(req); 1573 } else if (!fixed) { 1574 fd_install(fd, file); 1575 ret = fd; 1576 } else { 1577 ret = io_fixed_fd_install(req, issue_flags, file, 1578 accept->file_slot); 1579 } 1580 1581 cflags = 0; 1582 if (!arg.is_empty) 1583 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 1584 1585 if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { 1586 io_req_set_res(req, ret, cflags); 1587 return IOU_OK; 1588 } 1589 1590 if (ret < 0) 1591 return ret; 1592 if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { 1593 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) 1594 goto retry; 1595 if (issue_flags & IO_URING_F_MULTISHOT) 1596 return IOU_ISSUE_SKIP_COMPLETE; 1597 return -EAGAIN; 1598 } 1599 1600 io_req_set_res(req, ret, cflags); 1601 return IOU_STOP_MULTISHOT; 1602 } 1603 1604 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1605 { 1606 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); 1607 1608 if (sqe->addr || sqe->rw_flags || sqe->buf_index) 1609 return -EINVAL; 1610 1611 sock->domain = READ_ONCE(sqe->fd); 1612 sock->type = READ_ONCE(sqe->off); 1613 sock->protocol = READ_ONCE(sqe->len); 1614 sock->file_slot = READ_ONCE(sqe->file_index); 1615 sock->nofile = rlimit(RLIMIT_NOFILE); 1616 1617 sock->flags = sock->type & ~SOCK_TYPE_MASK; 1618 if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) 1619 return -EINVAL; 1620 if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1621 return -EINVAL; 1622 return 0; 1623 } 1624 1625 int io_socket(struct io_kiocb *req, unsigned int issue_flags) 1626 { 1627 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); 1628 bool fixed = !!sock->file_slot; 1629 struct file *file; 1630 int ret, fd; 1631 1632 if (!fixed) { 1633 fd = __get_unused_fd_flags(sock->flags, sock->nofile); 1634 if (unlikely(fd < 0)) 1635 return fd; 1636 } 1637 file = __sys_socket_file(sock->domain, sock->type, sock->protocol); 1638 if (IS_ERR(file)) { 1639 if (!fixed) 1640 put_unused_fd(fd); 1641 ret = PTR_ERR(file); 1642 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1643 return -EAGAIN; 1644 if (ret == -ERESTARTSYS) 1645 ret = -EINTR; 1646 req_set_fail(req); 1647 } else if (!fixed) { 1648 fd_install(fd, file); 1649 ret = fd; 1650 } else { 1651 ret = io_fixed_fd_install(req, issue_flags, file, 1652 sock->file_slot); 1653 } 1654 io_req_set_res(req, ret, 0); 1655 return IOU_OK; 1656 } 1657 1658 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1659 { 1660 struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); 1661 struct io_async_msghdr *io; 1662 1663 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 1664 return -EINVAL; 1665 1666 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1667 conn->addr_len = READ_ONCE(sqe->addr2); 1668 conn->in_progress = conn->seen_econnaborted = false; 1669 1670 io = io_msg_alloc_async(req); 1671 if (unlikely(!io)) 1672 return -ENOMEM; 1673 1674 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr); 1675 } 1676 1677 int io_connect(struct io_kiocb *req, unsigned int issue_flags) 1678 { 1679 struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect); 1680 struct io_async_msghdr *io = req->async_data; 1681 unsigned file_flags; 1682 int ret; 1683 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1684 1685 file_flags = force_nonblock ? O_NONBLOCK : 0; 1686 1687 ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, 1688 file_flags); 1689 if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED) 1690 && force_nonblock) { 1691 if (ret == -EINPROGRESS) { 1692 connect->in_progress = true; 1693 } else if (ret == -ECONNABORTED) { 1694 if (connect->seen_econnaborted) 1695 goto out; 1696 connect->seen_econnaborted = true; 1697 } 1698 return -EAGAIN; 1699 } 1700 if (connect->in_progress) { 1701 /* 1702 * At least bluetooth will return -EBADFD on a re-connect 1703 * attempt, and it's (supposedly) also valid to get -EISCONN 1704 * which means the previous result is good. For both of these, 1705 * grab the sock_error() and use that for the completion. 1706 */ 1707 if (ret == -EBADFD || ret == -EISCONN) 1708 ret = sock_error(sock_from_file(req->file)->sk); 1709 } 1710 if (ret == -ERESTARTSYS) 1711 ret = -EINTR; 1712 out: 1713 if (ret < 0) 1714 req_set_fail(req); 1715 io_req_msg_cleanup(req, issue_flags); 1716 io_req_set_res(req, ret, 0); 1717 return IOU_OK; 1718 } 1719 1720 void io_netmsg_cache_free(const void *entry) 1721 { 1722 struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; 1723 1724 if (kmsg->free_iov) { 1725 kasan_mempool_unpoison_object(kmsg->free_iov, 1726 kmsg->free_iov_nr * sizeof(struct iovec)); 1727 io_netmsg_iovec_free(kmsg); 1728 } 1729 kfree(kmsg); 1730 } 1731 #endif 1732