1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/file.h> 5 #include <linux/slab.h> 6 #include <linux/net.h> 7 #include <linux/compat.h> 8 #include <net/compat.h> 9 #include <linux/io_uring.h> 10 11 #include <uapi/linux/io_uring.h> 12 13 #include "io_uring.h" 14 #include "kbuf.h" 15 #include "alloc_cache.h" 16 #include "net.h" 17 #include "notif.h" 18 #include "rsrc.h" 19 #include "zcrx.h" 20 21 struct io_shutdown { 22 struct file *file; 23 int how; 24 }; 25 26 struct io_accept { 27 struct file *file; 28 struct sockaddr __user *addr; 29 int __user *addr_len; 30 int flags; 31 int iou_flags; 32 u32 file_slot; 33 unsigned long nofile; 34 }; 35 36 struct io_socket { 37 struct file *file; 38 int domain; 39 int type; 40 int protocol; 41 int flags; 42 u32 file_slot; 43 unsigned long nofile; 44 }; 45 46 struct io_connect { 47 struct file *file; 48 struct sockaddr __user *addr; 49 int addr_len; 50 bool in_progress; 51 bool seen_econnaborted; 52 }; 53 54 struct io_bind { 55 struct file *file; 56 int addr_len; 57 }; 58 59 struct io_listen { 60 struct file *file; 61 int backlog; 62 }; 63 64 struct io_sr_msg { 65 struct file *file; 66 union { 67 struct compat_msghdr __user *umsg_compat; 68 struct user_msghdr __user *umsg; 69 void __user *buf; 70 }; 71 int len; 72 unsigned done_io; 73 unsigned msg_flags; 74 unsigned nr_multishot_loops; 75 u16 flags; 76 /* initialised and used only by !msg send variants */ 77 u16 buf_group; 78 unsigned short retry_flags; 79 void __user *msg_control; 80 /* used only for send zerocopy */ 81 struct io_kiocb *notif; 82 }; 83 84 enum sr_retry_flags { 85 IO_SR_MSG_RETRY = 1, 86 IO_SR_MSG_PARTIAL_MAP = 2, 87 }; 88 89 /* 90 * Number of times we'll try and do receives if there's more data. If we 91 * exceed this limit, then add us to the back of the queue and retry from 92 * there. This helps fairness between flooding clients. 93 */ 94 #define MULTISHOT_MAX_RETRY 32 95 96 struct io_recvzc { 97 struct file *file; 98 unsigned msg_flags; 99 u16 flags; 100 u32 len; 101 struct io_zcrx_ifq *ifq; 102 }; 103 104 static int io_sg_from_iter_iovec(struct sk_buff *skb, 105 struct iov_iter *from, size_t length); 106 static int io_sg_from_iter(struct sk_buff *skb, 107 struct iov_iter *from, size_t length); 108 109 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 110 { 111 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); 112 113 if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || 114 sqe->buf_index || sqe->splice_fd_in)) 115 return -EINVAL; 116 117 shutdown->how = READ_ONCE(sqe->len); 118 req->flags |= REQ_F_FORCE_ASYNC; 119 return 0; 120 } 121 122 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) 123 { 124 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); 125 struct socket *sock; 126 int ret; 127 128 WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); 129 130 sock = sock_from_file(req->file); 131 if (unlikely(!sock)) 132 return -ENOTSOCK; 133 134 ret = __sys_shutdown_sock(sock, shutdown->how); 135 io_req_set_res(req, ret, 0); 136 return IOU_COMPLETE; 137 } 138 139 static bool io_net_retry(struct socket *sock, int flags) 140 { 141 if (!(flags & MSG_WAITALL)) 142 return false; 143 return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; 144 } 145 146 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) 147 { 148 if (kmsg->vec.iovec) 149 io_vec_free(&kmsg->vec); 150 } 151 152 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) 153 { 154 struct io_async_msghdr *hdr = req->async_data; 155 156 /* can't recycle, ensure we free the iovec if we have one */ 157 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { 158 io_netmsg_iovec_free(hdr); 159 return; 160 } 161 162 /* Let normal cleanup path reap it if we fail adding to the cache */ 163 io_alloc_cache_vec_kasan(&hdr->vec); 164 if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP) 165 io_vec_free(&hdr->vec); 166 167 if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { 168 req->async_data = NULL; 169 req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); 170 } 171 } 172 173 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) 174 { 175 struct io_ring_ctx *ctx = req->ctx; 176 struct io_async_msghdr *hdr; 177 178 hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req); 179 if (!hdr) 180 return NULL; 181 182 /* If the async data was cached, we might have an iov cached inside. */ 183 if (hdr->vec.iovec) 184 req->flags |= REQ_F_NEED_CLEANUP; 185 return hdr; 186 } 187 188 static inline void io_mshot_prep_retry(struct io_kiocb *req, 189 struct io_async_msghdr *kmsg) 190 { 191 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 192 193 req->flags &= ~REQ_F_BL_EMPTY; 194 sr->done_io = 0; 195 sr->retry_flags = 0; 196 sr->len = 0; /* get from the provided buffer */ 197 } 198 199 static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg, 200 const struct iovec __user *uiov, unsigned uvec_seg, 201 int ddir) 202 { 203 struct iovec *iov; 204 int ret, nr_segs; 205 206 if (iomsg->vec.iovec) { 207 nr_segs = iomsg->vec.nr; 208 iov = iomsg->vec.iovec; 209 } else { 210 nr_segs = 1; 211 iov = &iomsg->fast_iov; 212 } 213 214 ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov, 215 &iomsg->msg.msg_iter, io_is_compat(req->ctx)); 216 if (unlikely(ret < 0)) 217 return ret; 218 219 if (iov) { 220 req->flags |= REQ_F_NEED_CLEANUP; 221 io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs); 222 } 223 return 0; 224 } 225 226 static int io_compat_msg_copy_hdr(struct io_kiocb *req, 227 struct io_async_msghdr *iomsg, 228 struct compat_msghdr *msg, int ddir, 229 struct sockaddr __user **save_addr) 230 { 231 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 232 struct compat_iovec __user *uiov; 233 int ret; 234 235 if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) 236 return -EFAULT; 237 238 ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr); 239 if (ret) 240 return ret; 241 242 uiov = compat_ptr(msg->msg_iov); 243 if (req->flags & REQ_F_BUFFER_SELECT) { 244 if (msg->msg_iovlen == 0) { 245 sr->len = 0; 246 } else if (msg->msg_iovlen > 1) { 247 return -EINVAL; 248 } else { 249 struct compat_iovec tmp_iov; 250 251 if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) 252 return -EFAULT; 253 sr->len = tmp_iov.iov_len; 254 } 255 } 256 return 0; 257 } 258 259 static int io_copy_msghdr_from_user(struct user_msghdr *msg, 260 struct user_msghdr __user *umsg) 261 { 262 if (!user_access_begin(umsg, sizeof(*umsg))) 263 return -EFAULT; 264 unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); 265 unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); 266 unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); 267 unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); 268 unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); 269 unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); 270 user_access_end(); 271 return 0; 272 ua_end: 273 user_access_end(); 274 return -EFAULT; 275 } 276 277 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, 278 struct user_msghdr *msg, int ddir, 279 struct sockaddr __user **save_addr) 280 { 281 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 282 struct user_msghdr __user *umsg = sr->umsg; 283 int ret; 284 285 iomsg->msg.msg_name = &iomsg->addr; 286 iomsg->msg.msg_iter.nr_segs = 0; 287 288 if (io_is_compat(req->ctx)) { 289 struct compat_msghdr cmsg; 290 291 ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr); 292 if (ret) 293 return ret; 294 295 memset(msg, 0, sizeof(*msg)); 296 msg->msg_namelen = cmsg.msg_namelen; 297 msg->msg_controllen = cmsg.msg_controllen; 298 msg->msg_iov = compat_ptr(cmsg.msg_iov); 299 msg->msg_iovlen = cmsg.msg_iovlen; 300 return 0; 301 } 302 303 ret = io_copy_msghdr_from_user(msg, umsg); 304 if (unlikely(ret)) 305 return ret; 306 307 msg->msg_flags = 0; 308 309 ret = __copy_msghdr(&iomsg->msg, msg, save_addr); 310 if (ret) 311 return ret; 312 313 if (req->flags & REQ_F_BUFFER_SELECT) { 314 if (msg->msg_iovlen == 0) { 315 sr->len = 0; 316 } else if (msg->msg_iovlen > 1) { 317 return -EINVAL; 318 } else { 319 struct iovec __user *uiov = msg->msg_iov; 320 struct iovec tmp_iov; 321 322 if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) 323 return -EFAULT; 324 sr->len = tmp_iov.iov_len; 325 } 326 } 327 return 0; 328 } 329 330 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) 331 { 332 struct io_async_msghdr *io = req->async_data; 333 334 io_netmsg_iovec_free(io); 335 } 336 337 static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) 338 { 339 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 340 struct io_async_msghdr *kmsg = req->async_data; 341 void __user *addr; 342 u16 addr_len; 343 int ret; 344 345 sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 346 347 if (READ_ONCE(sqe->__pad3[0])) 348 return -EINVAL; 349 350 kmsg->msg.msg_name = NULL; 351 kmsg->msg.msg_namelen = 0; 352 kmsg->msg.msg_control = NULL; 353 kmsg->msg.msg_controllen = 0; 354 kmsg->msg.msg_ubuf = NULL; 355 356 addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 357 addr_len = READ_ONCE(sqe->addr_len); 358 if (addr) { 359 ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr); 360 if (unlikely(ret < 0)) 361 return ret; 362 kmsg->msg.msg_name = &kmsg->addr; 363 kmsg->msg.msg_namelen = addr_len; 364 } 365 if (sr->flags & IORING_RECVSEND_FIXED_BUF) { 366 req->flags |= REQ_F_IMPORT_BUFFER; 367 return 0; 368 } 369 if (req->flags & REQ_F_BUFFER_SELECT) 370 return 0; 371 return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); 372 } 373 374 static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) 375 { 376 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 377 struct io_async_msghdr *kmsg = req->async_data; 378 struct user_msghdr msg; 379 int ret; 380 381 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 382 ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL); 383 if (unlikely(ret)) 384 return ret; 385 /* save msg_control as sys_sendmsg() overwrites it */ 386 sr->msg_control = kmsg->msg.msg_control_user; 387 388 if (sr->flags & IORING_RECVSEND_FIXED_BUF) { 389 kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen; 390 return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov, 391 msg.msg_iovlen); 392 } 393 if (req->flags & REQ_F_BUFFER_SELECT) 394 return 0; 395 return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE); 396 } 397 398 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) 399 400 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 401 { 402 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 403 404 sr->done_io = 0; 405 sr->retry_flags = 0; 406 sr->len = READ_ONCE(sqe->len); 407 sr->flags = READ_ONCE(sqe->ioprio); 408 if (sr->flags & ~SENDMSG_FLAGS) 409 return -EINVAL; 410 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 411 if (sr->msg_flags & MSG_DONTWAIT) 412 req->flags |= REQ_F_NOWAIT; 413 if (req->flags & REQ_F_BUFFER_SELECT) 414 sr->buf_group = req->buf_index; 415 if (sr->flags & IORING_RECVSEND_BUNDLE) { 416 if (req->opcode == IORING_OP_SENDMSG) 417 return -EINVAL; 418 sr->msg_flags |= MSG_WAITALL; 419 req->buf_list = NULL; 420 req->flags |= REQ_F_MULTISHOT; 421 } 422 423 if (io_is_compat(req->ctx)) 424 sr->msg_flags |= MSG_CMSG_COMPAT; 425 426 if (unlikely(!io_msg_alloc_async(req))) 427 return -ENOMEM; 428 if (req->opcode != IORING_OP_SENDMSG) 429 return io_send_setup(req, sqe); 430 if (unlikely(sqe->addr2 || sqe->file_index)) 431 return -EINVAL; 432 return io_sendmsg_setup(req, sqe); 433 } 434 435 static void io_req_msg_cleanup(struct io_kiocb *req, 436 unsigned int issue_flags) 437 { 438 io_netmsg_recycle(req, issue_flags); 439 } 440 441 /* 442 * For bundle completions, we need to figure out how many segments we consumed. 443 * A bundle could be using a single ITER_UBUF if that's all we mapped, or it 444 * could be using an ITER_IOVEC. If the latter, then if we consumed all of 445 * the segments, then it's a trivial questiont o answer. If we have residual 446 * data in the iter, then loop the segments to figure out how much we 447 * transferred. 448 */ 449 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) 450 { 451 struct iovec *iov; 452 int nbufs; 453 454 /* no data is always zero segments, and a ubuf is always 1 segment */ 455 if (ret <= 0) 456 return 0; 457 if (iter_is_ubuf(&kmsg->msg.msg_iter)) 458 return 1; 459 460 iov = kmsg->vec.iovec; 461 if (!iov) 462 iov = &kmsg->fast_iov; 463 464 /* if all data was transferred, it's basic pointer math */ 465 if (!iov_iter_count(&kmsg->msg.msg_iter)) 466 return iter_iov(&kmsg->msg.msg_iter) - iov; 467 468 /* short transfer, count segments */ 469 nbufs = 0; 470 do { 471 int this_len = min_t(int, iov[nbufs].iov_len, ret); 472 473 nbufs++; 474 ret -= this_len; 475 } while (ret); 476 477 return nbufs; 478 } 479 480 static inline bool io_send_finish(struct io_kiocb *req, int *ret, 481 struct io_async_msghdr *kmsg, 482 unsigned issue_flags) 483 { 484 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 485 bool bundle_finished = *ret <= 0; 486 unsigned int cflags; 487 488 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { 489 cflags = io_put_kbuf(req, *ret, issue_flags); 490 goto finish; 491 } 492 493 cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); 494 495 if (bundle_finished || req->flags & REQ_F_BL_EMPTY) 496 goto finish; 497 498 /* 499 * Fill CQE for this receive and see if we should keep trying to 500 * receive from this socket. 501 */ 502 if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 503 io_mshot_prep_retry(req, kmsg); 504 return false; 505 } 506 507 /* Otherwise stop bundle and use the current result. */ 508 finish: 509 io_req_set_res(req, *ret, cflags); 510 *ret = IOU_COMPLETE; 511 return true; 512 } 513 514 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 515 { 516 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 517 struct io_async_msghdr *kmsg = req->async_data; 518 struct socket *sock; 519 unsigned flags; 520 int min_ret = 0; 521 int ret; 522 523 sock = sock_from_file(req->file); 524 if (unlikely(!sock)) 525 return -ENOTSOCK; 526 527 if (!(req->flags & REQ_F_POLLED) && 528 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 529 return -EAGAIN; 530 531 flags = sr->msg_flags; 532 if (issue_flags & IO_URING_F_NONBLOCK) 533 flags |= MSG_DONTWAIT; 534 if (flags & MSG_WAITALL) 535 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 536 537 kmsg->msg.msg_control_user = sr->msg_control; 538 539 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 540 541 if (ret < min_ret) { 542 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 543 return -EAGAIN; 544 if (ret > 0 && io_net_retry(sock, flags)) { 545 kmsg->msg.msg_controllen = 0; 546 kmsg->msg.msg_control = NULL; 547 sr->done_io += ret; 548 req->flags |= REQ_F_BL_NO_RECYCLE; 549 return -EAGAIN; 550 } 551 if (ret == -ERESTARTSYS) 552 ret = -EINTR; 553 req_set_fail(req); 554 } 555 io_req_msg_cleanup(req, issue_flags); 556 if (ret >= 0) 557 ret += sr->done_io; 558 else if (sr->done_io) 559 ret = sr->done_io; 560 io_req_set_res(req, ret, 0); 561 return IOU_COMPLETE; 562 } 563 564 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, 565 struct io_async_msghdr *kmsg) 566 { 567 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 568 569 int ret; 570 struct buf_sel_arg arg = { 571 .iovs = &kmsg->fast_iov, 572 .max_len = min_not_zero(sr->len, INT_MAX), 573 .nr_iovs = 1, 574 .buf_group = sr->buf_group, 575 }; 576 577 if (kmsg->vec.iovec) { 578 arg.nr_iovs = kmsg->vec.nr; 579 arg.iovs = kmsg->vec.iovec; 580 arg.mode = KBUF_MODE_FREE; 581 } 582 583 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) 584 arg.nr_iovs = 1; 585 else 586 arg.mode |= KBUF_MODE_EXPAND; 587 588 ret = io_buffers_select(req, &arg, issue_flags); 589 if (unlikely(ret < 0)) 590 return ret; 591 592 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { 593 kmsg->vec.nr = ret; 594 kmsg->vec.iovec = arg.iovs; 595 req->flags |= REQ_F_NEED_CLEANUP; 596 } 597 sr->len = arg.out_len; 598 599 if (ret == 1) { 600 sr->buf = arg.iovs[0].iov_base; 601 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, 602 &kmsg->msg.msg_iter); 603 if (unlikely(ret)) 604 return ret; 605 } else { 606 iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, 607 arg.iovs, ret, arg.out_len); 608 } 609 610 return 0; 611 } 612 613 int io_send(struct io_kiocb *req, unsigned int issue_flags) 614 { 615 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 616 struct io_async_msghdr *kmsg = req->async_data; 617 struct socket *sock; 618 unsigned flags; 619 int min_ret = 0; 620 int ret; 621 622 sock = sock_from_file(req->file); 623 if (unlikely(!sock)) 624 return -ENOTSOCK; 625 626 if (!(req->flags & REQ_F_POLLED) && 627 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 628 return -EAGAIN; 629 630 flags = sr->msg_flags; 631 if (issue_flags & IO_URING_F_NONBLOCK) 632 flags |= MSG_DONTWAIT; 633 634 retry_bundle: 635 if (io_do_buffer_select(req)) { 636 ret = io_send_select_buffer(req, issue_flags, kmsg); 637 if (ret) 638 return ret; 639 } 640 641 /* 642 * If MSG_WAITALL is set, or this is a bundle send, then we need 643 * the full amount. If just bundle is set, if we do a short send 644 * then we complete the bundle sequence rather than continue on. 645 */ 646 if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE) 647 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 648 649 flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 650 kmsg->msg.msg_flags = flags; 651 ret = sock_sendmsg(sock, &kmsg->msg); 652 if (ret < min_ret) { 653 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 654 return -EAGAIN; 655 656 if (ret > 0 && io_net_retry(sock, flags)) { 657 sr->len -= ret; 658 sr->buf += ret; 659 sr->done_io += ret; 660 req->flags |= REQ_F_BL_NO_RECYCLE; 661 return -EAGAIN; 662 } 663 if (ret == -ERESTARTSYS) 664 ret = -EINTR; 665 req_set_fail(req); 666 } 667 if (ret >= 0) 668 ret += sr->done_io; 669 else if (sr->done_io) 670 ret = sr->done_io; 671 672 if (!io_send_finish(req, &ret, kmsg, issue_flags)) 673 goto retry_bundle; 674 675 io_req_msg_cleanup(req, issue_flags); 676 return ret; 677 } 678 679 static int io_recvmsg_mshot_prep(struct io_kiocb *req, 680 struct io_async_msghdr *iomsg, 681 int namelen, size_t controllen) 682 { 683 if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) == 684 (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) { 685 int hdr; 686 687 if (unlikely(namelen < 0)) 688 return -EOVERFLOW; 689 if (check_add_overflow(sizeof(struct io_uring_recvmsg_out), 690 namelen, &hdr)) 691 return -EOVERFLOW; 692 if (check_add_overflow(hdr, controllen, &hdr)) 693 return -EOVERFLOW; 694 695 iomsg->namelen = namelen; 696 iomsg->controllen = controllen; 697 return 0; 698 } 699 700 return 0; 701 } 702 703 static int io_recvmsg_copy_hdr(struct io_kiocb *req, 704 struct io_async_msghdr *iomsg) 705 { 706 struct user_msghdr msg; 707 int ret; 708 709 ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); 710 if (unlikely(ret)) 711 return ret; 712 713 if (!(req->flags & REQ_F_BUFFER_SELECT)) { 714 ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen, 715 ITER_DEST); 716 if (unlikely(ret)) 717 return ret; 718 } 719 return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, 720 msg.msg_controllen); 721 } 722 723 static int io_recvmsg_prep_setup(struct io_kiocb *req) 724 { 725 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 726 struct io_async_msghdr *kmsg; 727 728 kmsg = io_msg_alloc_async(req); 729 if (unlikely(!kmsg)) 730 return -ENOMEM; 731 732 if (req->opcode == IORING_OP_RECV) { 733 kmsg->msg.msg_name = NULL; 734 kmsg->msg.msg_namelen = 0; 735 kmsg->msg.msg_inq = 0; 736 kmsg->msg.msg_control = NULL; 737 kmsg->msg.msg_get_inq = 1; 738 kmsg->msg.msg_controllen = 0; 739 kmsg->msg.msg_iocb = NULL; 740 kmsg->msg.msg_ubuf = NULL; 741 742 if (req->flags & REQ_F_BUFFER_SELECT) 743 return 0; 744 return import_ubuf(ITER_DEST, sr->buf, sr->len, 745 &kmsg->msg.msg_iter); 746 } 747 748 return io_recvmsg_copy_hdr(req, kmsg); 749 } 750 751 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ 752 IORING_RECVSEND_BUNDLE) 753 754 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 755 { 756 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 757 758 sr->done_io = 0; 759 sr->retry_flags = 0; 760 761 if (unlikely(sqe->file_index || sqe->addr2)) 762 return -EINVAL; 763 764 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 765 sr->len = READ_ONCE(sqe->len); 766 sr->flags = READ_ONCE(sqe->ioprio); 767 if (sr->flags & ~RECVMSG_FLAGS) 768 return -EINVAL; 769 sr->msg_flags = READ_ONCE(sqe->msg_flags); 770 if (sr->msg_flags & MSG_DONTWAIT) 771 req->flags |= REQ_F_NOWAIT; 772 if (sr->msg_flags & MSG_ERRQUEUE) 773 req->flags |= REQ_F_CLEAR_POLLIN; 774 if (req->flags & REQ_F_BUFFER_SELECT) { 775 /* 776 * Store the buffer group for this multishot receive separately, 777 * as if we end up doing an io-wq based issue that selects a 778 * buffer, it has to be committed immediately and that will 779 * clear ->buf_list. This means we lose the link to the buffer 780 * list, and the eventual buffer put on completion then cannot 781 * restore it. 782 */ 783 sr->buf_group = req->buf_index; 784 req->buf_list = NULL; 785 } 786 if (sr->flags & IORING_RECV_MULTISHOT) { 787 if (!(req->flags & REQ_F_BUFFER_SELECT)) 788 return -EINVAL; 789 if (sr->msg_flags & MSG_WAITALL) 790 return -EINVAL; 791 if (req->opcode == IORING_OP_RECV && sr->len) 792 return -EINVAL; 793 req->flags |= REQ_F_APOLL_MULTISHOT; 794 } 795 if (sr->flags & IORING_RECVSEND_BUNDLE) { 796 if (req->opcode == IORING_OP_RECVMSG) 797 return -EINVAL; 798 } 799 800 if (io_is_compat(req->ctx)) 801 sr->msg_flags |= MSG_CMSG_COMPAT; 802 803 sr->nr_multishot_loops = 0; 804 return io_recvmsg_prep_setup(req); 805 } 806 807 /* bits to clear in old and inherit in new cflags on bundle retry */ 808 #define CQE_F_MASK (IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE) 809 810 /* 811 * Finishes io_recv and io_recvmsg. 812 * 813 * Returns true if it is actually finished, or false if it should run 814 * again (for multishot). 815 */ 816 static inline bool io_recv_finish(struct io_kiocb *req, int *ret, 817 struct io_async_msghdr *kmsg, 818 bool mshot_finished, unsigned issue_flags) 819 { 820 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 821 unsigned int cflags = 0; 822 823 if (kmsg->msg.msg_inq > 0) 824 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 825 826 if (sr->flags & IORING_RECVSEND_BUNDLE) { 827 size_t this_ret = *ret - sr->done_io; 828 829 cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), 830 issue_flags); 831 if (sr->retry_flags & IO_SR_MSG_RETRY) 832 cflags = req->cqe.flags | (cflags & CQE_F_MASK); 833 /* bundle with no more immediate buffers, we're done */ 834 if (req->flags & REQ_F_BL_EMPTY) 835 goto finish; 836 /* 837 * If more is available AND it was a full transfer, retry and 838 * append to this one 839 */ 840 if (!sr->retry_flags && kmsg->msg.msg_inq > 1 && this_ret > 0 && 841 !iov_iter_count(&kmsg->msg.msg_iter)) { 842 req->cqe.flags = cflags & ~CQE_F_MASK; 843 sr->len = kmsg->msg.msg_inq; 844 sr->done_io += this_ret; 845 sr->retry_flags |= IO_SR_MSG_RETRY; 846 return false; 847 } 848 } else { 849 cflags |= io_put_kbuf(req, *ret, issue_flags); 850 } 851 852 /* 853 * Fill CQE for this receive and see if we should keep trying to 854 * receive from this socket. 855 */ 856 if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && 857 io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 858 *ret = IOU_RETRY; 859 io_mshot_prep_retry(req, kmsg); 860 /* Known not-empty or unknown state, retry */ 861 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { 862 if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) 863 return false; 864 /* mshot retries exceeded, force a requeue */ 865 sr->nr_multishot_loops = 0; 866 if (issue_flags & IO_URING_F_MULTISHOT) 867 *ret = IOU_REQUEUE; 868 } 869 return true; 870 } 871 872 /* Finish the request / stop multishot. */ 873 finish: 874 io_req_set_res(req, *ret, cflags); 875 *ret = IOU_COMPLETE; 876 io_req_msg_cleanup(req, issue_flags); 877 return true; 878 } 879 880 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg, 881 struct io_sr_msg *sr, void __user **buf, 882 size_t *len) 883 { 884 unsigned long ubuf = (unsigned long) *buf; 885 unsigned long hdr; 886 887 hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + 888 kmsg->controllen; 889 if (*len < hdr) 890 return -EFAULT; 891 892 if (kmsg->controllen) { 893 unsigned long control = ubuf + hdr - kmsg->controllen; 894 895 kmsg->msg.msg_control_user = (void __user *) control; 896 kmsg->msg.msg_controllen = kmsg->controllen; 897 } 898 899 sr->buf = *buf; /* stash for later copy */ 900 *buf = (void __user *) (ubuf + hdr); 901 kmsg->payloadlen = *len = *len - hdr; 902 return 0; 903 } 904 905 struct io_recvmsg_multishot_hdr { 906 struct io_uring_recvmsg_out msg; 907 struct sockaddr_storage addr; 908 }; 909 910 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, 911 struct io_async_msghdr *kmsg, 912 unsigned int flags, bool *finished) 913 { 914 int err; 915 int copy_len; 916 struct io_recvmsg_multishot_hdr hdr; 917 918 if (kmsg->namelen) 919 kmsg->msg.msg_name = &hdr.addr; 920 kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); 921 kmsg->msg.msg_namelen = 0; 922 923 if (sock->file->f_flags & O_NONBLOCK) 924 flags |= MSG_DONTWAIT; 925 926 err = sock_recvmsg(sock, &kmsg->msg, flags); 927 *finished = err <= 0; 928 if (err < 0) 929 return err; 930 931 hdr.msg = (struct io_uring_recvmsg_out) { 932 .controllen = kmsg->controllen - kmsg->msg.msg_controllen, 933 .flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT 934 }; 935 936 hdr.msg.payloadlen = err; 937 if (err > kmsg->payloadlen) 938 err = kmsg->payloadlen; 939 940 copy_len = sizeof(struct io_uring_recvmsg_out); 941 if (kmsg->msg.msg_namelen > kmsg->namelen) 942 copy_len += kmsg->namelen; 943 else 944 copy_len += kmsg->msg.msg_namelen; 945 946 /* 947 * "fromlen shall refer to the value before truncation.." 948 * 1003.1g 949 */ 950 hdr.msg.namelen = kmsg->msg.msg_namelen; 951 952 /* ensure that there is no gap between hdr and sockaddr_storage */ 953 BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) != 954 sizeof(struct io_uring_recvmsg_out)); 955 if (copy_to_user(io->buf, &hdr, copy_len)) { 956 *finished = true; 957 return -EFAULT; 958 } 959 960 return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + 961 kmsg->controllen + err; 962 } 963 964 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 965 { 966 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 967 struct io_async_msghdr *kmsg = req->async_data; 968 struct socket *sock; 969 unsigned flags; 970 int ret, min_ret = 0; 971 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 972 bool mshot_finished = true; 973 974 sock = sock_from_file(req->file); 975 if (unlikely(!sock)) 976 return -ENOTSOCK; 977 978 if (!(req->flags & REQ_F_POLLED) && 979 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 980 return -EAGAIN; 981 982 flags = sr->msg_flags; 983 if (force_nonblock) 984 flags |= MSG_DONTWAIT; 985 986 retry_multishot: 987 if (io_do_buffer_select(req)) { 988 void __user *buf; 989 size_t len = sr->len; 990 991 buf = io_buffer_select(req, &len, sr->buf_group, issue_flags); 992 if (!buf) 993 return -ENOBUFS; 994 995 if (req->flags & REQ_F_APOLL_MULTISHOT) { 996 ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len); 997 if (ret) { 998 io_kbuf_recycle(req, issue_flags); 999 return ret; 1000 } 1001 } 1002 1003 iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); 1004 } 1005 1006 kmsg->msg.msg_get_inq = 1; 1007 kmsg->msg.msg_inq = -1; 1008 if (req->flags & REQ_F_APOLL_MULTISHOT) { 1009 ret = io_recvmsg_multishot(sock, sr, kmsg, flags, 1010 &mshot_finished); 1011 } else { 1012 /* disable partial retry for recvmsg with cmsg attached */ 1013 if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) 1014 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1015 1016 ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, 1017 kmsg->uaddr, flags); 1018 } 1019 1020 if (ret < min_ret) { 1021 if (ret == -EAGAIN && force_nonblock) { 1022 if (issue_flags & IO_URING_F_MULTISHOT) 1023 io_kbuf_recycle(req, issue_flags); 1024 1025 return IOU_RETRY; 1026 } 1027 if (ret > 0 && io_net_retry(sock, flags)) { 1028 sr->done_io += ret; 1029 req->flags |= REQ_F_BL_NO_RECYCLE; 1030 return IOU_RETRY; 1031 } 1032 if (ret == -ERESTARTSYS) 1033 ret = -EINTR; 1034 req_set_fail(req); 1035 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1036 req_set_fail(req); 1037 } 1038 1039 if (ret > 0) 1040 ret += sr->done_io; 1041 else if (sr->done_io) 1042 ret = sr->done_io; 1043 else 1044 io_kbuf_recycle(req, issue_flags); 1045 1046 if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1047 goto retry_multishot; 1048 1049 return ret; 1050 } 1051 1052 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, 1053 size_t *len, unsigned int issue_flags) 1054 { 1055 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1056 int ret; 1057 1058 /* 1059 * If the ring isn't locked, then don't use the peek interface 1060 * to grab multiple buffers as we will lock/unlock between 1061 * this selection and posting the buffers. 1062 */ 1063 if (!(issue_flags & IO_URING_F_UNLOCKED) && 1064 sr->flags & IORING_RECVSEND_BUNDLE) { 1065 struct buf_sel_arg arg = { 1066 .iovs = &kmsg->fast_iov, 1067 .nr_iovs = 1, 1068 .mode = KBUF_MODE_EXPAND, 1069 .buf_group = sr->buf_group, 1070 }; 1071 1072 if (kmsg->vec.iovec) { 1073 arg.nr_iovs = kmsg->vec.nr; 1074 arg.iovs = kmsg->vec.iovec; 1075 arg.mode |= KBUF_MODE_FREE; 1076 } 1077 1078 if (kmsg->msg.msg_inq > 1) 1079 arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); 1080 1081 ret = io_buffers_peek(req, &arg); 1082 if (unlikely(ret < 0)) 1083 return ret; 1084 1085 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { 1086 kmsg->vec.nr = ret; 1087 kmsg->vec.iovec = arg.iovs; 1088 req->flags |= REQ_F_NEED_CLEANUP; 1089 } 1090 if (arg.partial_map) 1091 sr->retry_flags |= IO_SR_MSG_PARTIAL_MAP; 1092 1093 /* special case 1 vec, can be a fast path */ 1094 if (ret == 1) { 1095 sr->buf = arg.iovs[0].iov_base; 1096 sr->len = arg.iovs[0].iov_len; 1097 goto map_ubuf; 1098 } 1099 iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, 1100 arg.out_len); 1101 } else { 1102 void __user *buf; 1103 1104 *len = sr->len; 1105 buf = io_buffer_select(req, len, sr->buf_group, issue_flags); 1106 if (!buf) 1107 return -ENOBUFS; 1108 sr->buf = buf; 1109 sr->len = *len; 1110 map_ubuf: 1111 ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 1112 &kmsg->msg.msg_iter); 1113 if (unlikely(ret)) 1114 return ret; 1115 } 1116 1117 return 0; 1118 } 1119 1120 int io_recv(struct io_kiocb *req, unsigned int issue_flags) 1121 { 1122 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1123 struct io_async_msghdr *kmsg = req->async_data; 1124 struct socket *sock; 1125 unsigned flags; 1126 int ret, min_ret = 0; 1127 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1128 size_t len = sr->len; 1129 bool mshot_finished; 1130 1131 if (!(req->flags & REQ_F_POLLED) && 1132 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1133 return -EAGAIN; 1134 1135 sock = sock_from_file(req->file); 1136 if (unlikely(!sock)) 1137 return -ENOTSOCK; 1138 1139 flags = sr->msg_flags; 1140 if (force_nonblock) 1141 flags |= MSG_DONTWAIT; 1142 1143 retry_multishot: 1144 if (io_do_buffer_select(req)) { 1145 ret = io_recv_buf_select(req, kmsg, &len, issue_flags); 1146 if (unlikely(ret)) { 1147 kmsg->msg.msg_inq = -1; 1148 goto out_free; 1149 } 1150 sr->buf = NULL; 1151 } 1152 1153 kmsg->msg.msg_flags = 0; 1154 kmsg->msg.msg_inq = -1; 1155 1156 if (flags & MSG_WAITALL) 1157 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1158 1159 ret = sock_recvmsg(sock, &kmsg->msg, flags); 1160 if (ret < min_ret) { 1161 if (ret == -EAGAIN && force_nonblock) { 1162 if (issue_flags & IO_URING_F_MULTISHOT) 1163 io_kbuf_recycle(req, issue_flags); 1164 1165 return IOU_RETRY; 1166 } 1167 if (ret > 0 && io_net_retry(sock, flags)) { 1168 sr->len -= ret; 1169 sr->buf += ret; 1170 sr->done_io += ret; 1171 req->flags |= REQ_F_BL_NO_RECYCLE; 1172 return -EAGAIN; 1173 } 1174 if (ret == -ERESTARTSYS) 1175 ret = -EINTR; 1176 req_set_fail(req); 1177 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1178 out_free: 1179 req_set_fail(req); 1180 } 1181 1182 mshot_finished = ret <= 0; 1183 if (ret > 0) 1184 ret += sr->done_io; 1185 else if (sr->done_io) 1186 ret = sr->done_io; 1187 else 1188 io_kbuf_recycle(req, issue_flags); 1189 1190 if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1191 goto retry_multishot; 1192 1193 return ret; 1194 } 1195 1196 int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1197 { 1198 struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); 1199 unsigned ifq_idx; 1200 1201 if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3)) 1202 return -EINVAL; 1203 1204 ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); 1205 zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx); 1206 if (!zc->ifq) 1207 return -EINVAL; 1208 1209 zc->len = READ_ONCE(sqe->len); 1210 zc->flags = READ_ONCE(sqe->ioprio); 1211 zc->msg_flags = READ_ONCE(sqe->msg_flags); 1212 if (zc->msg_flags) 1213 return -EINVAL; 1214 if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) 1215 return -EINVAL; 1216 /* multishot required */ 1217 if (!(zc->flags & IORING_RECV_MULTISHOT)) 1218 return -EINVAL; 1219 /* All data completions are posted as aux CQEs. */ 1220 req->flags |= REQ_F_APOLL_MULTISHOT; 1221 1222 return 0; 1223 } 1224 1225 int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) 1226 { 1227 struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); 1228 struct socket *sock; 1229 unsigned int len; 1230 int ret; 1231 1232 if (!(req->flags & REQ_F_POLLED) && 1233 (zc->flags & IORING_RECVSEND_POLL_FIRST)) 1234 return -EAGAIN; 1235 1236 sock = sock_from_file(req->file); 1237 if (unlikely(!sock)) 1238 return -ENOTSOCK; 1239 1240 len = zc->len; 1241 ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, 1242 issue_flags, &zc->len); 1243 if (len && zc->len == 0) { 1244 io_req_set_res(req, 0, 0); 1245 1246 return IOU_COMPLETE; 1247 } 1248 if (unlikely(ret <= 0) && ret != -EAGAIN) { 1249 if (ret == -ERESTARTSYS) 1250 ret = -EINTR; 1251 if (ret == IOU_REQUEUE) 1252 return IOU_REQUEUE; 1253 1254 req_set_fail(req); 1255 io_req_set_res(req, ret, 0); 1256 return IOU_COMPLETE; 1257 } 1258 return IOU_RETRY; 1259 } 1260 1261 void io_send_zc_cleanup(struct io_kiocb *req) 1262 { 1263 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1264 struct io_async_msghdr *io = req->async_data; 1265 1266 if (req_has_async_data(req)) 1267 io_netmsg_iovec_free(io); 1268 if (zc->notif) { 1269 io_notif_flush(zc->notif); 1270 zc->notif = NULL; 1271 } 1272 } 1273 1274 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF) 1275 #define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE) 1276 1277 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1278 { 1279 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1280 struct io_ring_ctx *ctx = req->ctx; 1281 struct io_async_msghdr *iomsg; 1282 struct io_kiocb *notif; 1283 int ret; 1284 1285 zc->done_io = 0; 1286 zc->retry_flags = 0; 1287 1288 if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) 1289 return -EINVAL; 1290 /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ 1291 if (req->flags & REQ_F_CQE_SKIP) 1292 return -EINVAL; 1293 1294 notif = zc->notif = io_alloc_notif(ctx); 1295 if (!notif) 1296 return -ENOMEM; 1297 notif->cqe.user_data = req->cqe.user_data; 1298 notif->cqe.res = 0; 1299 notif->cqe.flags = IORING_CQE_F_NOTIF; 1300 req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY; 1301 1302 zc->flags = READ_ONCE(sqe->ioprio); 1303 if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) { 1304 if (zc->flags & ~IO_ZC_FLAGS_VALID) 1305 return -EINVAL; 1306 if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { 1307 struct io_notif_data *nd = io_notif_to_data(notif); 1308 1309 nd->zc_report = true; 1310 nd->zc_used = false; 1311 nd->zc_copied = false; 1312 } 1313 } 1314 1315 zc->len = READ_ONCE(sqe->len); 1316 zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; 1317 req->buf_index = READ_ONCE(sqe->buf_index); 1318 if (zc->msg_flags & MSG_DONTWAIT) 1319 req->flags |= REQ_F_NOWAIT; 1320 1321 if (io_is_compat(req->ctx)) 1322 zc->msg_flags |= MSG_CMSG_COMPAT; 1323 1324 iomsg = io_msg_alloc_async(req); 1325 if (unlikely(!iomsg)) 1326 return -ENOMEM; 1327 1328 if (req->opcode == IORING_OP_SEND_ZC) { 1329 ret = io_send_setup(req, sqe); 1330 } else { 1331 if (unlikely(sqe->addr2 || sqe->file_index)) 1332 return -EINVAL; 1333 ret = io_sendmsg_setup(req, sqe); 1334 } 1335 if (unlikely(ret)) 1336 return ret; 1337 1338 if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) { 1339 iomsg->msg.sg_from_iter = io_sg_from_iter_iovec; 1340 return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count); 1341 } 1342 iomsg->msg.sg_from_iter = io_sg_from_iter; 1343 return 0; 1344 } 1345 1346 static int io_sg_from_iter_iovec(struct sk_buff *skb, 1347 struct iov_iter *from, size_t length) 1348 { 1349 skb_zcopy_downgrade_managed(skb); 1350 return zerocopy_fill_skb_from_iter(skb, from, length); 1351 } 1352 1353 static int io_sg_from_iter(struct sk_buff *skb, 1354 struct iov_iter *from, size_t length) 1355 { 1356 struct skb_shared_info *shinfo = skb_shinfo(skb); 1357 int frag = shinfo->nr_frags; 1358 int ret = 0; 1359 struct bvec_iter bi; 1360 ssize_t copied = 0; 1361 unsigned long truesize = 0; 1362 1363 if (!frag) 1364 shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; 1365 else if (unlikely(!skb_zcopy_managed(skb))) 1366 return zerocopy_fill_skb_from_iter(skb, from, length); 1367 1368 bi.bi_size = min(from->count, length); 1369 bi.bi_bvec_done = from->iov_offset; 1370 bi.bi_idx = 0; 1371 1372 while (bi.bi_size && frag < MAX_SKB_FRAGS) { 1373 struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi); 1374 1375 copied += v.bv_len; 1376 truesize += PAGE_ALIGN(v.bv_len + v.bv_offset); 1377 __skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page, 1378 v.bv_offset, v.bv_len); 1379 bvec_iter_advance_single(from->bvec, &bi, v.bv_len); 1380 } 1381 if (bi.bi_size) 1382 ret = -EMSGSIZE; 1383 1384 shinfo->nr_frags = frag; 1385 from->bvec += bi.bi_idx; 1386 from->nr_segs -= bi.bi_idx; 1387 from->count -= copied; 1388 from->iov_offset = bi.bi_bvec_done; 1389 1390 skb->data_len += copied; 1391 skb->len += copied; 1392 skb->truesize += truesize; 1393 return ret; 1394 } 1395 1396 static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) 1397 { 1398 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1399 struct io_async_msghdr *kmsg = req->async_data; 1400 1401 WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF)); 1402 1403 sr->notif->buf_index = req->buf_index; 1404 return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, 1405 (u64)(uintptr_t)sr->buf, sr->len, 1406 ITER_SOURCE, issue_flags); 1407 } 1408 1409 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) 1410 { 1411 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1412 struct io_async_msghdr *kmsg = req->async_data; 1413 struct socket *sock; 1414 unsigned msg_flags; 1415 int ret, min_ret = 0; 1416 1417 sock = sock_from_file(req->file); 1418 if (unlikely(!sock)) 1419 return -ENOTSOCK; 1420 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1421 return -EOPNOTSUPP; 1422 1423 if (!(req->flags & REQ_F_POLLED) && 1424 (zc->flags & IORING_RECVSEND_POLL_FIRST)) 1425 return -EAGAIN; 1426 1427 if (req->flags & REQ_F_IMPORT_BUFFER) { 1428 req->flags &= ~REQ_F_IMPORT_BUFFER; 1429 ret = io_send_zc_import(req, issue_flags); 1430 if (unlikely(ret)) 1431 return ret; 1432 } 1433 1434 msg_flags = zc->msg_flags; 1435 if (issue_flags & IO_URING_F_NONBLOCK) 1436 msg_flags |= MSG_DONTWAIT; 1437 if (msg_flags & MSG_WAITALL) 1438 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1439 msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 1440 1441 kmsg->msg.msg_flags = msg_flags; 1442 kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; 1443 ret = sock_sendmsg(sock, &kmsg->msg); 1444 1445 if (unlikely(ret < min_ret)) { 1446 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1447 return -EAGAIN; 1448 1449 if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { 1450 zc->len -= ret; 1451 zc->buf += ret; 1452 zc->done_io += ret; 1453 req->flags |= REQ_F_BL_NO_RECYCLE; 1454 return -EAGAIN; 1455 } 1456 if (ret == -ERESTARTSYS) 1457 ret = -EINTR; 1458 req_set_fail(req); 1459 } 1460 1461 if (ret >= 0) 1462 ret += zc->done_io; 1463 else if (zc->done_io) 1464 ret = zc->done_io; 1465 1466 /* 1467 * If we're in io-wq we can't rely on tw ordering guarantees, defer 1468 * flushing notif to io_send_zc_cleanup() 1469 */ 1470 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1471 io_notif_flush(zc->notif); 1472 zc->notif = NULL; 1473 io_req_msg_cleanup(req, 0); 1474 } 1475 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1476 return IOU_COMPLETE; 1477 } 1478 1479 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) 1480 { 1481 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1482 struct io_async_msghdr *kmsg = req->async_data; 1483 struct socket *sock; 1484 unsigned flags; 1485 int ret, min_ret = 0; 1486 1487 if (req->flags & REQ_F_IMPORT_BUFFER) { 1488 unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; 1489 int ret; 1490 1491 ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req, 1492 &kmsg->vec, uvec_segs, issue_flags); 1493 if (unlikely(ret)) 1494 return ret; 1495 req->flags &= ~REQ_F_IMPORT_BUFFER; 1496 } 1497 1498 sock = sock_from_file(req->file); 1499 if (unlikely(!sock)) 1500 return -ENOTSOCK; 1501 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1502 return -EOPNOTSUPP; 1503 1504 if (!(req->flags & REQ_F_POLLED) && 1505 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1506 return -EAGAIN; 1507 1508 flags = sr->msg_flags; 1509 if (issue_flags & IO_URING_F_NONBLOCK) 1510 flags |= MSG_DONTWAIT; 1511 if (flags & MSG_WAITALL) 1512 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1513 1514 kmsg->msg.msg_control_user = sr->msg_control; 1515 kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; 1516 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 1517 1518 if (unlikely(ret < min_ret)) { 1519 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1520 return -EAGAIN; 1521 1522 if (ret > 0 && io_net_retry(sock, flags)) { 1523 sr->done_io += ret; 1524 req->flags |= REQ_F_BL_NO_RECYCLE; 1525 return -EAGAIN; 1526 } 1527 if (ret == -ERESTARTSYS) 1528 ret = -EINTR; 1529 req_set_fail(req); 1530 } 1531 1532 if (ret >= 0) 1533 ret += sr->done_io; 1534 else if (sr->done_io) 1535 ret = sr->done_io; 1536 1537 /* 1538 * If we're in io-wq we can't rely on tw ordering guarantees, defer 1539 * flushing notif to io_send_zc_cleanup() 1540 */ 1541 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1542 io_notif_flush(sr->notif); 1543 sr->notif = NULL; 1544 io_req_msg_cleanup(req, 0); 1545 } 1546 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1547 return IOU_COMPLETE; 1548 } 1549 1550 void io_sendrecv_fail(struct io_kiocb *req) 1551 { 1552 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1553 1554 if (sr->done_io) 1555 req->cqe.res = sr->done_io; 1556 1557 if ((req->flags & REQ_F_NEED_CLEANUP) && 1558 (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC)) 1559 req->cqe.flags |= IORING_CQE_F_MORE; 1560 } 1561 1562 #define ACCEPT_FLAGS (IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \ 1563 IORING_ACCEPT_POLL_FIRST) 1564 1565 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1566 { 1567 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1568 1569 if (sqe->len || sqe->buf_index) 1570 return -EINVAL; 1571 1572 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1573 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 1574 accept->flags = READ_ONCE(sqe->accept_flags); 1575 accept->nofile = rlimit(RLIMIT_NOFILE); 1576 accept->iou_flags = READ_ONCE(sqe->ioprio); 1577 if (accept->iou_flags & ~ACCEPT_FLAGS) 1578 return -EINVAL; 1579 1580 accept->file_slot = READ_ONCE(sqe->file_index); 1581 if (accept->file_slot) { 1582 if (accept->flags & SOCK_CLOEXEC) 1583 return -EINVAL; 1584 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT && 1585 accept->file_slot != IORING_FILE_INDEX_ALLOC) 1586 return -EINVAL; 1587 } 1588 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1589 return -EINVAL; 1590 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) 1591 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 1592 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT) 1593 req->flags |= REQ_F_APOLL_MULTISHOT; 1594 if (accept->iou_flags & IORING_ACCEPT_DONTWAIT) 1595 req->flags |= REQ_F_NOWAIT; 1596 return 0; 1597 } 1598 1599 int io_accept(struct io_kiocb *req, unsigned int issue_flags) 1600 { 1601 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1602 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1603 bool fixed = !!accept->file_slot; 1604 struct proto_accept_arg arg = { 1605 .flags = force_nonblock ? O_NONBLOCK : 0, 1606 }; 1607 struct file *file; 1608 unsigned cflags; 1609 int ret, fd; 1610 1611 if (!(req->flags & REQ_F_POLLED) && 1612 accept->iou_flags & IORING_ACCEPT_POLL_FIRST) 1613 return -EAGAIN; 1614 1615 retry: 1616 if (!fixed) { 1617 fd = __get_unused_fd_flags(accept->flags, accept->nofile); 1618 if (unlikely(fd < 0)) 1619 return fd; 1620 } 1621 arg.err = 0; 1622 arg.is_empty = -1; 1623 file = do_accept(req->file, &arg, accept->addr, accept->addr_len, 1624 accept->flags); 1625 if (IS_ERR(file)) { 1626 if (!fixed) 1627 put_unused_fd(fd); 1628 ret = PTR_ERR(file); 1629 if (ret == -EAGAIN && force_nonblock && 1630 !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) 1631 return IOU_RETRY; 1632 1633 if (ret == -ERESTARTSYS) 1634 ret = -EINTR; 1635 } else if (!fixed) { 1636 fd_install(fd, file); 1637 ret = fd; 1638 } else { 1639 ret = io_fixed_fd_install(req, issue_flags, file, 1640 accept->file_slot); 1641 } 1642 1643 cflags = 0; 1644 if (!arg.is_empty) 1645 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 1646 1647 if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) && 1648 io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { 1649 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) 1650 goto retry; 1651 return IOU_RETRY; 1652 } 1653 1654 io_req_set_res(req, ret, cflags); 1655 if (ret < 0) 1656 req_set_fail(req); 1657 return IOU_COMPLETE; 1658 } 1659 1660 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1661 { 1662 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); 1663 1664 if (sqe->addr || sqe->rw_flags || sqe->buf_index) 1665 return -EINVAL; 1666 1667 sock->domain = READ_ONCE(sqe->fd); 1668 sock->type = READ_ONCE(sqe->off); 1669 sock->protocol = READ_ONCE(sqe->len); 1670 sock->file_slot = READ_ONCE(sqe->file_index); 1671 sock->nofile = rlimit(RLIMIT_NOFILE); 1672 1673 sock->flags = sock->type & ~SOCK_TYPE_MASK; 1674 if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) 1675 return -EINVAL; 1676 if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1677 return -EINVAL; 1678 return 0; 1679 } 1680 1681 int io_socket(struct io_kiocb *req, unsigned int issue_flags) 1682 { 1683 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); 1684 bool fixed = !!sock->file_slot; 1685 struct file *file; 1686 int ret, fd; 1687 1688 if (!fixed) { 1689 fd = __get_unused_fd_flags(sock->flags, sock->nofile); 1690 if (unlikely(fd < 0)) 1691 return fd; 1692 } 1693 file = __sys_socket_file(sock->domain, sock->type, sock->protocol); 1694 if (IS_ERR(file)) { 1695 if (!fixed) 1696 put_unused_fd(fd); 1697 ret = PTR_ERR(file); 1698 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1699 return -EAGAIN; 1700 if (ret == -ERESTARTSYS) 1701 ret = -EINTR; 1702 req_set_fail(req); 1703 } else if (!fixed) { 1704 fd_install(fd, file); 1705 ret = fd; 1706 } else { 1707 ret = io_fixed_fd_install(req, issue_flags, file, 1708 sock->file_slot); 1709 } 1710 io_req_set_res(req, ret, 0); 1711 return IOU_COMPLETE; 1712 } 1713 1714 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1715 { 1716 struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); 1717 struct io_async_msghdr *io; 1718 1719 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 1720 return -EINVAL; 1721 1722 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1723 conn->addr_len = READ_ONCE(sqe->addr2); 1724 conn->in_progress = conn->seen_econnaborted = false; 1725 1726 io = io_msg_alloc_async(req); 1727 if (unlikely(!io)) 1728 return -ENOMEM; 1729 1730 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr); 1731 } 1732 1733 int io_connect(struct io_kiocb *req, unsigned int issue_flags) 1734 { 1735 struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect); 1736 struct io_async_msghdr *io = req->async_data; 1737 unsigned file_flags; 1738 int ret; 1739 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1740 1741 if (unlikely(req->flags & REQ_F_FAIL)) { 1742 ret = -ECONNRESET; 1743 goto out; 1744 } 1745 1746 file_flags = force_nonblock ? O_NONBLOCK : 0; 1747 1748 ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, 1749 file_flags); 1750 if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED) 1751 && force_nonblock) { 1752 if (ret == -EINPROGRESS) { 1753 connect->in_progress = true; 1754 } else if (ret == -ECONNABORTED) { 1755 if (connect->seen_econnaborted) 1756 goto out; 1757 connect->seen_econnaborted = true; 1758 } 1759 return -EAGAIN; 1760 } 1761 if (connect->in_progress) { 1762 /* 1763 * At least bluetooth will return -EBADFD on a re-connect 1764 * attempt, and it's (supposedly) also valid to get -EISCONN 1765 * which means the previous result is good. For both of these, 1766 * grab the sock_error() and use that for the completion. 1767 */ 1768 if (ret == -EBADFD || ret == -EISCONN) 1769 ret = sock_error(sock_from_file(req->file)->sk); 1770 } 1771 if (ret == -ERESTARTSYS) 1772 ret = -EINTR; 1773 out: 1774 if (ret < 0) 1775 req_set_fail(req); 1776 io_req_msg_cleanup(req, issue_flags); 1777 io_req_set_res(req, ret, 0); 1778 return IOU_COMPLETE; 1779 } 1780 1781 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1782 { 1783 struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); 1784 struct sockaddr __user *uaddr; 1785 struct io_async_msghdr *io; 1786 1787 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 1788 return -EINVAL; 1789 1790 uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1791 bind->addr_len = READ_ONCE(sqe->addr2); 1792 1793 io = io_msg_alloc_async(req); 1794 if (unlikely(!io)) 1795 return -ENOMEM; 1796 return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr); 1797 } 1798 1799 int io_bind(struct io_kiocb *req, unsigned int issue_flags) 1800 { 1801 struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); 1802 struct io_async_msghdr *io = req->async_data; 1803 struct socket *sock; 1804 int ret; 1805 1806 sock = sock_from_file(req->file); 1807 if (unlikely(!sock)) 1808 return -ENOTSOCK; 1809 1810 ret = __sys_bind_socket(sock, &io->addr, bind->addr_len); 1811 if (ret < 0) 1812 req_set_fail(req); 1813 io_req_set_res(req, ret, 0); 1814 return 0; 1815 } 1816 1817 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1818 { 1819 struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); 1820 1821 if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2) 1822 return -EINVAL; 1823 1824 listen->backlog = READ_ONCE(sqe->len); 1825 return 0; 1826 } 1827 1828 int io_listen(struct io_kiocb *req, unsigned int issue_flags) 1829 { 1830 struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); 1831 struct socket *sock; 1832 int ret; 1833 1834 sock = sock_from_file(req->file); 1835 if (unlikely(!sock)) 1836 return -ENOTSOCK; 1837 1838 ret = __sys_listen_socket(sock, listen->backlog); 1839 if (ret < 0) 1840 req_set_fail(req); 1841 io_req_set_res(req, ret, 0); 1842 return 0; 1843 } 1844 1845 void io_netmsg_cache_free(const void *entry) 1846 { 1847 struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; 1848 1849 io_vec_free(&kmsg->vec); 1850 kfree(kmsg); 1851 } 1852