1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/file.h> 5 #include <linux/slab.h> 6 #include <linux/net.h> 7 #include <linux/compat.h> 8 #include <net/compat.h> 9 #include <linux/io_uring.h> 10 11 #include <uapi/linux/io_uring.h> 12 13 #include "io_uring.h" 14 #include "kbuf.h" 15 #include "alloc_cache.h" 16 #include "net.h" 17 #include "notif.h" 18 #include "rsrc.h" 19 #include "zcrx.h" 20 21 struct io_shutdown { 22 struct file *file; 23 int how; 24 }; 25 26 struct io_accept { 27 struct file *file; 28 struct sockaddr __user *addr; 29 int __user *addr_len; 30 int flags; 31 int iou_flags; 32 u32 file_slot; 33 unsigned long nofile; 34 }; 35 36 struct io_socket { 37 struct file *file; 38 int domain; 39 int type; 40 int protocol; 41 int flags; 42 u32 file_slot; 43 unsigned long nofile; 44 }; 45 46 struct io_connect { 47 struct file *file; 48 struct sockaddr __user *addr; 49 int addr_len; 50 bool in_progress; 51 bool seen_econnaborted; 52 }; 53 54 struct io_bind { 55 struct file *file; 56 int addr_len; 57 }; 58 59 struct io_listen { 60 struct file *file; 61 int backlog; 62 }; 63 64 struct io_sr_msg { 65 struct file *file; 66 union { 67 struct compat_msghdr __user *umsg_compat; 68 struct user_msghdr __user *umsg; 69 void __user *buf; 70 }; 71 int len; 72 unsigned done_io; 73 unsigned msg_flags; 74 unsigned nr_multishot_loops; 75 u16 flags; 76 /* initialised and used only by !msg send variants */ 77 u16 buf_group; 78 bool retry; 79 void __user *msg_control; 80 /* used only for send zerocopy */ 81 struct io_kiocb *notif; 82 }; 83 84 /* 85 * Number of times we'll try and do receives if there's more data. If we 86 * exceed this limit, then add us to the back of the queue and retry from 87 * there. This helps fairness between flooding clients. 88 */ 89 #define MULTISHOT_MAX_RETRY 32 90 91 struct io_recvzc { 92 struct file *file; 93 unsigned msg_flags; 94 u16 flags; 95 u32 len; 96 struct io_zcrx_ifq *ifq; 97 }; 98 99 static int io_sg_from_iter_iovec(struct sk_buff *skb, 100 struct iov_iter *from, size_t length); 101 static int io_sg_from_iter(struct sk_buff *skb, 102 struct iov_iter *from, size_t length); 103 104 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 105 { 106 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); 107 108 if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || 109 sqe->buf_index || sqe->splice_fd_in)) 110 return -EINVAL; 111 112 shutdown->how = READ_ONCE(sqe->len); 113 req->flags |= REQ_F_FORCE_ASYNC; 114 return 0; 115 } 116 117 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) 118 { 119 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); 120 struct socket *sock; 121 int ret; 122 123 WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); 124 125 sock = sock_from_file(req->file); 126 if (unlikely(!sock)) 127 return -ENOTSOCK; 128 129 ret = __sys_shutdown_sock(sock, shutdown->how); 130 io_req_set_res(req, ret, 0); 131 return IOU_COMPLETE; 132 } 133 134 static bool io_net_retry(struct socket *sock, int flags) 135 { 136 if (!(flags & MSG_WAITALL)) 137 return false; 138 return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; 139 } 140 141 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) 142 { 143 if (kmsg->vec.iovec) 144 io_vec_free(&kmsg->vec); 145 } 146 147 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) 148 { 149 struct io_async_msghdr *hdr = req->async_data; 150 151 /* can't recycle, ensure we free the iovec if we have one */ 152 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { 153 io_netmsg_iovec_free(hdr); 154 return; 155 } 156 157 /* Let normal cleanup path reap it if we fail adding to the cache */ 158 io_alloc_cache_vec_kasan(&hdr->vec); 159 if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP) 160 io_vec_free(&hdr->vec); 161 162 if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { 163 req->async_data = NULL; 164 req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); 165 } 166 } 167 168 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) 169 { 170 struct io_ring_ctx *ctx = req->ctx; 171 struct io_async_msghdr *hdr; 172 173 hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req); 174 if (!hdr) 175 return NULL; 176 177 /* If the async data was cached, we might have an iov cached inside. */ 178 if (hdr->vec.iovec) 179 req->flags |= REQ_F_NEED_CLEANUP; 180 return hdr; 181 } 182 183 static inline void io_mshot_prep_retry(struct io_kiocb *req, 184 struct io_async_msghdr *kmsg) 185 { 186 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 187 188 req->flags &= ~REQ_F_BL_EMPTY; 189 sr->done_io = 0; 190 sr->retry = false; 191 sr->len = 0; /* get from the provided buffer */ 192 } 193 194 static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg, 195 const struct iovec __user *uiov, unsigned uvec_seg, 196 int ddir) 197 { 198 struct iovec *iov; 199 int ret, nr_segs; 200 201 if (iomsg->vec.iovec) { 202 nr_segs = iomsg->vec.nr; 203 iov = iomsg->vec.iovec; 204 } else { 205 nr_segs = 1; 206 iov = &iomsg->fast_iov; 207 } 208 209 ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov, 210 &iomsg->msg.msg_iter, io_is_compat(req->ctx)); 211 if (unlikely(ret < 0)) 212 return ret; 213 214 if (iov) { 215 req->flags |= REQ_F_NEED_CLEANUP; 216 io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs); 217 } 218 return 0; 219 } 220 221 static int io_compat_msg_copy_hdr(struct io_kiocb *req, 222 struct io_async_msghdr *iomsg, 223 struct compat_msghdr *msg, int ddir, 224 struct sockaddr __user **save_addr) 225 { 226 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 227 struct compat_iovec __user *uiov; 228 int ret; 229 230 if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) 231 return -EFAULT; 232 233 ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr); 234 if (ret) 235 return ret; 236 237 uiov = compat_ptr(msg->msg_iov); 238 if (req->flags & REQ_F_BUFFER_SELECT) { 239 if (msg->msg_iovlen == 0) { 240 sr->len = 0; 241 } else if (msg->msg_iovlen > 1) { 242 return -EINVAL; 243 } else { 244 struct compat_iovec tmp_iov; 245 246 if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) 247 return -EFAULT; 248 sr->len = tmp_iov.iov_len; 249 } 250 } 251 return 0; 252 } 253 254 static int io_copy_msghdr_from_user(struct user_msghdr *msg, 255 struct user_msghdr __user *umsg) 256 { 257 if (!user_access_begin(umsg, sizeof(*umsg))) 258 return -EFAULT; 259 unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); 260 unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); 261 unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); 262 unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); 263 unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); 264 unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); 265 user_access_end(); 266 return 0; 267 ua_end: 268 user_access_end(); 269 return -EFAULT; 270 } 271 272 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, 273 struct user_msghdr *msg, int ddir, 274 struct sockaddr __user **save_addr) 275 { 276 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 277 struct user_msghdr __user *umsg = sr->umsg; 278 int ret; 279 280 iomsg->msg.msg_name = &iomsg->addr; 281 iomsg->msg.msg_iter.nr_segs = 0; 282 283 if (io_is_compat(req->ctx)) { 284 struct compat_msghdr cmsg; 285 286 ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr); 287 if (ret) 288 return ret; 289 290 memset(msg, 0, sizeof(*msg)); 291 msg->msg_namelen = cmsg.msg_namelen; 292 msg->msg_controllen = cmsg.msg_controllen; 293 msg->msg_iov = compat_ptr(cmsg.msg_iov); 294 msg->msg_iovlen = cmsg.msg_iovlen; 295 return 0; 296 } 297 298 ret = io_copy_msghdr_from_user(msg, umsg); 299 if (unlikely(ret)) 300 return ret; 301 302 msg->msg_flags = 0; 303 304 ret = __copy_msghdr(&iomsg->msg, msg, save_addr); 305 if (ret) 306 return ret; 307 308 if (req->flags & REQ_F_BUFFER_SELECT) { 309 if (msg->msg_iovlen == 0) { 310 sr->len = 0; 311 } else if (msg->msg_iovlen > 1) { 312 return -EINVAL; 313 } else { 314 struct iovec __user *uiov = msg->msg_iov; 315 struct iovec tmp_iov; 316 317 if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) 318 return -EFAULT; 319 sr->len = tmp_iov.iov_len; 320 } 321 } 322 return 0; 323 } 324 325 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) 326 { 327 struct io_async_msghdr *io = req->async_data; 328 329 io_netmsg_iovec_free(io); 330 } 331 332 static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) 333 { 334 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 335 struct io_async_msghdr *kmsg = req->async_data; 336 void __user *addr; 337 u16 addr_len; 338 int ret; 339 340 sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 341 342 if (READ_ONCE(sqe->__pad3[0])) 343 return -EINVAL; 344 345 kmsg->msg.msg_name = NULL; 346 kmsg->msg.msg_namelen = 0; 347 kmsg->msg.msg_control = NULL; 348 kmsg->msg.msg_controllen = 0; 349 kmsg->msg.msg_ubuf = NULL; 350 351 addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 352 addr_len = READ_ONCE(sqe->addr_len); 353 if (addr) { 354 ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr); 355 if (unlikely(ret < 0)) 356 return ret; 357 kmsg->msg.msg_name = &kmsg->addr; 358 kmsg->msg.msg_namelen = addr_len; 359 } 360 if (sr->flags & IORING_RECVSEND_FIXED_BUF) { 361 req->flags |= REQ_F_IMPORT_BUFFER; 362 return 0; 363 } 364 if (req->flags & REQ_F_BUFFER_SELECT) 365 return 0; 366 return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); 367 } 368 369 static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) 370 { 371 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 372 struct io_async_msghdr *kmsg = req->async_data; 373 struct user_msghdr msg; 374 int ret; 375 376 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 377 ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL); 378 if (unlikely(ret)) 379 return ret; 380 /* save msg_control as sys_sendmsg() overwrites it */ 381 sr->msg_control = kmsg->msg.msg_control_user; 382 383 if (sr->flags & IORING_RECVSEND_FIXED_BUF) { 384 kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen; 385 return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov, 386 msg.msg_iovlen); 387 } 388 if (req->flags & REQ_F_BUFFER_SELECT) 389 return 0; 390 return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE); 391 } 392 393 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) 394 395 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 396 { 397 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 398 399 sr->done_io = 0; 400 sr->retry = false; 401 sr->len = READ_ONCE(sqe->len); 402 sr->flags = READ_ONCE(sqe->ioprio); 403 if (sr->flags & ~SENDMSG_FLAGS) 404 return -EINVAL; 405 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 406 if (sr->msg_flags & MSG_DONTWAIT) 407 req->flags |= REQ_F_NOWAIT; 408 if (req->flags & REQ_F_BUFFER_SELECT) 409 sr->buf_group = req->buf_index; 410 if (sr->flags & IORING_RECVSEND_BUNDLE) { 411 if (req->opcode == IORING_OP_SENDMSG) 412 return -EINVAL; 413 sr->msg_flags |= MSG_WAITALL; 414 req->buf_list = NULL; 415 req->flags |= REQ_F_MULTISHOT; 416 } 417 418 if (io_is_compat(req->ctx)) 419 sr->msg_flags |= MSG_CMSG_COMPAT; 420 421 if (unlikely(!io_msg_alloc_async(req))) 422 return -ENOMEM; 423 if (req->opcode != IORING_OP_SENDMSG) 424 return io_send_setup(req, sqe); 425 if (unlikely(sqe->addr2 || sqe->file_index)) 426 return -EINVAL; 427 return io_sendmsg_setup(req, sqe); 428 } 429 430 static void io_req_msg_cleanup(struct io_kiocb *req, 431 unsigned int issue_flags) 432 { 433 io_netmsg_recycle(req, issue_flags); 434 } 435 436 /* 437 * For bundle completions, we need to figure out how many segments we consumed. 438 * A bundle could be using a single ITER_UBUF if that's all we mapped, or it 439 * could be using an ITER_IOVEC. If the latter, then if we consumed all of 440 * the segments, then it's a trivial questiont o answer. If we have residual 441 * data in the iter, then loop the segments to figure out how much we 442 * transferred. 443 */ 444 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) 445 { 446 struct iovec *iov; 447 int nbufs; 448 449 /* no data is always zero segments, and a ubuf is always 1 segment */ 450 if (ret <= 0) 451 return 0; 452 if (iter_is_ubuf(&kmsg->msg.msg_iter)) 453 return 1; 454 455 iov = kmsg->vec.iovec; 456 if (!iov) 457 iov = &kmsg->fast_iov; 458 459 /* if all data was transferred, it's basic pointer math */ 460 if (!iov_iter_count(&kmsg->msg.msg_iter)) 461 return iter_iov(&kmsg->msg.msg_iter) - iov; 462 463 /* short transfer, count segments */ 464 nbufs = 0; 465 do { 466 int this_len = min_t(int, iov[nbufs].iov_len, ret); 467 468 nbufs++; 469 ret -= this_len; 470 } while (ret); 471 472 return nbufs; 473 } 474 475 static inline bool io_send_finish(struct io_kiocb *req, int *ret, 476 struct io_async_msghdr *kmsg, 477 unsigned issue_flags) 478 { 479 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 480 bool bundle_finished = *ret <= 0; 481 unsigned int cflags; 482 483 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { 484 cflags = io_put_kbuf(req, *ret, issue_flags); 485 goto finish; 486 } 487 488 cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); 489 490 if (bundle_finished || req->flags & REQ_F_BL_EMPTY) 491 goto finish; 492 493 /* 494 * Fill CQE for this receive and see if we should keep trying to 495 * receive from this socket. 496 */ 497 if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 498 io_mshot_prep_retry(req, kmsg); 499 return false; 500 } 501 502 /* Otherwise stop bundle and use the current result. */ 503 finish: 504 io_req_set_res(req, *ret, cflags); 505 *ret = IOU_COMPLETE; 506 return true; 507 } 508 509 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 510 { 511 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 512 struct io_async_msghdr *kmsg = req->async_data; 513 struct socket *sock; 514 unsigned flags; 515 int min_ret = 0; 516 int ret; 517 518 sock = sock_from_file(req->file); 519 if (unlikely(!sock)) 520 return -ENOTSOCK; 521 522 if (!(req->flags & REQ_F_POLLED) && 523 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 524 return -EAGAIN; 525 526 flags = sr->msg_flags; 527 if (issue_flags & IO_URING_F_NONBLOCK) 528 flags |= MSG_DONTWAIT; 529 if (flags & MSG_WAITALL) 530 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 531 532 kmsg->msg.msg_control_user = sr->msg_control; 533 534 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 535 536 if (ret < min_ret) { 537 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 538 return -EAGAIN; 539 if (ret > 0 && io_net_retry(sock, flags)) { 540 kmsg->msg.msg_controllen = 0; 541 kmsg->msg.msg_control = NULL; 542 sr->done_io += ret; 543 req->flags |= REQ_F_BL_NO_RECYCLE; 544 return -EAGAIN; 545 } 546 if (ret == -ERESTARTSYS) 547 ret = -EINTR; 548 req_set_fail(req); 549 } 550 io_req_msg_cleanup(req, issue_flags); 551 if (ret >= 0) 552 ret += sr->done_io; 553 else if (sr->done_io) 554 ret = sr->done_io; 555 io_req_set_res(req, ret, 0); 556 return IOU_COMPLETE; 557 } 558 559 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, 560 struct io_async_msghdr *kmsg) 561 { 562 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 563 564 int ret; 565 struct buf_sel_arg arg = { 566 .iovs = &kmsg->fast_iov, 567 .max_len = min_not_zero(sr->len, INT_MAX), 568 .nr_iovs = 1, 569 .buf_group = sr->buf_group, 570 }; 571 572 if (kmsg->vec.iovec) { 573 arg.nr_iovs = kmsg->vec.nr; 574 arg.iovs = kmsg->vec.iovec; 575 arg.mode = KBUF_MODE_FREE; 576 } 577 578 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) 579 arg.nr_iovs = 1; 580 else 581 arg.mode |= KBUF_MODE_EXPAND; 582 583 ret = io_buffers_select(req, &arg, issue_flags); 584 if (unlikely(ret < 0)) 585 return ret; 586 587 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { 588 kmsg->vec.nr = ret; 589 kmsg->vec.iovec = arg.iovs; 590 req->flags |= REQ_F_NEED_CLEANUP; 591 } 592 sr->len = arg.out_len; 593 594 if (ret == 1) { 595 sr->buf = arg.iovs[0].iov_base; 596 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, 597 &kmsg->msg.msg_iter); 598 if (unlikely(ret)) 599 return ret; 600 } else { 601 iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, 602 arg.iovs, ret, arg.out_len); 603 } 604 605 return 0; 606 } 607 608 int io_send(struct io_kiocb *req, unsigned int issue_flags) 609 { 610 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 611 struct io_async_msghdr *kmsg = req->async_data; 612 struct socket *sock; 613 unsigned flags; 614 int min_ret = 0; 615 int ret; 616 617 sock = sock_from_file(req->file); 618 if (unlikely(!sock)) 619 return -ENOTSOCK; 620 621 if (!(req->flags & REQ_F_POLLED) && 622 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 623 return -EAGAIN; 624 625 flags = sr->msg_flags; 626 if (issue_flags & IO_URING_F_NONBLOCK) 627 flags |= MSG_DONTWAIT; 628 629 retry_bundle: 630 if (io_do_buffer_select(req)) { 631 ret = io_send_select_buffer(req, issue_flags, kmsg); 632 if (ret) 633 return ret; 634 } 635 636 /* 637 * If MSG_WAITALL is set, or this is a bundle send, then we need 638 * the full amount. If just bundle is set, if we do a short send 639 * then we complete the bundle sequence rather than continue on. 640 */ 641 if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE) 642 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 643 644 flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 645 kmsg->msg.msg_flags = flags; 646 ret = sock_sendmsg(sock, &kmsg->msg); 647 if (ret < min_ret) { 648 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 649 return -EAGAIN; 650 651 if (ret > 0 && io_net_retry(sock, flags)) { 652 sr->len -= ret; 653 sr->buf += ret; 654 sr->done_io += ret; 655 req->flags |= REQ_F_BL_NO_RECYCLE; 656 return -EAGAIN; 657 } 658 if (ret == -ERESTARTSYS) 659 ret = -EINTR; 660 req_set_fail(req); 661 } 662 if (ret >= 0) 663 ret += sr->done_io; 664 else if (sr->done_io) 665 ret = sr->done_io; 666 667 if (!io_send_finish(req, &ret, kmsg, issue_flags)) 668 goto retry_bundle; 669 670 io_req_msg_cleanup(req, issue_flags); 671 return ret; 672 } 673 674 static int io_recvmsg_mshot_prep(struct io_kiocb *req, 675 struct io_async_msghdr *iomsg, 676 int namelen, size_t controllen) 677 { 678 if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) == 679 (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) { 680 int hdr; 681 682 if (unlikely(namelen < 0)) 683 return -EOVERFLOW; 684 if (check_add_overflow(sizeof(struct io_uring_recvmsg_out), 685 namelen, &hdr)) 686 return -EOVERFLOW; 687 if (check_add_overflow(hdr, controllen, &hdr)) 688 return -EOVERFLOW; 689 690 iomsg->namelen = namelen; 691 iomsg->controllen = controllen; 692 return 0; 693 } 694 695 return 0; 696 } 697 698 static int io_recvmsg_copy_hdr(struct io_kiocb *req, 699 struct io_async_msghdr *iomsg) 700 { 701 struct user_msghdr msg; 702 int ret; 703 704 ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); 705 if (unlikely(ret)) 706 return ret; 707 708 if (!(req->flags & REQ_F_BUFFER_SELECT)) { 709 ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen, 710 ITER_DEST); 711 if (unlikely(ret)) 712 return ret; 713 } 714 return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, 715 msg.msg_controllen); 716 } 717 718 static int io_recvmsg_prep_setup(struct io_kiocb *req) 719 { 720 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 721 struct io_async_msghdr *kmsg; 722 723 kmsg = io_msg_alloc_async(req); 724 if (unlikely(!kmsg)) 725 return -ENOMEM; 726 727 if (req->opcode == IORING_OP_RECV) { 728 kmsg->msg.msg_name = NULL; 729 kmsg->msg.msg_namelen = 0; 730 kmsg->msg.msg_inq = 0; 731 kmsg->msg.msg_control = NULL; 732 kmsg->msg.msg_get_inq = 1; 733 kmsg->msg.msg_controllen = 0; 734 kmsg->msg.msg_iocb = NULL; 735 kmsg->msg.msg_ubuf = NULL; 736 737 if (req->flags & REQ_F_BUFFER_SELECT) 738 return 0; 739 return import_ubuf(ITER_DEST, sr->buf, sr->len, 740 &kmsg->msg.msg_iter); 741 } 742 743 return io_recvmsg_copy_hdr(req, kmsg); 744 } 745 746 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ 747 IORING_RECVSEND_BUNDLE) 748 749 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 750 { 751 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 752 753 sr->done_io = 0; 754 sr->retry = false; 755 756 if (unlikely(sqe->file_index || sqe->addr2)) 757 return -EINVAL; 758 759 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 760 sr->len = READ_ONCE(sqe->len); 761 sr->flags = READ_ONCE(sqe->ioprio); 762 if (sr->flags & ~RECVMSG_FLAGS) 763 return -EINVAL; 764 sr->msg_flags = READ_ONCE(sqe->msg_flags); 765 if (sr->msg_flags & MSG_DONTWAIT) 766 req->flags |= REQ_F_NOWAIT; 767 if (sr->msg_flags & MSG_ERRQUEUE) 768 req->flags |= REQ_F_CLEAR_POLLIN; 769 if (req->flags & REQ_F_BUFFER_SELECT) { 770 /* 771 * Store the buffer group for this multishot receive separately, 772 * as if we end up doing an io-wq based issue that selects a 773 * buffer, it has to be committed immediately and that will 774 * clear ->buf_list. This means we lose the link to the buffer 775 * list, and the eventual buffer put on completion then cannot 776 * restore it. 777 */ 778 sr->buf_group = req->buf_index; 779 req->buf_list = NULL; 780 } 781 if (sr->flags & IORING_RECV_MULTISHOT) { 782 if (!(req->flags & REQ_F_BUFFER_SELECT)) 783 return -EINVAL; 784 if (sr->msg_flags & MSG_WAITALL) 785 return -EINVAL; 786 if (req->opcode == IORING_OP_RECV && sr->len) 787 return -EINVAL; 788 req->flags |= REQ_F_APOLL_MULTISHOT; 789 } 790 if (sr->flags & IORING_RECVSEND_BUNDLE) { 791 if (req->opcode == IORING_OP_RECVMSG) 792 return -EINVAL; 793 } 794 795 if (io_is_compat(req->ctx)) 796 sr->msg_flags |= MSG_CMSG_COMPAT; 797 798 sr->nr_multishot_loops = 0; 799 return io_recvmsg_prep_setup(req); 800 } 801 802 /* bits to clear in old and inherit in new cflags on bundle retry */ 803 #define CQE_F_MASK (IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE) 804 805 /* 806 * Finishes io_recv and io_recvmsg. 807 * 808 * Returns true if it is actually finished, or false if it should run 809 * again (for multishot). 810 */ 811 static inline bool io_recv_finish(struct io_kiocb *req, int *ret, 812 struct io_async_msghdr *kmsg, 813 bool mshot_finished, unsigned issue_flags) 814 { 815 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 816 unsigned int cflags = 0; 817 818 if (kmsg->msg.msg_inq > 0) 819 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 820 821 if (sr->flags & IORING_RECVSEND_BUNDLE) { 822 size_t this_ret = *ret - sr->done_io; 823 824 cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), 825 issue_flags); 826 if (sr->retry) 827 cflags = req->cqe.flags | (cflags & CQE_F_MASK); 828 /* bundle with no more immediate buffers, we're done */ 829 if (req->flags & REQ_F_BL_EMPTY) 830 goto finish; 831 /* 832 * If more is available AND it was a full transfer, retry and 833 * append to this one 834 */ 835 if (!sr->retry && kmsg->msg.msg_inq > 1 && this_ret > 0 && 836 !iov_iter_count(&kmsg->msg.msg_iter)) { 837 req->cqe.flags = cflags & ~CQE_F_MASK; 838 sr->len = kmsg->msg.msg_inq; 839 sr->done_io += this_ret; 840 sr->retry = true; 841 return false; 842 } 843 } else { 844 cflags |= io_put_kbuf(req, *ret, issue_flags); 845 } 846 847 /* 848 * Fill CQE for this receive and see if we should keep trying to 849 * receive from this socket. 850 */ 851 if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && 852 io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 853 *ret = IOU_RETRY; 854 io_mshot_prep_retry(req, kmsg); 855 /* Known not-empty or unknown state, retry */ 856 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { 857 if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) 858 return false; 859 /* mshot retries exceeded, force a requeue */ 860 sr->nr_multishot_loops = 0; 861 if (issue_flags & IO_URING_F_MULTISHOT) 862 *ret = IOU_REQUEUE; 863 } 864 return true; 865 } 866 867 /* Finish the request / stop multishot. */ 868 finish: 869 io_req_set_res(req, *ret, cflags); 870 *ret = IOU_COMPLETE; 871 io_req_msg_cleanup(req, issue_flags); 872 return true; 873 } 874 875 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg, 876 struct io_sr_msg *sr, void __user **buf, 877 size_t *len) 878 { 879 unsigned long ubuf = (unsigned long) *buf; 880 unsigned long hdr; 881 882 hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + 883 kmsg->controllen; 884 if (*len < hdr) 885 return -EFAULT; 886 887 if (kmsg->controllen) { 888 unsigned long control = ubuf + hdr - kmsg->controllen; 889 890 kmsg->msg.msg_control_user = (void __user *) control; 891 kmsg->msg.msg_controllen = kmsg->controllen; 892 } 893 894 sr->buf = *buf; /* stash for later copy */ 895 *buf = (void __user *) (ubuf + hdr); 896 kmsg->payloadlen = *len = *len - hdr; 897 return 0; 898 } 899 900 struct io_recvmsg_multishot_hdr { 901 struct io_uring_recvmsg_out msg; 902 struct sockaddr_storage addr; 903 }; 904 905 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, 906 struct io_async_msghdr *kmsg, 907 unsigned int flags, bool *finished) 908 { 909 int err; 910 int copy_len; 911 struct io_recvmsg_multishot_hdr hdr; 912 913 if (kmsg->namelen) 914 kmsg->msg.msg_name = &hdr.addr; 915 kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); 916 kmsg->msg.msg_namelen = 0; 917 918 if (sock->file->f_flags & O_NONBLOCK) 919 flags |= MSG_DONTWAIT; 920 921 err = sock_recvmsg(sock, &kmsg->msg, flags); 922 *finished = err <= 0; 923 if (err < 0) 924 return err; 925 926 hdr.msg = (struct io_uring_recvmsg_out) { 927 .controllen = kmsg->controllen - kmsg->msg.msg_controllen, 928 .flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT 929 }; 930 931 hdr.msg.payloadlen = err; 932 if (err > kmsg->payloadlen) 933 err = kmsg->payloadlen; 934 935 copy_len = sizeof(struct io_uring_recvmsg_out); 936 if (kmsg->msg.msg_namelen > kmsg->namelen) 937 copy_len += kmsg->namelen; 938 else 939 copy_len += kmsg->msg.msg_namelen; 940 941 /* 942 * "fromlen shall refer to the value before truncation.." 943 * 1003.1g 944 */ 945 hdr.msg.namelen = kmsg->msg.msg_namelen; 946 947 /* ensure that there is no gap between hdr and sockaddr_storage */ 948 BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) != 949 sizeof(struct io_uring_recvmsg_out)); 950 if (copy_to_user(io->buf, &hdr, copy_len)) { 951 *finished = true; 952 return -EFAULT; 953 } 954 955 return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + 956 kmsg->controllen + err; 957 } 958 959 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 960 { 961 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 962 struct io_async_msghdr *kmsg = req->async_data; 963 struct socket *sock; 964 unsigned flags; 965 int ret, min_ret = 0; 966 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 967 bool mshot_finished = true; 968 969 sock = sock_from_file(req->file); 970 if (unlikely(!sock)) 971 return -ENOTSOCK; 972 973 if (!(req->flags & REQ_F_POLLED) && 974 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 975 return -EAGAIN; 976 977 flags = sr->msg_flags; 978 if (force_nonblock) 979 flags |= MSG_DONTWAIT; 980 981 retry_multishot: 982 if (io_do_buffer_select(req)) { 983 void __user *buf; 984 size_t len = sr->len; 985 986 buf = io_buffer_select(req, &len, sr->buf_group, issue_flags); 987 if (!buf) 988 return -ENOBUFS; 989 990 if (req->flags & REQ_F_APOLL_MULTISHOT) { 991 ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len); 992 if (ret) { 993 io_kbuf_recycle(req, issue_flags); 994 return ret; 995 } 996 } 997 998 iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); 999 } 1000 1001 kmsg->msg.msg_get_inq = 1; 1002 kmsg->msg.msg_inq = -1; 1003 if (req->flags & REQ_F_APOLL_MULTISHOT) { 1004 ret = io_recvmsg_multishot(sock, sr, kmsg, flags, 1005 &mshot_finished); 1006 } else { 1007 /* disable partial retry for recvmsg with cmsg attached */ 1008 if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) 1009 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1010 1011 ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, 1012 kmsg->uaddr, flags); 1013 } 1014 1015 if (ret < min_ret) { 1016 if (ret == -EAGAIN && force_nonblock) { 1017 if (issue_flags & IO_URING_F_MULTISHOT) 1018 io_kbuf_recycle(req, issue_flags); 1019 1020 return IOU_RETRY; 1021 } 1022 if (ret > 0 && io_net_retry(sock, flags)) { 1023 sr->done_io += ret; 1024 req->flags |= REQ_F_BL_NO_RECYCLE; 1025 return IOU_RETRY; 1026 } 1027 if (ret == -ERESTARTSYS) 1028 ret = -EINTR; 1029 req_set_fail(req); 1030 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1031 req_set_fail(req); 1032 } 1033 1034 if (ret > 0) 1035 ret += sr->done_io; 1036 else if (sr->done_io) 1037 ret = sr->done_io; 1038 else 1039 io_kbuf_recycle(req, issue_flags); 1040 1041 if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1042 goto retry_multishot; 1043 1044 return ret; 1045 } 1046 1047 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, 1048 size_t *len, unsigned int issue_flags) 1049 { 1050 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1051 int ret; 1052 1053 /* 1054 * If the ring isn't locked, then don't use the peek interface 1055 * to grab multiple buffers as we will lock/unlock between 1056 * this selection and posting the buffers. 1057 */ 1058 if (!(issue_flags & IO_URING_F_UNLOCKED) && 1059 sr->flags & IORING_RECVSEND_BUNDLE) { 1060 struct buf_sel_arg arg = { 1061 .iovs = &kmsg->fast_iov, 1062 .nr_iovs = 1, 1063 .mode = KBUF_MODE_EXPAND, 1064 .buf_group = sr->buf_group, 1065 }; 1066 1067 if (kmsg->vec.iovec) { 1068 arg.nr_iovs = kmsg->vec.nr; 1069 arg.iovs = kmsg->vec.iovec; 1070 arg.mode |= KBUF_MODE_FREE; 1071 } 1072 1073 if (kmsg->msg.msg_inq > 1) 1074 arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); 1075 1076 ret = io_buffers_peek(req, &arg); 1077 if (unlikely(ret < 0)) 1078 return ret; 1079 1080 /* special case 1 vec, can be a fast path */ 1081 if (ret == 1) { 1082 sr->buf = arg.iovs[0].iov_base; 1083 sr->len = arg.iovs[0].iov_len; 1084 goto map_ubuf; 1085 } 1086 iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, 1087 arg.out_len); 1088 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { 1089 kmsg->vec.nr = ret; 1090 kmsg->vec.iovec = arg.iovs; 1091 req->flags |= REQ_F_NEED_CLEANUP; 1092 } 1093 } else { 1094 void __user *buf; 1095 1096 *len = sr->len; 1097 buf = io_buffer_select(req, len, sr->buf_group, issue_flags); 1098 if (!buf) 1099 return -ENOBUFS; 1100 sr->buf = buf; 1101 sr->len = *len; 1102 map_ubuf: 1103 ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 1104 &kmsg->msg.msg_iter); 1105 if (unlikely(ret)) 1106 return ret; 1107 } 1108 1109 return 0; 1110 } 1111 1112 int io_recv(struct io_kiocb *req, unsigned int issue_flags) 1113 { 1114 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1115 struct io_async_msghdr *kmsg = req->async_data; 1116 struct socket *sock; 1117 unsigned flags; 1118 int ret, min_ret = 0; 1119 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1120 size_t len = sr->len; 1121 bool mshot_finished; 1122 1123 if (!(req->flags & REQ_F_POLLED) && 1124 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1125 return -EAGAIN; 1126 1127 sock = sock_from_file(req->file); 1128 if (unlikely(!sock)) 1129 return -ENOTSOCK; 1130 1131 flags = sr->msg_flags; 1132 if (force_nonblock) 1133 flags |= MSG_DONTWAIT; 1134 1135 retry_multishot: 1136 if (io_do_buffer_select(req)) { 1137 ret = io_recv_buf_select(req, kmsg, &len, issue_flags); 1138 if (unlikely(ret)) { 1139 kmsg->msg.msg_inq = -1; 1140 goto out_free; 1141 } 1142 sr->buf = NULL; 1143 } 1144 1145 kmsg->msg.msg_flags = 0; 1146 kmsg->msg.msg_inq = -1; 1147 1148 if (flags & MSG_WAITALL) 1149 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1150 1151 ret = sock_recvmsg(sock, &kmsg->msg, flags); 1152 if (ret < min_ret) { 1153 if (ret == -EAGAIN && force_nonblock) { 1154 if (issue_flags & IO_URING_F_MULTISHOT) 1155 io_kbuf_recycle(req, issue_flags); 1156 1157 return IOU_RETRY; 1158 } 1159 if (ret > 0 && io_net_retry(sock, flags)) { 1160 sr->len -= ret; 1161 sr->buf += ret; 1162 sr->done_io += ret; 1163 req->flags |= REQ_F_BL_NO_RECYCLE; 1164 return -EAGAIN; 1165 } 1166 if (ret == -ERESTARTSYS) 1167 ret = -EINTR; 1168 req_set_fail(req); 1169 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1170 out_free: 1171 req_set_fail(req); 1172 } 1173 1174 mshot_finished = ret <= 0; 1175 if (ret > 0) 1176 ret += sr->done_io; 1177 else if (sr->done_io) 1178 ret = sr->done_io; 1179 else 1180 io_kbuf_recycle(req, issue_flags); 1181 1182 if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1183 goto retry_multishot; 1184 1185 return ret; 1186 } 1187 1188 int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1189 { 1190 struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); 1191 unsigned ifq_idx; 1192 1193 if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3)) 1194 return -EINVAL; 1195 1196 ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); 1197 zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx); 1198 if (!zc->ifq) 1199 return -EINVAL; 1200 1201 zc->len = READ_ONCE(sqe->len); 1202 zc->flags = READ_ONCE(sqe->ioprio); 1203 zc->msg_flags = READ_ONCE(sqe->msg_flags); 1204 if (zc->msg_flags) 1205 return -EINVAL; 1206 if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) 1207 return -EINVAL; 1208 /* multishot required */ 1209 if (!(zc->flags & IORING_RECV_MULTISHOT)) 1210 return -EINVAL; 1211 /* All data completions are posted as aux CQEs. */ 1212 req->flags |= REQ_F_APOLL_MULTISHOT; 1213 1214 return 0; 1215 } 1216 1217 int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) 1218 { 1219 struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); 1220 struct socket *sock; 1221 unsigned int len; 1222 int ret; 1223 1224 if (!(req->flags & REQ_F_POLLED) && 1225 (zc->flags & IORING_RECVSEND_POLL_FIRST)) 1226 return -EAGAIN; 1227 1228 sock = sock_from_file(req->file); 1229 if (unlikely(!sock)) 1230 return -ENOTSOCK; 1231 1232 len = zc->len; 1233 ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, 1234 issue_flags, &zc->len); 1235 if (len && zc->len == 0) { 1236 io_req_set_res(req, 0, 0); 1237 1238 return IOU_COMPLETE; 1239 } 1240 if (unlikely(ret <= 0) && ret != -EAGAIN) { 1241 if (ret == -ERESTARTSYS) 1242 ret = -EINTR; 1243 if (ret == IOU_REQUEUE) 1244 return IOU_REQUEUE; 1245 1246 req_set_fail(req); 1247 io_req_set_res(req, ret, 0); 1248 return IOU_COMPLETE; 1249 } 1250 return IOU_RETRY; 1251 } 1252 1253 void io_send_zc_cleanup(struct io_kiocb *req) 1254 { 1255 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1256 struct io_async_msghdr *io = req->async_data; 1257 1258 if (req_has_async_data(req)) 1259 io_netmsg_iovec_free(io); 1260 if (zc->notif) { 1261 io_notif_flush(zc->notif); 1262 zc->notif = NULL; 1263 } 1264 } 1265 1266 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF) 1267 #define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE) 1268 1269 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1270 { 1271 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1272 struct io_ring_ctx *ctx = req->ctx; 1273 struct io_async_msghdr *iomsg; 1274 struct io_kiocb *notif; 1275 int ret; 1276 1277 zc->done_io = 0; 1278 zc->retry = false; 1279 1280 if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) 1281 return -EINVAL; 1282 /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ 1283 if (req->flags & REQ_F_CQE_SKIP) 1284 return -EINVAL; 1285 1286 notif = zc->notif = io_alloc_notif(ctx); 1287 if (!notif) 1288 return -ENOMEM; 1289 notif->cqe.user_data = req->cqe.user_data; 1290 notif->cqe.res = 0; 1291 notif->cqe.flags = IORING_CQE_F_NOTIF; 1292 req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY; 1293 1294 zc->flags = READ_ONCE(sqe->ioprio); 1295 if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) { 1296 if (zc->flags & ~IO_ZC_FLAGS_VALID) 1297 return -EINVAL; 1298 if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { 1299 struct io_notif_data *nd = io_notif_to_data(notif); 1300 1301 nd->zc_report = true; 1302 nd->zc_used = false; 1303 nd->zc_copied = false; 1304 } 1305 } 1306 1307 zc->len = READ_ONCE(sqe->len); 1308 zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; 1309 req->buf_index = READ_ONCE(sqe->buf_index); 1310 if (zc->msg_flags & MSG_DONTWAIT) 1311 req->flags |= REQ_F_NOWAIT; 1312 1313 if (io_is_compat(req->ctx)) 1314 zc->msg_flags |= MSG_CMSG_COMPAT; 1315 1316 iomsg = io_msg_alloc_async(req); 1317 if (unlikely(!iomsg)) 1318 return -ENOMEM; 1319 1320 if (req->opcode == IORING_OP_SEND_ZC) { 1321 ret = io_send_setup(req, sqe); 1322 } else { 1323 if (unlikely(sqe->addr2 || sqe->file_index)) 1324 return -EINVAL; 1325 ret = io_sendmsg_setup(req, sqe); 1326 } 1327 if (unlikely(ret)) 1328 return ret; 1329 1330 if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) { 1331 iomsg->msg.sg_from_iter = io_sg_from_iter_iovec; 1332 return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count); 1333 } 1334 iomsg->msg.sg_from_iter = io_sg_from_iter; 1335 return 0; 1336 } 1337 1338 static int io_sg_from_iter_iovec(struct sk_buff *skb, 1339 struct iov_iter *from, size_t length) 1340 { 1341 skb_zcopy_downgrade_managed(skb); 1342 return zerocopy_fill_skb_from_iter(skb, from, length); 1343 } 1344 1345 static int io_sg_from_iter(struct sk_buff *skb, 1346 struct iov_iter *from, size_t length) 1347 { 1348 struct skb_shared_info *shinfo = skb_shinfo(skb); 1349 int frag = shinfo->nr_frags; 1350 int ret = 0; 1351 struct bvec_iter bi; 1352 ssize_t copied = 0; 1353 unsigned long truesize = 0; 1354 1355 if (!frag) 1356 shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; 1357 else if (unlikely(!skb_zcopy_managed(skb))) 1358 return zerocopy_fill_skb_from_iter(skb, from, length); 1359 1360 bi.bi_size = min(from->count, length); 1361 bi.bi_bvec_done = from->iov_offset; 1362 bi.bi_idx = 0; 1363 1364 while (bi.bi_size && frag < MAX_SKB_FRAGS) { 1365 struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi); 1366 1367 copied += v.bv_len; 1368 truesize += PAGE_ALIGN(v.bv_len + v.bv_offset); 1369 __skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page, 1370 v.bv_offset, v.bv_len); 1371 bvec_iter_advance_single(from->bvec, &bi, v.bv_len); 1372 } 1373 if (bi.bi_size) 1374 ret = -EMSGSIZE; 1375 1376 shinfo->nr_frags = frag; 1377 from->bvec += bi.bi_idx; 1378 from->nr_segs -= bi.bi_idx; 1379 from->count -= copied; 1380 from->iov_offset = bi.bi_bvec_done; 1381 1382 skb->data_len += copied; 1383 skb->len += copied; 1384 skb->truesize += truesize; 1385 return ret; 1386 } 1387 1388 static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) 1389 { 1390 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1391 struct io_async_msghdr *kmsg = req->async_data; 1392 1393 WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF)); 1394 1395 sr->notif->buf_index = req->buf_index; 1396 return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, 1397 (u64)(uintptr_t)sr->buf, sr->len, 1398 ITER_SOURCE, issue_flags); 1399 } 1400 1401 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) 1402 { 1403 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1404 struct io_async_msghdr *kmsg = req->async_data; 1405 struct socket *sock; 1406 unsigned msg_flags; 1407 int ret, min_ret = 0; 1408 1409 sock = sock_from_file(req->file); 1410 if (unlikely(!sock)) 1411 return -ENOTSOCK; 1412 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1413 return -EOPNOTSUPP; 1414 1415 if (!(req->flags & REQ_F_POLLED) && 1416 (zc->flags & IORING_RECVSEND_POLL_FIRST)) 1417 return -EAGAIN; 1418 1419 if (req->flags & REQ_F_IMPORT_BUFFER) { 1420 req->flags &= ~REQ_F_IMPORT_BUFFER; 1421 ret = io_send_zc_import(req, issue_flags); 1422 if (unlikely(ret)) 1423 return ret; 1424 } 1425 1426 msg_flags = zc->msg_flags; 1427 if (issue_flags & IO_URING_F_NONBLOCK) 1428 msg_flags |= MSG_DONTWAIT; 1429 if (msg_flags & MSG_WAITALL) 1430 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1431 msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 1432 1433 kmsg->msg.msg_flags = msg_flags; 1434 kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; 1435 ret = sock_sendmsg(sock, &kmsg->msg); 1436 1437 if (unlikely(ret < min_ret)) { 1438 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1439 return -EAGAIN; 1440 1441 if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { 1442 zc->len -= ret; 1443 zc->buf += ret; 1444 zc->done_io += ret; 1445 req->flags |= REQ_F_BL_NO_RECYCLE; 1446 return -EAGAIN; 1447 } 1448 if (ret == -ERESTARTSYS) 1449 ret = -EINTR; 1450 req_set_fail(req); 1451 } 1452 1453 if (ret >= 0) 1454 ret += zc->done_io; 1455 else if (zc->done_io) 1456 ret = zc->done_io; 1457 1458 /* 1459 * If we're in io-wq we can't rely on tw ordering guarantees, defer 1460 * flushing notif to io_send_zc_cleanup() 1461 */ 1462 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1463 io_notif_flush(zc->notif); 1464 zc->notif = NULL; 1465 io_req_msg_cleanup(req, 0); 1466 } 1467 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1468 return IOU_COMPLETE; 1469 } 1470 1471 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) 1472 { 1473 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1474 struct io_async_msghdr *kmsg = req->async_data; 1475 struct socket *sock; 1476 unsigned flags; 1477 int ret, min_ret = 0; 1478 1479 if (req->flags & REQ_F_IMPORT_BUFFER) { 1480 unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; 1481 int ret; 1482 1483 ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req, 1484 &kmsg->vec, uvec_segs, issue_flags); 1485 if (unlikely(ret)) 1486 return ret; 1487 req->flags &= ~REQ_F_IMPORT_BUFFER; 1488 } 1489 1490 sock = sock_from_file(req->file); 1491 if (unlikely(!sock)) 1492 return -ENOTSOCK; 1493 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1494 return -EOPNOTSUPP; 1495 1496 if (!(req->flags & REQ_F_POLLED) && 1497 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1498 return -EAGAIN; 1499 1500 flags = sr->msg_flags; 1501 if (issue_flags & IO_URING_F_NONBLOCK) 1502 flags |= MSG_DONTWAIT; 1503 if (flags & MSG_WAITALL) 1504 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1505 1506 kmsg->msg.msg_control_user = sr->msg_control; 1507 kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; 1508 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 1509 1510 if (unlikely(ret < min_ret)) { 1511 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1512 return -EAGAIN; 1513 1514 if (ret > 0 && io_net_retry(sock, flags)) { 1515 sr->done_io += ret; 1516 req->flags |= REQ_F_BL_NO_RECYCLE; 1517 return -EAGAIN; 1518 } 1519 if (ret == -ERESTARTSYS) 1520 ret = -EINTR; 1521 req_set_fail(req); 1522 } 1523 1524 if (ret >= 0) 1525 ret += sr->done_io; 1526 else if (sr->done_io) 1527 ret = sr->done_io; 1528 1529 /* 1530 * If we're in io-wq we can't rely on tw ordering guarantees, defer 1531 * flushing notif to io_send_zc_cleanup() 1532 */ 1533 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1534 io_notif_flush(sr->notif); 1535 sr->notif = NULL; 1536 io_req_msg_cleanup(req, 0); 1537 } 1538 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1539 return IOU_COMPLETE; 1540 } 1541 1542 void io_sendrecv_fail(struct io_kiocb *req) 1543 { 1544 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1545 1546 if (sr->done_io) 1547 req->cqe.res = sr->done_io; 1548 1549 if ((req->flags & REQ_F_NEED_CLEANUP) && 1550 (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC)) 1551 req->cqe.flags |= IORING_CQE_F_MORE; 1552 } 1553 1554 #define ACCEPT_FLAGS (IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \ 1555 IORING_ACCEPT_POLL_FIRST) 1556 1557 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1558 { 1559 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1560 1561 if (sqe->len || sqe->buf_index) 1562 return -EINVAL; 1563 1564 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1565 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 1566 accept->flags = READ_ONCE(sqe->accept_flags); 1567 accept->nofile = rlimit(RLIMIT_NOFILE); 1568 accept->iou_flags = READ_ONCE(sqe->ioprio); 1569 if (accept->iou_flags & ~ACCEPT_FLAGS) 1570 return -EINVAL; 1571 1572 accept->file_slot = READ_ONCE(sqe->file_index); 1573 if (accept->file_slot) { 1574 if (accept->flags & SOCK_CLOEXEC) 1575 return -EINVAL; 1576 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT && 1577 accept->file_slot != IORING_FILE_INDEX_ALLOC) 1578 return -EINVAL; 1579 } 1580 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1581 return -EINVAL; 1582 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) 1583 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 1584 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT) 1585 req->flags |= REQ_F_APOLL_MULTISHOT; 1586 if (accept->iou_flags & IORING_ACCEPT_DONTWAIT) 1587 req->flags |= REQ_F_NOWAIT; 1588 return 0; 1589 } 1590 1591 int io_accept(struct io_kiocb *req, unsigned int issue_flags) 1592 { 1593 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1594 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1595 bool fixed = !!accept->file_slot; 1596 struct proto_accept_arg arg = { 1597 .flags = force_nonblock ? O_NONBLOCK : 0, 1598 }; 1599 struct file *file; 1600 unsigned cflags; 1601 int ret, fd; 1602 1603 if (!(req->flags & REQ_F_POLLED) && 1604 accept->iou_flags & IORING_ACCEPT_POLL_FIRST) 1605 return -EAGAIN; 1606 1607 retry: 1608 if (!fixed) { 1609 fd = __get_unused_fd_flags(accept->flags, accept->nofile); 1610 if (unlikely(fd < 0)) 1611 return fd; 1612 } 1613 arg.err = 0; 1614 arg.is_empty = -1; 1615 file = do_accept(req->file, &arg, accept->addr, accept->addr_len, 1616 accept->flags); 1617 if (IS_ERR(file)) { 1618 if (!fixed) 1619 put_unused_fd(fd); 1620 ret = PTR_ERR(file); 1621 if (ret == -EAGAIN && force_nonblock && 1622 !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) 1623 return IOU_RETRY; 1624 1625 if (ret == -ERESTARTSYS) 1626 ret = -EINTR; 1627 } else if (!fixed) { 1628 fd_install(fd, file); 1629 ret = fd; 1630 } else { 1631 ret = io_fixed_fd_install(req, issue_flags, file, 1632 accept->file_slot); 1633 } 1634 1635 cflags = 0; 1636 if (!arg.is_empty) 1637 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 1638 1639 if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) && 1640 io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { 1641 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) 1642 goto retry; 1643 return IOU_RETRY; 1644 } 1645 1646 io_req_set_res(req, ret, cflags); 1647 if (ret < 0) 1648 req_set_fail(req); 1649 return IOU_COMPLETE; 1650 } 1651 1652 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1653 { 1654 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); 1655 1656 if (sqe->addr || sqe->rw_flags || sqe->buf_index) 1657 return -EINVAL; 1658 1659 sock->domain = READ_ONCE(sqe->fd); 1660 sock->type = READ_ONCE(sqe->off); 1661 sock->protocol = READ_ONCE(sqe->len); 1662 sock->file_slot = READ_ONCE(sqe->file_index); 1663 sock->nofile = rlimit(RLIMIT_NOFILE); 1664 1665 sock->flags = sock->type & ~SOCK_TYPE_MASK; 1666 if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) 1667 return -EINVAL; 1668 if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1669 return -EINVAL; 1670 return 0; 1671 } 1672 1673 int io_socket(struct io_kiocb *req, unsigned int issue_flags) 1674 { 1675 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); 1676 bool fixed = !!sock->file_slot; 1677 struct file *file; 1678 int ret, fd; 1679 1680 if (!fixed) { 1681 fd = __get_unused_fd_flags(sock->flags, sock->nofile); 1682 if (unlikely(fd < 0)) 1683 return fd; 1684 } 1685 file = __sys_socket_file(sock->domain, sock->type, sock->protocol); 1686 if (IS_ERR(file)) { 1687 if (!fixed) 1688 put_unused_fd(fd); 1689 ret = PTR_ERR(file); 1690 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1691 return -EAGAIN; 1692 if (ret == -ERESTARTSYS) 1693 ret = -EINTR; 1694 req_set_fail(req); 1695 } else if (!fixed) { 1696 fd_install(fd, file); 1697 ret = fd; 1698 } else { 1699 ret = io_fixed_fd_install(req, issue_flags, file, 1700 sock->file_slot); 1701 } 1702 io_req_set_res(req, ret, 0); 1703 return IOU_COMPLETE; 1704 } 1705 1706 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1707 { 1708 struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); 1709 struct io_async_msghdr *io; 1710 1711 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 1712 return -EINVAL; 1713 1714 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1715 conn->addr_len = READ_ONCE(sqe->addr2); 1716 conn->in_progress = conn->seen_econnaborted = false; 1717 1718 io = io_msg_alloc_async(req); 1719 if (unlikely(!io)) 1720 return -ENOMEM; 1721 1722 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr); 1723 } 1724 1725 int io_connect(struct io_kiocb *req, unsigned int issue_flags) 1726 { 1727 struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect); 1728 struct io_async_msghdr *io = req->async_data; 1729 unsigned file_flags; 1730 int ret; 1731 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1732 1733 if (unlikely(req->flags & REQ_F_FAIL)) { 1734 ret = -ECONNRESET; 1735 goto out; 1736 } 1737 1738 file_flags = force_nonblock ? O_NONBLOCK : 0; 1739 1740 ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, 1741 file_flags); 1742 if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED) 1743 && force_nonblock) { 1744 if (ret == -EINPROGRESS) { 1745 connect->in_progress = true; 1746 } else if (ret == -ECONNABORTED) { 1747 if (connect->seen_econnaborted) 1748 goto out; 1749 connect->seen_econnaborted = true; 1750 } 1751 return -EAGAIN; 1752 } 1753 if (connect->in_progress) { 1754 /* 1755 * At least bluetooth will return -EBADFD on a re-connect 1756 * attempt, and it's (supposedly) also valid to get -EISCONN 1757 * which means the previous result is good. For both of these, 1758 * grab the sock_error() and use that for the completion. 1759 */ 1760 if (ret == -EBADFD || ret == -EISCONN) 1761 ret = sock_error(sock_from_file(req->file)->sk); 1762 } 1763 if (ret == -ERESTARTSYS) 1764 ret = -EINTR; 1765 out: 1766 if (ret < 0) 1767 req_set_fail(req); 1768 io_req_msg_cleanup(req, issue_flags); 1769 io_req_set_res(req, ret, 0); 1770 return IOU_COMPLETE; 1771 } 1772 1773 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1774 { 1775 struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); 1776 struct sockaddr __user *uaddr; 1777 struct io_async_msghdr *io; 1778 1779 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 1780 return -EINVAL; 1781 1782 uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1783 bind->addr_len = READ_ONCE(sqe->addr2); 1784 1785 io = io_msg_alloc_async(req); 1786 if (unlikely(!io)) 1787 return -ENOMEM; 1788 return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr); 1789 } 1790 1791 int io_bind(struct io_kiocb *req, unsigned int issue_flags) 1792 { 1793 struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); 1794 struct io_async_msghdr *io = req->async_data; 1795 struct socket *sock; 1796 int ret; 1797 1798 sock = sock_from_file(req->file); 1799 if (unlikely(!sock)) 1800 return -ENOTSOCK; 1801 1802 ret = __sys_bind_socket(sock, &io->addr, bind->addr_len); 1803 if (ret < 0) 1804 req_set_fail(req); 1805 io_req_set_res(req, ret, 0); 1806 return 0; 1807 } 1808 1809 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1810 { 1811 struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); 1812 1813 if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2) 1814 return -EINVAL; 1815 1816 listen->backlog = READ_ONCE(sqe->len); 1817 return 0; 1818 } 1819 1820 int io_listen(struct io_kiocb *req, unsigned int issue_flags) 1821 { 1822 struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); 1823 struct socket *sock; 1824 int ret; 1825 1826 sock = sock_from_file(req->file); 1827 if (unlikely(!sock)) 1828 return -ENOTSOCK; 1829 1830 ret = __sys_listen_socket(sock, listen->backlog); 1831 if (ret < 0) 1832 req_set_fail(req); 1833 io_req_set_res(req, ret, 0); 1834 return 0; 1835 } 1836 1837 void io_netmsg_cache_free(const void *entry) 1838 { 1839 struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; 1840 1841 io_vec_free(&kmsg->vec); 1842 kfree(kmsg); 1843 } 1844