1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/file.h> 5 #include <linux/slab.h> 6 #include <linux/net.h> 7 #include <linux/compat.h> 8 #include <net/compat.h> 9 #include <linux/io_uring.h> 10 11 #include <uapi/linux/io_uring.h> 12 13 #include "io_uring.h" 14 #include "kbuf.h" 15 #include "alloc_cache.h" 16 #include "net.h" 17 #include "notif.h" 18 #include "rsrc.h" 19 #include "zcrx.h" 20 21 #if defined(CONFIG_NET) 22 struct io_shutdown { 23 struct file *file; 24 int how; 25 }; 26 27 struct io_accept { 28 struct file *file; 29 struct sockaddr __user *addr; 30 int __user *addr_len; 31 int flags; 32 int iou_flags; 33 u32 file_slot; 34 unsigned long nofile; 35 }; 36 37 struct io_socket { 38 struct file *file; 39 int domain; 40 int type; 41 int protocol; 42 int flags; 43 u32 file_slot; 44 unsigned long nofile; 45 }; 46 47 struct io_connect { 48 struct file *file; 49 struct sockaddr __user *addr; 50 int addr_len; 51 bool in_progress; 52 bool seen_econnaborted; 53 }; 54 55 struct io_bind { 56 struct file *file; 57 int addr_len; 58 }; 59 60 struct io_listen { 61 struct file *file; 62 int backlog; 63 }; 64 65 struct io_sr_msg { 66 struct file *file; 67 union { 68 struct compat_msghdr __user *umsg_compat; 69 struct user_msghdr __user *umsg; 70 void __user *buf; 71 }; 72 int len; 73 unsigned done_io; 74 unsigned msg_flags; 75 unsigned nr_multishot_loops; 76 u16 flags; 77 /* initialised and used only by !msg send variants */ 78 u16 buf_group; 79 bool retry; 80 void __user *msg_control; 81 /* used only for send zerocopy */ 82 struct io_kiocb *notif; 83 }; 84 85 /* 86 * Number of times we'll try and do receives if there's more data. If we 87 * exceed this limit, then add us to the back of the queue and retry from 88 * there. This helps fairness between flooding clients. 89 */ 90 #define MULTISHOT_MAX_RETRY 32 91 92 struct io_recvzc { 93 struct file *file; 94 unsigned msg_flags; 95 u16 flags; 96 u32 len; 97 struct io_zcrx_ifq *ifq; 98 }; 99 100 static int io_sg_from_iter_iovec(struct sk_buff *skb, 101 struct iov_iter *from, size_t length); 102 static int io_sg_from_iter(struct sk_buff *skb, 103 struct iov_iter *from, size_t length); 104 105 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 106 { 107 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); 108 109 if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || 110 sqe->buf_index || sqe->splice_fd_in)) 111 return -EINVAL; 112 113 shutdown->how = READ_ONCE(sqe->len); 114 req->flags |= REQ_F_FORCE_ASYNC; 115 return 0; 116 } 117 118 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) 119 { 120 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); 121 struct socket *sock; 122 int ret; 123 124 WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); 125 126 sock = sock_from_file(req->file); 127 if (unlikely(!sock)) 128 return -ENOTSOCK; 129 130 ret = __sys_shutdown_sock(sock, shutdown->how); 131 io_req_set_res(req, ret, 0); 132 return IOU_OK; 133 } 134 135 static bool io_net_retry(struct socket *sock, int flags) 136 { 137 if (!(flags & MSG_WAITALL)) 138 return false; 139 return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; 140 } 141 142 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) 143 { 144 if (kmsg->vec.iovec) 145 io_vec_free(&kmsg->vec); 146 } 147 148 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) 149 { 150 struct io_async_msghdr *hdr = req->async_data; 151 152 /* can't recycle, ensure we free the iovec if we have one */ 153 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { 154 io_netmsg_iovec_free(hdr); 155 return; 156 } 157 158 /* Let normal cleanup path reap it if we fail adding to the cache */ 159 io_alloc_cache_vec_kasan(&hdr->vec); 160 if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP) 161 io_vec_free(&hdr->vec); 162 163 if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { 164 req->async_data = NULL; 165 req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); 166 } 167 } 168 169 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) 170 { 171 struct io_ring_ctx *ctx = req->ctx; 172 struct io_async_msghdr *hdr; 173 174 hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req); 175 if (!hdr) 176 return NULL; 177 178 /* If the async data was cached, we might have an iov cached inside. */ 179 if (hdr->vec.iovec) 180 req->flags |= REQ_F_NEED_CLEANUP; 181 return hdr; 182 } 183 184 static inline void io_mshot_prep_retry(struct io_kiocb *req, 185 struct io_async_msghdr *kmsg) 186 { 187 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 188 189 req->flags &= ~REQ_F_BL_EMPTY; 190 sr->done_io = 0; 191 sr->retry = false; 192 sr->len = 0; /* get from the provided buffer */ 193 req->buf_index = sr->buf_group; 194 } 195 196 static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg, 197 const struct iovec __user *uiov, unsigned uvec_seg, 198 int ddir) 199 { 200 struct iovec *iov; 201 int ret, nr_segs; 202 203 if (iomsg->vec.iovec) { 204 nr_segs = iomsg->vec.nr; 205 iov = iomsg->vec.iovec; 206 } else { 207 nr_segs = 1; 208 iov = &iomsg->fast_iov; 209 } 210 211 ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov, 212 &iomsg->msg.msg_iter, io_is_compat(req->ctx)); 213 if (unlikely(ret < 0)) 214 return ret; 215 216 if (iov) { 217 req->flags |= REQ_F_NEED_CLEANUP; 218 io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs); 219 } 220 return 0; 221 } 222 223 static int io_compat_msg_copy_hdr(struct io_kiocb *req, 224 struct io_async_msghdr *iomsg, 225 struct compat_msghdr *msg, int ddir, 226 struct sockaddr __user **save_addr) 227 { 228 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 229 struct compat_iovec __user *uiov; 230 int ret; 231 232 if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) 233 return -EFAULT; 234 235 ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr); 236 if (ret) 237 return ret; 238 239 uiov = compat_ptr(msg->msg_iov); 240 if (req->flags & REQ_F_BUFFER_SELECT) { 241 if (msg->msg_iovlen == 0) { 242 sr->len = 0; 243 } else if (msg->msg_iovlen > 1) { 244 return -EINVAL; 245 } else { 246 struct compat_iovec tmp_iov; 247 248 if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) 249 return -EFAULT; 250 sr->len = tmp_iov.iov_len; 251 } 252 } 253 return 0; 254 } 255 256 static int io_copy_msghdr_from_user(struct user_msghdr *msg, 257 struct user_msghdr __user *umsg) 258 { 259 if (!user_access_begin(umsg, sizeof(*umsg))) 260 return -EFAULT; 261 unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); 262 unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); 263 unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); 264 unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); 265 unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); 266 unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); 267 user_access_end(); 268 return 0; 269 ua_end: 270 user_access_end(); 271 return -EFAULT; 272 } 273 274 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, 275 struct user_msghdr *msg, int ddir, 276 struct sockaddr __user **save_addr) 277 { 278 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 279 struct user_msghdr __user *umsg = sr->umsg; 280 int ret; 281 282 iomsg->msg.msg_name = &iomsg->addr; 283 iomsg->msg.msg_iter.nr_segs = 0; 284 285 if (io_is_compat(req->ctx)) { 286 struct compat_msghdr cmsg; 287 288 ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr); 289 if (ret) 290 return ret; 291 292 memset(msg, 0, sizeof(*msg)); 293 msg->msg_namelen = cmsg.msg_namelen; 294 msg->msg_controllen = cmsg.msg_controllen; 295 msg->msg_iov = compat_ptr(cmsg.msg_iov); 296 msg->msg_iovlen = cmsg.msg_iovlen; 297 return 0; 298 } 299 300 ret = io_copy_msghdr_from_user(msg, umsg); 301 if (unlikely(ret)) 302 return ret; 303 304 msg->msg_flags = 0; 305 306 ret = __copy_msghdr(&iomsg->msg, msg, save_addr); 307 if (ret) 308 return ret; 309 310 if (req->flags & REQ_F_BUFFER_SELECT) { 311 if (msg->msg_iovlen == 0) { 312 sr->len = 0; 313 } else if (msg->msg_iovlen > 1) { 314 return -EINVAL; 315 } else { 316 struct iovec __user *uiov = msg->msg_iov; 317 struct iovec tmp_iov; 318 319 if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) 320 return -EFAULT; 321 sr->len = tmp_iov.iov_len; 322 } 323 } 324 return 0; 325 } 326 327 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) 328 { 329 struct io_async_msghdr *io = req->async_data; 330 331 io_netmsg_iovec_free(io); 332 } 333 334 static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) 335 { 336 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 337 struct io_async_msghdr *kmsg = req->async_data; 338 void __user *addr; 339 u16 addr_len; 340 int ret; 341 342 sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 343 344 if (READ_ONCE(sqe->__pad3[0])) 345 return -EINVAL; 346 347 kmsg->msg.msg_name = NULL; 348 kmsg->msg.msg_namelen = 0; 349 kmsg->msg.msg_control = NULL; 350 kmsg->msg.msg_controllen = 0; 351 kmsg->msg.msg_ubuf = NULL; 352 353 addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 354 addr_len = READ_ONCE(sqe->addr_len); 355 if (addr) { 356 ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr); 357 if (unlikely(ret < 0)) 358 return ret; 359 kmsg->msg.msg_name = &kmsg->addr; 360 kmsg->msg.msg_namelen = addr_len; 361 } 362 if (sr->flags & IORING_RECVSEND_FIXED_BUF) 363 return 0; 364 if (!io_do_buffer_select(req)) { 365 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, 366 &kmsg->msg.msg_iter); 367 if (unlikely(ret < 0)) 368 return ret; 369 } 370 return 0; 371 } 372 373 static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) 374 { 375 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 376 struct io_async_msghdr *kmsg = req->async_data; 377 struct user_msghdr msg; 378 int ret; 379 380 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 381 ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL); 382 if (unlikely(ret)) 383 return ret; 384 /* save msg_control as sys_sendmsg() overwrites it */ 385 sr->msg_control = kmsg->msg.msg_control_user; 386 387 if (sr->flags & IORING_RECVSEND_FIXED_BUF) { 388 kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen; 389 return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov, 390 msg.msg_iovlen); 391 } 392 if (req->flags & REQ_F_BUFFER_SELECT) 393 return 0; 394 return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE); 395 } 396 397 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) 398 399 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 400 { 401 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 402 403 sr->done_io = 0; 404 sr->retry = false; 405 sr->len = READ_ONCE(sqe->len); 406 sr->flags = READ_ONCE(sqe->ioprio); 407 if (sr->flags & ~SENDMSG_FLAGS) 408 return -EINVAL; 409 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 410 if (sr->msg_flags & MSG_DONTWAIT) 411 req->flags |= REQ_F_NOWAIT; 412 if (sr->flags & IORING_RECVSEND_BUNDLE) { 413 if (req->opcode == IORING_OP_SENDMSG) 414 return -EINVAL; 415 if (!(req->flags & REQ_F_BUFFER_SELECT)) 416 return -EINVAL; 417 sr->msg_flags |= MSG_WAITALL; 418 sr->buf_group = req->buf_index; 419 req->buf_list = NULL; 420 req->flags |= REQ_F_MULTISHOT; 421 } 422 423 if (io_is_compat(req->ctx)) 424 sr->msg_flags |= MSG_CMSG_COMPAT; 425 426 if (unlikely(!io_msg_alloc_async(req))) 427 return -ENOMEM; 428 if (req->opcode != IORING_OP_SENDMSG) 429 return io_send_setup(req, sqe); 430 if (unlikely(sqe->addr2 || sqe->file_index)) 431 return -EINVAL; 432 return io_sendmsg_setup(req, sqe); 433 } 434 435 static void io_req_msg_cleanup(struct io_kiocb *req, 436 unsigned int issue_flags) 437 { 438 io_netmsg_recycle(req, issue_flags); 439 } 440 441 /* 442 * For bundle completions, we need to figure out how many segments we consumed. 443 * A bundle could be using a single ITER_UBUF if that's all we mapped, or it 444 * could be using an ITER_IOVEC. If the latter, then if we consumed all of 445 * the segments, then it's a trivial questiont o answer. If we have residual 446 * data in the iter, then loop the segments to figure out how much we 447 * transferred. 448 */ 449 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) 450 { 451 struct iovec *iov; 452 int nbufs; 453 454 /* no data is always zero segments, and a ubuf is always 1 segment */ 455 if (ret <= 0) 456 return 0; 457 if (iter_is_ubuf(&kmsg->msg.msg_iter)) 458 return 1; 459 460 iov = kmsg->vec.iovec; 461 if (!iov) 462 iov = &kmsg->fast_iov; 463 464 /* if all data was transferred, it's basic pointer math */ 465 if (!iov_iter_count(&kmsg->msg.msg_iter)) 466 return iter_iov(&kmsg->msg.msg_iter) - iov; 467 468 /* short transfer, count segments */ 469 nbufs = 0; 470 do { 471 int this_len = min_t(int, iov[nbufs].iov_len, ret); 472 473 nbufs++; 474 ret -= this_len; 475 } while (ret); 476 477 return nbufs; 478 } 479 480 static inline bool io_send_finish(struct io_kiocb *req, int *ret, 481 struct io_async_msghdr *kmsg, 482 unsigned issue_flags) 483 { 484 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 485 bool bundle_finished = *ret <= 0; 486 unsigned int cflags; 487 488 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { 489 cflags = io_put_kbuf(req, *ret, issue_flags); 490 goto finish; 491 } 492 493 cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); 494 495 if (bundle_finished || req->flags & REQ_F_BL_EMPTY) 496 goto finish; 497 498 /* 499 * Fill CQE for this receive and see if we should keep trying to 500 * receive from this socket. 501 */ 502 if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 503 io_mshot_prep_retry(req, kmsg); 504 return false; 505 } 506 507 /* Otherwise stop bundle and use the current result. */ 508 finish: 509 io_req_set_res(req, *ret, cflags); 510 *ret = IOU_OK; 511 return true; 512 } 513 514 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 515 { 516 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 517 struct io_async_msghdr *kmsg = req->async_data; 518 struct socket *sock; 519 unsigned flags; 520 int min_ret = 0; 521 int ret; 522 523 sock = sock_from_file(req->file); 524 if (unlikely(!sock)) 525 return -ENOTSOCK; 526 527 if (!(req->flags & REQ_F_POLLED) && 528 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 529 return -EAGAIN; 530 531 flags = sr->msg_flags; 532 if (issue_flags & IO_URING_F_NONBLOCK) 533 flags |= MSG_DONTWAIT; 534 if (flags & MSG_WAITALL) 535 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 536 537 kmsg->msg.msg_control_user = sr->msg_control; 538 539 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 540 541 if (ret < min_ret) { 542 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 543 return -EAGAIN; 544 if (ret > 0 && io_net_retry(sock, flags)) { 545 kmsg->msg.msg_controllen = 0; 546 kmsg->msg.msg_control = NULL; 547 sr->done_io += ret; 548 req->flags |= REQ_F_BL_NO_RECYCLE; 549 return -EAGAIN; 550 } 551 if (ret == -ERESTARTSYS) 552 ret = -EINTR; 553 req_set_fail(req); 554 } 555 io_req_msg_cleanup(req, issue_flags); 556 if (ret >= 0) 557 ret += sr->done_io; 558 else if (sr->done_io) 559 ret = sr->done_io; 560 io_req_set_res(req, ret, 0); 561 return IOU_OK; 562 } 563 564 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, 565 struct io_async_msghdr *kmsg) 566 { 567 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 568 569 int ret; 570 struct buf_sel_arg arg = { 571 .iovs = &kmsg->fast_iov, 572 .max_len = min_not_zero(sr->len, INT_MAX), 573 .nr_iovs = 1, 574 }; 575 576 if (kmsg->vec.iovec) { 577 arg.nr_iovs = kmsg->vec.nr; 578 arg.iovs = kmsg->vec.iovec; 579 arg.mode = KBUF_MODE_FREE; 580 } 581 582 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) 583 arg.nr_iovs = 1; 584 else 585 arg.mode |= KBUF_MODE_EXPAND; 586 587 ret = io_buffers_select(req, &arg, issue_flags); 588 if (unlikely(ret < 0)) 589 return ret; 590 591 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { 592 kmsg->vec.nr = ret; 593 kmsg->vec.iovec = arg.iovs; 594 req->flags |= REQ_F_NEED_CLEANUP; 595 } 596 sr->len = arg.out_len; 597 598 if (ret == 1) { 599 sr->buf = arg.iovs[0].iov_base; 600 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, 601 &kmsg->msg.msg_iter); 602 if (unlikely(ret)) 603 return ret; 604 } else { 605 iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, 606 arg.iovs, ret, arg.out_len); 607 } 608 609 return 0; 610 } 611 612 int io_send(struct io_kiocb *req, unsigned int issue_flags) 613 { 614 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 615 struct io_async_msghdr *kmsg = req->async_data; 616 struct socket *sock; 617 unsigned flags; 618 int min_ret = 0; 619 int ret; 620 621 sock = sock_from_file(req->file); 622 if (unlikely(!sock)) 623 return -ENOTSOCK; 624 625 if (!(req->flags & REQ_F_POLLED) && 626 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 627 return -EAGAIN; 628 629 flags = sr->msg_flags; 630 if (issue_flags & IO_URING_F_NONBLOCK) 631 flags |= MSG_DONTWAIT; 632 633 retry_bundle: 634 if (io_do_buffer_select(req)) { 635 ret = io_send_select_buffer(req, issue_flags, kmsg); 636 if (ret) 637 return ret; 638 } 639 640 /* 641 * If MSG_WAITALL is set, or this is a bundle send, then we need 642 * the full amount. If just bundle is set, if we do a short send 643 * then we complete the bundle sequence rather than continue on. 644 */ 645 if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE) 646 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 647 648 flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 649 kmsg->msg.msg_flags = flags; 650 ret = sock_sendmsg(sock, &kmsg->msg); 651 if (ret < min_ret) { 652 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 653 return -EAGAIN; 654 655 if (ret > 0 && io_net_retry(sock, flags)) { 656 sr->len -= ret; 657 sr->buf += ret; 658 sr->done_io += ret; 659 req->flags |= REQ_F_BL_NO_RECYCLE; 660 return -EAGAIN; 661 } 662 if (ret == -ERESTARTSYS) 663 ret = -EINTR; 664 req_set_fail(req); 665 } 666 if (ret >= 0) 667 ret += sr->done_io; 668 else if (sr->done_io) 669 ret = sr->done_io; 670 671 if (!io_send_finish(req, &ret, kmsg, issue_flags)) 672 goto retry_bundle; 673 674 io_req_msg_cleanup(req, issue_flags); 675 return ret; 676 } 677 678 static int io_recvmsg_mshot_prep(struct io_kiocb *req, 679 struct io_async_msghdr *iomsg, 680 int namelen, size_t controllen) 681 { 682 if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) == 683 (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) { 684 int hdr; 685 686 if (unlikely(namelen < 0)) 687 return -EOVERFLOW; 688 if (check_add_overflow(sizeof(struct io_uring_recvmsg_out), 689 namelen, &hdr)) 690 return -EOVERFLOW; 691 if (check_add_overflow(hdr, controllen, &hdr)) 692 return -EOVERFLOW; 693 694 iomsg->namelen = namelen; 695 iomsg->controllen = controllen; 696 return 0; 697 } 698 699 return 0; 700 } 701 702 static int io_recvmsg_copy_hdr(struct io_kiocb *req, 703 struct io_async_msghdr *iomsg) 704 { 705 struct user_msghdr msg; 706 int ret; 707 708 ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); 709 if (unlikely(ret)) 710 return ret; 711 712 if (!(req->flags & REQ_F_BUFFER_SELECT)) { 713 ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen, 714 ITER_DEST); 715 if (unlikely(ret)) 716 return ret; 717 } 718 return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, 719 msg.msg_controllen); 720 } 721 722 static int io_recvmsg_prep_setup(struct io_kiocb *req) 723 { 724 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 725 struct io_async_msghdr *kmsg; 726 int ret; 727 728 kmsg = io_msg_alloc_async(req); 729 if (unlikely(!kmsg)) 730 return -ENOMEM; 731 732 if (req->opcode == IORING_OP_RECV) { 733 kmsg->msg.msg_name = NULL; 734 kmsg->msg.msg_namelen = 0; 735 kmsg->msg.msg_inq = 0; 736 kmsg->msg.msg_control = NULL; 737 kmsg->msg.msg_get_inq = 1; 738 kmsg->msg.msg_controllen = 0; 739 kmsg->msg.msg_iocb = NULL; 740 kmsg->msg.msg_ubuf = NULL; 741 742 if (!io_do_buffer_select(req)) { 743 ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 744 &kmsg->msg.msg_iter); 745 if (unlikely(ret)) 746 return ret; 747 } 748 return 0; 749 } 750 751 return io_recvmsg_copy_hdr(req, kmsg); 752 } 753 754 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ 755 IORING_RECVSEND_BUNDLE) 756 757 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 758 { 759 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 760 761 sr->done_io = 0; 762 sr->retry = false; 763 764 if (unlikely(sqe->file_index || sqe->addr2)) 765 return -EINVAL; 766 767 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 768 sr->len = READ_ONCE(sqe->len); 769 sr->flags = READ_ONCE(sqe->ioprio); 770 if (sr->flags & ~RECVMSG_FLAGS) 771 return -EINVAL; 772 sr->msg_flags = READ_ONCE(sqe->msg_flags); 773 if (sr->msg_flags & MSG_DONTWAIT) 774 req->flags |= REQ_F_NOWAIT; 775 if (sr->msg_flags & MSG_ERRQUEUE) 776 req->flags |= REQ_F_CLEAR_POLLIN; 777 if (req->flags & REQ_F_BUFFER_SELECT) { 778 /* 779 * Store the buffer group for this multishot receive separately, 780 * as if we end up doing an io-wq based issue that selects a 781 * buffer, it has to be committed immediately and that will 782 * clear ->buf_list. This means we lose the link to the buffer 783 * list, and the eventual buffer put on completion then cannot 784 * restore it. 785 */ 786 sr->buf_group = req->buf_index; 787 req->buf_list = NULL; 788 } 789 if (sr->flags & IORING_RECV_MULTISHOT) { 790 if (!(req->flags & REQ_F_BUFFER_SELECT)) 791 return -EINVAL; 792 if (sr->msg_flags & MSG_WAITALL) 793 return -EINVAL; 794 if (req->opcode == IORING_OP_RECV && sr->len) 795 return -EINVAL; 796 req->flags |= REQ_F_APOLL_MULTISHOT; 797 } 798 if (sr->flags & IORING_RECVSEND_BUNDLE) { 799 if (req->opcode == IORING_OP_RECVMSG) 800 return -EINVAL; 801 } 802 803 if (io_is_compat(req->ctx)) 804 sr->msg_flags |= MSG_CMSG_COMPAT; 805 806 sr->nr_multishot_loops = 0; 807 return io_recvmsg_prep_setup(req); 808 } 809 810 /* bits to clear in old and inherit in new cflags on bundle retry */ 811 #define CQE_F_MASK (IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE) 812 813 /* 814 * Finishes io_recv and io_recvmsg. 815 * 816 * Returns true if it is actually finished, or false if it should run 817 * again (for multishot). 818 */ 819 static inline bool io_recv_finish(struct io_kiocb *req, int *ret, 820 struct io_async_msghdr *kmsg, 821 bool mshot_finished, unsigned issue_flags) 822 { 823 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 824 unsigned int cflags = 0; 825 826 if (kmsg->msg.msg_inq > 0) 827 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 828 829 if (sr->flags & IORING_RECVSEND_BUNDLE) { 830 size_t this_ret = *ret - sr->done_io; 831 832 cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, this_ret), 833 issue_flags); 834 if (sr->retry) 835 cflags = req->cqe.flags | (cflags & CQE_F_MASK); 836 /* bundle with no more immediate buffers, we're done */ 837 if (req->flags & REQ_F_BL_EMPTY) 838 goto finish; 839 /* 840 * If more is available AND it was a full transfer, retry and 841 * append to this one 842 */ 843 if (!sr->retry && kmsg->msg.msg_inq > 0 && this_ret > 0 && 844 !iov_iter_count(&kmsg->msg.msg_iter)) { 845 req->cqe.flags = cflags & ~CQE_F_MASK; 846 sr->len = kmsg->msg.msg_inq; 847 sr->done_io += this_ret; 848 sr->retry = true; 849 return false; 850 } 851 } else { 852 cflags |= io_put_kbuf(req, *ret, issue_flags); 853 } 854 855 /* 856 * Fill CQE for this receive and see if we should keep trying to 857 * receive from this socket. 858 */ 859 if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && 860 io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 861 *ret = IOU_RETRY; 862 io_mshot_prep_retry(req, kmsg); 863 /* Known not-empty or unknown state, retry */ 864 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { 865 if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) 866 return false; 867 /* mshot retries exceeded, force a requeue */ 868 sr->nr_multishot_loops = 0; 869 if (issue_flags & IO_URING_F_MULTISHOT) 870 *ret = IOU_REQUEUE; 871 } 872 return true; 873 } 874 875 /* Finish the request / stop multishot. */ 876 finish: 877 io_req_set_res(req, *ret, cflags); 878 *ret = IOU_COMPLETE; 879 io_req_msg_cleanup(req, issue_flags); 880 return true; 881 } 882 883 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg, 884 struct io_sr_msg *sr, void __user **buf, 885 size_t *len) 886 { 887 unsigned long ubuf = (unsigned long) *buf; 888 unsigned long hdr; 889 890 hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + 891 kmsg->controllen; 892 if (*len < hdr) 893 return -EFAULT; 894 895 if (kmsg->controllen) { 896 unsigned long control = ubuf + hdr - kmsg->controllen; 897 898 kmsg->msg.msg_control_user = (void __user *) control; 899 kmsg->msg.msg_controllen = kmsg->controllen; 900 } 901 902 sr->buf = *buf; /* stash for later copy */ 903 *buf = (void __user *) (ubuf + hdr); 904 kmsg->payloadlen = *len = *len - hdr; 905 return 0; 906 } 907 908 struct io_recvmsg_multishot_hdr { 909 struct io_uring_recvmsg_out msg; 910 struct sockaddr_storage addr; 911 }; 912 913 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, 914 struct io_async_msghdr *kmsg, 915 unsigned int flags, bool *finished) 916 { 917 int err; 918 int copy_len; 919 struct io_recvmsg_multishot_hdr hdr; 920 921 if (kmsg->namelen) 922 kmsg->msg.msg_name = &hdr.addr; 923 kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); 924 kmsg->msg.msg_namelen = 0; 925 926 if (sock->file->f_flags & O_NONBLOCK) 927 flags |= MSG_DONTWAIT; 928 929 err = sock_recvmsg(sock, &kmsg->msg, flags); 930 *finished = err <= 0; 931 if (err < 0) 932 return err; 933 934 hdr.msg = (struct io_uring_recvmsg_out) { 935 .controllen = kmsg->controllen - kmsg->msg.msg_controllen, 936 .flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT 937 }; 938 939 hdr.msg.payloadlen = err; 940 if (err > kmsg->payloadlen) 941 err = kmsg->payloadlen; 942 943 copy_len = sizeof(struct io_uring_recvmsg_out); 944 if (kmsg->msg.msg_namelen > kmsg->namelen) 945 copy_len += kmsg->namelen; 946 else 947 copy_len += kmsg->msg.msg_namelen; 948 949 /* 950 * "fromlen shall refer to the value before truncation.." 951 * 1003.1g 952 */ 953 hdr.msg.namelen = kmsg->msg.msg_namelen; 954 955 /* ensure that there is no gap between hdr and sockaddr_storage */ 956 BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) != 957 sizeof(struct io_uring_recvmsg_out)); 958 if (copy_to_user(io->buf, &hdr, copy_len)) { 959 *finished = true; 960 return -EFAULT; 961 } 962 963 return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + 964 kmsg->controllen + err; 965 } 966 967 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 968 { 969 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 970 struct io_async_msghdr *kmsg = req->async_data; 971 struct socket *sock; 972 unsigned flags; 973 int ret, min_ret = 0; 974 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 975 bool mshot_finished = true; 976 977 sock = sock_from_file(req->file); 978 if (unlikely(!sock)) 979 return -ENOTSOCK; 980 981 if (!(req->flags & REQ_F_POLLED) && 982 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 983 return -EAGAIN; 984 985 flags = sr->msg_flags; 986 if (force_nonblock) 987 flags |= MSG_DONTWAIT; 988 989 retry_multishot: 990 if (io_do_buffer_select(req)) { 991 void __user *buf; 992 size_t len = sr->len; 993 994 buf = io_buffer_select(req, &len, issue_flags); 995 if (!buf) 996 return -ENOBUFS; 997 998 if (req->flags & REQ_F_APOLL_MULTISHOT) { 999 ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len); 1000 if (ret) { 1001 io_kbuf_recycle(req, issue_flags); 1002 return ret; 1003 } 1004 } 1005 1006 iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); 1007 } 1008 1009 kmsg->msg.msg_get_inq = 1; 1010 kmsg->msg.msg_inq = -1; 1011 if (req->flags & REQ_F_APOLL_MULTISHOT) { 1012 ret = io_recvmsg_multishot(sock, sr, kmsg, flags, 1013 &mshot_finished); 1014 } else { 1015 /* disable partial retry for recvmsg with cmsg attached */ 1016 if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) 1017 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1018 1019 ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, 1020 kmsg->uaddr, flags); 1021 } 1022 1023 if (ret < min_ret) { 1024 if (ret == -EAGAIN && force_nonblock) { 1025 if (issue_flags & IO_URING_F_MULTISHOT) 1026 io_kbuf_recycle(req, issue_flags); 1027 1028 return IOU_RETRY; 1029 } 1030 if (ret > 0 && io_net_retry(sock, flags)) { 1031 sr->done_io += ret; 1032 req->flags |= REQ_F_BL_NO_RECYCLE; 1033 return IOU_RETRY; 1034 } 1035 if (ret == -ERESTARTSYS) 1036 ret = -EINTR; 1037 req_set_fail(req); 1038 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1039 req_set_fail(req); 1040 } 1041 1042 if (ret > 0) 1043 ret += sr->done_io; 1044 else if (sr->done_io) 1045 ret = sr->done_io; 1046 else 1047 io_kbuf_recycle(req, issue_flags); 1048 1049 if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1050 goto retry_multishot; 1051 1052 return ret; 1053 } 1054 1055 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, 1056 size_t *len, unsigned int issue_flags) 1057 { 1058 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1059 int ret; 1060 1061 /* 1062 * If the ring isn't locked, then don't use the peek interface 1063 * to grab multiple buffers as we will lock/unlock between 1064 * this selection and posting the buffers. 1065 */ 1066 if (!(issue_flags & IO_URING_F_UNLOCKED) && 1067 sr->flags & IORING_RECVSEND_BUNDLE) { 1068 struct buf_sel_arg arg = { 1069 .iovs = &kmsg->fast_iov, 1070 .nr_iovs = 1, 1071 .mode = KBUF_MODE_EXPAND, 1072 }; 1073 1074 if (kmsg->vec.iovec) { 1075 arg.nr_iovs = kmsg->vec.nr; 1076 arg.iovs = kmsg->vec.iovec; 1077 arg.mode |= KBUF_MODE_FREE; 1078 } 1079 1080 if (kmsg->msg.msg_inq > 0) 1081 arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); 1082 1083 ret = io_buffers_peek(req, &arg); 1084 if (unlikely(ret < 0)) 1085 return ret; 1086 1087 /* special case 1 vec, can be a fast path */ 1088 if (ret == 1) { 1089 sr->buf = arg.iovs[0].iov_base; 1090 sr->len = arg.iovs[0].iov_len; 1091 goto map_ubuf; 1092 } 1093 iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, 1094 arg.out_len); 1095 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { 1096 kmsg->vec.nr = ret; 1097 kmsg->vec.iovec = arg.iovs; 1098 req->flags |= REQ_F_NEED_CLEANUP; 1099 } 1100 } else { 1101 void __user *buf; 1102 1103 *len = sr->len; 1104 buf = io_buffer_select(req, len, issue_flags); 1105 if (!buf) 1106 return -ENOBUFS; 1107 sr->buf = buf; 1108 sr->len = *len; 1109 map_ubuf: 1110 ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 1111 &kmsg->msg.msg_iter); 1112 if (unlikely(ret)) 1113 return ret; 1114 } 1115 1116 return 0; 1117 } 1118 1119 int io_recv(struct io_kiocb *req, unsigned int issue_flags) 1120 { 1121 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1122 struct io_async_msghdr *kmsg = req->async_data; 1123 struct socket *sock; 1124 unsigned flags; 1125 int ret, min_ret = 0; 1126 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1127 size_t len = sr->len; 1128 bool mshot_finished; 1129 1130 if (!(req->flags & REQ_F_POLLED) && 1131 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1132 return -EAGAIN; 1133 1134 sock = sock_from_file(req->file); 1135 if (unlikely(!sock)) 1136 return -ENOTSOCK; 1137 1138 flags = sr->msg_flags; 1139 if (force_nonblock) 1140 flags |= MSG_DONTWAIT; 1141 1142 retry_multishot: 1143 if (io_do_buffer_select(req)) { 1144 ret = io_recv_buf_select(req, kmsg, &len, issue_flags); 1145 if (unlikely(ret)) { 1146 kmsg->msg.msg_inq = -1; 1147 goto out_free; 1148 } 1149 sr->buf = NULL; 1150 } 1151 1152 kmsg->msg.msg_flags = 0; 1153 kmsg->msg.msg_inq = -1; 1154 1155 if (flags & MSG_WAITALL) 1156 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1157 1158 ret = sock_recvmsg(sock, &kmsg->msg, flags); 1159 if (ret < min_ret) { 1160 if (ret == -EAGAIN && force_nonblock) { 1161 if (issue_flags & IO_URING_F_MULTISHOT) 1162 io_kbuf_recycle(req, issue_flags); 1163 1164 return IOU_RETRY; 1165 } 1166 if (ret > 0 && io_net_retry(sock, flags)) { 1167 sr->len -= ret; 1168 sr->buf += ret; 1169 sr->done_io += ret; 1170 req->flags |= REQ_F_BL_NO_RECYCLE; 1171 return -EAGAIN; 1172 } 1173 if (ret == -ERESTARTSYS) 1174 ret = -EINTR; 1175 req_set_fail(req); 1176 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1177 out_free: 1178 req_set_fail(req); 1179 } 1180 1181 mshot_finished = ret <= 0; 1182 if (ret > 0) 1183 ret += sr->done_io; 1184 else if (sr->done_io) 1185 ret = sr->done_io; 1186 else 1187 io_kbuf_recycle(req, issue_flags); 1188 1189 if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1190 goto retry_multishot; 1191 1192 return ret; 1193 } 1194 1195 int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1196 { 1197 struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); 1198 unsigned ifq_idx; 1199 1200 if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr || 1201 sqe->addr3)) 1202 return -EINVAL; 1203 1204 ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); 1205 if (ifq_idx != 0) 1206 return -EINVAL; 1207 zc->ifq = req->ctx->ifq; 1208 if (!zc->ifq) 1209 return -EINVAL; 1210 zc->len = READ_ONCE(sqe->len); 1211 zc->flags = READ_ONCE(sqe->ioprio); 1212 zc->msg_flags = READ_ONCE(sqe->msg_flags); 1213 if (zc->msg_flags) 1214 return -EINVAL; 1215 if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) 1216 return -EINVAL; 1217 /* multishot required */ 1218 if (!(zc->flags & IORING_RECV_MULTISHOT)) 1219 return -EINVAL; 1220 /* All data completions are posted as aux CQEs. */ 1221 req->flags |= REQ_F_APOLL_MULTISHOT; 1222 1223 return 0; 1224 } 1225 1226 int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) 1227 { 1228 struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); 1229 struct socket *sock; 1230 unsigned int len; 1231 int ret; 1232 1233 if (!(req->flags & REQ_F_POLLED) && 1234 (zc->flags & IORING_RECVSEND_POLL_FIRST)) 1235 return -EAGAIN; 1236 1237 sock = sock_from_file(req->file); 1238 if (unlikely(!sock)) 1239 return -ENOTSOCK; 1240 1241 len = zc->len; 1242 ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, 1243 issue_flags, &zc->len); 1244 if (len && zc->len == 0) { 1245 io_req_set_res(req, 0, 0); 1246 1247 return IOU_COMPLETE; 1248 } 1249 if (unlikely(ret <= 0) && ret != -EAGAIN) { 1250 if (ret == -ERESTARTSYS) 1251 ret = -EINTR; 1252 if (ret == IOU_REQUEUE) 1253 return IOU_REQUEUE; 1254 1255 req_set_fail(req); 1256 io_req_set_res(req, ret, 0); 1257 return IOU_COMPLETE; 1258 } 1259 return IOU_RETRY; 1260 } 1261 1262 void io_send_zc_cleanup(struct io_kiocb *req) 1263 { 1264 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1265 struct io_async_msghdr *io = req->async_data; 1266 1267 if (req_has_async_data(req)) 1268 io_netmsg_iovec_free(io); 1269 if (zc->notif) { 1270 io_notif_flush(zc->notif); 1271 zc->notif = NULL; 1272 } 1273 } 1274 1275 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF) 1276 #define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE) 1277 1278 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1279 { 1280 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1281 struct io_ring_ctx *ctx = req->ctx; 1282 struct io_async_msghdr *iomsg; 1283 struct io_kiocb *notif; 1284 int ret; 1285 1286 zc->done_io = 0; 1287 zc->retry = false; 1288 1289 if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) 1290 return -EINVAL; 1291 /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ 1292 if (req->flags & REQ_F_CQE_SKIP) 1293 return -EINVAL; 1294 1295 notif = zc->notif = io_alloc_notif(ctx); 1296 if (!notif) 1297 return -ENOMEM; 1298 notif->cqe.user_data = req->cqe.user_data; 1299 notif->cqe.res = 0; 1300 notif->cqe.flags = IORING_CQE_F_NOTIF; 1301 req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY; 1302 1303 zc->flags = READ_ONCE(sqe->ioprio); 1304 if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) { 1305 if (zc->flags & ~IO_ZC_FLAGS_VALID) 1306 return -EINVAL; 1307 if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { 1308 struct io_notif_data *nd = io_notif_to_data(notif); 1309 1310 nd->zc_report = true; 1311 nd->zc_used = false; 1312 nd->zc_copied = false; 1313 } 1314 } 1315 1316 zc->len = READ_ONCE(sqe->len); 1317 zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; 1318 req->buf_index = READ_ONCE(sqe->buf_index); 1319 if (zc->msg_flags & MSG_DONTWAIT) 1320 req->flags |= REQ_F_NOWAIT; 1321 1322 if (io_is_compat(req->ctx)) 1323 zc->msg_flags |= MSG_CMSG_COMPAT; 1324 1325 iomsg = io_msg_alloc_async(req); 1326 if (unlikely(!iomsg)) 1327 return -ENOMEM; 1328 1329 if (req->opcode == IORING_OP_SEND_ZC) { 1330 if (zc->flags & IORING_RECVSEND_FIXED_BUF) 1331 req->flags |= REQ_F_IMPORT_BUFFER; 1332 ret = io_send_setup(req, sqe); 1333 } else { 1334 if (unlikely(sqe->addr2 || sqe->file_index)) 1335 return -EINVAL; 1336 ret = io_sendmsg_setup(req, sqe); 1337 } 1338 if (unlikely(ret)) 1339 return ret; 1340 1341 if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) { 1342 iomsg->msg.sg_from_iter = io_sg_from_iter_iovec; 1343 return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count); 1344 } 1345 iomsg->msg.sg_from_iter = io_sg_from_iter; 1346 return 0; 1347 } 1348 1349 static int io_sg_from_iter_iovec(struct sk_buff *skb, 1350 struct iov_iter *from, size_t length) 1351 { 1352 skb_zcopy_downgrade_managed(skb); 1353 return zerocopy_fill_skb_from_iter(skb, from, length); 1354 } 1355 1356 static int io_sg_from_iter(struct sk_buff *skb, 1357 struct iov_iter *from, size_t length) 1358 { 1359 struct skb_shared_info *shinfo = skb_shinfo(skb); 1360 int frag = shinfo->nr_frags; 1361 int ret = 0; 1362 struct bvec_iter bi; 1363 ssize_t copied = 0; 1364 unsigned long truesize = 0; 1365 1366 if (!frag) 1367 shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; 1368 else if (unlikely(!skb_zcopy_managed(skb))) 1369 return zerocopy_fill_skb_from_iter(skb, from, length); 1370 1371 bi.bi_size = min(from->count, length); 1372 bi.bi_bvec_done = from->iov_offset; 1373 bi.bi_idx = 0; 1374 1375 while (bi.bi_size && frag < MAX_SKB_FRAGS) { 1376 struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi); 1377 1378 copied += v.bv_len; 1379 truesize += PAGE_ALIGN(v.bv_len + v.bv_offset); 1380 __skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page, 1381 v.bv_offset, v.bv_len); 1382 bvec_iter_advance_single(from->bvec, &bi, v.bv_len); 1383 } 1384 if (bi.bi_size) 1385 ret = -EMSGSIZE; 1386 1387 shinfo->nr_frags = frag; 1388 from->bvec += bi.bi_idx; 1389 from->nr_segs -= bi.bi_idx; 1390 from->count -= copied; 1391 from->iov_offset = bi.bi_bvec_done; 1392 1393 skb->data_len += copied; 1394 skb->len += copied; 1395 skb->truesize += truesize; 1396 return ret; 1397 } 1398 1399 static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) 1400 { 1401 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1402 struct io_async_msghdr *kmsg = req->async_data; 1403 1404 WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF)); 1405 1406 sr->notif->buf_index = req->buf_index; 1407 return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, 1408 (u64)(uintptr_t)sr->buf, sr->len, 1409 ITER_SOURCE, issue_flags); 1410 } 1411 1412 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) 1413 { 1414 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1415 struct io_async_msghdr *kmsg = req->async_data; 1416 struct socket *sock; 1417 unsigned msg_flags; 1418 int ret, min_ret = 0; 1419 1420 sock = sock_from_file(req->file); 1421 if (unlikely(!sock)) 1422 return -ENOTSOCK; 1423 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1424 return -EOPNOTSUPP; 1425 1426 if (!(req->flags & REQ_F_POLLED) && 1427 (zc->flags & IORING_RECVSEND_POLL_FIRST)) 1428 return -EAGAIN; 1429 1430 if (req->flags & REQ_F_IMPORT_BUFFER) { 1431 req->flags &= ~REQ_F_IMPORT_BUFFER; 1432 ret = io_send_zc_import(req, issue_flags); 1433 if (unlikely(ret)) 1434 return ret; 1435 } 1436 1437 msg_flags = zc->msg_flags; 1438 if (issue_flags & IO_URING_F_NONBLOCK) 1439 msg_flags |= MSG_DONTWAIT; 1440 if (msg_flags & MSG_WAITALL) 1441 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1442 msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 1443 1444 kmsg->msg.msg_flags = msg_flags; 1445 kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; 1446 ret = sock_sendmsg(sock, &kmsg->msg); 1447 1448 if (unlikely(ret < min_ret)) { 1449 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1450 return -EAGAIN; 1451 1452 if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { 1453 zc->len -= ret; 1454 zc->buf += ret; 1455 zc->done_io += ret; 1456 req->flags |= REQ_F_BL_NO_RECYCLE; 1457 return -EAGAIN; 1458 } 1459 if (ret == -ERESTARTSYS) 1460 ret = -EINTR; 1461 req_set_fail(req); 1462 } 1463 1464 if (ret >= 0) 1465 ret += zc->done_io; 1466 else if (zc->done_io) 1467 ret = zc->done_io; 1468 1469 /* 1470 * If we're in io-wq we can't rely on tw ordering guarantees, defer 1471 * flushing notif to io_send_zc_cleanup() 1472 */ 1473 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1474 io_notif_flush(zc->notif); 1475 zc->notif = NULL; 1476 io_req_msg_cleanup(req, 0); 1477 } 1478 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1479 return IOU_OK; 1480 } 1481 1482 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) 1483 { 1484 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1485 struct io_async_msghdr *kmsg = req->async_data; 1486 struct socket *sock; 1487 unsigned flags; 1488 int ret, min_ret = 0; 1489 1490 if (req->flags & REQ_F_IMPORT_BUFFER) { 1491 unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; 1492 int ret; 1493 1494 ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req, 1495 &kmsg->vec, uvec_segs, issue_flags); 1496 if (unlikely(ret)) 1497 return ret; 1498 req->flags &= ~REQ_F_IMPORT_BUFFER; 1499 } 1500 1501 sock = sock_from_file(req->file); 1502 if (unlikely(!sock)) 1503 return -ENOTSOCK; 1504 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1505 return -EOPNOTSUPP; 1506 1507 if (!(req->flags & REQ_F_POLLED) && 1508 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1509 return -EAGAIN; 1510 1511 flags = sr->msg_flags; 1512 if (issue_flags & IO_URING_F_NONBLOCK) 1513 flags |= MSG_DONTWAIT; 1514 if (flags & MSG_WAITALL) 1515 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1516 1517 kmsg->msg.msg_control_user = sr->msg_control; 1518 kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; 1519 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 1520 1521 if (unlikely(ret < min_ret)) { 1522 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1523 return -EAGAIN; 1524 1525 if (ret > 0 && io_net_retry(sock, flags)) { 1526 sr->done_io += ret; 1527 req->flags |= REQ_F_BL_NO_RECYCLE; 1528 return -EAGAIN; 1529 } 1530 if (ret == -ERESTARTSYS) 1531 ret = -EINTR; 1532 req_set_fail(req); 1533 } 1534 1535 if (ret >= 0) 1536 ret += sr->done_io; 1537 else if (sr->done_io) 1538 ret = sr->done_io; 1539 1540 /* 1541 * If we're in io-wq we can't rely on tw ordering guarantees, defer 1542 * flushing notif to io_send_zc_cleanup() 1543 */ 1544 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1545 io_notif_flush(sr->notif); 1546 sr->notif = NULL; 1547 io_req_msg_cleanup(req, 0); 1548 } 1549 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1550 return IOU_OK; 1551 } 1552 1553 void io_sendrecv_fail(struct io_kiocb *req) 1554 { 1555 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1556 1557 if (sr->done_io) 1558 req->cqe.res = sr->done_io; 1559 1560 if ((req->flags & REQ_F_NEED_CLEANUP) && 1561 (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC)) 1562 req->cqe.flags |= IORING_CQE_F_MORE; 1563 } 1564 1565 #define ACCEPT_FLAGS (IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \ 1566 IORING_ACCEPT_POLL_FIRST) 1567 1568 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1569 { 1570 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1571 1572 if (sqe->len || sqe->buf_index) 1573 return -EINVAL; 1574 1575 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1576 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 1577 accept->flags = READ_ONCE(sqe->accept_flags); 1578 accept->nofile = rlimit(RLIMIT_NOFILE); 1579 accept->iou_flags = READ_ONCE(sqe->ioprio); 1580 if (accept->iou_flags & ~ACCEPT_FLAGS) 1581 return -EINVAL; 1582 1583 accept->file_slot = READ_ONCE(sqe->file_index); 1584 if (accept->file_slot) { 1585 if (accept->flags & SOCK_CLOEXEC) 1586 return -EINVAL; 1587 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT && 1588 accept->file_slot != IORING_FILE_INDEX_ALLOC) 1589 return -EINVAL; 1590 } 1591 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1592 return -EINVAL; 1593 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) 1594 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 1595 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT) 1596 req->flags |= REQ_F_APOLL_MULTISHOT; 1597 if (accept->iou_flags & IORING_ACCEPT_DONTWAIT) 1598 req->flags |= REQ_F_NOWAIT; 1599 return 0; 1600 } 1601 1602 int io_accept(struct io_kiocb *req, unsigned int issue_flags) 1603 { 1604 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1605 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1606 bool fixed = !!accept->file_slot; 1607 struct proto_accept_arg arg = { 1608 .flags = force_nonblock ? O_NONBLOCK : 0, 1609 }; 1610 struct file *file; 1611 unsigned cflags; 1612 int ret, fd; 1613 1614 if (!(req->flags & REQ_F_POLLED) && 1615 accept->iou_flags & IORING_ACCEPT_POLL_FIRST) 1616 return -EAGAIN; 1617 1618 retry: 1619 if (!fixed) { 1620 fd = __get_unused_fd_flags(accept->flags, accept->nofile); 1621 if (unlikely(fd < 0)) 1622 return fd; 1623 } 1624 arg.err = 0; 1625 arg.is_empty = -1; 1626 file = do_accept(req->file, &arg, accept->addr, accept->addr_len, 1627 accept->flags); 1628 if (IS_ERR(file)) { 1629 if (!fixed) 1630 put_unused_fd(fd); 1631 ret = PTR_ERR(file); 1632 if (ret == -EAGAIN && force_nonblock && 1633 !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) 1634 return IOU_RETRY; 1635 1636 if (ret == -ERESTARTSYS) 1637 ret = -EINTR; 1638 } else if (!fixed) { 1639 fd_install(fd, file); 1640 ret = fd; 1641 } else { 1642 ret = io_fixed_fd_install(req, issue_flags, file, 1643 accept->file_slot); 1644 } 1645 1646 cflags = 0; 1647 if (!arg.is_empty) 1648 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 1649 1650 if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) && 1651 io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { 1652 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) 1653 goto retry; 1654 return IOU_RETRY; 1655 } 1656 1657 io_req_set_res(req, ret, cflags); 1658 if (ret < 0) 1659 req_set_fail(req); 1660 return IOU_COMPLETE; 1661 } 1662 1663 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1664 { 1665 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); 1666 1667 if (sqe->addr || sqe->rw_flags || sqe->buf_index) 1668 return -EINVAL; 1669 1670 sock->domain = READ_ONCE(sqe->fd); 1671 sock->type = READ_ONCE(sqe->off); 1672 sock->protocol = READ_ONCE(sqe->len); 1673 sock->file_slot = READ_ONCE(sqe->file_index); 1674 sock->nofile = rlimit(RLIMIT_NOFILE); 1675 1676 sock->flags = sock->type & ~SOCK_TYPE_MASK; 1677 if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) 1678 return -EINVAL; 1679 if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1680 return -EINVAL; 1681 return 0; 1682 } 1683 1684 int io_socket(struct io_kiocb *req, unsigned int issue_flags) 1685 { 1686 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); 1687 bool fixed = !!sock->file_slot; 1688 struct file *file; 1689 int ret, fd; 1690 1691 if (!fixed) { 1692 fd = __get_unused_fd_flags(sock->flags, sock->nofile); 1693 if (unlikely(fd < 0)) 1694 return fd; 1695 } 1696 file = __sys_socket_file(sock->domain, sock->type, sock->protocol); 1697 if (IS_ERR(file)) { 1698 if (!fixed) 1699 put_unused_fd(fd); 1700 ret = PTR_ERR(file); 1701 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1702 return -EAGAIN; 1703 if (ret == -ERESTARTSYS) 1704 ret = -EINTR; 1705 req_set_fail(req); 1706 } else if (!fixed) { 1707 fd_install(fd, file); 1708 ret = fd; 1709 } else { 1710 ret = io_fixed_fd_install(req, issue_flags, file, 1711 sock->file_slot); 1712 } 1713 io_req_set_res(req, ret, 0); 1714 return IOU_OK; 1715 } 1716 1717 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1718 { 1719 struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); 1720 struct io_async_msghdr *io; 1721 1722 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 1723 return -EINVAL; 1724 1725 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1726 conn->addr_len = READ_ONCE(sqe->addr2); 1727 conn->in_progress = conn->seen_econnaborted = false; 1728 1729 io = io_msg_alloc_async(req); 1730 if (unlikely(!io)) 1731 return -ENOMEM; 1732 1733 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr); 1734 } 1735 1736 int io_connect(struct io_kiocb *req, unsigned int issue_flags) 1737 { 1738 struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect); 1739 struct io_async_msghdr *io = req->async_data; 1740 unsigned file_flags; 1741 int ret; 1742 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1743 1744 if (unlikely(req->flags & REQ_F_FAIL)) { 1745 ret = -ECONNRESET; 1746 goto out; 1747 } 1748 1749 file_flags = force_nonblock ? O_NONBLOCK : 0; 1750 1751 ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, 1752 file_flags); 1753 if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED) 1754 && force_nonblock) { 1755 if (ret == -EINPROGRESS) { 1756 connect->in_progress = true; 1757 } else if (ret == -ECONNABORTED) { 1758 if (connect->seen_econnaborted) 1759 goto out; 1760 connect->seen_econnaborted = true; 1761 } 1762 return -EAGAIN; 1763 } 1764 if (connect->in_progress) { 1765 /* 1766 * At least bluetooth will return -EBADFD on a re-connect 1767 * attempt, and it's (supposedly) also valid to get -EISCONN 1768 * which means the previous result is good. For both of these, 1769 * grab the sock_error() and use that for the completion. 1770 */ 1771 if (ret == -EBADFD || ret == -EISCONN) 1772 ret = sock_error(sock_from_file(req->file)->sk); 1773 } 1774 if (ret == -ERESTARTSYS) 1775 ret = -EINTR; 1776 out: 1777 if (ret < 0) 1778 req_set_fail(req); 1779 io_req_msg_cleanup(req, issue_flags); 1780 io_req_set_res(req, ret, 0); 1781 return IOU_OK; 1782 } 1783 1784 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1785 { 1786 struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); 1787 struct sockaddr __user *uaddr; 1788 struct io_async_msghdr *io; 1789 1790 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 1791 return -EINVAL; 1792 1793 uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1794 bind->addr_len = READ_ONCE(sqe->addr2); 1795 1796 io = io_msg_alloc_async(req); 1797 if (unlikely(!io)) 1798 return -ENOMEM; 1799 return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr); 1800 } 1801 1802 int io_bind(struct io_kiocb *req, unsigned int issue_flags) 1803 { 1804 struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); 1805 struct io_async_msghdr *io = req->async_data; 1806 struct socket *sock; 1807 int ret; 1808 1809 sock = sock_from_file(req->file); 1810 if (unlikely(!sock)) 1811 return -ENOTSOCK; 1812 1813 ret = __sys_bind_socket(sock, &io->addr, bind->addr_len); 1814 if (ret < 0) 1815 req_set_fail(req); 1816 io_req_set_res(req, ret, 0); 1817 return 0; 1818 } 1819 1820 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1821 { 1822 struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); 1823 1824 if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2) 1825 return -EINVAL; 1826 1827 listen->backlog = READ_ONCE(sqe->len); 1828 return 0; 1829 } 1830 1831 int io_listen(struct io_kiocb *req, unsigned int issue_flags) 1832 { 1833 struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); 1834 struct socket *sock; 1835 int ret; 1836 1837 sock = sock_from_file(req->file); 1838 if (unlikely(!sock)) 1839 return -ENOTSOCK; 1840 1841 ret = __sys_listen_socket(sock, listen->backlog); 1842 if (ret < 0) 1843 req_set_fail(req); 1844 io_req_set_res(req, ret, 0); 1845 return 0; 1846 } 1847 1848 void io_netmsg_cache_free(const void *entry) 1849 { 1850 struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; 1851 1852 io_vec_free(&kmsg->vec); 1853 kfree(kmsg); 1854 } 1855 #endif 1856