1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/file.h> 5 #include <linux/slab.h> 6 #include <linux/net.h> 7 #include <linux/compat.h> 8 #include <net/compat.h> 9 #include <linux/io_uring.h> 10 11 #include <uapi/linux/io_uring.h> 12 13 #include "io_uring.h" 14 #include "kbuf.h" 15 #include "alloc_cache.h" 16 #include "net.h" 17 #include "notif.h" 18 #include "rsrc.h" 19 20 #if defined(CONFIG_NET) 21 struct io_shutdown { 22 struct file *file; 23 int how; 24 }; 25 26 struct io_accept { 27 struct file *file; 28 struct sockaddr __user *addr; 29 int __user *addr_len; 30 int flags; 31 int iou_flags; 32 u32 file_slot; 33 unsigned long nofile; 34 }; 35 36 struct io_socket { 37 struct file *file; 38 int domain; 39 int type; 40 int protocol; 41 int flags; 42 u32 file_slot; 43 unsigned long nofile; 44 }; 45 46 struct io_connect { 47 struct file *file; 48 struct sockaddr __user *addr; 49 int addr_len; 50 bool in_progress; 51 bool seen_econnaborted; 52 }; 53 54 struct io_bind { 55 struct file *file; 56 int addr_len; 57 }; 58 59 struct io_listen { 60 struct file *file; 61 int backlog; 62 }; 63 64 struct io_sr_msg { 65 struct file *file; 66 union { 67 struct compat_msghdr __user *umsg_compat; 68 struct user_msghdr __user *umsg; 69 void __user *buf; 70 }; 71 int len; 72 unsigned done_io; 73 unsigned msg_flags; 74 unsigned nr_multishot_loops; 75 u16 flags; 76 /* initialised and used only by !msg send variants */ 77 u16 buf_group; 78 u16 buf_index; 79 void __user *msg_control; 80 /* used only for send zerocopy */ 81 struct io_kiocb *notif; 82 }; 83 84 /* 85 * Number of times we'll try and do receives if there's more data. If we 86 * exceed this limit, then add us to the back of the queue and retry from 87 * there. This helps fairness between flooding clients. 88 */ 89 #define MULTISHOT_MAX_RETRY 32 90 91 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 92 { 93 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); 94 95 if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || 96 sqe->buf_index || sqe->splice_fd_in)) 97 return -EINVAL; 98 99 shutdown->how = READ_ONCE(sqe->len); 100 req->flags |= REQ_F_FORCE_ASYNC; 101 return 0; 102 } 103 104 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) 105 { 106 struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); 107 struct socket *sock; 108 int ret; 109 110 WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); 111 112 sock = sock_from_file(req->file); 113 if (unlikely(!sock)) 114 return -ENOTSOCK; 115 116 ret = __sys_shutdown_sock(sock, shutdown->how); 117 io_req_set_res(req, ret, 0); 118 return IOU_OK; 119 } 120 121 static bool io_net_retry(struct socket *sock, int flags) 122 { 123 if (!(flags & MSG_WAITALL)) 124 return false; 125 return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; 126 } 127 128 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) 129 { 130 if (kmsg->free_iov) { 131 kfree(kmsg->free_iov); 132 kmsg->free_iov_nr = 0; 133 kmsg->free_iov = NULL; 134 } 135 } 136 137 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) 138 { 139 struct io_async_msghdr *hdr = req->async_data; 140 141 /* can't recycle, ensure we free the iovec if we have one */ 142 if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { 143 io_netmsg_iovec_free(hdr); 144 return; 145 } 146 147 /* Let normal cleanup path reap it if we fail adding to the cache */ 148 io_alloc_cache_kasan(&hdr->free_iov, &hdr->free_iov_nr); 149 if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { 150 req->async_data = NULL; 151 req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); 152 } 153 } 154 155 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) 156 { 157 struct io_ring_ctx *ctx = req->ctx; 158 struct io_async_msghdr *hdr; 159 160 hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req); 161 if (!hdr) 162 return NULL; 163 164 /* If the async data was cached, we might have an iov cached inside. */ 165 if (hdr->free_iov) 166 req->flags |= REQ_F_NEED_CLEANUP; 167 return hdr; 168 } 169 170 /* assign new iovec to kmsg, if we need to */ 171 static void io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, 172 struct iovec *iov) 173 { 174 if (iov) { 175 req->flags |= REQ_F_NEED_CLEANUP; 176 kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs; 177 if (kmsg->free_iov) 178 kfree(kmsg->free_iov); 179 kmsg->free_iov = iov; 180 } 181 } 182 183 static inline void io_mshot_prep_retry(struct io_kiocb *req, 184 struct io_async_msghdr *kmsg) 185 { 186 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 187 188 req->flags &= ~REQ_F_BL_EMPTY; 189 sr->done_io = 0; 190 sr->len = 0; /* get from the provided buffer */ 191 req->buf_index = sr->buf_group; 192 } 193 194 #ifdef CONFIG_COMPAT 195 static int io_compat_msg_copy_hdr(struct io_kiocb *req, 196 struct io_async_msghdr *iomsg, 197 struct compat_msghdr *msg, int ddir) 198 { 199 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 200 struct compat_iovec __user *uiov; 201 struct iovec *iov; 202 int ret, nr_segs; 203 204 if (iomsg->free_iov) { 205 nr_segs = iomsg->free_iov_nr; 206 iov = iomsg->free_iov; 207 } else { 208 iov = &iomsg->fast_iov; 209 nr_segs = 1; 210 } 211 212 if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) 213 return -EFAULT; 214 215 uiov = compat_ptr(msg->msg_iov); 216 if (req->flags & REQ_F_BUFFER_SELECT) { 217 compat_ssize_t clen; 218 219 if (msg->msg_iovlen == 0) { 220 sr->len = iov->iov_len = 0; 221 iov->iov_base = NULL; 222 } else if (msg->msg_iovlen > 1) { 223 return -EINVAL; 224 } else { 225 if (!access_ok(uiov, sizeof(*uiov))) 226 return -EFAULT; 227 if (__get_user(clen, &uiov->iov_len)) 228 return -EFAULT; 229 if (clen < 0) 230 return -EINVAL; 231 sr->len = clen; 232 } 233 234 return 0; 235 } 236 237 ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, 238 nr_segs, &iov, &iomsg->msg.msg_iter, true); 239 if (unlikely(ret < 0)) 240 return ret; 241 242 io_net_vec_assign(req, iomsg, iov); 243 return 0; 244 } 245 #endif 246 247 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, 248 struct user_msghdr *msg, int ddir) 249 { 250 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 251 struct user_msghdr __user *umsg = sr->umsg; 252 struct iovec *iov; 253 int ret, nr_segs; 254 255 if (iomsg->free_iov) { 256 nr_segs = iomsg->free_iov_nr; 257 iov = iomsg->free_iov; 258 } else { 259 iov = &iomsg->fast_iov; 260 nr_segs = 1; 261 } 262 263 if (!user_access_begin(umsg, sizeof(*umsg))) 264 return -EFAULT; 265 266 ret = -EFAULT; 267 unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); 268 unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); 269 unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); 270 unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); 271 unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); 272 unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); 273 msg->msg_flags = 0; 274 275 if (req->flags & REQ_F_BUFFER_SELECT) { 276 if (msg->msg_iovlen == 0) { 277 sr->len = iov->iov_len = 0; 278 iov->iov_base = NULL; 279 } else if (msg->msg_iovlen > 1) { 280 ret = -EINVAL; 281 goto ua_end; 282 } else { 283 struct iovec __user *uiov = msg->msg_iov; 284 285 /* we only need the length for provided buffers */ 286 if (!access_ok(&uiov->iov_len, sizeof(uiov->iov_len))) 287 goto ua_end; 288 unsafe_get_user(iov->iov_len, &uiov->iov_len, ua_end); 289 sr->len = iov->iov_len; 290 } 291 ret = 0; 292 ua_end: 293 user_access_end(); 294 return ret; 295 } 296 297 user_access_end(); 298 ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, 299 &iov, &iomsg->msg.msg_iter, false); 300 if (unlikely(ret < 0)) 301 return ret; 302 303 io_net_vec_assign(req, iomsg, iov); 304 return 0; 305 } 306 307 static int io_sendmsg_copy_hdr(struct io_kiocb *req, 308 struct io_async_msghdr *iomsg) 309 { 310 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 311 struct user_msghdr msg; 312 int ret; 313 314 iomsg->msg.msg_name = &iomsg->addr; 315 iomsg->msg.msg_iter.nr_segs = 0; 316 317 #ifdef CONFIG_COMPAT 318 if (unlikely(req->ctx->compat)) { 319 struct compat_msghdr cmsg; 320 321 ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE); 322 if (unlikely(ret)) 323 return ret; 324 325 ret = __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); 326 sr->msg_control = iomsg->msg.msg_control_user; 327 return ret; 328 } 329 #endif 330 331 ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE); 332 if (unlikely(ret)) 333 return ret; 334 335 ret = __copy_msghdr(&iomsg->msg, &msg, NULL); 336 337 /* save msg_control as sys_sendmsg() overwrites it */ 338 sr->msg_control = iomsg->msg.msg_control_user; 339 return ret; 340 } 341 342 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) 343 { 344 struct io_async_msghdr *io = req->async_data; 345 346 io_netmsg_iovec_free(io); 347 } 348 349 static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) 350 { 351 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 352 struct io_async_msghdr *kmsg = req->async_data; 353 void __user *addr; 354 u16 addr_len; 355 int ret; 356 357 sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 358 359 if (READ_ONCE(sqe->__pad3[0])) 360 return -EINVAL; 361 362 kmsg->msg.msg_name = NULL; 363 kmsg->msg.msg_namelen = 0; 364 kmsg->msg.msg_control = NULL; 365 kmsg->msg.msg_controllen = 0; 366 kmsg->msg.msg_ubuf = NULL; 367 368 addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 369 addr_len = READ_ONCE(sqe->addr_len); 370 if (addr) { 371 ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr); 372 if (unlikely(ret < 0)) 373 return ret; 374 kmsg->msg.msg_name = &kmsg->addr; 375 kmsg->msg.msg_namelen = addr_len; 376 } 377 if (!io_do_buffer_select(req)) { 378 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, 379 &kmsg->msg.msg_iter); 380 if (unlikely(ret < 0)) 381 return ret; 382 } 383 return 0; 384 } 385 386 static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) 387 { 388 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 389 struct io_async_msghdr *kmsg = req->async_data; 390 int ret; 391 392 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 393 394 ret = io_sendmsg_copy_hdr(req, kmsg); 395 if (!ret) 396 req->flags |= REQ_F_NEED_CLEANUP; 397 return ret; 398 } 399 400 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) 401 402 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 403 { 404 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 405 406 sr->done_io = 0; 407 408 if (req->opcode != IORING_OP_SEND) { 409 if (sqe->addr2 || sqe->file_index) 410 return -EINVAL; 411 } 412 413 sr->len = READ_ONCE(sqe->len); 414 sr->flags = READ_ONCE(sqe->ioprio); 415 if (sr->flags & ~SENDMSG_FLAGS) 416 return -EINVAL; 417 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 418 if (sr->msg_flags & MSG_DONTWAIT) 419 req->flags |= REQ_F_NOWAIT; 420 if (sr->flags & IORING_RECVSEND_BUNDLE) { 421 if (req->opcode == IORING_OP_SENDMSG) 422 return -EINVAL; 423 if (!(req->flags & REQ_F_BUFFER_SELECT)) 424 return -EINVAL; 425 sr->msg_flags |= MSG_WAITALL; 426 sr->buf_group = req->buf_index; 427 req->buf_list = NULL; 428 } 429 430 #ifdef CONFIG_COMPAT 431 if (req->ctx->compat) 432 sr->msg_flags |= MSG_CMSG_COMPAT; 433 #endif 434 if (unlikely(!io_msg_alloc_async(req))) 435 return -ENOMEM; 436 if (req->opcode != IORING_OP_SENDMSG) 437 return io_send_setup(req, sqe); 438 return io_sendmsg_setup(req, sqe); 439 } 440 441 static void io_req_msg_cleanup(struct io_kiocb *req, 442 unsigned int issue_flags) 443 { 444 io_netmsg_recycle(req, issue_flags); 445 } 446 447 /* 448 * For bundle completions, we need to figure out how many segments we consumed. 449 * A bundle could be using a single ITER_UBUF if that's all we mapped, or it 450 * could be using an ITER_IOVEC. If the latter, then if we consumed all of 451 * the segments, then it's a trivial questiont o answer. If we have residual 452 * data in the iter, then loop the segments to figure out how much we 453 * transferred. 454 */ 455 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) 456 { 457 struct iovec *iov; 458 int nbufs; 459 460 /* no data is always zero segments, and a ubuf is always 1 segment */ 461 if (ret <= 0) 462 return 0; 463 if (iter_is_ubuf(&kmsg->msg.msg_iter)) 464 return 1; 465 466 iov = kmsg->free_iov; 467 if (!iov) 468 iov = &kmsg->fast_iov; 469 470 /* if all data was transferred, it's basic pointer math */ 471 if (!iov_iter_count(&kmsg->msg.msg_iter)) 472 return iter_iov(&kmsg->msg.msg_iter) - iov; 473 474 /* short transfer, count segments */ 475 nbufs = 0; 476 do { 477 int this_len = min_t(int, iov[nbufs].iov_len, ret); 478 479 nbufs++; 480 ret -= this_len; 481 } while (ret); 482 483 return nbufs; 484 } 485 486 static inline bool io_send_finish(struct io_kiocb *req, int *ret, 487 struct io_async_msghdr *kmsg, 488 unsigned issue_flags) 489 { 490 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 491 bool bundle_finished = *ret <= 0; 492 unsigned int cflags; 493 494 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { 495 cflags = io_put_kbuf(req, *ret, issue_flags); 496 goto finish; 497 } 498 499 cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); 500 501 if (bundle_finished || req->flags & REQ_F_BL_EMPTY) 502 goto finish; 503 504 /* 505 * Fill CQE for this receive and see if we should keep trying to 506 * receive from this socket. 507 */ 508 if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 509 io_mshot_prep_retry(req, kmsg); 510 return false; 511 } 512 513 /* Otherwise stop bundle and use the current result. */ 514 finish: 515 io_req_set_res(req, *ret, cflags); 516 *ret = IOU_OK; 517 return true; 518 } 519 520 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 521 { 522 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 523 struct io_async_msghdr *kmsg = req->async_data; 524 struct socket *sock; 525 unsigned flags; 526 int min_ret = 0; 527 int ret; 528 529 sock = sock_from_file(req->file); 530 if (unlikely(!sock)) 531 return -ENOTSOCK; 532 533 if (!(req->flags & REQ_F_POLLED) && 534 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 535 return -EAGAIN; 536 537 flags = sr->msg_flags; 538 if (issue_flags & IO_URING_F_NONBLOCK) 539 flags |= MSG_DONTWAIT; 540 if (flags & MSG_WAITALL) 541 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 542 543 kmsg->msg.msg_control_user = sr->msg_control; 544 545 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 546 547 if (ret < min_ret) { 548 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 549 return -EAGAIN; 550 if (ret > 0 && io_net_retry(sock, flags)) { 551 kmsg->msg.msg_controllen = 0; 552 kmsg->msg.msg_control = NULL; 553 sr->done_io += ret; 554 req->flags |= REQ_F_BL_NO_RECYCLE; 555 return -EAGAIN; 556 } 557 if (ret == -ERESTARTSYS) 558 ret = -EINTR; 559 req_set_fail(req); 560 } 561 io_req_msg_cleanup(req, issue_flags); 562 if (ret >= 0) 563 ret += sr->done_io; 564 else if (sr->done_io) 565 ret = sr->done_io; 566 io_req_set_res(req, ret, 0); 567 return IOU_OK; 568 } 569 570 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, 571 struct io_async_msghdr *kmsg) 572 { 573 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 574 575 int ret; 576 struct buf_sel_arg arg = { 577 .iovs = &kmsg->fast_iov, 578 .max_len = min_not_zero(sr->len, INT_MAX), 579 .nr_iovs = 1, 580 }; 581 582 if (kmsg->free_iov) { 583 arg.nr_iovs = kmsg->free_iov_nr; 584 arg.iovs = kmsg->free_iov; 585 arg.mode = KBUF_MODE_FREE; 586 } 587 588 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) 589 arg.nr_iovs = 1; 590 else 591 arg.mode |= KBUF_MODE_EXPAND; 592 593 ret = io_buffers_select(req, &arg, issue_flags); 594 if (unlikely(ret < 0)) 595 return ret; 596 597 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { 598 kmsg->free_iov_nr = ret; 599 kmsg->free_iov = arg.iovs; 600 req->flags |= REQ_F_NEED_CLEANUP; 601 } 602 sr->len = arg.out_len; 603 604 if (ret == 1) { 605 sr->buf = arg.iovs[0].iov_base; 606 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, 607 &kmsg->msg.msg_iter); 608 if (unlikely(ret)) 609 return ret; 610 } else { 611 iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, 612 arg.iovs, ret, arg.out_len); 613 } 614 615 return 0; 616 } 617 618 int io_send(struct io_kiocb *req, unsigned int issue_flags) 619 { 620 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 621 struct io_async_msghdr *kmsg = req->async_data; 622 struct socket *sock; 623 unsigned flags; 624 int min_ret = 0; 625 int ret; 626 627 sock = sock_from_file(req->file); 628 if (unlikely(!sock)) 629 return -ENOTSOCK; 630 631 if (!(req->flags & REQ_F_POLLED) && 632 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 633 return -EAGAIN; 634 635 flags = sr->msg_flags; 636 if (issue_flags & IO_URING_F_NONBLOCK) 637 flags |= MSG_DONTWAIT; 638 639 retry_bundle: 640 if (io_do_buffer_select(req)) { 641 ret = io_send_select_buffer(req, issue_flags, kmsg); 642 if (ret) 643 return ret; 644 } 645 646 /* 647 * If MSG_WAITALL is set, or this is a bundle send, then we need 648 * the full amount. If just bundle is set, if we do a short send 649 * then we complete the bundle sequence rather than continue on. 650 */ 651 if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE) 652 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 653 654 flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 655 kmsg->msg.msg_flags = flags; 656 ret = sock_sendmsg(sock, &kmsg->msg); 657 if (ret < min_ret) { 658 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 659 return -EAGAIN; 660 661 if (ret > 0 && io_net_retry(sock, flags)) { 662 sr->len -= ret; 663 sr->buf += ret; 664 sr->done_io += ret; 665 req->flags |= REQ_F_BL_NO_RECYCLE; 666 return -EAGAIN; 667 } 668 if (ret == -ERESTARTSYS) 669 ret = -EINTR; 670 req_set_fail(req); 671 } 672 if (ret >= 0) 673 ret += sr->done_io; 674 else if (sr->done_io) 675 ret = sr->done_io; 676 677 if (!io_send_finish(req, &ret, kmsg, issue_flags)) 678 goto retry_bundle; 679 680 io_req_msg_cleanup(req, issue_flags); 681 return ret; 682 } 683 684 static int io_recvmsg_mshot_prep(struct io_kiocb *req, 685 struct io_async_msghdr *iomsg, 686 int namelen, size_t controllen) 687 { 688 if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) == 689 (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) { 690 int hdr; 691 692 if (unlikely(namelen < 0)) 693 return -EOVERFLOW; 694 if (check_add_overflow(sizeof(struct io_uring_recvmsg_out), 695 namelen, &hdr)) 696 return -EOVERFLOW; 697 if (check_add_overflow(hdr, controllen, &hdr)) 698 return -EOVERFLOW; 699 700 iomsg->namelen = namelen; 701 iomsg->controllen = controllen; 702 return 0; 703 } 704 705 return 0; 706 } 707 708 static int io_recvmsg_copy_hdr(struct io_kiocb *req, 709 struct io_async_msghdr *iomsg) 710 { 711 struct user_msghdr msg; 712 int ret; 713 714 iomsg->msg.msg_name = &iomsg->addr; 715 iomsg->msg.msg_iter.nr_segs = 0; 716 717 #ifdef CONFIG_COMPAT 718 if (unlikely(req->ctx->compat)) { 719 struct compat_msghdr cmsg; 720 721 ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST); 722 if (unlikely(ret)) 723 return ret; 724 725 ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr); 726 if (unlikely(ret)) 727 return ret; 728 729 return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen, 730 cmsg.msg_controllen); 731 } 732 #endif 733 734 ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST); 735 if (unlikely(ret)) 736 return ret; 737 738 ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); 739 if (unlikely(ret)) 740 return ret; 741 742 return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, 743 msg.msg_controllen); 744 } 745 746 static int io_recvmsg_prep_setup(struct io_kiocb *req) 747 { 748 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 749 struct io_async_msghdr *kmsg; 750 int ret; 751 752 kmsg = io_msg_alloc_async(req); 753 if (unlikely(!kmsg)) 754 return -ENOMEM; 755 756 if (req->opcode == IORING_OP_RECV) { 757 kmsg->msg.msg_name = NULL; 758 kmsg->msg.msg_namelen = 0; 759 kmsg->msg.msg_inq = 0; 760 kmsg->msg.msg_control = NULL; 761 kmsg->msg.msg_get_inq = 1; 762 kmsg->msg.msg_controllen = 0; 763 kmsg->msg.msg_iocb = NULL; 764 kmsg->msg.msg_ubuf = NULL; 765 766 if (!io_do_buffer_select(req)) { 767 ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 768 &kmsg->msg.msg_iter); 769 if (unlikely(ret)) 770 return ret; 771 } 772 return 0; 773 } 774 775 ret = io_recvmsg_copy_hdr(req, kmsg); 776 if (!ret) 777 req->flags |= REQ_F_NEED_CLEANUP; 778 return ret; 779 } 780 781 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ 782 IORING_RECVSEND_BUNDLE) 783 784 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 785 { 786 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 787 788 sr->done_io = 0; 789 790 if (unlikely(sqe->file_index || sqe->addr2)) 791 return -EINVAL; 792 793 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 794 sr->len = READ_ONCE(sqe->len); 795 sr->flags = READ_ONCE(sqe->ioprio); 796 if (sr->flags & ~RECVMSG_FLAGS) 797 return -EINVAL; 798 sr->msg_flags = READ_ONCE(sqe->msg_flags); 799 if (sr->msg_flags & MSG_DONTWAIT) 800 req->flags |= REQ_F_NOWAIT; 801 if (sr->msg_flags & MSG_ERRQUEUE) 802 req->flags |= REQ_F_CLEAR_POLLIN; 803 if (req->flags & REQ_F_BUFFER_SELECT) { 804 /* 805 * Store the buffer group for this multishot receive separately, 806 * as if we end up doing an io-wq based issue that selects a 807 * buffer, it has to be committed immediately and that will 808 * clear ->buf_list. This means we lose the link to the buffer 809 * list, and the eventual buffer put on completion then cannot 810 * restore it. 811 */ 812 sr->buf_group = req->buf_index; 813 req->buf_list = NULL; 814 } 815 if (sr->flags & IORING_RECV_MULTISHOT) { 816 if (!(req->flags & REQ_F_BUFFER_SELECT)) 817 return -EINVAL; 818 if (sr->msg_flags & MSG_WAITALL) 819 return -EINVAL; 820 if (req->opcode == IORING_OP_RECV && sr->len) 821 return -EINVAL; 822 req->flags |= REQ_F_APOLL_MULTISHOT; 823 } 824 if (sr->flags & IORING_RECVSEND_BUNDLE) { 825 if (req->opcode == IORING_OP_RECVMSG) 826 return -EINVAL; 827 } 828 829 #ifdef CONFIG_COMPAT 830 if (req->ctx->compat) 831 sr->msg_flags |= MSG_CMSG_COMPAT; 832 #endif 833 sr->nr_multishot_loops = 0; 834 return io_recvmsg_prep_setup(req); 835 } 836 837 /* 838 * Finishes io_recv and io_recvmsg. 839 * 840 * Returns true if it is actually finished, or false if it should run 841 * again (for multishot). 842 */ 843 static inline bool io_recv_finish(struct io_kiocb *req, int *ret, 844 struct io_async_msghdr *kmsg, 845 bool mshot_finished, unsigned issue_flags) 846 { 847 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 848 unsigned int cflags = 0; 849 850 if (kmsg->msg.msg_inq > 0) 851 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 852 853 if (sr->flags & IORING_RECVSEND_BUNDLE) { 854 cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), 855 issue_flags); 856 /* bundle with no more immediate buffers, we're done */ 857 if (req->flags & REQ_F_BL_EMPTY) 858 goto finish; 859 } else { 860 cflags |= io_put_kbuf(req, *ret, issue_flags); 861 } 862 863 /* 864 * Fill CQE for this receive and see if we should keep trying to 865 * receive from this socket. 866 */ 867 if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && 868 io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 869 int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; 870 871 io_mshot_prep_retry(req, kmsg); 872 /* Known not-empty or unknown state, retry */ 873 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { 874 if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) 875 return false; 876 /* mshot retries exceeded, force a requeue */ 877 sr->nr_multishot_loops = 0; 878 mshot_retry_ret = IOU_REQUEUE; 879 } 880 if (issue_flags & IO_URING_F_MULTISHOT) 881 *ret = mshot_retry_ret; 882 else 883 *ret = -EAGAIN; 884 return true; 885 } 886 887 /* Finish the request / stop multishot. */ 888 finish: 889 io_req_set_res(req, *ret, cflags); 890 891 if (issue_flags & IO_URING_F_MULTISHOT) 892 *ret = IOU_STOP_MULTISHOT; 893 else 894 *ret = IOU_OK; 895 io_req_msg_cleanup(req, issue_flags); 896 return true; 897 } 898 899 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg, 900 struct io_sr_msg *sr, void __user **buf, 901 size_t *len) 902 { 903 unsigned long ubuf = (unsigned long) *buf; 904 unsigned long hdr; 905 906 hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + 907 kmsg->controllen; 908 if (*len < hdr) 909 return -EFAULT; 910 911 if (kmsg->controllen) { 912 unsigned long control = ubuf + hdr - kmsg->controllen; 913 914 kmsg->msg.msg_control_user = (void __user *) control; 915 kmsg->msg.msg_controllen = kmsg->controllen; 916 } 917 918 sr->buf = *buf; /* stash for later copy */ 919 *buf = (void __user *) (ubuf + hdr); 920 kmsg->payloadlen = *len = *len - hdr; 921 return 0; 922 } 923 924 struct io_recvmsg_multishot_hdr { 925 struct io_uring_recvmsg_out msg; 926 struct sockaddr_storage addr; 927 }; 928 929 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, 930 struct io_async_msghdr *kmsg, 931 unsigned int flags, bool *finished) 932 { 933 int err; 934 int copy_len; 935 struct io_recvmsg_multishot_hdr hdr; 936 937 if (kmsg->namelen) 938 kmsg->msg.msg_name = &hdr.addr; 939 kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); 940 kmsg->msg.msg_namelen = 0; 941 942 if (sock->file->f_flags & O_NONBLOCK) 943 flags |= MSG_DONTWAIT; 944 945 err = sock_recvmsg(sock, &kmsg->msg, flags); 946 *finished = err <= 0; 947 if (err < 0) 948 return err; 949 950 hdr.msg = (struct io_uring_recvmsg_out) { 951 .controllen = kmsg->controllen - kmsg->msg.msg_controllen, 952 .flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT 953 }; 954 955 hdr.msg.payloadlen = err; 956 if (err > kmsg->payloadlen) 957 err = kmsg->payloadlen; 958 959 copy_len = sizeof(struct io_uring_recvmsg_out); 960 if (kmsg->msg.msg_namelen > kmsg->namelen) 961 copy_len += kmsg->namelen; 962 else 963 copy_len += kmsg->msg.msg_namelen; 964 965 /* 966 * "fromlen shall refer to the value before truncation.." 967 * 1003.1g 968 */ 969 hdr.msg.namelen = kmsg->msg.msg_namelen; 970 971 /* ensure that there is no gap between hdr and sockaddr_storage */ 972 BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) != 973 sizeof(struct io_uring_recvmsg_out)); 974 if (copy_to_user(io->buf, &hdr, copy_len)) { 975 *finished = true; 976 return -EFAULT; 977 } 978 979 return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + 980 kmsg->controllen + err; 981 } 982 983 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 984 { 985 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 986 struct io_async_msghdr *kmsg = req->async_data; 987 struct socket *sock; 988 unsigned flags; 989 int ret, min_ret = 0; 990 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 991 bool mshot_finished = true; 992 993 sock = sock_from_file(req->file); 994 if (unlikely(!sock)) 995 return -ENOTSOCK; 996 997 if (!(req->flags & REQ_F_POLLED) && 998 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 999 return -EAGAIN; 1000 1001 flags = sr->msg_flags; 1002 if (force_nonblock) 1003 flags |= MSG_DONTWAIT; 1004 1005 retry_multishot: 1006 if (io_do_buffer_select(req)) { 1007 void __user *buf; 1008 size_t len = sr->len; 1009 1010 buf = io_buffer_select(req, &len, issue_flags); 1011 if (!buf) 1012 return -ENOBUFS; 1013 1014 if (req->flags & REQ_F_APOLL_MULTISHOT) { 1015 ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len); 1016 if (ret) { 1017 io_kbuf_recycle(req, issue_flags); 1018 return ret; 1019 } 1020 } 1021 1022 iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); 1023 } 1024 1025 kmsg->msg.msg_get_inq = 1; 1026 kmsg->msg.msg_inq = -1; 1027 if (req->flags & REQ_F_APOLL_MULTISHOT) { 1028 ret = io_recvmsg_multishot(sock, sr, kmsg, flags, 1029 &mshot_finished); 1030 } else { 1031 /* disable partial retry for recvmsg with cmsg attached */ 1032 if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) 1033 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1034 1035 ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, 1036 kmsg->uaddr, flags); 1037 } 1038 1039 if (ret < min_ret) { 1040 if (ret == -EAGAIN && force_nonblock) { 1041 if (issue_flags & IO_URING_F_MULTISHOT) { 1042 io_kbuf_recycle(req, issue_flags); 1043 return IOU_ISSUE_SKIP_COMPLETE; 1044 } 1045 return -EAGAIN; 1046 } 1047 if (ret > 0 && io_net_retry(sock, flags)) { 1048 sr->done_io += ret; 1049 req->flags |= REQ_F_BL_NO_RECYCLE; 1050 return -EAGAIN; 1051 } 1052 if (ret == -ERESTARTSYS) 1053 ret = -EINTR; 1054 req_set_fail(req); 1055 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1056 req_set_fail(req); 1057 } 1058 1059 if (ret > 0) 1060 ret += sr->done_io; 1061 else if (sr->done_io) 1062 ret = sr->done_io; 1063 else 1064 io_kbuf_recycle(req, issue_flags); 1065 1066 if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1067 goto retry_multishot; 1068 1069 return ret; 1070 } 1071 1072 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, 1073 size_t *len, unsigned int issue_flags) 1074 { 1075 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1076 int ret; 1077 1078 /* 1079 * If the ring isn't locked, then don't use the peek interface 1080 * to grab multiple buffers as we will lock/unlock between 1081 * this selection and posting the buffers. 1082 */ 1083 if (!(issue_flags & IO_URING_F_UNLOCKED) && 1084 sr->flags & IORING_RECVSEND_BUNDLE) { 1085 struct buf_sel_arg arg = { 1086 .iovs = &kmsg->fast_iov, 1087 .nr_iovs = 1, 1088 .mode = KBUF_MODE_EXPAND, 1089 }; 1090 1091 if (kmsg->free_iov) { 1092 arg.nr_iovs = kmsg->free_iov_nr; 1093 arg.iovs = kmsg->free_iov; 1094 arg.mode |= KBUF_MODE_FREE; 1095 } 1096 1097 if (kmsg->msg.msg_inq > 0) 1098 arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); 1099 1100 ret = io_buffers_peek(req, &arg); 1101 if (unlikely(ret < 0)) 1102 return ret; 1103 1104 /* special case 1 vec, can be a fast path */ 1105 if (ret == 1) { 1106 sr->buf = arg.iovs[0].iov_base; 1107 sr->len = arg.iovs[0].iov_len; 1108 goto map_ubuf; 1109 } 1110 iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, 1111 arg.out_len); 1112 if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { 1113 kmsg->free_iov_nr = ret; 1114 kmsg->free_iov = arg.iovs; 1115 req->flags |= REQ_F_NEED_CLEANUP; 1116 } 1117 } else { 1118 void __user *buf; 1119 1120 *len = sr->len; 1121 buf = io_buffer_select(req, len, issue_flags); 1122 if (!buf) 1123 return -ENOBUFS; 1124 sr->buf = buf; 1125 sr->len = *len; 1126 map_ubuf: 1127 ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 1128 &kmsg->msg.msg_iter); 1129 if (unlikely(ret)) 1130 return ret; 1131 } 1132 1133 return 0; 1134 } 1135 1136 int io_recv(struct io_kiocb *req, unsigned int issue_flags) 1137 { 1138 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1139 struct io_async_msghdr *kmsg = req->async_data; 1140 struct socket *sock; 1141 unsigned flags; 1142 int ret, min_ret = 0; 1143 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1144 size_t len = sr->len; 1145 bool mshot_finished; 1146 1147 if (!(req->flags & REQ_F_POLLED) && 1148 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1149 return -EAGAIN; 1150 1151 sock = sock_from_file(req->file); 1152 if (unlikely(!sock)) 1153 return -ENOTSOCK; 1154 1155 flags = sr->msg_flags; 1156 if (force_nonblock) 1157 flags |= MSG_DONTWAIT; 1158 1159 retry_multishot: 1160 if (io_do_buffer_select(req)) { 1161 ret = io_recv_buf_select(req, kmsg, &len, issue_flags); 1162 if (unlikely(ret)) { 1163 kmsg->msg.msg_inq = -1; 1164 goto out_free; 1165 } 1166 sr->buf = NULL; 1167 } 1168 1169 kmsg->msg.msg_flags = 0; 1170 kmsg->msg.msg_inq = -1; 1171 1172 if (flags & MSG_WAITALL) 1173 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1174 1175 ret = sock_recvmsg(sock, &kmsg->msg, flags); 1176 if (ret < min_ret) { 1177 if (ret == -EAGAIN && force_nonblock) { 1178 if (issue_flags & IO_URING_F_MULTISHOT) { 1179 io_kbuf_recycle(req, issue_flags); 1180 return IOU_ISSUE_SKIP_COMPLETE; 1181 } 1182 1183 return -EAGAIN; 1184 } 1185 if (ret > 0 && io_net_retry(sock, flags)) { 1186 sr->len -= ret; 1187 sr->buf += ret; 1188 sr->done_io += ret; 1189 req->flags |= REQ_F_BL_NO_RECYCLE; 1190 return -EAGAIN; 1191 } 1192 if (ret == -ERESTARTSYS) 1193 ret = -EINTR; 1194 req_set_fail(req); 1195 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 1196 out_free: 1197 req_set_fail(req); 1198 } 1199 1200 mshot_finished = ret <= 0; 1201 if (ret > 0) 1202 ret += sr->done_io; 1203 else if (sr->done_io) 1204 ret = sr->done_io; 1205 else 1206 io_kbuf_recycle(req, issue_flags); 1207 1208 if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1209 goto retry_multishot; 1210 1211 return ret; 1212 } 1213 1214 void io_send_zc_cleanup(struct io_kiocb *req) 1215 { 1216 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1217 struct io_async_msghdr *io = req->async_data; 1218 1219 if (req_has_async_data(req)) 1220 io_netmsg_iovec_free(io); 1221 if (zc->notif) { 1222 io_notif_flush(zc->notif); 1223 zc->notif = NULL; 1224 } 1225 } 1226 1227 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF) 1228 #define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE) 1229 1230 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1231 { 1232 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1233 struct io_ring_ctx *ctx = req->ctx; 1234 struct io_kiocb *notif; 1235 1236 zc->done_io = 0; 1237 req->flags |= REQ_F_POLL_NO_LAZY; 1238 1239 if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) 1240 return -EINVAL; 1241 /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ 1242 if (req->flags & REQ_F_CQE_SKIP) 1243 return -EINVAL; 1244 1245 notif = zc->notif = io_alloc_notif(ctx); 1246 if (!notif) 1247 return -ENOMEM; 1248 notif->cqe.user_data = req->cqe.user_data; 1249 notif->cqe.res = 0; 1250 notif->cqe.flags = IORING_CQE_F_NOTIF; 1251 req->flags |= REQ_F_NEED_CLEANUP; 1252 1253 zc->flags = READ_ONCE(sqe->ioprio); 1254 if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) { 1255 if (zc->flags & ~IO_ZC_FLAGS_VALID) 1256 return -EINVAL; 1257 if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { 1258 struct io_notif_data *nd = io_notif_to_data(notif); 1259 1260 nd->zc_report = true; 1261 nd->zc_used = false; 1262 nd->zc_copied = false; 1263 } 1264 } 1265 1266 if (req->opcode != IORING_OP_SEND_ZC) { 1267 if (unlikely(sqe->addr2 || sqe->file_index)) 1268 return -EINVAL; 1269 if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF)) 1270 return -EINVAL; 1271 } 1272 1273 zc->len = READ_ONCE(sqe->len); 1274 zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; 1275 zc->buf_index = READ_ONCE(sqe->buf_index); 1276 if (zc->msg_flags & MSG_DONTWAIT) 1277 req->flags |= REQ_F_NOWAIT; 1278 1279 #ifdef CONFIG_COMPAT 1280 if (req->ctx->compat) 1281 zc->msg_flags |= MSG_CMSG_COMPAT; 1282 #endif 1283 if (unlikely(!io_msg_alloc_async(req))) 1284 return -ENOMEM; 1285 if (req->opcode != IORING_OP_SENDMSG_ZC) 1286 return io_send_setup(req, sqe); 1287 return io_sendmsg_setup(req, sqe); 1288 } 1289 1290 static int io_sg_from_iter_iovec(struct sk_buff *skb, 1291 struct iov_iter *from, size_t length) 1292 { 1293 skb_zcopy_downgrade_managed(skb); 1294 return zerocopy_fill_skb_from_iter(skb, from, length); 1295 } 1296 1297 static int io_sg_from_iter(struct sk_buff *skb, 1298 struct iov_iter *from, size_t length) 1299 { 1300 struct skb_shared_info *shinfo = skb_shinfo(skb); 1301 int frag = shinfo->nr_frags; 1302 int ret = 0; 1303 struct bvec_iter bi; 1304 ssize_t copied = 0; 1305 unsigned long truesize = 0; 1306 1307 if (!frag) 1308 shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; 1309 else if (unlikely(!skb_zcopy_managed(skb))) 1310 return zerocopy_fill_skb_from_iter(skb, from, length); 1311 1312 bi.bi_size = min(from->count, length); 1313 bi.bi_bvec_done = from->iov_offset; 1314 bi.bi_idx = 0; 1315 1316 while (bi.bi_size && frag < MAX_SKB_FRAGS) { 1317 struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi); 1318 1319 copied += v.bv_len; 1320 truesize += PAGE_ALIGN(v.bv_len + v.bv_offset); 1321 __skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page, 1322 v.bv_offset, v.bv_len); 1323 bvec_iter_advance_single(from->bvec, &bi, v.bv_len); 1324 } 1325 if (bi.bi_size) 1326 ret = -EMSGSIZE; 1327 1328 shinfo->nr_frags = frag; 1329 from->bvec += bi.bi_idx; 1330 from->nr_segs -= bi.bi_idx; 1331 from->count -= copied; 1332 from->iov_offset = bi.bi_bvec_done; 1333 1334 skb->data_len += copied; 1335 skb->len += copied; 1336 skb->truesize += truesize; 1337 return ret; 1338 } 1339 1340 static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) 1341 { 1342 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1343 struct io_async_msghdr *kmsg = req->async_data; 1344 int ret; 1345 1346 if (sr->flags & IORING_RECVSEND_FIXED_BUF) { 1347 struct io_ring_ctx *ctx = req->ctx; 1348 struct io_rsrc_node *node; 1349 1350 ret = -EFAULT; 1351 io_ring_submit_lock(ctx, issue_flags); 1352 node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index); 1353 if (node) { 1354 io_req_assign_buf_node(sr->notif, node); 1355 ret = 0; 1356 } 1357 io_ring_submit_unlock(ctx, issue_flags); 1358 1359 if (unlikely(ret)) 1360 return ret; 1361 1362 ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, 1363 node->buf, (u64)(uintptr_t)sr->buf, 1364 sr->len); 1365 if (unlikely(ret)) 1366 return ret; 1367 kmsg->msg.sg_from_iter = io_sg_from_iter; 1368 } else { 1369 ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); 1370 if (unlikely(ret)) 1371 return ret; 1372 ret = io_notif_account_mem(sr->notif, sr->len); 1373 if (unlikely(ret)) 1374 return ret; 1375 kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; 1376 } 1377 1378 return ret; 1379 } 1380 1381 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) 1382 { 1383 struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1384 struct io_async_msghdr *kmsg = req->async_data; 1385 struct socket *sock; 1386 unsigned msg_flags; 1387 int ret, min_ret = 0; 1388 1389 sock = sock_from_file(req->file); 1390 if (unlikely(!sock)) 1391 return -ENOTSOCK; 1392 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1393 return -EOPNOTSUPP; 1394 1395 if (!(req->flags & REQ_F_POLLED) && 1396 (zc->flags & IORING_RECVSEND_POLL_FIRST)) 1397 return -EAGAIN; 1398 1399 if (!zc->done_io) { 1400 ret = io_send_zc_import(req, issue_flags); 1401 if (unlikely(ret)) 1402 return ret; 1403 } 1404 1405 msg_flags = zc->msg_flags; 1406 if (issue_flags & IO_URING_F_NONBLOCK) 1407 msg_flags |= MSG_DONTWAIT; 1408 if (msg_flags & MSG_WAITALL) 1409 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1410 msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 1411 1412 kmsg->msg.msg_flags = msg_flags; 1413 kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; 1414 ret = sock_sendmsg(sock, &kmsg->msg); 1415 1416 if (unlikely(ret < min_ret)) { 1417 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1418 return -EAGAIN; 1419 1420 if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { 1421 zc->len -= ret; 1422 zc->buf += ret; 1423 zc->done_io += ret; 1424 req->flags |= REQ_F_BL_NO_RECYCLE; 1425 return -EAGAIN; 1426 } 1427 if (ret == -ERESTARTSYS) 1428 ret = -EINTR; 1429 req_set_fail(req); 1430 } 1431 1432 if (ret >= 0) 1433 ret += zc->done_io; 1434 else if (zc->done_io) 1435 ret = zc->done_io; 1436 1437 /* 1438 * If we're in io-wq we can't rely on tw ordering guarantees, defer 1439 * flushing notif to io_send_zc_cleanup() 1440 */ 1441 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1442 io_notif_flush(zc->notif); 1443 zc->notif = NULL; 1444 io_req_msg_cleanup(req, 0); 1445 } 1446 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1447 return IOU_OK; 1448 } 1449 1450 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) 1451 { 1452 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1453 struct io_async_msghdr *kmsg = req->async_data; 1454 struct socket *sock; 1455 unsigned flags; 1456 int ret, min_ret = 0; 1457 1458 sock = sock_from_file(req->file); 1459 if (unlikely(!sock)) 1460 return -ENOTSOCK; 1461 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1462 return -EOPNOTSUPP; 1463 1464 if (!(req->flags & REQ_F_POLLED) && 1465 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1466 return -EAGAIN; 1467 1468 flags = sr->msg_flags; 1469 if (issue_flags & IO_URING_F_NONBLOCK) 1470 flags |= MSG_DONTWAIT; 1471 if (flags & MSG_WAITALL) 1472 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1473 1474 kmsg->msg.msg_control_user = sr->msg_control; 1475 kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; 1476 kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; 1477 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 1478 1479 if (unlikely(ret < min_ret)) { 1480 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1481 return -EAGAIN; 1482 1483 if (ret > 0 && io_net_retry(sock, flags)) { 1484 sr->done_io += ret; 1485 req->flags |= REQ_F_BL_NO_RECYCLE; 1486 return -EAGAIN; 1487 } 1488 if (ret == -ERESTARTSYS) 1489 ret = -EINTR; 1490 req_set_fail(req); 1491 } 1492 1493 if (ret >= 0) 1494 ret += sr->done_io; 1495 else if (sr->done_io) 1496 ret = sr->done_io; 1497 1498 /* 1499 * If we're in io-wq we can't rely on tw ordering guarantees, defer 1500 * flushing notif to io_send_zc_cleanup() 1501 */ 1502 if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1503 io_notif_flush(sr->notif); 1504 sr->notif = NULL; 1505 io_req_msg_cleanup(req, 0); 1506 } 1507 io_req_set_res(req, ret, IORING_CQE_F_MORE); 1508 return IOU_OK; 1509 } 1510 1511 void io_sendrecv_fail(struct io_kiocb *req) 1512 { 1513 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1514 1515 if (sr->done_io) 1516 req->cqe.res = sr->done_io; 1517 1518 if ((req->flags & REQ_F_NEED_CLEANUP) && 1519 (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC)) 1520 req->cqe.flags |= IORING_CQE_F_MORE; 1521 } 1522 1523 #define ACCEPT_FLAGS (IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \ 1524 IORING_ACCEPT_POLL_FIRST) 1525 1526 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1527 { 1528 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1529 1530 if (sqe->len || sqe->buf_index) 1531 return -EINVAL; 1532 1533 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1534 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 1535 accept->flags = READ_ONCE(sqe->accept_flags); 1536 accept->nofile = rlimit(RLIMIT_NOFILE); 1537 accept->iou_flags = READ_ONCE(sqe->ioprio); 1538 if (accept->iou_flags & ~ACCEPT_FLAGS) 1539 return -EINVAL; 1540 1541 accept->file_slot = READ_ONCE(sqe->file_index); 1542 if (accept->file_slot) { 1543 if (accept->flags & SOCK_CLOEXEC) 1544 return -EINVAL; 1545 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT && 1546 accept->file_slot != IORING_FILE_INDEX_ALLOC) 1547 return -EINVAL; 1548 } 1549 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1550 return -EINVAL; 1551 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) 1552 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 1553 if (accept->iou_flags & IORING_ACCEPT_MULTISHOT) 1554 req->flags |= REQ_F_APOLL_MULTISHOT; 1555 if (accept->iou_flags & IORING_ACCEPT_DONTWAIT) 1556 req->flags |= REQ_F_NOWAIT; 1557 return 0; 1558 } 1559 1560 int io_accept(struct io_kiocb *req, unsigned int issue_flags) 1561 { 1562 struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); 1563 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1564 bool fixed = !!accept->file_slot; 1565 struct proto_accept_arg arg = { 1566 .flags = force_nonblock ? O_NONBLOCK : 0, 1567 }; 1568 struct file *file; 1569 unsigned cflags; 1570 int ret, fd; 1571 1572 if (!(req->flags & REQ_F_POLLED) && 1573 accept->iou_flags & IORING_ACCEPT_POLL_FIRST) 1574 return -EAGAIN; 1575 1576 retry: 1577 if (!fixed) { 1578 fd = __get_unused_fd_flags(accept->flags, accept->nofile); 1579 if (unlikely(fd < 0)) 1580 return fd; 1581 } 1582 arg.err = 0; 1583 arg.is_empty = -1; 1584 file = do_accept(req->file, &arg, accept->addr, accept->addr_len, 1585 accept->flags); 1586 if (IS_ERR(file)) { 1587 if (!fixed) 1588 put_unused_fd(fd); 1589 ret = PTR_ERR(file); 1590 if (ret == -EAGAIN && force_nonblock && 1591 !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) { 1592 /* 1593 * if it's multishot and polled, we don't need to 1594 * return EAGAIN to arm the poll infra since it 1595 * has already been done 1596 */ 1597 if (issue_flags & IO_URING_F_MULTISHOT) 1598 return IOU_ISSUE_SKIP_COMPLETE; 1599 return ret; 1600 } 1601 if (ret == -ERESTARTSYS) 1602 ret = -EINTR; 1603 req_set_fail(req); 1604 } else if (!fixed) { 1605 fd_install(fd, file); 1606 ret = fd; 1607 } else { 1608 ret = io_fixed_fd_install(req, issue_flags, file, 1609 accept->file_slot); 1610 } 1611 1612 cflags = 0; 1613 if (!arg.is_empty) 1614 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 1615 1616 if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { 1617 io_req_set_res(req, ret, cflags); 1618 return IOU_OK; 1619 } 1620 1621 if (ret < 0) 1622 return ret; 1623 if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { 1624 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) 1625 goto retry; 1626 if (issue_flags & IO_URING_F_MULTISHOT) 1627 return IOU_ISSUE_SKIP_COMPLETE; 1628 return -EAGAIN; 1629 } 1630 1631 io_req_set_res(req, ret, cflags); 1632 return IOU_STOP_MULTISHOT; 1633 } 1634 1635 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1636 { 1637 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); 1638 1639 if (sqe->addr || sqe->rw_flags || sqe->buf_index) 1640 return -EINVAL; 1641 1642 sock->domain = READ_ONCE(sqe->fd); 1643 sock->type = READ_ONCE(sqe->off); 1644 sock->protocol = READ_ONCE(sqe->len); 1645 sock->file_slot = READ_ONCE(sqe->file_index); 1646 sock->nofile = rlimit(RLIMIT_NOFILE); 1647 1648 sock->flags = sock->type & ~SOCK_TYPE_MASK; 1649 if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) 1650 return -EINVAL; 1651 if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 1652 return -EINVAL; 1653 return 0; 1654 } 1655 1656 int io_socket(struct io_kiocb *req, unsigned int issue_flags) 1657 { 1658 struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket); 1659 bool fixed = !!sock->file_slot; 1660 struct file *file; 1661 int ret, fd; 1662 1663 if (!fixed) { 1664 fd = __get_unused_fd_flags(sock->flags, sock->nofile); 1665 if (unlikely(fd < 0)) 1666 return fd; 1667 } 1668 file = __sys_socket_file(sock->domain, sock->type, sock->protocol); 1669 if (IS_ERR(file)) { 1670 if (!fixed) 1671 put_unused_fd(fd); 1672 ret = PTR_ERR(file); 1673 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1674 return -EAGAIN; 1675 if (ret == -ERESTARTSYS) 1676 ret = -EINTR; 1677 req_set_fail(req); 1678 } else if (!fixed) { 1679 fd_install(fd, file); 1680 ret = fd; 1681 } else { 1682 ret = io_fixed_fd_install(req, issue_flags, file, 1683 sock->file_slot); 1684 } 1685 io_req_set_res(req, ret, 0); 1686 return IOU_OK; 1687 } 1688 1689 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1690 { 1691 struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); 1692 struct io_async_msghdr *io; 1693 1694 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 1695 return -EINVAL; 1696 1697 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1698 conn->addr_len = READ_ONCE(sqe->addr2); 1699 conn->in_progress = conn->seen_econnaborted = false; 1700 1701 io = io_msg_alloc_async(req); 1702 if (unlikely(!io)) 1703 return -ENOMEM; 1704 1705 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr); 1706 } 1707 1708 int io_connect(struct io_kiocb *req, unsigned int issue_flags) 1709 { 1710 struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect); 1711 struct io_async_msghdr *io = req->async_data; 1712 unsigned file_flags; 1713 int ret; 1714 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1715 1716 if (unlikely(req->flags & REQ_F_FAIL)) { 1717 ret = -ECONNRESET; 1718 goto out; 1719 } 1720 1721 file_flags = force_nonblock ? O_NONBLOCK : 0; 1722 1723 ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, 1724 file_flags); 1725 if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED) 1726 && force_nonblock) { 1727 if (ret == -EINPROGRESS) { 1728 connect->in_progress = true; 1729 } else if (ret == -ECONNABORTED) { 1730 if (connect->seen_econnaborted) 1731 goto out; 1732 connect->seen_econnaborted = true; 1733 } 1734 return -EAGAIN; 1735 } 1736 if (connect->in_progress) { 1737 /* 1738 * At least bluetooth will return -EBADFD on a re-connect 1739 * attempt, and it's (supposedly) also valid to get -EISCONN 1740 * which means the previous result is good. For both of these, 1741 * grab the sock_error() and use that for the completion. 1742 */ 1743 if (ret == -EBADFD || ret == -EISCONN) 1744 ret = sock_error(sock_from_file(req->file)->sk); 1745 } 1746 if (ret == -ERESTARTSYS) 1747 ret = -EINTR; 1748 out: 1749 if (ret < 0) 1750 req_set_fail(req); 1751 io_req_msg_cleanup(req, issue_flags); 1752 io_req_set_res(req, ret, 0); 1753 return IOU_OK; 1754 } 1755 1756 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1757 { 1758 struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); 1759 struct sockaddr __user *uaddr; 1760 struct io_async_msghdr *io; 1761 1762 if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) 1763 return -EINVAL; 1764 1765 uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 1766 bind->addr_len = READ_ONCE(sqe->addr2); 1767 1768 io = io_msg_alloc_async(req); 1769 if (unlikely(!io)) 1770 return -ENOMEM; 1771 return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr); 1772 } 1773 1774 int io_bind(struct io_kiocb *req, unsigned int issue_flags) 1775 { 1776 struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind); 1777 struct io_async_msghdr *io = req->async_data; 1778 struct socket *sock; 1779 int ret; 1780 1781 sock = sock_from_file(req->file); 1782 if (unlikely(!sock)) 1783 return -ENOTSOCK; 1784 1785 ret = __sys_bind_socket(sock, &io->addr, bind->addr_len); 1786 if (ret < 0) 1787 req_set_fail(req); 1788 io_req_set_res(req, ret, 0); 1789 return 0; 1790 } 1791 1792 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1793 { 1794 struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); 1795 1796 if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2) 1797 return -EINVAL; 1798 1799 listen->backlog = READ_ONCE(sqe->len); 1800 return 0; 1801 } 1802 1803 int io_listen(struct io_kiocb *req, unsigned int issue_flags) 1804 { 1805 struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen); 1806 struct socket *sock; 1807 int ret; 1808 1809 sock = sock_from_file(req->file); 1810 if (unlikely(!sock)) 1811 return -ENOTSOCK; 1812 1813 ret = __sys_listen_socket(sock, listen->backlog); 1814 if (ret < 0) 1815 req_set_fail(req); 1816 io_req_set_res(req, ret, 0); 1817 return 0; 1818 } 1819 1820 void io_netmsg_cache_free(const void *entry) 1821 { 1822 struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; 1823 1824 if (kmsg->free_iov) 1825 io_netmsg_iovec_free(kmsg); 1826 kfree(kmsg); 1827 } 1828 #endif 1829