1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 /* 40 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 41 */ 42 43 #ifdef _KERNEL 44 45 #include <sys/errno.h> 46 #include <sys/vmem.h> 47 #include <sys/sysmacros.h> 48 #include <sys/types.h> 49 #include <sys/uio_impl.h> 50 #include <sys/sysmacros.h> 51 #include <sys/string.h> 52 #include <sys/zfs_refcount.h> 53 #include <sys/zfs_debug.h> 54 #include <linux/kmap_compat.h> 55 #include <linux/uaccess.h> 56 #include <linux/pagemap.h> 57 #include <linux/mman.h> 58 59 /* 60 * Move "n" bytes at byte address "p"; "rw" indicates the direction 61 * of the move, and the I/O parameters are provided in "uio", which is 62 * update to reflect the data which was moved. Returns 0 on success or 63 * a non-zero errno on failure. 64 */ 65 static int 66 zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 67 { 68 const struct iovec *iov = uio->uio_iov; 69 size_t skip = uio->uio_skip; 70 ulong_t cnt; 71 72 ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE); 73 while (n && uio->uio_resid) { 74 cnt = MIN(iov->iov_len - skip, n); 75 if (rw == UIO_READ) 76 memcpy(iov->iov_base + skip, p, cnt); 77 else 78 memcpy(p, iov->iov_base + skip, cnt); 79 skip += cnt; 80 if (skip == iov->iov_len) { 81 skip = 0; 82 uio->uio_iov = (++iov); 83 uio->uio_iovcnt--; 84 } 85 uio->uio_skip = skip; 86 uio->uio_resid -= cnt; 87 uio->uio_loffset += cnt; 88 p = (caddr_t)p + cnt; 89 n -= cnt; 90 } 91 return (0); 92 } 93 94 static int 95 zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 96 { 97 const struct bio_vec *bv = uio->uio_bvec; 98 size_t skip = uio->uio_skip; 99 ulong_t cnt; 100 101 while (n && uio->uio_resid) { 102 void *paddr; 103 cnt = MIN(bv->bv_len - skip, n); 104 105 paddr = zfs_kmap_local(bv->bv_page); 106 if (rw == UIO_READ) { 107 /* Copy from buffer 'p' to the bvec data */ 108 memcpy(paddr + bv->bv_offset + skip, p, cnt); 109 } else { 110 /* Copy from bvec data to buffer 'p' */ 111 memcpy(p, paddr + bv->bv_offset + skip, cnt); 112 } 113 zfs_kunmap_local(paddr); 114 115 skip += cnt; 116 if (skip == bv->bv_len) { 117 skip = 0; 118 uio->uio_bvec = (++bv); 119 uio->uio_iovcnt--; 120 } 121 uio->uio_skip = skip; 122 uio->uio_resid -= cnt; 123 uio->uio_loffset += cnt; 124 p = (caddr_t)p + cnt; 125 n -= cnt; 126 } 127 return (0); 128 } 129 130 static void 131 zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw, 132 struct bio_vec *bv) 133 { 134 void *paddr; 135 136 paddr = zfs_kmap_local(bv->bv_page); 137 if (rw == UIO_READ) { 138 /* Copy from buffer 'p' to the bvec data */ 139 memcpy(paddr + bv->bv_offset + skip, p, cnt); 140 } else { 141 /* Copy from bvec data to buffer 'p' */ 142 memcpy(p, paddr + bv->bv_offset + skip, cnt); 143 } 144 zfs_kunmap_local(paddr); 145 } 146 147 /* 148 * Copy 'n' bytes of data between the buffer p[] and the data represented 149 * by the request in the uio. 150 */ 151 static int 152 zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 153 { 154 struct request *rq = uio->rq; 155 struct bio_vec bv; 156 struct req_iterator iter; 157 size_t this_seg_start; /* logical offset */ 158 size_t this_seg_end; /* logical offset */ 159 size_t skip_in_seg; 160 size_t copy_from_seg; 161 size_t orig_loffset; 162 int copied = 0; 163 164 /* 165 * Get the original logical offset of this entire request (because 166 * uio->uio_loffset will be modified over time). 167 */ 168 orig_loffset = io_offset(NULL, rq); 169 this_seg_start = orig_loffset; 170 171 rq_for_each_segment(bv, rq, iter) { 172 /* 173 * Lookup what the logical offset of the last byte of this 174 * segment is. 175 */ 176 this_seg_end = this_seg_start + bv.bv_len - 1; 177 178 /* 179 * We only need to operate on segments that have data we're 180 * copying. 181 */ 182 if (uio->uio_loffset >= this_seg_start && 183 uio->uio_loffset <= this_seg_end) { 184 /* 185 * Some, or all, of the data in this segment needs to be 186 * copied. 187 */ 188 189 /* 190 * We may be not be copying from the first byte in the 191 * segment. Figure out how many bytes to skip copying 192 * from the beginning of this segment. 193 */ 194 skip_in_seg = uio->uio_loffset - this_seg_start; 195 196 /* 197 * Calculate the total number of bytes from this 198 * segment that we will be copying. 199 */ 200 copy_from_seg = MIN(bv.bv_len - skip_in_seg, n); 201 202 /* Copy the bytes */ 203 zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv); 204 p = ((char *)p) + copy_from_seg; 205 206 n -= copy_from_seg; 207 uio->uio_resid -= copy_from_seg; 208 uio->uio_loffset += copy_from_seg; 209 copied = 1; /* We copied some data */ 210 } 211 212 this_seg_start = this_seg_end + 1; 213 } 214 215 if (!copied) { 216 /* Didn't copy anything */ 217 uio->uio_resid = 0; 218 } 219 return (0); 220 } 221 222 static int 223 zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 224 { 225 if (uio->rq != NULL) 226 return (zfs_uiomove_bvec_rq(p, n, rw, uio)); 227 return (zfs_uiomove_bvec_impl(p, n, rw, uio)); 228 } 229 230 static int 231 zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, 232 boolean_t revert) 233 { 234 size_t cnt = MIN(n, uio->uio_resid); 235 236 if (uio->uio_skip) 237 iov_iter_advance(uio->uio_iter, uio->uio_skip); 238 239 if (rw == UIO_READ) 240 cnt = copy_to_iter(p, cnt, uio->uio_iter); 241 else 242 cnt = copy_from_iter(p, cnt, uio->uio_iter); 243 244 /* 245 * When operating on a full pipe no bytes are processed. 246 * In which case return EFAULT which is converted to EAGAIN 247 * by the kernel's generic_file_splice_read() function. 248 */ 249 if (cnt == 0) 250 return (EFAULT); 251 252 /* 253 * Revert advancing the uio_iter. This is set by zfs_uiocopy() 254 * to avoid consuming the uio and its iov_iter structure. 255 */ 256 if (revert) 257 iov_iter_revert(uio->uio_iter, cnt); 258 259 uio->uio_resid -= cnt; 260 uio->uio_loffset += cnt; 261 262 return (0); 263 } 264 265 int 266 zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 267 { 268 if (uio->uio_segflg == UIO_BVEC) 269 return (zfs_uiomove_bvec(p, n, rw, uio)); 270 else if (uio->uio_segflg == UIO_ITER) 271 return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE)); 272 else 273 return (zfs_uiomove_iov(p, n, rw, uio)); 274 } 275 EXPORT_SYMBOL(zfs_uiomove); 276 277 /* 278 * Fault in the pages of the first n bytes specified by the uio structure. 279 * 1 byte in each page is touched and the uio struct is unmodified. Any 280 * error will terminate the process as this is only a best attempt to get 281 * the pages resident. 282 */ 283 int 284 zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio) 285 { 286 if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC || 287 (uio->uio_extflg & UIO_DIRECT)) { 288 /* 289 * There's never a need to fault in kernel pages or Direct I/O 290 * write pages. Direct I/O write pages have been pinned in so 291 * there is never a time for these pages a fault will occur. 292 */ 293 return (0); 294 } else { 295 ASSERT3S(uio->uio_segflg, ==, UIO_ITER); 296 /* 297 * At least a Linux 4.18 kernel, iov_iter_fault_in_readable() 298 * can be relied on to fault in user pages when referenced. 299 */ 300 if (iov_iter_fault_in_readable(uio->uio_iter, n)) 301 return (EFAULT); 302 } 303 304 return (0); 305 } 306 EXPORT_SYMBOL(zfs_uio_prefaultpages); 307 308 /* 309 * The same as zfs_uiomove() but doesn't modify uio structure. 310 * return in cbytes how many bytes were copied. 311 */ 312 int 313 zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes) 314 { 315 zfs_uio_t uio_copy; 316 int ret; 317 318 memcpy(&uio_copy, uio, sizeof (zfs_uio_t)); 319 320 if (uio->uio_segflg == UIO_BVEC) 321 ret = zfs_uiomove_bvec(p, n, rw, &uio_copy); 322 else if (uio->uio_segflg == UIO_ITER) 323 ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE); 324 else 325 ret = zfs_uiomove_iov(p, n, rw, &uio_copy); 326 327 *cbytes = uio->uio_resid - uio_copy.uio_resid; 328 329 return (ret); 330 } 331 EXPORT_SYMBOL(zfs_uiocopy); 332 333 /* 334 * Drop the next n chars out of *uio. 335 */ 336 void 337 zfs_uioskip(zfs_uio_t *uio, size_t n) 338 { 339 if (n > uio->uio_resid) 340 return; 341 /* 342 * When using a uio with a struct request, we simply 343 * use uio_loffset as a pointer to the next logical byte to 344 * copy in the request. We don't have to do any fancy 345 * accounting with uio_bvec/uio_iovcnt since we don't use 346 * them. 347 */ 348 if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) { 349 uio->uio_skip += n; 350 while (uio->uio_iovcnt && 351 uio->uio_skip >= uio->uio_bvec->bv_len) { 352 uio->uio_skip -= uio->uio_bvec->bv_len; 353 uio->uio_bvec++; 354 uio->uio_iovcnt--; 355 } 356 } else if (uio->uio_segflg == UIO_ITER) { 357 iov_iter_advance(uio->uio_iter, n); 358 } else { 359 ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE); 360 uio->uio_skip += n; 361 while (uio->uio_iovcnt && 362 uio->uio_skip >= uio->uio_iov->iov_len) { 363 uio->uio_skip -= uio->uio_iov->iov_len; 364 uio->uio_iov++; 365 uio->uio_iovcnt--; 366 } 367 } 368 369 uio->uio_loffset += n; 370 uio->uio_resid -= n; 371 } 372 EXPORT_SYMBOL(zfs_uioskip); 373 374 /* 375 * Check if the uio is page-aligned in memory. 376 */ 377 boolean_t 378 zfs_uio_page_aligned(zfs_uio_t *uio) 379 { 380 boolean_t aligned = B_TRUE; 381 382 if (uio->uio_segflg == UIO_SYSSPACE) { 383 const struct iovec *iov = uio->uio_iov; 384 size_t skip = uio->uio_skip; 385 386 for (int i = uio->uio_iovcnt; i > 0; iov++, i--) { 387 uintptr_t addr = (uintptr_t)(iov->iov_base + skip); 388 size_t size = iov->iov_len - skip; 389 if ((addr & (PAGE_SIZE - 1)) || 390 (size & (PAGE_SIZE - 1))) { 391 aligned = B_FALSE; 392 break; 393 } 394 skip = 0; 395 } 396 } else if (uio->uio_segflg == UIO_ITER) { 397 unsigned long alignment = 398 iov_iter_alignment(uio->uio_iter); 399 aligned = IS_P2ALIGNED(alignment, PAGE_SIZE); 400 } else { 401 /* Currently not supported */ 402 aligned = B_FALSE; 403 } 404 405 return (aligned); 406 } 407 408 #if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64) 409 #define ZFS_MARKEED_PAGE 0x0 410 #define IS_ZFS_MARKED_PAGE(_p) 0 411 #define zfs_mark_page(_p) 412 #define zfs_unmark_page(_p) 413 #define IS_ZERO_PAGE(_p) 0 414 415 #else 416 /* 417 * Mark pages to know if they were allocated to replace ZERO_PAGE() for 418 * Direct I/O writes. 419 */ 420 #define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */ 421 #define IS_ZFS_MARKED_PAGE(_p) \ 422 (page_private(_p) == (unsigned long)ZFS_MARKED_PAGE) 423 #define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0)) 424 425 static inline void 426 zfs_mark_page(struct page *page) 427 { 428 ASSERT3P(page, !=, NULL); 429 get_page(page); 430 SetPagePrivate(page); 431 set_page_private(page, ZFS_MARKED_PAGE); 432 } 433 434 static inline void 435 zfs_unmark_page(struct page *page) 436 { 437 ASSERT3P(page, !=, NULL); 438 set_page_private(page, 0UL); 439 ClearPagePrivate(page); 440 put_page(page); 441 } 442 #endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */ 443 444 static void 445 zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) 446 { 447 ASSERT3P(uio->uio_dio.pages, !=, NULL); 448 449 for (long i = 0; i < uio->uio_dio.npages; i++) { 450 struct page *p = uio->uio_dio.pages[i]; 451 lock_page(p); 452 453 if (IS_ZERO_PAGE(p)) { 454 /* 455 * If the user page points the kernels ZERO_PAGE() a 456 * new zero filled page will just be allocated so the 457 * contents of the page can not be changed by the user 458 * while a Direct I/O write is taking place. 459 */ 460 gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO | 461 __GFP_ZERO | GFP_KERNEL; 462 463 ASSERT0(IS_ZFS_MARKED_PAGE(p)); 464 unlock_page(p); 465 put_page(p); 466 467 uio->uio_dio.pages[i] = 468 __page_cache_alloc(gfp_zero_page); 469 zfs_mark_page(uio->uio_dio.pages[i]); 470 } else { 471 unlock_page(p); 472 } 473 } 474 } 475 476 void 477 zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) 478 { 479 480 ASSERT(uio->uio_extflg & UIO_DIRECT); 481 ASSERT3P(uio->uio_dio.pages, !=, NULL); 482 483 if (uio->uio_dio.pinned) { 484 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 485 unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages); 486 #endif 487 } else { 488 for (long i = 0; i < uio->uio_dio.npages; i++) { 489 struct page *p = uio->uio_dio.pages[i]; 490 491 if (IS_ZFS_MARKED_PAGE(p)) { 492 zfs_unmark_page(p); 493 __free_page(p); 494 continue; 495 } 496 497 put_page(p); 498 } 499 } 500 501 vmem_free(uio->uio_dio.pages, 502 uio->uio_dio.npages * sizeof (struct page *)); 503 } 504 505 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 506 static int 507 zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) 508 { 509 long res; 510 size_t skip = uio->uio_skip; 511 size_t len = uio->uio_resid - skip; 512 unsigned int gup_flags = 0; 513 unsigned long addr; 514 unsigned long nr_pages; 515 516 /* 517 * Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could 518 * possibly be used here in the future to allow for P2P operations with 519 * user pages. 520 */ 521 if (rw == UIO_READ) 522 gup_flags = FOLL_WRITE; 523 524 if (len == 0) 525 return (0); 526 527 uio->uio_dio.pinned = B_TRUE; 528 #if defined(HAVE_ITER_IS_UBUF) 529 if (iter_is_ubuf(uio->uio_iter)) { 530 nr_pages = DIV_ROUND_UP(len, PAGE_SIZE); 531 addr = (unsigned long)uio->uio_iter->ubuf + skip; 532 res = pin_user_pages_unlocked(addr, nr_pages, 533 &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags); 534 if (res < 0) { 535 return (SET_ERROR(-res)); 536 } else if (len != (res * PAGE_SIZE)) { 537 uio->uio_dio.npages += res; 538 return (SET_ERROR(EFAULT)); 539 } 540 uio->uio_dio.npages += res; 541 return (0); 542 } 543 #endif 544 const struct iovec *iovp = zfs_uio_iter_iov(uio->uio_iter); 545 for (int i = 0; i < uio->uio_iovcnt; i++) { 546 size_t amt = iovp->iov_len - skip; 547 if (amt == 0) { 548 iovp++; 549 skip = 0; 550 continue; 551 } 552 553 addr = (unsigned long)iovp->iov_base + skip; 554 nr_pages = DIV_ROUND_UP(amt, PAGE_SIZE); 555 res = pin_user_pages_unlocked(addr, nr_pages, 556 &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags); 557 if (res < 0) { 558 return (SET_ERROR(-res)); 559 } else if (amt != (res * PAGE_SIZE)) { 560 uio->uio_dio.npages += res; 561 return (SET_ERROR(EFAULT)); 562 } 563 564 len -= amt; 565 uio->uio_dio.npages += res; 566 skip = 0; 567 iovp++; 568 }; 569 570 ASSERT0(len); 571 572 return (0); 573 } 574 #endif 575 576 static int 577 zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) 578 { 579 size_t start; 580 size_t wanted = uio->uio_resid - uio->uio_skip; 581 ssize_t rollback = 0; 582 ssize_t cnt; 583 unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE); 584 585 while (wanted) { 586 #if defined(HAVE_IOV_ITER_GET_PAGES2) 587 cnt = iov_iter_get_pages2(uio->uio_iter, 588 &uio->uio_dio.pages[uio->uio_dio.npages], 589 wanted, maxpages, &start); 590 #else 591 cnt = iov_iter_get_pages(uio->uio_iter, 592 &uio->uio_dio.pages[uio->uio_dio.npages], 593 wanted, maxpages, &start); 594 #endif 595 if (cnt < 0) { 596 iov_iter_revert(uio->uio_iter, rollback); 597 return (SET_ERROR(-cnt)); 598 } 599 /* 600 * All Direct I/O operations must be page aligned. 601 */ 602 ASSERT(IS_P2ALIGNED(start, PAGE_SIZE)); 603 uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE); 604 rollback += cnt; 605 wanted -= cnt; 606 #if !defined(HAVE_IOV_ITER_GET_PAGES2) 607 /* 608 * iov_iter_get_pages2() advances the iov_iter on success. 609 */ 610 iov_iter_advance(uio->uio_iter, cnt); 611 #endif 612 613 } 614 ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip); 615 iov_iter_revert(uio->uio_iter, rollback); 616 617 return (0); 618 } 619 620 /* 621 * This function pins user pages. In the event that the user pages were not 622 * successfully pinned an error value is returned. 623 * 624 * On success, 0 is returned. 625 */ 626 int 627 zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) 628 { 629 int error = 0; 630 long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE); 631 size_t size = npages * sizeof (struct page *); 632 633 if (uio->uio_segflg == UIO_ITER) { 634 uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); 635 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 636 if (zfs_user_backed_iov_iter(uio->uio_iter)) 637 error = zfs_uio_pin_user_pages(uio, rw); 638 else 639 error = zfs_uio_get_dio_pages_iov_iter(uio, rw); 640 #else 641 error = zfs_uio_get_dio_pages_iov_iter(uio, rw); 642 #endif 643 } else { 644 return (SET_ERROR(EOPNOTSUPP)); 645 } 646 647 ASSERT3S(uio->uio_dio.npages, >=, 0); 648 649 if (error) { 650 if (uio->uio_dio.pinned) { 651 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 652 unpin_user_pages(uio->uio_dio.pages, 653 uio->uio_dio.npages); 654 #endif 655 } else { 656 for (long i = 0; i < uio->uio_dio.npages; i++) 657 put_page(uio->uio_dio.pages[i]); 658 } 659 660 vmem_free(uio->uio_dio.pages, size); 661 return (error); 662 } else { 663 ASSERT3S(uio->uio_dio.npages, ==, npages); 664 } 665 666 if (rw == UIO_WRITE && !uio->uio_dio.pinned) 667 zfs_uio_dio_check_for_zero_page(uio); 668 669 uio->uio_extflg |= UIO_DIRECT; 670 671 return (0); 672 } 673 674 #endif /* _KERNEL */ 675