1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 /* 40 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 41 */ 42 43 #ifdef _KERNEL 44 45 #include <sys/errno.h> 46 #include <sys/vmem.h> 47 #include <sys/sysmacros.h> 48 #include <sys/types.h> 49 #include <sys/uio_impl.h> 50 #include <sys/sysmacros.h> 51 #include <sys/string.h> 52 #include <sys/zfs_refcount.h> 53 #include <sys/zfs_debug.h> 54 #include <linux/kmap_compat.h> 55 #include <linux/uaccess.h> 56 #include <linux/pagemap.h> 57 #include <linux/mman.h> 58 59 /* 60 * Move "n" bytes at byte address "p"; "rw" indicates the direction 61 * of the move, and the I/O parameters are provided in "uio", which is 62 * update to reflect the data which was moved. Returns 0 on success or 63 * a non-zero errno on failure. 64 */ 65 static int 66 zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 67 { 68 const struct iovec *iov = uio->uio_iov; 69 size_t skip = uio->uio_skip; 70 ulong_t cnt; 71 72 ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE); 73 while (n && uio->uio_resid) { 74 cnt = MIN(iov->iov_len - skip, n); 75 if (rw == UIO_READ) 76 memcpy(iov->iov_base + skip, p, cnt); 77 else 78 memcpy(p, iov->iov_base + skip, cnt); 79 skip += cnt; 80 if (skip == iov->iov_len) { 81 skip = 0; 82 uio->uio_iov = (++iov); 83 uio->uio_iovcnt--; 84 } 85 uio->uio_skip = skip; 86 uio->uio_resid -= cnt; 87 uio->uio_loffset += cnt; 88 p = (caddr_t)p + cnt; 89 n -= cnt; 90 } 91 return (0); 92 } 93 94 static int 95 zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 96 { 97 const struct bio_vec *bv = uio->uio_bvec; 98 size_t skip = uio->uio_skip; 99 ulong_t cnt; 100 101 while (n && uio->uio_resid) { 102 void *paddr; 103 cnt = MIN(bv->bv_len - skip, n); 104 105 paddr = zfs_kmap_local(bv->bv_page); 106 if (rw == UIO_READ) { 107 /* Copy from buffer 'p' to the bvec data */ 108 memcpy(paddr + bv->bv_offset + skip, p, cnt); 109 } else { 110 /* Copy from bvec data to buffer 'p' */ 111 memcpy(p, paddr + bv->bv_offset + skip, cnt); 112 } 113 zfs_kunmap_local(paddr); 114 115 skip += cnt; 116 if (skip == bv->bv_len) { 117 skip = 0; 118 uio->uio_bvec = (++bv); 119 uio->uio_iovcnt--; 120 } 121 uio->uio_skip = skip; 122 uio->uio_resid -= cnt; 123 uio->uio_loffset += cnt; 124 p = (caddr_t)p + cnt; 125 n -= cnt; 126 } 127 return (0); 128 } 129 130 static void 131 zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw, 132 struct bio_vec *bv) 133 { 134 void *paddr; 135 136 paddr = zfs_kmap_local(bv->bv_page); 137 if (rw == UIO_READ) { 138 /* Copy from buffer 'p' to the bvec data */ 139 memcpy(paddr + bv->bv_offset + skip, p, cnt); 140 } else { 141 /* Copy from bvec data to buffer 'p' */ 142 memcpy(p, paddr + bv->bv_offset + skip, cnt); 143 } 144 zfs_kunmap_local(paddr); 145 } 146 147 /* 148 * Copy 'n' bytes of data between the buffer p[] and the data represented 149 * by the request in the uio. 150 */ 151 static int 152 zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 153 { 154 struct request *rq = uio->rq; 155 struct bio_vec bv; 156 struct req_iterator iter; 157 size_t this_seg_start; /* logical offset */ 158 size_t this_seg_end; /* logical offset */ 159 size_t skip_in_seg; 160 size_t copy_from_seg; 161 size_t orig_loffset; 162 int copied = 0; 163 164 /* 165 * Get the original logical offset of this entire request (because 166 * uio->uio_loffset will be modified over time). 167 */ 168 orig_loffset = io_offset(NULL, rq); 169 this_seg_start = orig_loffset; 170 171 rq_for_each_segment(bv, rq, iter) { 172 /* 173 * Lookup what the logical offset of the last byte of this 174 * segment is. 175 */ 176 this_seg_end = this_seg_start + bv.bv_len - 1; 177 178 /* 179 * We only need to operate on segments that have data we're 180 * copying. 181 */ 182 if (uio->uio_loffset >= this_seg_start && 183 uio->uio_loffset <= this_seg_end) { 184 /* 185 * Some, or all, of the data in this segment needs to be 186 * copied. 187 */ 188 189 /* 190 * We may be not be copying from the first byte in the 191 * segment. Figure out how many bytes to skip copying 192 * from the beginning of this segment. 193 */ 194 skip_in_seg = uio->uio_loffset - this_seg_start; 195 196 /* 197 * Calculate the total number of bytes from this 198 * segment that we will be copying. 199 */ 200 copy_from_seg = MIN(bv.bv_len - skip_in_seg, n); 201 202 /* Copy the bytes */ 203 zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv); 204 p = ((char *)p) + copy_from_seg; 205 206 n -= copy_from_seg; 207 uio->uio_resid -= copy_from_seg; 208 uio->uio_loffset += copy_from_seg; 209 copied = 1; /* We copied some data */ 210 } 211 212 this_seg_start = this_seg_end + 1; 213 } 214 215 if (!copied) { 216 /* Didn't copy anything */ 217 uio->uio_resid = 0; 218 } 219 return (0); 220 } 221 222 static int 223 zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 224 { 225 if (uio->rq != NULL) 226 return (zfs_uiomove_bvec_rq(p, n, rw, uio)); 227 return (zfs_uiomove_bvec_impl(p, n, rw, uio)); 228 } 229 230 static int 231 zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, 232 boolean_t revert) 233 { 234 size_t cnt = MIN(n, uio->uio_resid); 235 236 if (rw == UIO_READ) 237 cnt = copy_to_iter(p, cnt, uio->uio_iter); 238 else 239 cnt = copy_from_iter(p, cnt, uio->uio_iter); 240 241 /* 242 * When operating on a full pipe no bytes are processed. 243 * In which case return EFAULT which is converted to EAGAIN 244 * by the kernel's generic_file_splice_read() function. 245 */ 246 if (cnt == 0) 247 return (EFAULT); 248 249 /* 250 * Revert advancing the uio_iter. This is set by zfs_uiocopy() 251 * to avoid consuming the uio and its iov_iter structure. 252 */ 253 if (revert) 254 iov_iter_revert(uio->uio_iter, cnt); 255 256 uio->uio_resid -= cnt; 257 uio->uio_loffset += cnt; 258 259 return (0); 260 } 261 262 int 263 zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 264 { 265 if (uio->uio_segflg == UIO_BVEC) 266 return (zfs_uiomove_bvec(p, n, rw, uio)); 267 else if (uio->uio_segflg == UIO_ITER) 268 return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE)); 269 else 270 return (zfs_uiomove_iov(p, n, rw, uio)); 271 } 272 EXPORT_SYMBOL(zfs_uiomove); 273 274 /* 275 * Fault in the pages of the first n bytes specified by the uio structure. 276 * 1 byte in each page is touched and the uio struct is unmodified. Any 277 * error will terminate the process as this is only a best attempt to get 278 * the pages resident. 279 */ 280 int 281 zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio) 282 { 283 if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC || 284 (uio->uio_extflg & UIO_DIRECT)) { 285 /* 286 * There's never a need to fault in kernel pages or Direct I/O 287 * write pages. Direct I/O write pages have been pinned in so 288 * there is never a time for these pages a fault will occur. 289 */ 290 return (0); 291 } else { 292 ASSERT3S(uio->uio_segflg, ==, UIO_ITER); 293 /* 294 * At least a Linux 4.18 kernel, iov_iter_fault_in_readable() 295 * can be relied on to fault in user pages when referenced. 296 */ 297 if (iov_iter_fault_in_readable(uio->uio_iter, n)) 298 return (EFAULT); 299 } 300 301 return (0); 302 } 303 EXPORT_SYMBOL(zfs_uio_prefaultpages); 304 305 /* 306 * The same as zfs_uiomove() but doesn't modify uio structure. 307 * return in cbytes how many bytes were copied. 308 */ 309 int 310 zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes) 311 { 312 zfs_uio_t uio_copy; 313 int ret; 314 315 memcpy(&uio_copy, uio, sizeof (zfs_uio_t)); 316 317 if (uio->uio_segflg == UIO_BVEC) 318 ret = zfs_uiomove_bvec(p, n, rw, &uio_copy); 319 else if (uio->uio_segflg == UIO_ITER) 320 ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE); 321 else 322 ret = zfs_uiomove_iov(p, n, rw, &uio_copy); 323 324 *cbytes = uio->uio_resid - uio_copy.uio_resid; 325 326 return (ret); 327 } 328 EXPORT_SYMBOL(zfs_uiocopy); 329 330 /* 331 * Drop the next n chars out of *uio. 332 */ 333 void 334 zfs_uioskip(zfs_uio_t *uio, size_t n) 335 { 336 if (n > uio->uio_resid) 337 return; 338 /* 339 * When using a uio with a struct request, we simply 340 * use uio_loffset as a pointer to the next logical byte to 341 * copy in the request. We don't have to do any fancy 342 * accounting with uio_bvec/uio_iovcnt since we don't use 343 * them. 344 */ 345 if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) { 346 uio->uio_skip += n; 347 while (uio->uio_iovcnt && 348 uio->uio_skip >= uio->uio_bvec->bv_len) { 349 uio->uio_skip -= uio->uio_bvec->bv_len; 350 uio->uio_bvec++; 351 uio->uio_iovcnt--; 352 } 353 } else if (uio->uio_segflg == UIO_ITER) { 354 iov_iter_advance(uio->uio_iter, n); 355 } else { 356 ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE); 357 uio->uio_skip += n; 358 while (uio->uio_iovcnt && 359 uio->uio_skip >= uio->uio_iov->iov_len) { 360 uio->uio_skip -= uio->uio_iov->iov_len; 361 uio->uio_iov++; 362 uio->uio_iovcnt--; 363 } 364 } 365 366 uio->uio_loffset += n; 367 uio->uio_resid -= n; 368 } 369 EXPORT_SYMBOL(zfs_uioskip); 370 371 /* 372 * Check if the uio is page-aligned in memory. 373 */ 374 boolean_t 375 zfs_uio_page_aligned(zfs_uio_t *uio) 376 { 377 boolean_t aligned = B_TRUE; 378 379 if (uio->uio_segflg == UIO_SYSSPACE) { 380 const struct iovec *iov = uio->uio_iov; 381 size_t skip = uio->uio_skip; 382 383 for (int i = uio->uio_iovcnt; i > 0; iov++, i--) { 384 uintptr_t addr = (uintptr_t)(iov->iov_base + skip); 385 size_t size = iov->iov_len - skip; 386 if ((addr & (PAGE_SIZE - 1)) || 387 (size & (PAGE_SIZE - 1))) { 388 aligned = B_FALSE; 389 break; 390 } 391 skip = 0; 392 } 393 } else if (uio->uio_segflg == UIO_ITER) { 394 unsigned long alignment = 395 iov_iter_alignment(uio->uio_iter); 396 aligned = IS_P2ALIGNED(alignment, PAGE_SIZE); 397 } else { 398 /* Currently not supported */ 399 aligned = B_FALSE; 400 } 401 402 return (aligned); 403 } 404 405 #if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64) 406 #define ZFS_MARKEED_PAGE 0x0 407 #define IS_ZFS_MARKED_PAGE(_p) 0 408 #define zfs_mark_page(_p) 409 #define zfs_unmark_page(_p) 410 #define IS_ZERO_PAGE(_p) 0 411 412 #else 413 /* 414 * Mark pages to know if they were allocated to replace ZERO_PAGE() for 415 * Direct I/O writes. 416 */ 417 #define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */ 418 #define IS_ZFS_MARKED_PAGE(_p) \ 419 (page_private(_p) == (unsigned long)ZFS_MARKED_PAGE) 420 #define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0)) 421 422 static inline void 423 zfs_mark_page(struct page *page) 424 { 425 ASSERT3P(page, !=, NULL); 426 get_page(page); 427 SetPagePrivate(page); 428 set_page_private(page, ZFS_MARKED_PAGE); 429 } 430 431 static inline void 432 zfs_unmark_page(struct page *page) 433 { 434 ASSERT3P(page, !=, NULL); 435 set_page_private(page, 0UL); 436 ClearPagePrivate(page); 437 put_page(page); 438 } 439 #endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */ 440 441 static void 442 zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) 443 { 444 ASSERT3P(uio->uio_dio.pages, !=, NULL); 445 446 for (long i = 0; i < uio->uio_dio.npages; i++) { 447 struct page *p = uio->uio_dio.pages[i]; 448 lock_page(p); 449 450 if (IS_ZERO_PAGE(p)) { 451 /* 452 * If the user page points the kernels ZERO_PAGE() a 453 * new zero filled page will just be allocated so the 454 * contents of the page can not be changed by the user 455 * while a Direct I/O write is taking place. 456 */ 457 gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO | 458 __GFP_ZERO | GFP_KERNEL; 459 460 ASSERT0(IS_ZFS_MARKED_PAGE(p)); 461 unlock_page(p); 462 put_page(p); 463 464 uio->uio_dio.pages[i] = 465 __page_cache_alloc(gfp_zero_page); 466 zfs_mark_page(uio->uio_dio.pages[i]); 467 } else { 468 unlock_page(p); 469 } 470 } 471 } 472 473 void 474 zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) 475 { 476 477 ASSERT(uio->uio_extflg & UIO_DIRECT); 478 ASSERT3P(uio->uio_dio.pages, !=, NULL); 479 480 if (uio->uio_dio.pinned) { 481 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 482 unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages); 483 #endif 484 } else { 485 for (long i = 0; i < uio->uio_dio.npages; i++) { 486 struct page *p = uio->uio_dio.pages[i]; 487 488 if (IS_ZFS_MARKED_PAGE(p)) { 489 zfs_unmark_page(p); 490 __free_page(p); 491 continue; 492 } 493 494 put_page(p); 495 } 496 } 497 498 vmem_free(uio->uio_dio.pages, 499 uio->uio_dio.npages * sizeof (struct page *)); 500 } 501 502 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 503 static int 504 zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) 505 { 506 long res; 507 size_t skip = uio->uio_iter->iov_offset; 508 size_t len = uio->uio_resid - skip; 509 unsigned int gup_flags = 0; 510 unsigned long addr; 511 unsigned long nr_pages; 512 513 ASSERT3U(uio->uio_segflg, ==, UIO_ITER); 514 515 /* 516 * Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could 517 * possibly be used here in the future to allow for P2P operations with 518 * user pages. 519 */ 520 if (rw == UIO_READ) 521 gup_flags = FOLL_WRITE; 522 523 if (len == 0) 524 return (0); 525 526 uio->uio_dio.pinned = B_TRUE; 527 #if defined(HAVE_ITER_IS_UBUF) 528 if (iter_is_ubuf(uio->uio_iter)) { 529 nr_pages = DIV_ROUND_UP(len, PAGE_SIZE); 530 addr = (unsigned long)uio->uio_iter->ubuf + skip; 531 res = pin_user_pages_unlocked(addr, nr_pages, 532 &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags); 533 if (res < 0) { 534 return (SET_ERROR(-res)); 535 } else if (len != (res * PAGE_SIZE)) { 536 uio->uio_dio.npages += res; 537 return (SET_ERROR(EFAULT)); 538 } 539 uio->uio_dio.npages += res; 540 return (0); 541 } 542 #endif 543 const struct iovec *iovp = zfs_uio_iter_iov(uio->uio_iter); 544 for (int i = 0; i < uio->uio_iovcnt; i++) { 545 size_t amt = iovp->iov_len - skip; 546 if (amt == 0) { 547 iovp++; 548 skip = 0; 549 continue; 550 } 551 552 addr = (unsigned long)iovp->iov_base + skip; 553 nr_pages = DIV_ROUND_UP(amt, PAGE_SIZE); 554 res = pin_user_pages_unlocked(addr, nr_pages, 555 &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags); 556 if (res < 0) { 557 return (SET_ERROR(-res)); 558 } else if (amt != (res * PAGE_SIZE)) { 559 uio->uio_dio.npages += res; 560 return (SET_ERROR(EFAULT)); 561 } 562 563 len -= amt; 564 uio->uio_dio.npages += res; 565 skip = 0; 566 iovp++; 567 }; 568 569 ASSERT0(len); 570 571 return (0); 572 } 573 #endif 574 575 static int 576 zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) 577 { 578 size_t start; 579 size_t wanted = uio->uio_resid; 580 ssize_t rollback = 0; 581 ssize_t cnt; 582 unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE); 583 584 while (wanted) { 585 #if defined(HAVE_IOV_ITER_GET_PAGES2) 586 cnt = iov_iter_get_pages2(uio->uio_iter, 587 &uio->uio_dio.pages[uio->uio_dio.npages], 588 wanted, maxpages, &start); 589 #else 590 cnt = iov_iter_get_pages(uio->uio_iter, 591 &uio->uio_dio.pages[uio->uio_dio.npages], 592 wanted, maxpages, &start); 593 #endif 594 if (cnt < 0) { 595 iov_iter_revert(uio->uio_iter, rollback); 596 return (SET_ERROR(-cnt)); 597 } 598 /* 599 * All Direct I/O operations must be page aligned. 600 */ 601 ASSERT(IS_P2ALIGNED(start, PAGE_SIZE)); 602 uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE); 603 rollback += cnt; 604 wanted -= cnt; 605 #if !defined(HAVE_IOV_ITER_GET_PAGES2) 606 /* 607 * iov_iter_get_pages2() advances the iov_iter on success. 608 */ 609 iov_iter_advance(uio->uio_iter, cnt); 610 #endif 611 612 } 613 ASSERT3U(rollback, ==, uio->uio_resid); 614 iov_iter_revert(uio->uio_iter, rollback); 615 616 return (0); 617 } 618 619 /* 620 * This function pins user pages. In the event that the user pages were not 621 * successfully pinned an error value is returned. 622 * 623 * On success, 0 is returned. 624 */ 625 int 626 zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) 627 { 628 int error = 0; 629 long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE); 630 size_t size = npages * sizeof (struct page *); 631 632 if (uio->uio_segflg == UIO_ITER) { 633 uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); 634 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 635 if (zfs_user_backed_iov_iter(uio->uio_iter)) 636 error = zfs_uio_pin_user_pages(uio, rw); 637 else 638 error = zfs_uio_get_dio_pages_iov_iter(uio, rw); 639 #else 640 error = zfs_uio_get_dio_pages_iov_iter(uio, rw); 641 #endif 642 } else { 643 return (SET_ERROR(EOPNOTSUPP)); 644 } 645 646 ASSERT3S(uio->uio_dio.npages, >=, 0); 647 648 if (error) { 649 if (uio->uio_dio.pinned) { 650 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 651 unpin_user_pages(uio->uio_dio.pages, 652 uio->uio_dio.npages); 653 #endif 654 } else { 655 for (long i = 0; i < uio->uio_dio.npages; i++) 656 put_page(uio->uio_dio.pages[i]); 657 } 658 659 vmem_free(uio->uio_dio.pages, size); 660 return (error); 661 } else { 662 ASSERT3S(uio->uio_dio.npages, ==, npages); 663 } 664 665 if (rw == UIO_WRITE && !uio->uio_dio.pinned) 666 zfs_uio_dio_check_for_zero_page(uio); 667 668 uio->uio_extflg |= UIO_DIRECT; 669 670 return (0); 671 } 672 673 #endif /* _KERNEL */ 674