1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 /* 39 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 40 */ 41 42 #ifdef _KERNEL 43 44 #include <sys/errno.h> 45 #include <sys/vmem.h> 46 #include <sys/sysmacros.h> 47 #include <sys/types.h> 48 #include <sys/uio_impl.h> 49 #include <sys/sysmacros.h> 50 #include <sys/string.h> 51 #include <sys/zfs_refcount.h> 52 #include <sys/zfs_debug.h> 53 #include <linux/kmap_compat.h> 54 #include <linux/uaccess.h> 55 #include <linux/pagemap.h> 56 #include <linux/mman.h> 57 58 /* 59 * Move "n" bytes at byte address "p"; "rw" indicates the direction 60 * of the move, and the I/O parameters are provided in "uio", which is 61 * update to reflect the data which was moved. Returns 0 on success or 62 * a non-zero errno on failure. 63 */ 64 static int 65 zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 66 { 67 const struct iovec *iov = uio->uio_iov; 68 size_t skip = uio->uio_skip; 69 ulong_t cnt; 70 71 ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE); 72 while (n && uio->uio_resid) { 73 cnt = MIN(iov->iov_len - skip, n); 74 if (rw == UIO_READ) 75 memcpy(iov->iov_base + skip, p, cnt); 76 else 77 memcpy(p, iov->iov_base + skip, cnt); 78 skip += cnt; 79 if (skip == iov->iov_len) { 80 skip = 0; 81 uio->uio_iov = (++iov); 82 uio->uio_iovcnt--; 83 } 84 uio->uio_skip = skip; 85 uio->uio_resid -= cnt; 86 uio->uio_loffset += cnt; 87 p = (caddr_t)p + cnt; 88 n -= cnt; 89 } 90 return (0); 91 } 92 93 static int 94 zfs_uiomove_bvec_impl(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 95 { 96 const struct bio_vec *bv = uio->uio_bvec; 97 size_t skip = uio->uio_skip; 98 ulong_t cnt; 99 100 while (n && uio->uio_resid) { 101 void *paddr; 102 cnt = MIN(bv->bv_len - skip, n); 103 104 paddr = zfs_kmap_local(bv->bv_page); 105 if (rw == UIO_READ) { 106 /* Copy from buffer 'p' to the bvec data */ 107 memcpy(paddr + bv->bv_offset + skip, p, cnt); 108 } else { 109 /* Copy from bvec data to buffer 'p' */ 110 memcpy(p, paddr + bv->bv_offset + skip, cnt); 111 } 112 zfs_kunmap_local(paddr); 113 114 skip += cnt; 115 if (skip == bv->bv_len) { 116 skip = 0; 117 uio->uio_bvec = (++bv); 118 uio->uio_iovcnt--; 119 } 120 uio->uio_skip = skip; 121 uio->uio_resid -= cnt; 122 uio->uio_loffset += cnt; 123 p = (caddr_t)p + cnt; 124 n -= cnt; 125 } 126 return (0); 127 } 128 129 static void 130 zfs_copy_bvec(void *p, size_t skip, size_t cnt, zfs_uio_rw_t rw, 131 struct bio_vec *bv) 132 { 133 void *paddr; 134 135 paddr = zfs_kmap_local(bv->bv_page); 136 if (rw == UIO_READ) { 137 /* Copy from buffer 'p' to the bvec data */ 138 memcpy(paddr + bv->bv_offset + skip, p, cnt); 139 } else { 140 /* Copy from bvec data to buffer 'p' */ 141 memcpy(p, paddr + bv->bv_offset + skip, cnt); 142 } 143 zfs_kunmap_local(paddr); 144 } 145 146 /* 147 * Copy 'n' bytes of data between the buffer p[] and the data represented 148 * by the request in the uio. 149 */ 150 static int 151 zfs_uiomove_bvec_rq(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 152 { 153 struct request *rq = uio->rq; 154 struct bio_vec bv; 155 struct req_iterator iter; 156 size_t this_seg_start; /* logical offset */ 157 size_t this_seg_end; /* logical offset */ 158 size_t skip_in_seg; 159 size_t copy_from_seg; 160 size_t orig_loffset; 161 int copied = 0; 162 163 /* 164 * Get the original logical offset of this entire request (because 165 * uio->uio_loffset will be modified over time). 166 */ 167 orig_loffset = io_offset(NULL, rq); 168 this_seg_start = orig_loffset; 169 170 rq_for_each_segment(bv, rq, iter) { 171 /* 172 * Lookup what the logical offset of the last byte of this 173 * segment is. 174 */ 175 this_seg_end = this_seg_start + bv.bv_len - 1; 176 177 /* 178 * We only need to operate on segments that have data we're 179 * copying. 180 */ 181 if (uio->uio_loffset >= this_seg_start && 182 uio->uio_loffset <= this_seg_end) { 183 /* 184 * Some, or all, of the data in this segment needs to be 185 * copied. 186 */ 187 188 /* 189 * We may be not be copying from the first byte in the 190 * segment. Figure out how many bytes to skip copying 191 * from the beginning of this segment. 192 */ 193 skip_in_seg = uio->uio_loffset - this_seg_start; 194 195 /* 196 * Calculate the total number of bytes from this 197 * segment that we will be copying. 198 */ 199 copy_from_seg = MIN(bv.bv_len - skip_in_seg, n); 200 201 /* Copy the bytes */ 202 zfs_copy_bvec(p, skip_in_seg, copy_from_seg, rw, &bv); 203 p = ((char *)p) + copy_from_seg; 204 205 n -= copy_from_seg; 206 uio->uio_resid -= copy_from_seg; 207 uio->uio_loffset += copy_from_seg; 208 copied = 1; /* We copied some data */ 209 } 210 211 this_seg_start = this_seg_end + 1; 212 } 213 214 if (!copied) { 215 /* Didn't copy anything */ 216 uio->uio_resid = 0; 217 } 218 return (0); 219 } 220 221 static int 222 zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 223 { 224 if (uio->rq != NULL) 225 return (zfs_uiomove_bvec_rq(p, n, rw, uio)); 226 return (zfs_uiomove_bvec_impl(p, n, rw, uio)); 227 } 228 229 static int 230 zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, 231 boolean_t revert) 232 { 233 size_t cnt = MIN(n, uio->uio_resid); 234 235 if (uio->uio_skip) 236 iov_iter_advance(uio->uio_iter, uio->uio_skip); 237 238 if (rw == UIO_READ) 239 cnt = copy_to_iter(p, cnt, uio->uio_iter); 240 else 241 cnt = copy_from_iter(p, cnt, uio->uio_iter); 242 243 /* 244 * When operating on a full pipe no bytes are processed. 245 * In which case return EFAULT which is converted to EAGAIN 246 * by the kernel's generic_file_splice_read() function. 247 */ 248 if (cnt == 0) 249 return (EFAULT); 250 251 /* 252 * Revert advancing the uio_iter. This is set by zfs_uiocopy() 253 * to avoid consuming the uio and its iov_iter structure. 254 */ 255 if (revert) 256 iov_iter_revert(uio->uio_iter, cnt); 257 258 uio->uio_resid -= cnt; 259 uio->uio_loffset += cnt; 260 261 return (0); 262 } 263 264 int 265 zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio) 266 { 267 if (uio->uio_segflg == UIO_BVEC) 268 return (zfs_uiomove_bvec(p, n, rw, uio)); 269 else if (uio->uio_segflg == UIO_ITER) 270 return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE)); 271 else 272 return (zfs_uiomove_iov(p, n, rw, uio)); 273 } 274 EXPORT_SYMBOL(zfs_uiomove); 275 276 /* 277 * Fault in the pages of the first n bytes specified by the uio structure. 278 * 1 byte in each page is touched and the uio struct is unmodified. Any 279 * error will terminate the process as this is only a best attempt to get 280 * the pages resident. 281 */ 282 int 283 zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio) 284 { 285 if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC || 286 (uio->uio_extflg & UIO_DIRECT)) { 287 /* 288 * There's never a need to fault in kernel pages or Direct I/O 289 * write pages. Direct I/O write pages have been pinned in so 290 * there is never a time for these pages a fault will occur. 291 */ 292 return (0); 293 } else { 294 ASSERT3S(uio->uio_segflg, ==, UIO_ITER); 295 /* 296 * At least a Linux 4.18 kernel, iov_iter_fault_in_readable() 297 * can be relied on to fault in user pages when referenced. 298 */ 299 if (iov_iter_fault_in_readable(uio->uio_iter, n)) 300 return (EFAULT); 301 } 302 303 return (0); 304 } 305 EXPORT_SYMBOL(zfs_uio_prefaultpages); 306 307 /* 308 * The same as zfs_uiomove() but doesn't modify uio structure. 309 * return in cbytes how many bytes were copied. 310 */ 311 int 312 zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes) 313 { 314 zfs_uio_t uio_copy; 315 int ret; 316 317 memcpy(&uio_copy, uio, sizeof (zfs_uio_t)); 318 319 if (uio->uio_segflg == UIO_BVEC) 320 ret = zfs_uiomove_bvec(p, n, rw, &uio_copy); 321 else if (uio->uio_segflg == UIO_ITER) 322 ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE); 323 else 324 ret = zfs_uiomove_iov(p, n, rw, &uio_copy); 325 326 *cbytes = uio->uio_resid - uio_copy.uio_resid; 327 328 return (ret); 329 } 330 EXPORT_SYMBOL(zfs_uiocopy); 331 332 /* 333 * Drop the next n chars out of *uio. 334 */ 335 void 336 zfs_uioskip(zfs_uio_t *uio, size_t n) 337 { 338 if (n > uio->uio_resid) 339 return; 340 /* 341 * When using a uio with a struct request, we simply 342 * use uio_loffset as a pointer to the next logical byte to 343 * copy in the request. We don't have to do any fancy 344 * accounting with uio_bvec/uio_iovcnt since we don't use 345 * them. 346 */ 347 if (uio->uio_segflg == UIO_BVEC && uio->rq == NULL) { 348 uio->uio_skip += n; 349 while (uio->uio_iovcnt && 350 uio->uio_skip >= uio->uio_bvec->bv_len) { 351 uio->uio_skip -= uio->uio_bvec->bv_len; 352 uio->uio_bvec++; 353 uio->uio_iovcnt--; 354 } 355 } else if (uio->uio_segflg == UIO_ITER) { 356 iov_iter_advance(uio->uio_iter, n); 357 } else { 358 ASSERT3S(uio->uio_segflg, ==, UIO_SYSSPACE); 359 uio->uio_skip += n; 360 while (uio->uio_iovcnt && 361 uio->uio_skip >= uio->uio_iov->iov_len) { 362 uio->uio_skip -= uio->uio_iov->iov_len; 363 uio->uio_iov++; 364 uio->uio_iovcnt--; 365 } 366 } 367 368 uio->uio_loffset += n; 369 uio->uio_resid -= n; 370 } 371 EXPORT_SYMBOL(zfs_uioskip); 372 373 /* 374 * Check if the uio is page-aligned in memory. 375 */ 376 boolean_t 377 zfs_uio_page_aligned(zfs_uio_t *uio) 378 { 379 boolean_t aligned = B_TRUE; 380 381 if (uio->uio_segflg == UIO_SYSSPACE) { 382 const struct iovec *iov = uio->uio_iov; 383 size_t skip = uio->uio_skip; 384 385 for (int i = uio->uio_iovcnt; i > 0; iov++, i--) { 386 uintptr_t addr = (uintptr_t)(iov->iov_base + skip); 387 size_t size = iov->iov_len - skip; 388 if ((addr & (PAGE_SIZE - 1)) || 389 (size & (PAGE_SIZE - 1))) { 390 aligned = B_FALSE; 391 break; 392 } 393 skip = 0; 394 } 395 } else if (uio->uio_segflg == UIO_ITER) { 396 unsigned long alignment = 397 iov_iter_alignment(uio->uio_iter); 398 aligned = IS_P2ALIGNED(alignment, PAGE_SIZE); 399 } else { 400 /* Currently not supported */ 401 aligned = B_FALSE; 402 } 403 404 return (aligned); 405 } 406 407 #if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64) 408 #define ZFS_MARKEED_PAGE 0x0 409 #define IS_ZFS_MARKED_PAGE(_p) 0 410 #define zfs_mark_page(_p) 411 #define zfs_unmark_page(_p) 412 #define IS_ZERO_PAGE(_p) 0 413 414 #else 415 /* 416 * Mark pages to know if they were allocated to replace ZERO_PAGE() for 417 * Direct I/O writes. 418 */ 419 #define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */ 420 #define IS_ZFS_MARKED_PAGE(_p) \ 421 (page_private(_p) == (unsigned long)ZFS_MARKED_PAGE) 422 #define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0)) 423 424 static inline void 425 zfs_mark_page(struct page *page) 426 { 427 ASSERT3P(page, !=, NULL); 428 get_page(page); 429 SetPagePrivate(page); 430 set_page_private(page, ZFS_MARKED_PAGE); 431 } 432 433 static inline void 434 zfs_unmark_page(struct page *page) 435 { 436 ASSERT3P(page, !=, NULL); 437 set_page_private(page, 0UL); 438 ClearPagePrivate(page); 439 put_page(page); 440 } 441 #endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */ 442 443 static void 444 zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) 445 { 446 ASSERT3P(uio->uio_dio.pages, !=, NULL); 447 448 for (long i = 0; i < uio->uio_dio.npages; i++) { 449 struct page *p = uio->uio_dio.pages[i]; 450 lock_page(p); 451 452 if (IS_ZERO_PAGE(p)) { 453 /* 454 * If the user page points the kernels ZERO_PAGE() a 455 * new zero filled page will just be allocated so the 456 * contents of the page can not be changed by the user 457 * while a Direct I/O write is taking place. 458 */ 459 gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO | 460 __GFP_ZERO | GFP_KERNEL; 461 462 ASSERT0(IS_ZFS_MARKED_PAGE(p)); 463 unlock_page(p); 464 put_page(p); 465 466 uio->uio_dio.pages[i] = 467 __page_cache_alloc(gfp_zero_page); 468 zfs_mark_page(uio->uio_dio.pages[i]); 469 } else { 470 unlock_page(p); 471 } 472 } 473 } 474 475 void 476 zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) 477 { 478 479 ASSERT(uio->uio_extflg & UIO_DIRECT); 480 ASSERT3P(uio->uio_dio.pages, !=, NULL); 481 482 if (uio->uio_dio.pinned) { 483 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 484 unpin_user_pages(uio->uio_dio.pages, uio->uio_dio.npages); 485 #endif 486 } else { 487 for (long i = 0; i < uio->uio_dio.npages; i++) { 488 struct page *p = uio->uio_dio.pages[i]; 489 490 if (IS_ZFS_MARKED_PAGE(p)) { 491 zfs_unmark_page(p); 492 __free_page(p); 493 continue; 494 } 495 496 put_page(p); 497 } 498 } 499 500 vmem_free(uio->uio_dio.pages, 501 uio->uio_dio.npages * sizeof (struct page *)); 502 } 503 504 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 505 static int 506 zfs_uio_pin_user_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) 507 { 508 long res; 509 size_t skip = uio->uio_skip; 510 size_t len = uio->uio_resid - skip; 511 unsigned int gup_flags = 0; 512 unsigned long addr; 513 unsigned long nr_pages; 514 515 /* 516 * Kernel 6.2 introduced the FOLL_PCI_P2PDMA flag. This flag could 517 * possibly be used here in the future to allow for P2P operations with 518 * user pages. 519 */ 520 if (rw == UIO_READ) 521 gup_flags = FOLL_WRITE; 522 523 if (len == 0) 524 return (0); 525 526 uio->uio_dio.pinned = B_TRUE; 527 #if defined(HAVE_ITER_IS_UBUF) 528 if (iter_is_ubuf(uio->uio_iter)) { 529 nr_pages = DIV_ROUND_UP(len, PAGE_SIZE); 530 addr = (unsigned long)uio->uio_iter->ubuf + skip; 531 res = pin_user_pages_unlocked(addr, nr_pages, 532 &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags); 533 if (res < 0) { 534 return (SET_ERROR(-res)); 535 } else if (len != (res * PAGE_SIZE)) { 536 uio->uio_dio.npages += res; 537 return (SET_ERROR(EFAULT)); 538 } 539 uio->uio_dio.npages += res; 540 return (0); 541 } 542 #endif 543 const struct iovec *iovp = zfs_uio_iter_iov(uio->uio_iter); 544 for (int i = 0; i < uio->uio_iovcnt; i++) { 545 size_t amt = iovp->iov_len - skip; 546 if (amt == 0) { 547 iovp++; 548 skip = 0; 549 continue; 550 } 551 552 addr = (unsigned long)iovp->iov_base + skip; 553 nr_pages = DIV_ROUND_UP(amt, PAGE_SIZE); 554 res = pin_user_pages_unlocked(addr, nr_pages, 555 &uio->uio_dio.pages[uio->uio_dio.npages], gup_flags); 556 if (res < 0) { 557 return (SET_ERROR(-res)); 558 } else if (amt != (res * PAGE_SIZE)) { 559 uio->uio_dio.npages += res; 560 return (SET_ERROR(EFAULT)); 561 } 562 563 len -= amt; 564 uio->uio_dio.npages += res; 565 skip = 0; 566 iovp++; 567 }; 568 569 ASSERT0(len); 570 571 return (0); 572 } 573 #endif 574 575 static int 576 zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) 577 { 578 size_t start; 579 size_t wanted = uio->uio_resid - uio->uio_skip; 580 ssize_t rollback = 0; 581 ssize_t cnt; 582 unsigned maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE); 583 584 while (wanted) { 585 #if defined(HAVE_IOV_ITER_GET_PAGES2) 586 cnt = iov_iter_get_pages2(uio->uio_iter, 587 &uio->uio_dio.pages[uio->uio_dio.npages], 588 wanted, maxpages, &start); 589 #else 590 cnt = iov_iter_get_pages(uio->uio_iter, 591 &uio->uio_dio.pages[uio->uio_dio.npages], 592 wanted, maxpages, &start); 593 #endif 594 if (cnt < 0) { 595 iov_iter_revert(uio->uio_iter, rollback); 596 return (SET_ERROR(-cnt)); 597 } 598 /* 599 * All Direct I/O operations must be page aligned. 600 */ 601 ASSERT(IS_P2ALIGNED(start, PAGE_SIZE)); 602 uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE); 603 rollback += cnt; 604 wanted -= cnt; 605 #if !defined(HAVE_IOV_ITER_GET_PAGES2) 606 /* 607 * iov_iter_get_pages2() advances the iov_iter on success. 608 */ 609 iov_iter_advance(uio->uio_iter, cnt); 610 #endif 611 612 } 613 ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip); 614 iov_iter_revert(uio->uio_iter, rollback); 615 616 return (0); 617 } 618 619 /* 620 * This function pins user pages. In the event that the user pages were not 621 * successfully pinned an error value is returned. 622 * 623 * On success, 0 is returned. 624 */ 625 int 626 zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) 627 { 628 int error = 0; 629 long npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE); 630 size_t size = npages * sizeof (struct page *); 631 632 if (uio->uio_segflg == UIO_ITER) { 633 uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); 634 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 635 if (zfs_user_backed_iov_iter(uio->uio_iter)) 636 error = zfs_uio_pin_user_pages(uio, rw); 637 else 638 error = zfs_uio_get_dio_pages_iov_iter(uio, rw); 639 #else 640 error = zfs_uio_get_dio_pages_iov_iter(uio, rw); 641 #endif 642 } else { 643 return (SET_ERROR(EOPNOTSUPP)); 644 } 645 646 ASSERT3S(uio->uio_dio.npages, >=, 0); 647 648 if (error) { 649 if (uio->uio_dio.pinned) { 650 #if defined(HAVE_PIN_USER_PAGES_UNLOCKED) 651 unpin_user_pages(uio->uio_dio.pages, 652 uio->uio_dio.npages); 653 #endif 654 } else { 655 for (long i = 0; i < uio->uio_dio.npages; i++) 656 put_page(uio->uio_dio.pages[i]); 657 } 658 659 vmem_free(uio->uio_dio.pages, size); 660 return (error); 661 } else { 662 ASSERT3S(uio->uio_dio.npages, ==, npages); 663 } 664 665 if (rw == UIO_WRITE && !uio->uio_dio.pinned) 666 zfs_uio_dio_check_for_zero_page(uio); 667 668 uio->uio_extflg |= UIO_DIRECT; 669 670 return (0); 671 } 672 673 #endif /* _KERNEL */ 674