1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/export.h> 3 #include <linux/bvec.h> 4 #include <linux/fault-inject-usercopy.h> 5 #include <linux/uio.h> 6 #include <linux/pagemap.h> 7 #include <linux/highmem.h> 8 #include <linux/slab.h> 9 #include <linux/vmalloc.h> 10 #include <linux/splice.h> 11 #include <linux/compat.h> 12 #include <linux/scatterlist.h> 13 #include <linux/instrumented.h> 14 #include <linux/iov_iter.h> 15 16 static __always_inline 17 size_t copy_to_user_iter(void __user *iter_to, size_t progress, 18 size_t len, void *from, void *priv2) 19 { 20 if (should_fail_usercopy()) 21 return len; 22 if (access_ok(iter_to, len)) { 23 from += progress; 24 instrument_copy_to_user(iter_to, from, len); 25 len = raw_copy_to_user(iter_to, from, len); 26 } 27 return len; 28 } 29 30 static __always_inline 31 size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress, 32 size_t len, void *from, void *priv2) 33 { 34 ssize_t res; 35 36 if (should_fail_usercopy()) 37 return len; 38 39 from += progress; 40 res = copy_to_user_nofault(iter_to, from, len); 41 return res < 0 ? len : res; 42 } 43 44 static __always_inline 45 size_t copy_from_user_iter(void __user *iter_from, size_t progress, 46 size_t len, void *to, void *priv2) 47 { 48 size_t res = len; 49 50 if (should_fail_usercopy()) 51 return len; 52 if (can_do_masked_user_access()) { 53 iter_from = mask_user_address(iter_from); 54 } else { 55 if (!access_ok(iter_from, len)) 56 return res; 57 58 /* 59 * Ensure that bad access_ok() speculation will not 60 * lead to nasty side effects *after* the copy is 61 * finished: 62 */ 63 barrier_nospec(); 64 } 65 to += progress; 66 instrument_copy_from_user_before(to, iter_from, len); 67 res = raw_copy_from_user(to, iter_from, len); 68 instrument_copy_from_user_after(to, iter_from, len, res); 69 70 return res; 71 } 72 73 static __always_inline 74 size_t memcpy_to_iter(void *iter_to, size_t progress, 75 size_t len, void *from, void *priv2) 76 { 77 memcpy(iter_to, from + progress, len); 78 return 0; 79 } 80 81 static __always_inline 82 size_t memcpy_from_iter(void *iter_from, size_t progress, 83 size_t len, void *to, void *priv2) 84 { 85 memcpy(to + progress, iter_from, len); 86 return 0; 87 } 88 89 /* 90 * fault_in_iov_iter_readable - fault in iov iterator for reading 91 * @i: iterator 92 * @size: maximum length 93 * 94 * Fault in one or more iovecs of the given iov_iter, to a maximum length of 95 * @size. For each iovec, fault in each page that constitutes the iovec. 96 * 97 * Returns the number of bytes not faulted in (like copy_to_user() and 98 * copy_from_user()). 99 * 100 * Always returns 0 for non-userspace iterators. 101 */ 102 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) 103 { 104 if (iter_is_ubuf(i)) { 105 size_t n = min(size, iov_iter_count(i)); 106 n -= fault_in_readable(i->ubuf + i->iov_offset, n); 107 return size - n; 108 } else if (iter_is_iovec(i)) { 109 size_t count = min(size, iov_iter_count(i)); 110 const struct iovec *p; 111 size_t skip; 112 113 size -= count; 114 for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { 115 size_t len = min(count, p->iov_len - skip); 116 size_t ret; 117 118 if (unlikely(!len)) 119 continue; 120 ret = fault_in_readable(p->iov_base + skip, len); 121 count -= len - ret; 122 if (ret) 123 break; 124 } 125 return count + size; 126 } 127 return 0; 128 } 129 EXPORT_SYMBOL(fault_in_iov_iter_readable); 130 131 /* 132 * fault_in_iov_iter_writeable - fault in iov iterator for writing 133 * @i: iterator 134 * @size: maximum length 135 * 136 * Faults in the iterator using get_user_pages(), i.e., without triggering 137 * hardware page faults. This is primarily useful when we already know that 138 * some or all of the pages in @i aren't in memory. 139 * 140 * Returns the number of bytes not faulted in, like copy_to_user() and 141 * copy_from_user(). 142 * 143 * Always returns 0 for non-user-space iterators. 144 */ 145 size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) 146 { 147 if (iter_is_ubuf(i)) { 148 size_t n = min(size, iov_iter_count(i)); 149 n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n); 150 return size - n; 151 } else if (iter_is_iovec(i)) { 152 size_t count = min(size, iov_iter_count(i)); 153 const struct iovec *p; 154 size_t skip; 155 156 size -= count; 157 for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { 158 size_t len = min(count, p->iov_len - skip); 159 size_t ret; 160 161 if (unlikely(!len)) 162 continue; 163 ret = fault_in_safe_writeable(p->iov_base + skip, len); 164 count -= len - ret; 165 if (ret) 166 break; 167 } 168 return count + size; 169 } 170 return 0; 171 } 172 EXPORT_SYMBOL(fault_in_iov_iter_writeable); 173 174 void iov_iter_init(struct iov_iter *i, unsigned int direction, 175 const struct iovec *iov, unsigned long nr_segs, 176 size_t count) 177 { 178 WARN_ON(direction & ~(READ | WRITE)); 179 *i = (struct iov_iter) { 180 .iter_type = ITER_IOVEC, 181 .nofault = false, 182 .data_source = direction, 183 .__iov = iov, 184 .nr_segs = nr_segs, 185 .iov_offset = 0, 186 .count = count 187 }; 188 } 189 EXPORT_SYMBOL(iov_iter_init); 190 191 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 192 { 193 if (WARN_ON_ONCE(i->data_source)) 194 return 0; 195 if (user_backed_iter(i)) 196 might_fault(); 197 return iterate_and_advance(i, bytes, (void *)addr, 198 copy_to_user_iter, memcpy_to_iter); 199 } 200 EXPORT_SYMBOL(_copy_to_iter); 201 202 #ifdef CONFIG_ARCH_HAS_COPY_MC 203 static __always_inline 204 size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress, 205 size_t len, void *from, void *priv2) 206 { 207 if (access_ok(iter_to, len)) { 208 from += progress; 209 instrument_copy_to_user(iter_to, from, len); 210 len = copy_mc_to_user(iter_to, from, len); 211 } 212 return len; 213 } 214 215 static __always_inline 216 size_t memcpy_to_iter_mc(void *iter_to, size_t progress, 217 size_t len, void *from, void *priv2) 218 { 219 return copy_mc_to_kernel(iter_to, from + progress, len); 220 } 221 222 /** 223 * _copy_mc_to_iter - copy to iter with source memory error exception handling 224 * @addr: source kernel address 225 * @bytes: total transfer length 226 * @i: destination iterator 227 * 228 * The pmem driver deploys this for the dax operation 229 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the 230 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes 231 * successfully copied. 232 * 233 * The main differences between this and typical _copy_to_iter(). 234 * 235 * * Typical tail/residue handling after a fault retries the copy 236 * byte-by-byte until the fault happens again. Re-triggering machine 237 * checks is potentially fatal so the implementation uses source 238 * alignment and poison alignment assumptions to avoid re-triggering 239 * hardware exceptions. 240 * 241 * * ITER_KVEC and ITER_BVEC can return short copies. Compare to 242 * copy_to_iter() where only ITER_IOVEC attempts might return a short copy. 243 * 244 * Return: number of bytes copied (may be %0) 245 */ 246 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 247 { 248 if (WARN_ON_ONCE(i->data_source)) 249 return 0; 250 if (user_backed_iter(i)) 251 might_fault(); 252 return iterate_and_advance(i, bytes, (void *)addr, 253 copy_to_user_iter_mc, memcpy_to_iter_mc); 254 } 255 EXPORT_SYMBOL_GPL(_copy_mc_to_iter); 256 #endif /* CONFIG_ARCH_HAS_COPY_MC */ 257 258 static __always_inline 259 size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) 260 { 261 return iterate_and_advance(i, bytes, addr, 262 copy_from_user_iter, memcpy_from_iter); 263 } 264 265 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) 266 { 267 if (WARN_ON_ONCE(!i->data_source)) 268 return 0; 269 270 if (user_backed_iter(i)) 271 might_fault(); 272 return __copy_from_iter(addr, bytes, i); 273 } 274 EXPORT_SYMBOL(_copy_from_iter); 275 276 static __always_inline 277 size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress, 278 size_t len, void *to, void *priv2) 279 { 280 return __copy_from_user_inatomic_nocache(to + progress, iter_from, len); 281 } 282 283 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) 284 { 285 if (WARN_ON_ONCE(!i->data_source)) 286 return 0; 287 288 return iterate_and_advance(i, bytes, addr, 289 copy_from_user_iter_nocache, 290 memcpy_from_iter); 291 } 292 EXPORT_SYMBOL(_copy_from_iter_nocache); 293 294 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE 295 static __always_inline 296 size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress, 297 size_t len, void *to, void *priv2) 298 { 299 return __copy_from_user_flushcache(to + progress, iter_from, len); 300 } 301 302 static __always_inline 303 size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress, 304 size_t len, void *to, void *priv2) 305 { 306 memcpy_flushcache(to + progress, iter_from, len); 307 return 0; 308 } 309 310 /** 311 * _copy_from_iter_flushcache - write destination through cpu cache 312 * @addr: destination kernel address 313 * @bytes: total transfer length 314 * @i: source iterator 315 * 316 * The pmem driver arranges for filesystem-dax to use this facility via 317 * dax_copy_from_iter() for ensuring that writes to persistent memory 318 * are flushed through the CPU cache. It is differentiated from 319 * _copy_from_iter_nocache() in that guarantees all data is flushed for 320 * all iterator types. The _copy_from_iter_nocache() only attempts to 321 * bypass the cache for the ITER_IOVEC case, and on some archs may use 322 * instructions that strand dirty-data in the cache. 323 * 324 * Return: number of bytes copied (may be %0) 325 */ 326 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) 327 { 328 if (WARN_ON_ONCE(!i->data_source)) 329 return 0; 330 331 return iterate_and_advance(i, bytes, addr, 332 copy_from_user_iter_flushcache, 333 memcpy_from_iter_flushcache); 334 } 335 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); 336 #endif 337 338 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) 339 { 340 struct page *head; 341 size_t v = n + offset; 342 343 /* 344 * The general case needs to access the page order in order 345 * to compute the page size. 346 * However, we mostly deal with order-0 pages and thus can 347 * avoid a possible cache line miss for requests that fit all 348 * page orders. 349 */ 350 if (n <= v && v <= PAGE_SIZE) 351 return true; 352 353 head = compound_head(page); 354 v += (page - head) << PAGE_SHIFT; 355 356 if (WARN_ON(n > v || v > page_size(head))) 357 return false; 358 return true; 359 } 360 361 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 362 struct iov_iter *i) 363 { 364 size_t res = 0; 365 if (!page_copy_sane(page, offset, bytes)) 366 return 0; 367 if (WARN_ON_ONCE(i->data_source)) 368 return 0; 369 page += offset / PAGE_SIZE; // first subpage 370 offset %= PAGE_SIZE; 371 while (1) { 372 void *kaddr = kmap_local_page(page); 373 size_t n = min(bytes, (size_t)PAGE_SIZE - offset); 374 n = _copy_to_iter(kaddr + offset, n, i); 375 kunmap_local(kaddr); 376 res += n; 377 bytes -= n; 378 if (!bytes || !n) 379 break; 380 offset += n; 381 if (offset == PAGE_SIZE) { 382 page++; 383 offset = 0; 384 } 385 } 386 return res; 387 } 388 EXPORT_SYMBOL(copy_page_to_iter); 389 390 size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, 391 struct iov_iter *i) 392 { 393 size_t res = 0; 394 395 if (!page_copy_sane(page, offset, bytes)) 396 return 0; 397 if (WARN_ON_ONCE(i->data_source)) 398 return 0; 399 page += offset / PAGE_SIZE; // first subpage 400 offset %= PAGE_SIZE; 401 while (1) { 402 void *kaddr = kmap_local_page(page); 403 size_t n = min(bytes, (size_t)PAGE_SIZE - offset); 404 405 n = iterate_and_advance(i, n, kaddr + offset, 406 copy_to_user_iter_nofault, 407 memcpy_to_iter); 408 kunmap_local(kaddr); 409 res += n; 410 bytes -= n; 411 if (!bytes || !n) 412 break; 413 offset += n; 414 if (offset == PAGE_SIZE) { 415 page++; 416 offset = 0; 417 } 418 } 419 return res; 420 } 421 EXPORT_SYMBOL(copy_page_to_iter_nofault); 422 423 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, 424 struct iov_iter *i) 425 { 426 size_t res = 0; 427 if (!page_copy_sane(page, offset, bytes)) 428 return 0; 429 page += offset / PAGE_SIZE; // first subpage 430 offset %= PAGE_SIZE; 431 while (1) { 432 void *kaddr = kmap_local_page(page); 433 size_t n = min(bytes, (size_t)PAGE_SIZE - offset); 434 n = _copy_from_iter(kaddr + offset, n, i); 435 kunmap_local(kaddr); 436 res += n; 437 bytes -= n; 438 if (!bytes || !n) 439 break; 440 offset += n; 441 if (offset == PAGE_SIZE) { 442 page++; 443 offset = 0; 444 } 445 } 446 return res; 447 } 448 EXPORT_SYMBOL(copy_page_from_iter); 449 450 static __always_inline 451 size_t zero_to_user_iter(void __user *iter_to, size_t progress, 452 size_t len, void *priv, void *priv2) 453 { 454 return clear_user(iter_to, len); 455 } 456 457 static __always_inline 458 size_t zero_to_iter(void *iter_to, size_t progress, 459 size_t len, void *priv, void *priv2) 460 { 461 memset(iter_to, 0, len); 462 return 0; 463 } 464 465 size_t iov_iter_zero(size_t bytes, struct iov_iter *i) 466 { 467 return iterate_and_advance(i, bytes, NULL, 468 zero_to_user_iter, zero_to_iter); 469 } 470 EXPORT_SYMBOL(iov_iter_zero); 471 472 size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, 473 size_t bytes, struct iov_iter *i) 474 { 475 size_t n, copied = 0; 476 477 if (!page_copy_sane(&folio->page, offset, bytes)) 478 return 0; 479 if (WARN_ON_ONCE(!i->data_source)) 480 return 0; 481 482 do { 483 char *to = kmap_local_folio(folio, offset); 484 485 n = bytes - copied; 486 if (folio_test_partial_kmap(folio) && 487 n > PAGE_SIZE - offset_in_page(offset)) 488 n = PAGE_SIZE - offset_in_page(offset); 489 490 pagefault_disable(); 491 n = __copy_from_iter(to, n, i); 492 pagefault_enable(); 493 kunmap_local(to); 494 copied += n; 495 offset += n; 496 } while (copied != bytes && n > 0); 497 498 return copied; 499 } 500 EXPORT_SYMBOL(copy_folio_from_iter_atomic); 501 502 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) 503 { 504 const struct bio_vec *bvec, *end; 505 506 if (!i->count) 507 return; 508 i->count -= size; 509 510 size += i->iov_offset; 511 512 for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) { 513 if (likely(size < bvec->bv_len)) 514 break; 515 size -= bvec->bv_len; 516 } 517 i->iov_offset = size; 518 i->nr_segs -= bvec - i->bvec; 519 i->bvec = bvec; 520 } 521 522 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) 523 { 524 const struct iovec *iov, *end; 525 526 if (!i->count) 527 return; 528 i->count -= size; 529 530 size += i->iov_offset; // from beginning of current segment 531 for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) { 532 if (likely(size < iov->iov_len)) 533 break; 534 size -= iov->iov_len; 535 } 536 i->iov_offset = size; 537 i->nr_segs -= iov - iter_iov(i); 538 i->__iov = iov; 539 } 540 541 static void iov_iter_folioq_advance(struct iov_iter *i, size_t size) 542 { 543 const struct folio_queue *folioq = i->folioq; 544 unsigned int slot = i->folioq_slot; 545 546 if (!i->count) 547 return; 548 i->count -= size; 549 550 if (slot >= folioq_nr_slots(folioq)) { 551 folioq = folioq->next; 552 slot = 0; 553 } 554 555 size += i->iov_offset; /* From beginning of current segment. */ 556 do { 557 size_t fsize = folioq_folio_size(folioq, slot); 558 559 if (likely(size < fsize)) 560 break; 561 size -= fsize; 562 slot++; 563 if (slot >= folioq_nr_slots(folioq) && folioq->next) { 564 folioq = folioq->next; 565 slot = 0; 566 } 567 } while (size); 568 569 i->iov_offset = size; 570 i->folioq_slot = slot; 571 i->folioq = folioq; 572 } 573 574 void iov_iter_advance(struct iov_iter *i, size_t size) 575 { 576 if (unlikely(i->count < size)) 577 size = i->count; 578 if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) { 579 i->iov_offset += size; 580 i->count -= size; 581 } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { 582 /* iovec and kvec have identical layouts */ 583 iov_iter_iovec_advance(i, size); 584 } else if (iov_iter_is_bvec(i)) { 585 iov_iter_bvec_advance(i, size); 586 } else if (iov_iter_is_folioq(i)) { 587 iov_iter_folioq_advance(i, size); 588 } else if (iov_iter_is_discard(i)) { 589 i->count -= size; 590 } 591 } 592 EXPORT_SYMBOL(iov_iter_advance); 593 594 static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll) 595 { 596 const struct folio_queue *folioq = i->folioq; 597 unsigned int slot = i->folioq_slot; 598 599 for (;;) { 600 size_t fsize; 601 602 if (slot == 0) { 603 folioq = folioq->prev; 604 slot = folioq_nr_slots(folioq); 605 } 606 slot--; 607 608 fsize = folioq_folio_size(folioq, slot); 609 if (unroll <= fsize) { 610 i->iov_offset = fsize - unroll; 611 break; 612 } 613 unroll -= fsize; 614 } 615 616 i->folioq_slot = slot; 617 i->folioq = folioq; 618 } 619 620 void iov_iter_revert(struct iov_iter *i, size_t unroll) 621 { 622 if (!unroll) 623 return; 624 if (WARN_ON(unroll > MAX_RW_COUNT)) 625 return; 626 i->count += unroll; 627 if (unlikely(iov_iter_is_discard(i))) 628 return; 629 if (unroll <= i->iov_offset) { 630 i->iov_offset -= unroll; 631 return; 632 } 633 unroll -= i->iov_offset; 634 if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) { 635 BUG(); /* We should never go beyond the start of the specified 636 * range since we might then be straying into pages that 637 * aren't pinned. 638 */ 639 } else if (iov_iter_is_bvec(i)) { 640 const struct bio_vec *bvec = i->bvec; 641 while (1) { 642 size_t n = (--bvec)->bv_len; 643 i->nr_segs++; 644 if (unroll <= n) { 645 i->bvec = bvec; 646 i->iov_offset = n - unroll; 647 return; 648 } 649 unroll -= n; 650 } 651 } else if (iov_iter_is_folioq(i)) { 652 i->iov_offset = 0; 653 iov_iter_folioq_revert(i, unroll); 654 } else { /* same logics for iovec and kvec */ 655 const struct iovec *iov = iter_iov(i); 656 while (1) { 657 size_t n = (--iov)->iov_len; 658 i->nr_segs++; 659 if (unroll <= n) { 660 i->__iov = iov; 661 i->iov_offset = n - unroll; 662 return; 663 } 664 unroll -= n; 665 } 666 } 667 } 668 EXPORT_SYMBOL(iov_iter_revert); 669 670 /* 671 * Return the count of just the current iov_iter segment. 672 */ 673 size_t iov_iter_single_seg_count(const struct iov_iter *i) 674 { 675 if (i->nr_segs > 1) { 676 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 677 return min(i->count, iter_iov(i)->iov_len - i->iov_offset); 678 if (iov_iter_is_bvec(i)) 679 return min(i->count, i->bvec->bv_len - i->iov_offset); 680 } 681 if (unlikely(iov_iter_is_folioq(i))) 682 return !i->count ? 0 : 683 umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count); 684 return i->count; 685 } 686 EXPORT_SYMBOL(iov_iter_single_seg_count); 687 688 void iov_iter_kvec(struct iov_iter *i, unsigned int direction, 689 const struct kvec *kvec, unsigned long nr_segs, 690 size_t count) 691 { 692 WARN_ON(direction & ~(READ | WRITE)); 693 *i = (struct iov_iter){ 694 .iter_type = ITER_KVEC, 695 .data_source = direction, 696 .kvec = kvec, 697 .nr_segs = nr_segs, 698 .iov_offset = 0, 699 .count = count 700 }; 701 } 702 EXPORT_SYMBOL(iov_iter_kvec); 703 704 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, 705 const struct bio_vec *bvec, unsigned long nr_segs, 706 size_t count) 707 { 708 WARN_ON(direction & ~(READ | WRITE)); 709 *i = (struct iov_iter){ 710 .iter_type = ITER_BVEC, 711 .data_source = direction, 712 .bvec = bvec, 713 .nr_segs = nr_segs, 714 .iov_offset = 0, 715 .count = count 716 }; 717 } 718 EXPORT_SYMBOL(iov_iter_bvec); 719 720 /** 721 * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue 722 * @i: The iterator to initialise. 723 * @direction: The direction of the transfer. 724 * @folioq: The starting point in the folio queue. 725 * @first_slot: The first slot in the folio queue to use 726 * @offset: The offset into the folio in the first slot to start at 727 * @count: The size of the I/O buffer in bytes. 728 * 729 * Set up an I/O iterator to either draw data out of the pages attached to an 730 * inode or to inject data into those pages. The pages *must* be prevented 731 * from evaporation, either by taking a ref on them or locking them by the 732 * caller. 733 */ 734 void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, 735 const struct folio_queue *folioq, unsigned int first_slot, 736 unsigned int offset, size_t count) 737 { 738 BUG_ON(direction & ~1); 739 *i = (struct iov_iter) { 740 .iter_type = ITER_FOLIOQ, 741 .data_source = direction, 742 .folioq = folioq, 743 .folioq_slot = first_slot, 744 .count = count, 745 .iov_offset = offset, 746 }; 747 } 748 EXPORT_SYMBOL(iov_iter_folio_queue); 749 750 /** 751 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray 752 * @i: The iterator to initialise. 753 * @direction: The direction of the transfer. 754 * @xarray: The xarray to access. 755 * @start: The start file position. 756 * @count: The size of the I/O buffer in bytes. 757 * 758 * Set up an I/O iterator to either draw data out of the pages attached to an 759 * inode or to inject data into those pages. The pages *must* be prevented 760 * from evaporation, either by taking a ref on them or locking them by the 761 * caller. 762 */ 763 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, 764 struct xarray *xarray, loff_t start, size_t count) 765 { 766 BUG_ON(direction & ~1); 767 *i = (struct iov_iter) { 768 .iter_type = ITER_XARRAY, 769 .data_source = direction, 770 .xarray = xarray, 771 .xarray_start = start, 772 .count = count, 773 .iov_offset = 0 774 }; 775 } 776 EXPORT_SYMBOL(iov_iter_xarray); 777 778 /** 779 * iov_iter_discard - Initialise an I/O iterator that discards data 780 * @i: The iterator to initialise. 781 * @direction: The direction of the transfer. 782 * @count: The size of the I/O buffer in bytes. 783 * 784 * Set up an I/O iterator that just discards everything that's written to it. 785 * It's only available as a READ iterator. 786 */ 787 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) 788 { 789 BUG_ON(direction != READ); 790 *i = (struct iov_iter){ 791 .iter_type = ITER_DISCARD, 792 .data_source = false, 793 .count = count, 794 .iov_offset = 0 795 }; 796 } 797 EXPORT_SYMBOL(iov_iter_discard); 798 799 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) 800 { 801 const struct iovec *iov = iter_iov(i); 802 unsigned long res = 0; 803 size_t size = i->count; 804 size_t skip = i->iov_offset; 805 806 do { 807 size_t len = iov->iov_len - skip; 808 if (len) { 809 res |= (unsigned long)iov->iov_base + skip; 810 if (len > size) 811 len = size; 812 res |= len; 813 size -= len; 814 } 815 iov++; 816 skip = 0; 817 } while (size); 818 return res; 819 } 820 821 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) 822 { 823 const struct bio_vec *bvec = i->bvec; 824 unsigned res = 0; 825 size_t size = i->count; 826 unsigned skip = i->iov_offset; 827 828 do { 829 size_t len = bvec->bv_len - skip; 830 res |= (unsigned long)bvec->bv_offset + skip; 831 if (len > size) 832 len = size; 833 res |= len; 834 bvec++; 835 size -= len; 836 skip = 0; 837 } while (size); 838 839 return res; 840 } 841 842 unsigned long iov_iter_alignment(const struct iov_iter *i) 843 { 844 if (likely(iter_is_ubuf(i))) { 845 size_t size = i->count; 846 if (size) 847 return ((unsigned long)i->ubuf + i->iov_offset) | size; 848 return 0; 849 } 850 851 /* iovec and kvec have identical layouts */ 852 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 853 return iov_iter_alignment_iovec(i); 854 855 if (iov_iter_is_bvec(i)) 856 return iov_iter_alignment_bvec(i); 857 858 /* With both xarray and folioq types, we're dealing with whole folios. */ 859 if (iov_iter_is_folioq(i)) 860 return i->iov_offset | i->count; 861 if (iov_iter_is_xarray(i)) 862 return (i->xarray_start + i->iov_offset) | i->count; 863 864 return 0; 865 } 866 EXPORT_SYMBOL(iov_iter_alignment); 867 868 unsigned long iov_iter_gap_alignment(const struct iov_iter *i) 869 { 870 unsigned long res = 0; 871 unsigned long v = 0; 872 size_t size = i->count; 873 unsigned k; 874 875 if (iter_is_ubuf(i)) 876 return 0; 877 878 if (WARN_ON(!iter_is_iovec(i))) 879 return ~0U; 880 881 for (k = 0; k < i->nr_segs; k++) { 882 const struct iovec *iov = iter_iov(i) + k; 883 if (iov->iov_len) { 884 unsigned long base = (unsigned long)iov->iov_base; 885 if (v) // if not the first one 886 res |= base | v; // this start | previous end 887 v = base + iov->iov_len; 888 if (size <= iov->iov_len) 889 break; 890 size -= iov->iov_len; 891 } 892 } 893 return res; 894 } 895 EXPORT_SYMBOL(iov_iter_gap_alignment); 896 897 static int want_pages_array(struct page ***res, size_t size, 898 size_t start, unsigned int maxpages) 899 { 900 unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE); 901 902 if (count > maxpages) 903 count = maxpages; 904 WARN_ON(!count); // caller should've prevented that 905 if (!*res) { 906 *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL); 907 if (!*res) 908 return 0; 909 } 910 return count; 911 } 912 913 static ssize_t iter_folioq_get_pages(struct iov_iter *iter, 914 struct page ***ppages, size_t maxsize, 915 unsigned maxpages, size_t *_start_offset) 916 { 917 const struct folio_queue *folioq = iter->folioq; 918 struct page **pages; 919 unsigned int slot = iter->folioq_slot; 920 size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset; 921 922 if (slot >= folioq_nr_slots(folioq)) { 923 folioq = folioq->next; 924 slot = 0; 925 if (WARN_ON(iov_offset != 0)) 926 return -EIO; 927 } 928 929 maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages); 930 if (!maxpages) 931 return -ENOMEM; 932 *_start_offset = iov_offset & ~PAGE_MASK; 933 pages = *ppages; 934 935 for (;;) { 936 struct folio *folio = folioq_folio(folioq, slot); 937 size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot); 938 size_t part = PAGE_SIZE - offset % PAGE_SIZE; 939 940 if (offset < fsize) { 941 part = umin(part, umin(maxsize - extracted, fsize - offset)); 942 count -= part; 943 iov_offset += part; 944 extracted += part; 945 946 *pages = folio_page(folio, offset / PAGE_SIZE); 947 get_page(*pages); 948 pages++; 949 maxpages--; 950 } 951 952 if (maxpages == 0 || extracted >= maxsize) 953 break; 954 955 if (iov_offset >= fsize) { 956 iov_offset = 0; 957 slot++; 958 if (slot == folioq_nr_slots(folioq) && folioq->next) { 959 folioq = folioq->next; 960 slot = 0; 961 } 962 } 963 } 964 965 iter->count = count; 966 iter->iov_offset = iov_offset; 967 iter->folioq = folioq; 968 iter->folioq_slot = slot; 969 return extracted; 970 } 971 972 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, 973 pgoff_t index, unsigned int nr_pages) 974 { 975 XA_STATE(xas, xa, index); 976 struct folio *folio; 977 unsigned int ret = 0; 978 979 rcu_read_lock(); 980 for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { 981 if (xas_retry(&xas, folio)) 982 continue; 983 984 /* Has the folio moved or been split? */ 985 if (unlikely(folio != xas_reload(&xas))) { 986 xas_reset(&xas); 987 continue; 988 } 989 990 pages[ret] = folio_file_page(folio, xas.xa_index); 991 folio_get(folio); 992 if (++ret == nr_pages) 993 break; 994 } 995 rcu_read_unlock(); 996 return ret; 997 } 998 999 static ssize_t iter_xarray_get_pages(struct iov_iter *i, 1000 struct page ***pages, size_t maxsize, 1001 unsigned maxpages, size_t *_start_offset) 1002 { 1003 unsigned nr, offset, count; 1004 pgoff_t index; 1005 loff_t pos; 1006 1007 pos = i->xarray_start + i->iov_offset; 1008 index = pos >> PAGE_SHIFT; 1009 offset = pos & ~PAGE_MASK; 1010 *_start_offset = offset; 1011 1012 count = want_pages_array(pages, maxsize, offset, maxpages); 1013 if (!count) 1014 return -ENOMEM; 1015 nr = iter_xarray_populate_pages(*pages, i->xarray, index, count); 1016 if (nr == 0) 1017 return 0; 1018 1019 maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); 1020 i->iov_offset += maxsize; 1021 i->count -= maxsize; 1022 return maxsize; 1023 } 1024 1025 /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ 1026 static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) 1027 { 1028 size_t skip; 1029 long k; 1030 1031 if (iter_is_ubuf(i)) 1032 return (unsigned long)i->ubuf + i->iov_offset; 1033 1034 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { 1035 const struct iovec *iov = iter_iov(i) + k; 1036 size_t len = iov->iov_len - skip; 1037 1038 if (unlikely(!len)) 1039 continue; 1040 if (*size > len) 1041 *size = len; 1042 return (unsigned long)iov->iov_base + skip; 1043 } 1044 BUG(); // if it had been empty, we wouldn't get called 1045 } 1046 1047 /* must be done on non-empty ITER_BVEC one */ 1048 static struct page *first_bvec_segment(const struct iov_iter *i, 1049 size_t *size, size_t *start) 1050 { 1051 struct page *page; 1052 size_t skip = i->iov_offset, len; 1053 1054 len = i->bvec->bv_len - skip; 1055 if (*size > len) 1056 *size = len; 1057 skip += i->bvec->bv_offset; 1058 page = i->bvec->bv_page + skip / PAGE_SIZE; 1059 *start = skip % PAGE_SIZE; 1060 return page; 1061 } 1062 1063 static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, 1064 struct page ***pages, size_t maxsize, 1065 unsigned int maxpages, size_t *start) 1066 { 1067 unsigned int n, gup_flags = 0; 1068 1069 if (maxsize > i->count) 1070 maxsize = i->count; 1071 if (!maxsize) 1072 return 0; 1073 if (maxsize > MAX_RW_COUNT) 1074 maxsize = MAX_RW_COUNT; 1075 1076 if (likely(user_backed_iter(i))) { 1077 unsigned long addr; 1078 int res; 1079 1080 if (iov_iter_rw(i) != WRITE) 1081 gup_flags |= FOLL_WRITE; 1082 if (i->nofault) 1083 gup_flags |= FOLL_NOFAULT; 1084 1085 addr = first_iovec_segment(i, &maxsize); 1086 *start = addr % PAGE_SIZE; 1087 addr &= PAGE_MASK; 1088 n = want_pages_array(pages, maxsize, *start, maxpages); 1089 if (!n) 1090 return -ENOMEM; 1091 res = get_user_pages_fast(addr, n, gup_flags, *pages); 1092 if (unlikely(res <= 0)) 1093 return res; 1094 maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start); 1095 iov_iter_advance(i, maxsize); 1096 return maxsize; 1097 } 1098 if (iov_iter_is_bvec(i)) { 1099 struct page **p; 1100 struct page *page; 1101 1102 page = first_bvec_segment(i, &maxsize, start); 1103 n = want_pages_array(pages, maxsize, *start, maxpages); 1104 if (!n) 1105 return -ENOMEM; 1106 p = *pages; 1107 for (int k = 0; k < n; k++) { 1108 struct folio *folio = page_folio(page + k); 1109 p[k] = page + k; 1110 if (!folio_test_slab(folio)) 1111 folio_get(folio); 1112 } 1113 maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); 1114 i->count -= maxsize; 1115 i->iov_offset += maxsize; 1116 if (i->iov_offset == i->bvec->bv_len) { 1117 i->iov_offset = 0; 1118 i->bvec++; 1119 i->nr_segs--; 1120 } 1121 return maxsize; 1122 } 1123 if (iov_iter_is_folioq(i)) 1124 return iter_folioq_get_pages(i, pages, maxsize, maxpages, start); 1125 if (iov_iter_is_xarray(i)) 1126 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); 1127 return -EFAULT; 1128 } 1129 1130 ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, 1131 size_t maxsize, unsigned maxpages, size_t *start) 1132 { 1133 if (!maxpages) 1134 return 0; 1135 BUG_ON(!pages); 1136 1137 return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start); 1138 } 1139 EXPORT_SYMBOL(iov_iter_get_pages2); 1140 1141 ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, 1142 struct page ***pages, size_t maxsize, size_t *start) 1143 { 1144 ssize_t len; 1145 1146 *pages = NULL; 1147 1148 len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start); 1149 if (len <= 0) { 1150 kvfree(*pages); 1151 *pages = NULL; 1152 } 1153 return len; 1154 } 1155 EXPORT_SYMBOL(iov_iter_get_pages_alloc2); 1156 1157 static int iov_npages(const struct iov_iter *i, int maxpages) 1158 { 1159 size_t skip = i->iov_offset, size = i->count; 1160 const struct iovec *p; 1161 int npages = 0; 1162 1163 for (p = iter_iov(i); size; skip = 0, p++) { 1164 unsigned offs = offset_in_page(p->iov_base + skip); 1165 size_t len = min(p->iov_len - skip, size); 1166 1167 if (len) { 1168 size -= len; 1169 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1170 if (unlikely(npages > maxpages)) 1171 return maxpages; 1172 } 1173 } 1174 return npages; 1175 } 1176 1177 static int bvec_npages(const struct iov_iter *i, int maxpages) 1178 { 1179 size_t skip = i->iov_offset, size = i->count; 1180 const struct bio_vec *p; 1181 int npages = 0; 1182 1183 for (p = i->bvec; size; skip = 0, p++) { 1184 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; 1185 size_t len = min(p->bv_len - skip, size); 1186 1187 size -= len; 1188 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1189 if (unlikely(npages > maxpages)) 1190 return maxpages; 1191 } 1192 return npages; 1193 } 1194 1195 int iov_iter_npages(const struct iov_iter *i, int maxpages) 1196 { 1197 if (unlikely(!i->count)) 1198 return 0; 1199 if (likely(iter_is_ubuf(i))) { 1200 unsigned offs = offset_in_page(i->ubuf + i->iov_offset); 1201 int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE); 1202 return min(npages, maxpages); 1203 } 1204 /* iovec and kvec have identical layouts */ 1205 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1206 return iov_npages(i, maxpages); 1207 if (iov_iter_is_bvec(i)) 1208 return bvec_npages(i, maxpages); 1209 if (iov_iter_is_folioq(i)) { 1210 unsigned offset = i->iov_offset % PAGE_SIZE; 1211 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); 1212 return min(npages, maxpages); 1213 } 1214 if (iov_iter_is_xarray(i)) { 1215 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; 1216 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); 1217 return min(npages, maxpages); 1218 } 1219 return 0; 1220 } 1221 EXPORT_SYMBOL(iov_iter_npages); 1222 1223 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) 1224 { 1225 *new = *old; 1226 if (iov_iter_is_bvec(new)) 1227 return new->bvec = kmemdup(new->bvec, 1228 new->nr_segs * sizeof(struct bio_vec), 1229 flags); 1230 else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) 1231 /* iovec and kvec have identical layout */ 1232 return new->__iov = kmemdup(new->__iov, 1233 new->nr_segs * sizeof(struct iovec), 1234 flags); 1235 return NULL; 1236 } 1237 EXPORT_SYMBOL(dup_iter); 1238 1239 static __noclone int copy_compat_iovec_from_user(struct iovec *iov, 1240 const struct iovec __user *uvec, u32 nr_segs) 1241 { 1242 const struct compat_iovec __user *uiov = 1243 (const struct compat_iovec __user *)uvec; 1244 int ret = -EFAULT; 1245 u32 i; 1246 1247 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) 1248 return -EFAULT; 1249 1250 for (i = 0; i < nr_segs; i++) { 1251 compat_uptr_t buf; 1252 compat_ssize_t len; 1253 1254 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); 1255 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); 1256 1257 /* check for compat_size_t not fitting in compat_ssize_t .. */ 1258 if (len < 0) { 1259 ret = -EINVAL; 1260 goto uaccess_end; 1261 } 1262 iov[i].iov_base = compat_ptr(buf); 1263 iov[i].iov_len = len; 1264 } 1265 1266 ret = 0; 1267 uaccess_end: 1268 user_access_end(); 1269 return ret; 1270 } 1271 1272 static __noclone int copy_iovec_from_user(struct iovec *iov, 1273 const struct iovec __user *uiov, unsigned long nr_segs) 1274 { 1275 int ret = -EFAULT; 1276 1277 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) 1278 return -EFAULT; 1279 1280 do { 1281 void __user *buf; 1282 ssize_t len; 1283 1284 unsafe_get_user(len, &uiov->iov_len, uaccess_end); 1285 unsafe_get_user(buf, &uiov->iov_base, uaccess_end); 1286 1287 /* check for size_t not fitting in ssize_t .. */ 1288 if (unlikely(len < 0)) { 1289 ret = -EINVAL; 1290 goto uaccess_end; 1291 } 1292 iov->iov_base = buf; 1293 iov->iov_len = len; 1294 1295 uiov++; iov++; 1296 } while (--nr_segs); 1297 1298 ret = 0; 1299 uaccess_end: 1300 user_access_end(); 1301 return ret; 1302 } 1303 1304 struct iovec *iovec_from_user(const struct iovec __user *uvec, 1305 unsigned long nr_segs, unsigned long fast_segs, 1306 struct iovec *fast_iov, bool compat) 1307 { 1308 struct iovec *iov = fast_iov; 1309 int ret; 1310 1311 /* 1312 * SuS says "The readv() function *may* fail if the iovcnt argument was 1313 * less than or equal to 0, or greater than {IOV_MAX}. Linux has 1314 * traditionally returned zero for zero segments, so... 1315 */ 1316 if (nr_segs == 0) 1317 return iov; 1318 if (nr_segs > UIO_MAXIOV) 1319 return ERR_PTR(-EINVAL); 1320 if (nr_segs > fast_segs) { 1321 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); 1322 if (!iov) 1323 return ERR_PTR(-ENOMEM); 1324 } 1325 1326 if (unlikely(compat)) 1327 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); 1328 else 1329 ret = copy_iovec_from_user(iov, uvec, nr_segs); 1330 if (ret) { 1331 if (iov != fast_iov) 1332 kfree(iov); 1333 return ERR_PTR(ret); 1334 } 1335 1336 return iov; 1337 } 1338 1339 /* 1340 * Single segment iovec supplied by the user, import it as ITER_UBUF. 1341 */ 1342 static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, 1343 struct iovec **iovp, struct iov_iter *i, 1344 bool compat) 1345 { 1346 struct iovec *iov = *iovp; 1347 ssize_t ret; 1348 1349 *iovp = NULL; 1350 1351 if (compat) 1352 ret = copy_compat_iovec_from_user(iov, uvec, 1); 1353 else 1354 ret = copy_iovec_from_user(iov, uvec, 1); 1355 if (unlikely(ret)) 1356 return ret; 1357 1358 ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); 1359 if (unlikely(ret)) 1360 return ret; 1361 return i->count; 1362 } 1363 1364 ssize_t __import_iovec(int type, const struct iovec __user *uvec, 1365 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, 1366 struct iov_iter *i, bool compat) 1367 { 1368 ssize_t total_len = 0; 1369 unsigned long seg; 1370 struct iovec *iov; 1371 1372 if (nr_segs == 1) 1373 return __import_iovec_ubuf(type, uvec, iovp, i, compat); 1374 1375 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); 1376 if (IS_ERR(iov)) { 1377 *iovp = NULL; 1378 return PTR_ERR(iov); 1379 } 1380 1381 /* 1382 * According to the Single Unix Specification we should return EINVAL if 1383 * an element length is < 0 when cast to ssize_t or if the total length 1384 * would overflow the ssize_t return value of the system call. 1385 * 1386 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 1387 * overflow case. 1388 */ 1389 for (seg = 0; seg < nr_segs; seg++) { 1390 ssize_t len = (ssize_t)iov[seg].iov_len; 1391 1392 if (!access_ok(iov[seg].iov_base, len)) { 1393 if (iov != *iovp) 1394 kfree(iov); 1395 *iovp = NULL; 1396 return -EFAULT; 1397 } 1398 1399 if (len > MAX_RW_COUNT - total_len) { 1400 len = MAX_RW_COUNT - total_len; 1401 iov[seg].iov_len = len; 1402 } 1403 total_len += len; 1404 } 1405 1406 iov_iter_init(i, type, iov, nr_segs, total_len); 1407 if (iov == *iovp) 1408 *iovp = NULL; 1409 else 1410 *iovp = iov; 1411 return total_len; 1412 } 1413 1414 /** 1415 * import_iovec() - Copy an array of &struct iovec from userspace 1416 * into the kernel, check that it is valid, and initialize a new 1417 * &struct iov_iter iterator to access it. 1418 * 1419 * @type: One of %READ or %WRITE. 1420 * @uvec: Pointer to the userspace array. 1421 * @nr_segs: Number of elements in userspace array. 1422 * @fast_segs: Number of elements in @iov. 1423 * @iovp: (input and output parameter) Pointer to pointer to (usually small 1424 * on-stack) kernel array. 1425 * @i: Pointer to iterator that will be initialized on success. 1426 * 1427 * If the array pointed to by *@iov is large enough to hold all @nr_segs, 1428 * then this function places %NULL in *@iov on return. Otherwise, a new 1429 * array will be allocated and the result placed in *@iov. This means that 1430 * the caller may call kfree() on *@iov regardless of whether the small 1431 * on-stack array was used or not (and regardless of whether this function 1432 * returns an error or not). 1433 * 1434 * Return: Negative error code on error, bytes imported on success 1435 */ 1436 ssize_t import_iovec(int type, const struct iovec __user *uvec, 1437 unsigned nr_segs, unsigned fast_segs, 1438 struct iovec **iovp, struct iov_iter *i) 1439 { 1440 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, 1441 in_compat_syscall()); 1442 } 1443 EXPORT_SYMBOL(import_iovec); 1444 1445 int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) 1446 { 1447 if (len > MAX_RW_COUNT) 1448 len = MAX_RW_COUNT; 1449 if (unlikely(!access_ok(buf, len))) 1450 return -EFAULT; 1451 1452 iov_iter_ubuf(i, rw, buf, len); 1453 return 0; 1454 } 1455 EXPORT_SYMBOL_GPL(import_ubuf); 1456 1457 /** 1458 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when 1459 * iov_iter_save_state() was called. 1460 * 1461 * @i: &struct iov_iter to restore 1462 * @state: state to restore from 1463 * 1464 * Used after iov_iter_save_state() to bring restore @i, if operations may 1465 * have advanced it. 1466 * 1467 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC 1468 */ 1469 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) 1470 { 1471 if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && 1472 !iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) 1473 return; 1474 i->iov_offset = state->iov_offset; 1475 i->count = state->count; 1476 if (iter_is_ubuf(i)) 1477 return; 1478 /* 1479 * For the *vec iters, nr_segs + iov is constant - if we increment 1480 * the vec, then we also decrement the nr_segs count. Hence we don't 1481 * need to track both of these, just one is enough and we can deduct 1482 * the other from that. ITER_KVEC and ITER_IOVEC are the same struct 1483 * size, so we can just increment the iov pointer as they are unionzed. 1484 * ITER_BVEC _may_ be the same size on some archs, but on others it is 1485 * not. Be safe and handle it separately. 1486 */ 1487 BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); 1488 if (iov_iter_is_bvec(i)) 1489 i->bvec -= state->nr_segs - i->nr_segs; 1490 else 1491 i->__iov -= state->nr_segs - i->nr_segs; 1492 i->nr_segs = state->nr_segs; 1493 } 1494 1495 /* 1496 * Extract a list of contiguous pages from an ITER_FOLIOQ iterator. This does 1497 * not get references on the pages, nor does it get a pin on them. 1498 */ 1499 static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i, 1500 struct page ***pages, size_t maxsize, 1501 unsigned int maxpages, 1502 iov_iter_extraction_t extraction_flags, 1503 size_t *offset0) 1504 { 1505 const struct folio_queue *folioq = i->folioq; 1506 struct page **p; 1507 unsigned int nr = 0; 1508 size_t extracted = 0, offset, slot = i->folioq_slot; 1509 1510 if (slot >= folioq_nr_slots(folioq)) { 1511 folioq = folioq->next; 1512 slot = 0; 1513 if (WARN_ON(i->iov_offset != 0)) 1514 return -EIO; 1515 } 1516 1517 offset = i->iov_offset & ~PAGE_MASK; 1518 *offset0 = offset; 1519 1520 maxpages = want_pages_array(pages, maxsize, offset, maxpages); 1521 if (!maxpages) 1522 return -ENOMEM; 1523 p = *pages; 1524 1525 for (;;) { 1526 struct folio *folio = folioq_folio(folioq, slot); 1527 size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot); 1528 size_t part = PAGE_SIZE - offset % PAGE_SIZE; 1529 1530 if (offset < fsize) { 1531 part = umin(part, umin(maxsize - extracted, fsize - offset)); 1532 i->count -= part; 1533 i->iov_offset += part; 1534 extracted += part; 1535 1536 p[nr++] = folio_page(folio, offset / PAGE_SIZE); 1537 } 1538 1539 if (nr >= maxpages || extracted >= maxsize) 1540 break; 1541 1542 if (i->iov_offset >= fsize) { 1543 i->iov_offset = 0; 1544 slot++; 1545 if (slot == folioq_nr_slots(folioq) && folioq->next) { 1546 folioq = folioq->next; 1547 slot = 0; 1548 } 1549 } 1550 } 1551 1552 i->folioq = folioq; 1553 i->folioq_slot = slot; 1554 return extracted; 1555 } 1556 1557 /* 1558 * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not 1559 * get references on the pages, nor does it get a pin on them. 1560 */ 1561 static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, 1562 struct page ***pages, size_t maxsize, 1563 unsigned int maxpages, 1564 iov_iter_extraction_t extraction_flags, 1565 size_t *offset0) 1566 { 1567 struct page **p; 1568 struct folio *folio; 1569 unsigned int nr = 0, offset; 1570 loff_t pos = i->xarray_start + i->iov_offset; 1571 XA_STATE(xas, i->xarray, pos >> PAGE_SHIFT); 1572 1573 offset = pos & ~PAGE_MASK; 1574 *offset0 = offset; 1575 1576 maxpages = want_pages_array(pages, maxsize, offset, maxpages); 1577 if (!maxpages) 1578 return -ENOMEM; 1579 p = *pages; 1580 1581 rcu_read_lock(); 1582 for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { 1583 if (xas_retry(&xas, folio)) 1584 continue; 1585 1586 /* Has the folio moved or been split? */ 1587 if (unlikely(folio != xas_reload(&xas))) { 1588 xas_reset(&xas); 1589 continue; 1590 } 1591 1592 p[nr++] = folio_file_page(folio, xas.xa_index); 1593 if (nr == maxpages) 1594 break; 1595 } 1596 rcu_read_unlock(); 1597 1598 maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); 1599 iov_iter_advance(i, maxsize); 1600 return maxsize; 1601 } 1602 1603 /* 1604 * Extract a list of virtually contiguous pages from an ITER_BVEC iterator. 1605 * This does not get references on the pages, nor does it get a pin on them. 1606 */ 1607 static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, 1608 struct page ***pages, size_t maxsize, 1609 unsigned int maxpages, 1610 iov_iter_extraction_t extraction_flags, 1611 size_t *offset0) 1612 { 1613 size_t skip = i->iov_offset, size = 0; 1614 struct bvec_iter bi; 1615 int k = 0; 1616 1617 if (i->nr_segs == 0) 1618 return 0; 1619 1620 if (i->iov_offset == i->bvec->bv_len) { 1621 i->iov_offset = 0; 1622 i->nr_segs--; 1623 i->bvec++; 1624 skip = 0; 1625 } 1626 bi.bi_idx = 0; 1627 bi.bi_size = maxsize; 1628 bi.bi_bvec_done = skip; 1629 1630 maxpages = want_pages_array(pages, maxsize, skip, maxpages); 1631 1632 while (bi.bi_size && bi.bi_idx < i->nr_segs) { 1633 struct bio_vec bv = bvec_iter_bvec(i->bvec, bi); 1634 1635 /* 1636 * The iov_iter_extract_pages interface only allows an offset 1637 * into the first page. Break out of the loop if we see an 1638 * offset into subsequent pages, the caller will have to call 1639 * iov_iter_extract_pages again for the reminder. 1640 */ 1641 if (k) { 1642 if (bv.bv_offset) 1643 break; 1644 } else { 1645 *offset0 = bv.bv_offset; 1646 } 1647 1648 (*pages)[k++] = bv.bv_page; 1649 size += bv.bv_len; 1650 1651 if (k >= maxpages) 1652 break; 1653 1654 /* 1655 * We are done when the end of the bvec doesn't align to a page 1656 * boundary as that would create a hole in the returned space. 1657 * The caller will handle this with another call to 1658 * iov_iter_extract_pages. 1659 */ 1660 if (bv.bv_offset + bv.bv_len != PAGE_SIZE) 1661 break; 1662 1663 bvec_iter_advance_single(i->bvec, &bi, bv.bv_len); 1664 } 1665 1666 iov_iter_advance(i, size); 1667 return size; 1668 } 1669 1670 /* 1671 * Extract a list of virtually contiguous pages from an ITER_KVEC iterator. 1672 * This does not get references on the pages, nor does it get a pin on them. 1673 */ 1674 static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i, 1675 struct page ***pages, size_t maxsize, 1676 unsigned int maxpages, 1677 iov_iter_extraction_t extraction_flags, 1678 size_t *offset0) 1679 { 1680 struct page **p, *page; 1681 const void *kaddr; 1682 size_t skip = i->iov_offset, offset, len, size; 1683 int k; 1684 1685 for (;;) { 1686 if (i->nr_segs == 0) 1687 return 0; 1688 size = min(maxsize, i->kvec->iov_len - skip); 1689 if (size) 1690 break; 1691 i->iov_offset = 0; 1692 i->nr_segs--; 1693 i->kvec++; 1694 skip = 0; 1695 } 1696 1697 kaddr = i->kvec->iov_base + skip; 1698 offset = (unsigned long)kaddr & ~PAGE_MASK; 1699 *offset0 = offset; 1700 1701 maxpages = want_pages_array(pages, size, offset, maxpages); 1702 if (!maxpages) 1703 return -ENOMEM; 1704 p = *pages; 1705 1706 kaddr -= offset; 1707 len = offset + size; 1708 for (k = 0; k < maxpages; k++) { 1709 size_t seg = min_t(size_t, len, PAGE_SIZE); 1710 1711 if (is_vmalloc_or_module_addr(kaddr)) 1712 page = vmalloc_to_page(kaddr); 1713 else 1714 page = virt_to_page(kaddr); 1715 1716 p[k] = page; 1717 len -= seg; 1718 kaddr += PAGE_SIZE; 1719 } 1720 1721 size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); 1722 iov_iter_advance(i, size); 1723 return size; 1724 } 1725 1726 /* 1727 * Extract a list of contiguous pages from a user iterator and get a pin on 1728 * each of them. This should only be used if the iterator is user-backed 1729 * (IOBUF/UBUF). 1730 * 1731 * It does not get refs on the pages, but the pages must be unpinned by the 1732 * caller once the transfer is complete. 1733 * 1734 * This is safe to be used where background IO/DMA *is* going to be modifying 1735 * the buffer; using a pin rather than a ref makes forces fork() to give the 1736 * child a copy of the page. 1737 */ 1738 static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, 1739 struct page ***pages, 1740 size_t maxsize, 1741 unsigned int maxpages, 1742 iov_iter_extraction_t extraction_flags, 1743 size_t *offset0) 1744 { 1745 unsigned long addr; 1746 unsigned int gup_flags = 0; 1747 size_t offset; 1748 int res; 1749 1750 if (i->data_source == ITER_DEST) 1751 gup_flags |= FOLL_WRITE; 1752 if (extraction_flags & ITER_ALLOW_P2PDMA) 1753 gup_flags |= FOLL_PCI_P2PDMA; 1754 if (i->nofault) 1755 gup_flags |= FOLL_NOFAULT; 1756 1757 addr = first_iovec_segment(i, &maxsize); 1758 *offset0 = offset = addr % PAGE_SIZE; 1759 addr &= PAGE_MASK; 1760 maxpages = want_pages_array(pages, maxsize, offset, maxpages); 1761 if (!maxpages) 1762 return -ENOMEM; 1763 res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages); 1764 if (unlikely(res <= 0)) 1765 return res; 1766 maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); 1767 iov_iter_advance(i, maxsize); 1768 return maxsize; 1769 } 1770 1771 /** 1772 * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator 1773 * @i: The iterator to extract from 1774 * @pages: Where to return the list of pages 1775 * @maxsize: The maximum amount of iterator to extract 1776 * @maxpages: The maximum size of the list of pages 1777 * @extraction_flags: Flags to qualify request 1778 * @offset0: Where to return the starting offset into (*@pages)[0] 1779 * 1780 * Extract a list of contiguous pages from the current point of the iterator, 1781 * advancing the iterator. The maximum number of pages and the maximum amount 1782 * of page contents can be set. 1783 * 1784 * If *@pages is NULL, a page list will be allocated to the required size and 1785 * *@pages will be set to its base. If *@pages is not NULL, it will be assumed 1786 * that the caller allocated a page list at least @maxpages in size and this 1787 * will be filled in. 1788 * 1789 * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA 1790 * be allowed on the pages extracted. 1791 * 1792 * The iov_iter_extract_will_pin() function can be used to query how cleanup 1793 * should be performed. 1794 * 1795 * Extra refs or pins on the pages may be obtained as follows: 1796 * 1797 * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be 1798 * added to the pages, but refs will not be taken. 1799 * iov_iter_extract_will_pin() will return true. 1800 * 1801 * (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the 1802 * pages are merely listed; no extra refs or pins are obtained. 1803 * iov_iter_extract_will_pin() will return 0. 1804 * 1805 * Note also: 1806 * 1807 * (*) Use with ITER_DISCARD is not supported as that has no content. 1808 * 1809 * On success, the function sets *@pages to the new pagelist, if allocated, and 1810 * sets *offset0 to the offset into the first page. 1811 * 1812 * It may also return -ENOMEM and -EFAULT. 1813 */ 1814 ssize_t iov_iter_extract_pages(struct iov_iter *i, 1815 struct page ***pages, 1816 size_t maxsize, 1817 unsigned int maxpages, 1818 iov_iter_extraction_t extraction_flags, 1819 size_t *offset0) 1820 { 1821 maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT); 1822 if (!maxsize) 1823 return 0; 1824 1825 if (likely(user_backed_iter(i))) 1826 return iov_iter_extract_user_pages(i, pages, maxsize, 1827 maxpages, extraction_flags, 1828 offset0); 1829 if (iov_iter_is_kvec(i)) 1830 return iov_iter_extract_kvec_pages(i, pages, maxsize, 1831 maxpages, extraction_flags, 1832 offset0); 1833 if (iov_iter_is_bvec(i)) 1834 return iov_iter_extract_bvec_pages(i, pages, maxsize, 1835 maxpages, extraction_flags, 1836 offset0); 1837 if (iov_iter_is_folioq(i)) 1838 return iov_iter_extract_folioq_pages(i, pages, maxsize, 1839 maxpages, extraction_flags, 1840 offset0); 1841 if (iov_iter_is_xarray(i)) 1842 return iov_iter_extract_xarray_pages(i, pages, maxsize, 1843 maxpages, extraction_flags, 1844 offset0); 1845 return -EFAULT; 1846 } 1847 EXPORT_SYMBOL_GPL(iov_iter_extract_pages); 1848