1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <crypto/hash.h> 3 #include <linux/export.h> 4 #include <linux/bvec.h> 5 #include <linux/fault-inject-usercopy.h> 6 #include <linux/uio.h> 7 #include <linux/pagemap.h> 8 #include <linux/highmem.h> 9 #include <linux/slab.h> 10 #include <linux/vmalloc.h> 11 #include <linux/splice.h> 12 #include <linux/compat.h> 13 #include <net/checksum.h> 14 #include <linux/scatterlist.h> 15 #include <linux/instrumented.h> 16 17 #define PIPE_PARANOIA /* for now */ 18 19 /* covers iovec and kvec alike */ 20 #define iterate_iovec(i, n, base, len, off, __p, STEP) { \ 21 size_t off = 0; \ 22 size_t skip = i->iov_offset; \ 23 do { \ 24 len = min(n, __p->iov_len - skip); \ 25 if (likely(len)) { \ 26 base = __p->iov_base + skip; \ 27 len -= (STEP); \ 28 off += len; \ 29 skip += len; \ 30 n -= len; \ 31 if (skip < __p->iov_len) \ 32 break; \ 33 } \ 34 __p++; \ 35 skip = 0; \ 36 } while (n); \ 37 i->iov_offset = skip; \ 38 n = off; \ 39 } 40 41 #define iterate_bvec(i, n, base, len, off, p, STEP) { \ 42 size_t off = 0; \ 43 unsigned skip = i->iov_offset; \ 44 while (n) { \ 45 unsigned offset = p->bv_offset + skip; \ 46 unsigned left; \ 47 void *kaddr = kmap_local_page(p->bv_page + \ 48 offset / PAGE_SIZE); \ 49 base = kaddr + offset % PAGE_SIZE; \ 50 len = min(min(n, (size_t)(p->bv_len - skip)), \ 51 (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \ 52 left = (STEP); \ 53 kunmap_local(kaddr); \ 54 len -= left; \ 55 off += len; \ 56 skip += len; \ 57 if (skip == p->bv_len) { \ 58 skip = 0; \ 59 p++; \ 60 } \ 61 n -= len; \ 62 if (left) \ 63 break; \ 64 } \ 65 i->iov_offset = skip; \ 66 n = off; \ 67 } 68 69 #define iterate_xarray(i, n, base, len, __off, STEP) { \ 70 __label__ __out; \ 71 size_t __off = 0; \ 72 struct folio *folio; \ 73 loff_t start = i->xarray_start + i->iov_offset; \ 74 pgoff_t index = start / PAGE_SIZE; \ 75 XA_STATE(xas, i->xarray, index); \ 76 \ 77 len = PAGE_SIZE - offset_in_page(start); \ 78 rcu_read_lock(); \ 79 xas_for_each(&xas, folio, ULONG_MAX) { \ 80 unsigned left; \ 81 size_t offset; \ 82 if (xas_retry(&xas, folio)) \ 83 continue; \ 84 if (WARN_ON(xa_is_value(folio))) \ 85 break; \ 86 if (WARN_ON(folio_test_hugetlb(folio))) \ 87 break; \ 88 offset = offset_in_folio(folio, start + __off); \ 89 while (offset < folio_size(folio)) { \ 90 base = kmap_local_folio(folio, offset); \ 91 len = min(n, len); \ 92 left = (STEP); \ 93 kunmap_local(base); \ 94 len -= left; \ 95 __off += len; \ 96 n -= len; \ 97 if (left || n == 0) \ 98 goto __out; \ 99 offset += len; \ 100 len = PAGE_SIZE; \ 101 } \ 102 } \ 103 __out: \ 104 rcu_read_unlock(); \ 105 i->iov_offset += __off; \ 106 n = __off; \ 107 } 108 109 #define __iterate_and_advance(i, n, base, len, off, I, K) { \ 110 if (unlikely(i->count < n)) \ 111 n = i->count; \ 112 if (likely(n)) { \ 113 if (likely(iter_is_iovec(i))) { \ 114 const struct iovec *iov = i->iov; \ 115 void __user *base; \ 116 size_t len; \ 117 iterate_iovec(i, n, base, len, off, \ 118 iov, (I)) \ 119 i->nr_segs -= iov - i->iov; \ 120 i->iov = iov; \ 121 } else if (iov_iter_is_bvec(i)) { \ 122 const struct bio_vec *bvec = i->bvec; \ 123 void *base; \ 124 size_t len; \ 125 iterate_bvec(i, n, base, len, off, \ 126 bvec, (K)) \ 127 i->nr_segs -= bvec - i->bvec; \ 128 i->bvec = bvec; \ 129 } else if (iov_iter_is_kvec(i)) { \ 130 const struct kvec *kvec = i->kvec; \ 131 void *base; \ 132 size_t len; \ 133 iterate_iovec(i, n, base, len, off, \ 134 kvec, (K)) \ 135 i->nr_segs -= kvec - i->kvec; \ 136 i->kvec = kvec; \ 137 } else if (iov_iter_is_xarray(i)) { \ 138 void *base; \ 139 size_t len; \ 140 iterate_xarray(i, n, base, len, off, \ 141 (K)) \ 142 } \ 143 i->count -= n; \ 144 } \ 145 } 146 #define iterate_and_advance(i, n, base, len, off, I, K) \ 147 __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0)) 148 149 static int copyout(void __user *to, const void *from, size_t n) 150 { 151 if (should_fail_usercopy()) 152 return n; 153 if (access_ok(to, n)) { 154 instrument_copy_to_user(to, from, n); 155 n = raw_copy_to_user(to, from, n); 156 } 157 return n; 158 } 159 160 static int copyin(void *to, const void __user *from, size_t n) 161 { 162 if (should_fail_usercopy()) 163 return n; 164 if (access_ok(from, n)) { 165 instrument_copy_from_user(to, from, n); 166 n = raw_copy_from_user(to, from, n); 167 } 168 return n; 169 } 170 171 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, 172 struct iov_iter *i) 173 { 174 size_t skip, copy, left, wanted; 175 const struct iovec *iov; 176 char __user *buf; 177 void *kaddr, *from; 178 179 if (unlikely(bytes > i->count)) 180 bytes = i->count; 181 182 if (unlikely(!bytes)) 183 return 0; 184 185 might_fault(); 186 wanted = bytes; 187 iov = i->iov; 188 skip = i->iov_offset; 189 buf = iov->iov_base + skip; 190 copy = min(bytes, iov->iov_len - skip); 191 192 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) { 193 kaddr = kmap_atomic(page); 194 from = kaddr + offset; 195 196 /* first chunk, usually the only one */ 197 left = copyout(buf, from, copy); 198 copy -= left; 199 skip += copy; 200 from += copy; 201 bytes -= copy; 202 203 while (unlikely(!left && bytes)) { 204 iov++; 205 buf = iov->iov_base; 206 copy = min(bytes, iov->iov_len); 207 left = copyout(buf, from, copy); 208 copy -= left; 209 skip = copy; 210 from += copy; 211 bytes -= copy; 212 } 213 if (likely(!bytes)) { 214 kunmap_atomic(kaddr); 215 goto done; 216 } 217 offset = from - kaddr; 218 buf += copy; 219 kunmap_atomic(kaddr); 220 copy = min(bytes, iov->iov_len - skip); 221 } 222 /* Too bad - revert to non-atomic kmap */ 223 224 kaddr = kmap(page); 225 from = kaddr + offset; 226 left = copyout(buf, from, copy); 227 copy -= left; 228 skip += copy; 229 from += copy; 230 bytes -= copy; 231 while (unlikely(!left && bytes)) { 232 iov++; 233 buf = iov->iov_base; 234 copy = min(bytes, iov->iov_len); 235 left = copyout(buf, from, copy); 236 copy -= left; 237 skip = copy; 238 from += copy; 239 bytes -= copy; 240 } 241 kunmap(page); 242 243 done: 244 if (skip == iov->iov_len) { 245 iov++; 246 skip = 0; 247 } 248 i->count -= wanted - bytes; 249 i->nr_segs -= iov - i->iov; 250 i->iov = iov; 251 i->iov_offset = skip; 252 return wanted - bytes; 253 } 254 255 static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, 256 struct iov_iter *i) 257 { 258 size_t skip, copy, left, wanted; 259 const struct iovec *iov; 260 char __user *buf; 261 void *kaddr, *to; 262 263 if (unlikely(bytes > i->count)) 264 bytes = i->count; 265 266 if (unlikely(!bytes)) 267 return 0; 268 269 might_fault(); 270 wanted = bytes; 271 iov = i->iov; 272 skip = i->iov_offset; 273 buf = iov->iov_base + skip; 274 copy = min(bytes, iov->iov_len - skip); 275 276 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) { 277 kaddr = kmap_atomic(page); 278 to = kaddr + offset; 279 280 /* first chunk, usually the only one */ 281 left = copyin(to, buf, copy); 282 copy -= left; 283 skip += copy; 284 to += copy; 285 bytes -= copy; 286 287 while (unlikely(!left && bytes)) { 288 iov++; 289 buf = iov->iov_base; 290 copy = min(bytes, iov->iov_len); 291 left = copyin(to, buf, copy); 292 copy -= left; 293 skip = copy; 294 to += copy; 295 bytes -= copy; 296 } 297 if (likely(!bytes)) { 298 kunmap_atomic(kaddr); 299 goto done; 300 } 301 offset = to - kaddr; 302 buf += copy; 303 kunmap_atomic(kaddr); 304 copy = min(bytes, iov->iov_len - skip); 305 } 306 /* Too bad - revert to non-atomic kmap */ 307 308 kaddr = kmap(page); 309 to = kaddr + offset; 310 left = copyin(to, buf, copy); 311 copy -= left; 312 skip += copy; 313 to += copy; 314 bytes -= copy; 315 while (unlikely(!left && bytes)) { 316 iov++; 317 buf = iov->iov_base; 318 copy = min(bytes, iov->iov_len); 319 left = copyin(to, buf, copy); 320 copy -= left; 321 skip = copy; 322 to += copy; 323 bytes -= copy; 324 } 325 kunmap(page); 326 327 done: 328 if (skip == iov->iov_len) { 329 iov++; 330 skip = 0; 331 } 332 i->count -= wanted - bytes; 333 i->nr_segs -= iov - i->iov; 334 i->iov = iov; 335 i->iov_offset = skip; 336 return wanted - bytes; 337 } 338 339 #ifdef PIPE_PARANOIA 340 static bool sanity(const struct iov_iter *i) 341 { 342 struct pipe_inode_info *pipe = i->pipe; 343 unsigned int p_head = pipe->head; 344 unsigned int p_tail = pipe->tail; 345 unsigned int p_mask = pipe->ring_size - 1; 346 unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); 347 unsigned int i_head = i->head; 348 unsigned int idx; 349 350 if (i->iov_offset) { 351 struct pipe_buffer *p; 352 if (unlikely(p_occupancy == 0)) 353 goto Bad; // pipe must be non-empty 354 if (unlikely(i_head != p_head - 1)) 355 goto Bad; // must be at the last buffer... 356 357 p = &pipe->bufs[i_head & p_mask]; 358 if (unlikely(p->offset + p->len != i->iov_offset)) 359 goto Bad; // ... at the end of segment 360 } else { 361 if (i_head != p_head) 362 goto Bad; // must be right after the last buffer 363 } 364 return true; 365 Bad: 366 printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset); 367 printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n", 368 p_head, p_tail, pipe->ring_size); 369 for (idx = 0; idx < pipe->ring_size; idx++) 370 printk(KERN_ERR "[%p %p %d %d]\n", 371 pipe->bufs[idx].ops, 372 pipe->bufs[idx].page, 373 pipe->bufs[idx].offset, 374 pipe->bufs[idx].len); 375 WARN_ON(1); 376 return false; 377 } 378 #else 379 #define sanity(i) true 380 #endif 381 382 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, 383 struct iov_iter *i) 384 { 385 struct pipe_inode_info *pipe = i->pipe; 386 struct pipe_buffer *buf; 387 unsigned int p_tail = pipe->tail; 388 unsigned int p_mask = pipe->ring_size - 1; 389 unsigned int i_head = i->head; 390 size_t off; 391 392 if (unlikely(bytes > i->count)) 393 bytes = i->count; 394 395 if (unlikely(!bytes)) 396 return 0; 397 398 if (!sanity(i)) 399 return 0; 400 401 off = i->iov_offset; 402 buf = &pipe->bufs[i_head & p_mask]; 403 if (off) { 404 if (offset == off && buf->page == page) { 405 /* merge with the last one */ 406 buf->len += bytes; 407 i->iov_offset += bytes; 408 goto out; 409 } 410 i_head++; 411 buf = &pipe->bufs[i_head & p_mask]; 412 } 413 if (pipe_full(i_head, p_tail, pipe->max_usage)) 414 return 0; 415 416 buf->ops = &page_cache_pipe_buf_ops; 417 get_page(page); 418 buf->page = page; 419 buf->offset = offset; 420 buf->len = bytes; 421 422 pipe->head = i_head + 1; 423 i->iov_offset = offset + bytes; 424 i->head = i_head; 425 out: 426 i->count -= bytes; 427 return bytes; 428 } 429 430 /* 431 * fault_in_iov_iter_readable - fault in iov iterator for reading 432 * @i: iterator 433 * @size: maximum length 434 * 435 * Fault in one or more iovecs of the given iov_iter, to a maximum length of 436 * @size. For each iovec, fault in each page that constitutes the iovec. 437 * 438 * Returns the number of bytes not faulted in (like copy_to_user() and 439 * copy_from_user()). 440 * 441 * Always returns 0 for non-userspace iterators. 442 */ 443 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) 444 { 445 if (iter_is_iovec(i)) { 446 size_t count = min(size, iov_iter_count(i)); 447 const struct iovec *p; 448 size_t skip; 449 450 size -= count; 451 for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { 452 size_t len = min(count, p->iov_len - skip); 453 size_t ret; 454 455 if (unlikely(!len)) 456 continue; 457 ret = fault_in_readable(p->iov_base + skip, len); 458 count -= len - ret; 459 if (ret) 460 break; 461 } 462 return count + size; 463 } 464 return 0; 465 } 466 EXPORT_SYMBOL(fault_in_iov_iter_readable); 467 468 /* 469 * fault_in_iov_iter_writeable - fault in iov iterator for writing 470 * @i: iterator 471 * @size: maximum length 472 * 473 * Faults in the iterator using get_user_pages(), i.e., without triggering 474 * hardware page faults. This is primarily useful when we already know that 475 * some or all of the pages in @i aren't in memory. 476 * 477 * Returns the number of bytes not faulted in, like copy_to_user() and 478 * copy_from_user(). 479 * 480 * Always returns 0 for non-user-space iterators. 481 */ 482 size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) 483 { 484 if (iter_is_iovec(i)) { 485 size_t count = min(size, iov_iter_count(i)); 486 const struct iovec *p; 487 size_t skip; 488 489 size -= count; 490 for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { 491 size_t len = min(count, p->iov_len - skip); 492 size_t ret; 493 494 if (unlikely(!len)) 495 continue; 496 ret = fault_in_safe_writeable(p->iov_base + skip, len); 497 count -= len - ret; 498 if (ret) 499 break; 500 } 501 return count + size; 502 } 503 return 0; 504 } 505 EXPORT_SYMBOL(fault_in_iov_iter_writeable); 506 507 void iov_iter_init(struct iov_iter *i, unsigned int direction, 508 const struct iovec *iov, unsigned long nr_segs, 509 size_t count) 510 { 511 WARN_ON(direction & ~(READ | WRITE)); 512 *i = (struct iov_iter) { 513 .iter_type = ITER_IOVEC, 514 .nofault = false, 515 .data_source = direction, 516 .iov = iov, 517 .nr_segs = nr_segs, 518 .iov_offset = 0, 519 .count = count 520 }; 521 } 522 EXPORT_SYMBOL(iov_iter_init); 523 524 static inline bool allocated(struct pipe_buffer *buf) 525 { 526 return buf->ops == &default_pipe_buf_ops; 527 } 528 529 static inline void data_start(const struct iov_iter *i, 530 unsigned int *iter_headp, size_t *offp) 531 { 532 unsigned int p_mask = i->pipe->ring_size - 1; 533 unsigned int iter_head = i->head; 534 size_t off = i->iov_offset; 535 536 if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) || 537 off == PAGE_SIZE)) { 538 iter_head++; 539 off = 0; 540 } 541 *iter_headp = iter_head; 542 *offp = off; 543 } 544 545 static size_t push_pipe(struct iov_iter *i, size_t size, 546 int *iter_headp, size_t *offp) 547 { 548 struct pipe_inode_info *pipe = i->pipe; 549 unsigned int p_tail = pipe->tail; 550 unsigned int p_mask = pipe->ring_size - 1; 551 unsigned int iter_head; 552 size_t off; 553 ssize_t left; 554 555 if (unlikely(size > i->count)) 556 size = i->count; 557 if (unlikely(!size)) 558 return 0; 559 560 left = size; 561 data_start(i, &iter_head, &off); 562 *iter_headp = iter_head; 563 *offp = off; 564 if (off) { 565 left -= PAGE_SIZE - off; 566 if (left <= 0) { 567 pipe->bufs[iter_head & p_mask].len += size; 568 return size; 569 } 570 pipe->bufs[iter_head & p_mask].len = PAGE_SIZE; 571 iter_head++; 572 } 573 while (!pipe_full(iter_head, p_tail, pipe->max_usage)) { 574 struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask]; 575 struct page *page = alloc_page(GFP_USER); 576 if (!page) 577 break; 578 579 buf->ops = &default_pipe_buf_ops; 580 buf->page = page; 581 buf->offset = 0; 582 buf->len = min_t(ssize_t, left, PAGE_SIZE); 583 left -= buf->len; 584 iter_head++; 585 pipe->head = iter_head; 586 587 if (left == 0) 588 return size; 589 } 590 return size - left; 591 } 592 593 static size_t copy_pipe_to_iter(const void *addr, size_t bytes, 594 struct iov_iter *i) 595 { 596 struct pipe_inode_info *pipe = i->pipe; 597 unsigned int p_mask = pipe->ring_size - 1; 598 unsigned int i_head; 599 size_t n, off; 600 601 if (!sanity(i)) 602 return 0; 603 604 bytes = n = push_pipe(i, bytes, &i_head, &off); 605 if (unlikely(!n)) 606 return 0; 607 do { 608 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 609 memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk); 610 i->head = i_head; 611 i->iov_offset = off + chunk; 612 n -= chunk; 613 addr += chunk; 614 off = 0; 615 i_head++; 616 } while (n); 617 i->count -= bytes; 618 return bytes; 619 } 620 621 static __wsum csum_and_memcpy(void *to, const void *from, size_t len, 622 __wsum sum, size_t off) 623 { 624 __wsum next = csum_partial_copy_nocheck(from, to, len); 625 return csum_block_add(sum, next, off); 626 } 627 628 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, 629 struct iov_iter *i, __wsum *sump) 630 { 631 struct pipe_inode_info *pipe = i->pipe; 632 unsigned int p_mask = pipe->ring_size - 1; 633 __wsum sum = *sump; 634 size_t off = 0; 635 unsigned int i_head; 636 size_t r; 637 638 if (!sanity(i)) 639 return 0; 640 641 bytes = push_pipe(i, bytes, &i_head, &r); 642 while (bytes) { 643 size_t chunk = min_t(size_t, bytes, PAGE_SIZE - r); 644 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page); 645 sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off); 646 kunmap_local(p); 647 i->head = i_head; 648 i->iov_offset = r + chunk; 649 bytes -= chunk; 650 off += chunk; 651 r = 0; 652 i_head++; 653 } 654 *sump = sum; 655 i->count -= off; 656 return off; 657 } 658 659 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 660 { 661 if (unlikely(iov_iter_is_pipe(i))) 662 return copy_pipe_to_iter(addr, bytes, i); 663 if (iter_is_iovec(i)) 664 might_fault(); 665 iterate_and_advance(i, bytes, base, len, off, 666 copyout(base, addr + off, len), 667 memcpy(base, addr + off, len) 668 ) 669 670 return bytes; 671 } 672 EXPORT_SYMBOL(_copy_to_iter); 673 674 #ifdef CONFIG_ARCH_HAS_COPY_MC 675 static int copyout_mc(void __user *to, const void *from, size_t n) 676 { 677 if (access_ok(to, n)) { 678 instrument_copy_to_user(to, from, n); 679 n = copy_mc_to_user((__force void *) to, from, n); 680 } 681 return n; 682 } 683 684 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, 685 struct iov_iter *i) 686 { 687 struct pipe_inode_info *pipe = i->pipe; 688 unsigned int p_mask = pipe->ring_size - 1; 689 unsigned int i_head; 690 size_t n, off, xfer = 0; 691 692 if (!sanity(i)) 693 return 0; 694 695 n = push_pipe(i, bytes, &i_head, &off); 696 while (n) { 697 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 698 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page); 699 unsigned long rem; 700 rem = copy_mc_to_kernel(p + off, addr + xfer, chunk); 701 chunk -= rem; 702 kunmap_local(p); 703 i->head = i_head; 704 i->iov_offset = off + chunk; 705 xfer += chunk; 706 if (rem) 707 break; 708 n -= chunk; 709 off = 0; 710 i_head++; 711 } 712 i->count -= xfer; 713 return xfer; 714 } 715 716 /** 717 * _copy_mc_to_iter - copy to iter with source memory error exception handling 718 * @addr: source kernel address 719 * @bytes: total transfer length 720 * @i: destination iterator 721 * 722 * The pmem driver deploys this for the dax operation 723 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the 724 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes 725 * successfully copied. 726 * 727 * The main differences between this and typical _copy_to_iter(). 728 * 729 * * Typical tail/residue handling after a fault retries the copy 730 * byte-by-byte until the fault happens again. Re-triggering machine 731 * checks is potentially fatal so the implementation uses source 732 * alignment and poison alignment assumptions to avoid re-triggering 733 * hardware exceptions. 734 * 735 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. 736 * Compare to copy_to_iter() where only ITER_IOVEC attempts might return 737 * a short copy. 738 * 739 * Return: number of bytes copied (may be %0) 740 */ 741 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 742 { 743 if (unlikely(iov_iter_is_pipe(i))) 744 return copy_mc_pipe_to_iter(addr, bytes, i); 745 if (iter_is_iovec(i)) 746 might_fault(); 747 __iterate_and_advance(i, bytes, base, len, off, 748 copyout_mc(base, addr + off, len), 749 copy_mc_to_kernel(base, addr + off, len) 750 ) 751 752 return bytes; 753 } 754 EXPORT_SYMBOL_GPL(_copy_mc_to_iter); 755 #endif /* CONFIG_ARCH_HAS_COPY_MC */ 756 757 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) 758 { 759 if (unlikely(iov_iter_is_pipe(i))) { 760 WARN_ON(1); 761 return 0; 762 } 763 if (iter_is_iovec(i)) 764 might_fault(); 765 iterate_and_advance(i, bytes, base, len, off, 766 copyin(addr + off, base, len), 767 memcpy(addr + off, base, len) 768 ) 769 770 return bytes; 771 } 772 EXPORT_SYMBOL(_copy_from_iter); 773 774 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) 775 { 776 if (unlikely(iov_iter_is_pipe(i))) { 777 WARN_ON(1); 778 return 0; 779 } 780 iterate_and_advance(i, bytes, base, len, off, 781 __copy_from_user_inatomic_nocache(addr + off, base, len), 782 memcpy(addr + off, base, len) 783 ) 784 785 return bytes; 786 } 787 EXPORT_SYMBOL(_copy_from_iter_nocache); 788 789 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE 790 /** 791 * _copy_from_iter_flushcache - write destination through cpu cache 792 * @addr: destination kernel address 793 * @bytes: total transfer length 794 * @i: source iterator 795 * 796 * The pmem driver arranges for filesystem-dax to use this facility via 797 * dax_copy_from_iter() for ensuring that writes to persistent memory 798 * are flushed through the CPU cache. It is differentiated from 799 * _copy_from_iter_nocache() in that guarantees all data is flushed for 800 * all iterator types. The _copy_from_iter_nocache() only attempts to 801 * bypass the cache for the ITER_IOVEC case, and on some archs may use 802 * instructions that strand dirty-data in the cache. 803 * 804 * Return: number of bytes copied (may be %0) 805 */ 806 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) 807 { 808 if (unlikely(iov_iter_is_pipe(i))) { 809 WARN_ON(1); 810 return 0; 811 } 812 iterate_and_advance(i, bytes, base, len, off, 813 __copy_from_user_flushcache(addr + off, base, len), 814 memcpy_flushcache(addr + off, base, len) 815 ) 816 817 return bytes; 818 } 819 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); 820 #endif 821 822 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) 823 { 824 struct page *head; 825 size_t v = n + offset; 826 827 /* 828 * The general case needs to access the page order in order 829 * to compute the page size. 830 * However, we mostly deal with order-0 pages and thus can 831 * avoid a possible cache line miss for requests that fit all 832 * page orders. 833 */ 834 if (n <= v && v <= PAGE_SIZE) 835 return true; 836 837 head = compound_head(page); 838 v += (page - head) << PAGE_SHIFT; 839 840 if (likely(n <= v && v <= (page_size(head)))) 841 return true; 842 WARN_ON(1); 843 return false; 844 } 845 846 static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 847 struct iov_iter *i) 848 { 849 if (likely(iter_is_iovec(i))) 850 return copy_page_to_iter_iovec(page, offset, bytes, i); 851 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { 852 void *kaddr = kmap_local_page(page); 853 size_t wanted = _copy_to_iter(kaddr + offset, bytes, i); 854 kunmap_local(kaddr); 855 return wanted; 856 } 857 if (iov_iter_is_pipe(i)) 858 return copy_page_to_iter_pipe(page, offset, bytes, i); 859 if (unlikely(iov_iter_is_discard(i))) { 860 if (unlikely(i->count < bytes)) 861 bytes = i->count; 862 i->count -= bytes; 863 return bytes; 864 } 865 WARN_ON(1); 866 return 0; 867 } 868 869 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 870 struct iov_iter *i) 871 { 872 size_t res = 0; 873 if (unlikely(!page_copy_sane(page, offset, bytes))) 874 return 0; 875 page += offset / PAGE_SIZE; // first subpage 876 offset %= PAGE_SIZE; 877 while (1) { 878 size_t n = __copy_page_to_iter(page, offset, 879 min(bytes, (size_t)PAGE_SIZE - offset), i); 880 res += n; 881 bytes -= n; 882 if (!bytes || !n) 883 break; 884 offset += n; 885 if (offset == PAGE_SIZE) { 886 page++; 887 offset = 0; 888 } 889 } 890 return res; 891 } 892 EXPORT_SYMBOL(copy_page_to_iter); 893 894 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, 895 struct iov_iter *i) 896 { 897 if (unlikely(!page_copy_sane(page, offset, bytes))) 898 return 0; 899 if (likely(iter_is_iovec(i))) 900 return copy_page_from_iter_iovec(page, offset, bytes, i); 901 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { 902 void *kaddr = kmap_local_page(page); 903 size_t wanted = _copy_from_iter(kaddr + offset, bytes, i); 904 kunmap_local(kaddr); 905 return wanted; 906 } 907 WARN_ON(1); 908 return 0; 909 } 910 EXPORT_SYMBOL(copy_page_from_iter); 911 912 static size_t pipe_zero(size_t bytes, struct iov_iter *i) 913 { 914 struct pipe_inode_info *pipe = i->pipe; 915 unsigned int p_mask = pipe->ring_size - 1; 916 unsigned int i_head; 917 size_t n, off; 918 919 if (!sanity(i)) 920 return 0; 921 922 bytes = n = push_pipe(i, bytes, &i_head, &off); 923 if (unlikely(!n)) 924 return 0; 925 926 do { 927 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 928 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page); 929 memset(p + off, 0, chunk); 930 kunmap_local(p); 931 i->head = i_head; 932 i->iov_offset = off + chunk; 933 n -= chunk; 934 off = 0; 935 i_head++; 936 } while (n); 937 i->count -= bytes; 938 return bytes; 939 } 940 941 size_t iov_iter_zero(size_t bytes, struct iov_iter *i) 942 { 943 if (unlikely(iov_iter_is_pipe(i))) 944 return pipe_zero(bytes, i); 945 iterate_and_advance(i, bytes, base, len, count, 946 clear_user(base, len), 947 memset(base, 0, len) 948 ) 949 950 return bytes; 951 } 952 EXPORT_SYMBOL(iov_iter_zero); 953 954 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, 955 struct iov_iter *i) 956 { 957 char *kaddr = kmap_atomic(page), *p = kaddr + offset; 958 if (unlikely(!page_copy_sane(page, offset, bytes))) { 959 kunmap_atomic(kaddr); 960 return 0; 961 } 962 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 963 kunmap_atomic(kaddr); 964 WARN_ON(1); 965 return 0; 966 } 967 iterate_and_advance(i, bytes, base, len, off, 968 copyin(p + off, base, len), 969 memcpy(p + off, base, len) 970 ) 971 kunmap_atomic(kaddr); 972 return bytes; 973 } 974 EXPORT_SYMBOL(copy_page_from_iter_atomic); 975 976 static inline void pipe_truncate(struct iov_iter *i) 977 { 978 struct pipe_inode_info *pipe = i->pipe; 979 unsigned int p_tail = pipe->tail; 980 unsigned int p_head = pipe->head; 981 unsigned int p_mask = pipe->ring_size - 1; 982 983 if (!pipe_empty(p_head, p_tail)) { 984 struct pipe_buffer *buf; 985 unsigned int i_head = i->head; 986 size_t off = i->iov_offset; 987 988 if (off) { 989 buf = &pipe->bufs[i_head & p_mask]; 990 buf->len = off - buf->offset; 991 i_head++; 992 } 993 while (p_head != i_head) { 994 p_head--; 995 pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]); 996 } 997 998 pipe->head = p_head; 999 } 1000 } 1001 1002 static void pipe_advance(struct iov_iter *i, size_t size) 1003 { 1004 struct pipe_inode_info *pipe = i->pipe; 1005 if (size) { 1006 struct pipe_buffer *buf; 1007 unsigned int p_mask = pipe->ring_size - 1; 1008 unsigned int i_head = i->head; 1009 size_t off = i->iov_offset, left = size; 1010 1011 if (off) /* make it relative to the beginning of buffer */ 1012 left += off - pipe->bufs[i_head & p_mask].offset; 1013 while (1) { 1014 buf = &pipe->bufs[i_head & p_mask]; 1015 if (left <= buf->len) 1016 break; 1017 left -= buf->len; 1018 i_head++; 1019 } 1020 i->head = i_head; 1021 i->iov_offset = buf->offset + left; 1022 } 1023 i->count -= size; 1024 /* ... and discard everything past that point */ 1025 pipe_truncate(i); 1026 } 1027 1028 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) 1029 { 1030 struct bvec_iter bi; 1031 1032 bi.bi_size = i->count; 1033 bi.bi_bvec_done = i->iov_offset; 1034 bi.bi_idx = 0; 1035 bvec_iter_advance(i->bvec, &bi, size); 1036 1037 i->bvec += bi.bi_idx; 1038 i->nr_segs -= bi.bi_idx; 1039 i->count = bi.bi_size; 1040 i->iov_offset = bi.bi_bvec_done; 1041 } 1042 1043 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) 1044 { 1045 const struct iovec *iov, *end; 1046 1047 if (!i->count) 1048 return; 1049 i->count -= size; 1050 1051 size += i->iov_offset; // from beginning of current segment 1052 for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { 1053 if (likely(size < iov->iov_len)) 1054 break; 1055 size -= iov->iov_len; 1056 } 1057 i->iov_offset = size; 1058 i->nr_segs -= iov - i->iov; 1059 i->iov = iov; 1060 } 1061 1062 void iov_iter_advance(struct iov_iter *i, size_t size) 1063 { 1064 if (unlikely(i->count < size)) 1065 size = i->count; 1066 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { 1067 /* iovec and kvec have identical layouts */ 1068 iov_iter_iovec_advance(i, size); 1069 } else if (iov_iter_is_bvec(i)) { 1070 iov_iter_bvec_advance(i, size); 1071 } else if (iov_iter_is_pipe(i)) { 1072 pipe_advance(i, size); 1073 } else if (unlikely(iov_iter_is_xarray(i))) { 1074 i->iov_offset += size; 1075 i->count -= size; 1076 } else if (iov_iter_is_discard(i)) { 1077 i->count -= size; 1078 } 1079 } 1080 EXPORT_SYMBOL(iov_iter_advance); 1081 1082 void iov_iter_revert(struct iov_iter *i, size_t unroll) 1083 { 1084 if (!unroll) 1085 return; 1086 if (WARN_ON(unroll > MAX_RW_COUNT)) 1087 return; 1088 i->count += unroll; 1089 if (unlikely(iov_iter_is_pipe(i))) { 1090 struct pipe_inode_info *pipe = i->pipe; 1091 unsigned int p_mask = pipe->ring_size - 1; 1092 unsigned int i_head = i->head; 1093 size_t off = i->iov_offset; 1094 while (1) { 1095 struct pipe_buffer *b = &pipe->bufs[i_head & p_mask]; 1096 size_t n = off - b->offset; 1097 if (unroll < n) { 1098 off -= unroll; 1099 break; 1100 } 1101 unroll -= n; 1102 if (!unroll && i_head == i->start_head) { 1103 off = 0; 1104 break; 1105 } 1106 i_head--; 1107 b = &pipe->bufs[i_head & p_mask]; 1108 off = b->offset + b->len; 1109 } 1110 i->iov_offset = off; 1111 i->head = i_head; 1112 pipe_truncate(i); 1113 return; 1114 } 1115 if (unlikely(iov_iter_is_discard(i))) 1116 return; 1117 if (unroll <= i->iov_offset) { 1118 i->iov_offset -= unroll; 1119 return; 1120 } 1121 unroll -= i->iov_offset; 1122 if (iov_iter_is_xarray(i)) { 1123 BUG(); /* We should never go beyond the start of the specified 1124 * range since we might then be straying into pages that 1125 * aren't pinned. 1126 */ 1127 } else if (iov_iter_is_bvec(i)) { 1128 const struct bio_vec *bvec = i->bvec; 1129 while (1) { 1130 size_t n = (--bvec)->bv_len; 1131 i->nr_segs++; 1132 if (unroll <= n) { 1133 i->bvec = bvec; 1134 i->iov_offset = n - unroll; 1135 return; 1136 } 1137 unroll -= n; 1138 } 1139 } else { /* same logics for iovec and kvec */ 1140 const struct iovec *iov = i->iov; 1141 while (1) { 1142 size_t n = (--iov)->iov_len; 1143 i->nr_segs++; 1144 if (unroll <= n) { 1145 i->iov = iov; 1146 i->iov_offset = n - unroll; 1147 return; 1148 } 1149 unroll -= n; 1150 } 1151 } 1152 } 1153 EXPORT_SYMBOL(iov_iter_revert); 1154 1155 /* 1156 * Return the count of just the current iov_iter segment. 1157 */ 1158 size_t iov_iter_single_seg_count(const struct iov_iter *i) 1159 { 1160 if (i->nr_segs > 1) { 1161 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1162 return min(i->count, i->iov->iov_len - i->iov_offset); 1163 if (iov_iter_is_bvec(i)) 1164 return min(i->count, i->bvec->bv_len - i->iov_offset); 1165 } 1166 return i->count; 1167 } 1168 EXPORT_SYMBOL(iov_iter_single_seg_count); 1169 1170 void iov_iter_kvec(struct iov_iter *i, unsigned int direction, 1171 const struct kvec *kvec, unsigned long nr_segs, 1172 size_t count) 1173 { 1174 WARN_ON(direction & ~(READ | WRITE)); 1175 *i = (struct iov_iter){ 1176 .iter_type = ITER_KVEC, 1177 .data_source = direction, 1178 .kvec = kvec, 1179 .nr_segs = nr_segs, 1180 .iov_offset = 0, 1181 .count = count 1182 }; 1183 } 1184 EXPORT_SYMBOL(iov_iter_kvec); 1185 1186 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, 1187 const struct bio_vec *bvec, unsigned long nr_segs, 1188 size_t count) 1189 { 1190 WARN_ON(direction & ~(READ | WRITE)); 1191 *i = (struct iov_iter){ 1192 .iter_type = ITER_BVEC, 1193 .data_source = direction, 1194 .bvec = bvec, 1195 .nr_segs = nr_segs, 1196 .iov_offset = 0, 1197 .count = count 1198 }; 1199 } 1200 EXPORT_SYMBOL(iov_iter_bvec); 1201 1202 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, 1203 struct pipe_inode_info *pipe, 1204 size_t count) 1205 { 1206 BUG_ON(direction != READ); 1207 WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); 1208 *i = (struct iov_iter){ 1209 .iter_type = ITER_PIPE, 1210 .data_source = false, 1211 .pipe = pipe, 1212 .head = pipe->head, 1213 .start_head = pipe->head, 1214 .iov_offset = 0, 1215 .count = count 1216 }; 1217 } 1218 EXPORT_SYMBOL(iov_iter_pipe); 1219 1220 /** 1221 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray 1222 * @i: The iterator to initialise. 1223 * @direction: The direction of the transfer. 1224 * @xarray: The xarray to access. 1225 * @start: The start file position. 1226 * @count: The size of the I/O buffer in bytes. 1227 * 1228 * Set up an I/O iterator to either draw data out of the pages attached to an 1229 * inode or to inject data into those pages. The pages *must* be prevented 1230 * from evaporation, either by taking a ref on them or locking them by the 1231 * caller. 1232 */ 1233 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, 1234 struct xarray *xarray, loff_t start, size_t count) 1235 { 1236 BUG_ON(direction & ~1); 1237 *i = (struct iov_iter) { 1238 .iter_type = ITER_XARRAY, 1239 .data_source = direction, 1240 .xarray = xarray, 1241 .xarray_start = start, 1242 .count = count, 1243 .iov_offset = 0 1244 }; 1245 } 1246 EXPORT_SYMBOL(iov_iter_xarray); 1247 1248 /** 1249 * iov_iter_discard - Initialise an I/O iterator that discards data 1250 * @i: The iterator to initialise. 1251 * @direction: The direction of the transfer. 1252 * @count: The size of the I/O buffer in bytes. 1253 * 1254 * Set up an I/O iterator that just discards everything that's written to it. 1255 * It's only available as a READ iterator. 1256 */ 1257 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) 1258 { 1259 BUG_ON(direction != READ); 1260 *i = (struct iov_iter){ 1261 .iter_type = ITER_DISCARD, 1262 .data_source = false, 1263 .count = count, 1264 .iov_offset = 0 1265 }; 1266 } 1267 EXPORT_SYMBOL(iov_iter_discard); 1268 1269 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) 1270 { 1271 unsigned long res = 0; 1272 size_t size = i->count; 1273 size_t skip = i->iov_offset; 1274 unsigned k; 1275 1276 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1277 size_t len = i->iov[k].iov_len - skip; 1278 if (len) { 1279 res |= (unsigned long)i->iov[k].iov_base + skip; 1280 if (len > size) 1281 len = size; 1282 res |= len; 1283 size -= len; 1284 if (!size) 1285 break; 1286 } 1287 } 1288 return res; 1289 } 1290 1291 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) 1292 { 1293 unsigned res = 0; 1294 size_t size = i->count; 1295 unsigned skip = i->iov_offset; 1296 unsigned k; 1297 1298 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1299 size_t len = i->bvec[k].bv_len - skip; 1300 res |= (unsigned long)i->bvec[k].bv_offset + skip; 1301 if (len > size) 1302 len = size; 1303 res |= len; 1304 size -= len; 1305 if (!size) 1306 break; 1307 } 1308 return res; 1309 } 1310 1311 unsigned long iov_iter_alignment(const struct iov_iter *i) 1312 { 1313 /* iovec and kvec have identical layouts */ 1314 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1315 return iov_iter_alignment_iovec(i); 1316 1317 if (iov_iter_is_bvec(i)) 1318 return iov_iter_alignment_bvec(i); 1319 1320 if (iov_iter_is_pipe(i)) { 1321 unsigned int p_mask = i->pipe->ring_size - 1; 1322 size_t size = i->count; 1323 1324 if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask])) 1325 return size | i->iov_offset; 1326 return size; 1327 } 1328 1329 if (iov_iter_is_xarray(i)) 1330 return (i->xarray_start + i->iov_offset) | i->count; 1331 1332 return 0; 1333 } 1334 EXPORT_SYMBOL(iov_iter_alignment); 1335 1336 unsigned long iov_iter_gap_alignment(const struct iov_iter *i) 1337 { 1338 unsigned long res = 0; 1339 unsigned long v = 0; 1340 size_t size = i->count; 1341 unsigned k; 1342 1343 if (WARN_ON(!iter_is_iovec(i))) 1344 return ~0U; 1345 1346 for (k = 0; k < i->nr_segs; k++) { 1347 if (i->iov[k].iov_len) { 1348 unsigned long base = (unsigned long)i->iov[k].iov_base; 1349 if (v) // if not the first one 1350 res |= base | v; // this start | previous end 1351 v = base + i->iov[k].iov_len; 1352 if (size <= i->iov[k].iov_len) 1353 break; 1354 size -= i->iov[k].iov_len; 1355 } 1356 } 1357 return res; 1358 } 1359 EXPORT_SYMBOL(iov_iter_gap_alignment); 1360 1361 static inline ssize_t __pipe_get_pages(struct iov_iter *i, 1362 size_t maxsize, 1363 struct page **pages, 1364 int iter_head, 1365 size_t *start) 1366 { 1367 struct pipe_inode_info *pipe = i->pipe; 1368 unsigned int p_mask = pipe->ring_size - 1; 1369 ssize_t n = push_pipe(i, maxsize, &iter_head, start); 1370 if (!n) 1371 return -EFAULT; 1372 1373 maxsize = n; 1374 n += *start; 1375 while (n > 0) { 1376 get_page(*pages++ = pipe->bufs[iter_head & p_mask].page); 1377 iter_head++; 1378 n -= PAGE_SIZE; 1379 } 1380 1381 return maxsize; 1382 } 1383 1384 static ssize_t pipe_get_pages(struct iov_iter *i, 1385 struct page **pages, size_t maxsize, unsigned maxpages, 1386 size_t *start) 1387 { 1388 unsigned int iter_head, npages; 1389 size_t capacity; 1390 1391 if (!sanity(i)) 1392 return -EFAULT; 1393 1394 data_start(i, &iter_head, start); 1395 /* Amount of free space: some of this one + all after this one */ 1396 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1397 capacity = min(npages, maxpages) * PAGE_SIZE - *start; 1398 1399 return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start); 1400 } 1401 1402 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, 1403 pgoff_t index, unsigned int nr_pages) 1404 { 1405 XA_STATE(xas, xa, index); 1406 struct page *page; 1407 unsigned int ret = 0; 1408 1409 rcu_read_lock(); 1410 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1411 if (xas_retry(&xas, page)) 1412 continue; 1413 1414 /* Has the page moved or been split? */ 1415 if (unlikely(page != xas_reload(&xas))) { 1416 xas_reset(&xas); 1417 continue; 1418 } 1419 1420 pages[ret] = find_subpage(page, xas.xa_index); 1421 get_page(pages[ret]); 1422 if (++ret == nr_pages) 1423 break; 1424 } 1425 rcu_read_unlock(); 1426 return ret; 1427 } 1428 1429 static ssize_t iter_xarray_get_pages(struct iov_iter *i, 1430 struct page **pages, size_t maxsize, 1431 unsigned maxpages, size_t *_start_offset) 1432 { 1433 unsigned nr, offset; 1434 pgoff_t index, count; 1435 size_t size = maxsize, actual; 1436 loff_t pos; 1437 1438 if (!size || !maxpages) 1439 return 0; 1440 1441 pos = i->xarray_start + i->iov_offset; 1442 index = pos >> PAGE_SHIFT; 1443 offset = pos & ~PAGE_MASK; 1444 *_start_offset = offset; 1445 1446 count = 1; 1447 if (size > PAGE_SIZE - offset) { 1448 size -= PAGE_SIZE - offset; 1449 count += size >> PAGE_SHIFT; 1450 size &= ~PAGE_MASK; 1451 if (size) 1452 count++; 1453 } 1454 1455 if (count > maxpages) 1456 count = maxpages; 1457 1458 nr = iter_xarray_populate_pages(pages, i->xarray, index, count); 1459 if (nr == 0) 1460 return 0; 1461 1462 actual = PAGE_SIZE * nr; 1463 actual -= offset; 1464 if (nr == count && size > 0) { 1465 unsigned last_offset = (nr > 1) ? 0 : offset; 1466 actual -= PAGE_SIZE - (last_offset + size); 1467 } 1468 return actual; 1469 } 1470 1471 /* must be done on non-empty ITER_IOVEC one */ 1472 static unsigned long first_iovec_segment(const struct iov_iter *i, 1473 size_t *size, size_t *start, 1474 size_t maxsize, unsigned maxpages) 1475 { 1476 size_t skip; 1477 long k; 1478 1479 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { 1480 unsigned long addr = (unsigned long)i->iov[k].iov_base + skip; 1481 size_t len = i->iov[k].iov_len - skip; 1482 1483 if (unlikely(!len)) 1484 continue; 1485 if (len > maxsize) 1486 len = maxsize; 1487 len += (*start = addr % PAGE_SIZE); 1488 if (len > maxpages * PAGE_SIZE) 1489 len = maxpages * PAGE_SIZE; 1490 *size = len; 1491 return addr & PAGE_MASK; 1492 } 1493 BUG(); // if it had been empty, we wouldn't get called 1494 } 1495 1496 /* must be done on non-empty ITER_BVEC one */ 1497 static struct page *first_bvec_segment(const struct iov_iter *i, 1498 size_t *size, size_t *start, 1499 size_t maxsize, unsigned maxpages) 1500 { 1501 struct page *page; 1502 size_t skip = i->iov_offset, len; 1503 1504 len = i->bvec->bv_len - skip; 1505 if (len > maxsize) 1506 len = maxsize; 1507 skip += i->bvec->bv_offset; 1508 page = i->bvec->bv_page + skip / PAGE_SIZE; 1509 len += (*start = skip % PAGE_SIZE); 1510 if (len > maxpages * PAGE_SIZE) 1511 len = maxpages * PAGE_SIZE; 1512 *size = len; 1513 return page; 1514 } 1515 1516 ssize_t iov_iter_get_pages(struct iov_iter *i, 1517 struct page **pages, size_t maxsize, unsigned maxpages, 1518 size_t *start) 1519 { 1520 size_t len; 1521 int n, res; 1522 1523 if (maxsize > i->count) 1524 maxsize = i->count; 1525 if (!maxsize) 1526 return 0; 1527 1528 if (likely(iter_is_iovec(i))) { 1529 unsigned int gup_flags = 0; 1530 unsigned long addr; 1531 1532 if (iov_iter_rw(i) != WRITE) 1533 gup_flags |= FOLL_WRITE; 1534 if (i->nofault) 1535 gup_flags |= FOLL_NOFAULT; 1536 1537 addr = first_iovec_segment(i, &len, start, maxsize, maxpages); 1538 n = DIV_ROUND_UP(len, PAGE_SIZE); 1539 res = get_user_pages_fast(addr, n, gup_flags, pages); 1540 if (unlikely(res <= 0)) 1541 return res; 1542 return (res == n ? len : res * PAGE_SIZE) - *start; 1543 } 1544 if (iov_iter_is_bvec(i)) { 1545 struct page *page; 1546 1547 page = first_bvec_segment(i, &len, start, maxsize, maxpages); 1548 n = DIV_ROUND_UP(len, PAGE_SIZE); 1549 while (n--) 1550 get_page(*pages++ = page++); 1551 return len - *start; 1552 } 1553 if (iov_iter_is_pipe(i)) 1554 return pipe_get_pages(i, pages, maxsize, maxpages, start); 1555 if (iov_iter_is_xarray(i)) 1556 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); 1557 return -EFAULT; 1558 } 1559 EXPORT_SYMBOL(iov_iter_get_pages); 1560 1561 static struct page **get_pages_array(size_t n) 1562 { 1563 return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL); 1564 } 1565 1566 static ssize_t pipe_get_pages_alloc(struct iov_iter *i, 1567 struct page ***pages, size_t maxsize, 1568 size_t *start) 1569 { 1570 struct page **p; 1571 unsigned int iter_head, npages; 1572 ssize_t n; 1573 1574 if (!sanity(i)) 1575 return -EFAULT; 1576 1577 data_start(i, &iter_head, start); 1578 /* Amount of free space: some of this one + all after this one */ 1579 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1580 n = npages * PAGE_SIZE - *start; 1581 if (maxsize > n) 1582 maxsize = n; 1583 else 1584 npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); 1585 p = get_pages_array(npages); 1586 if (!p) 1587 return -ENOMEM; 1588 n = __pipe_get_pages(i, maxsize, p, iter_head, start); 1589 if (n > 0) 1590 *pages = p; 1591 else 1592 kvfree(p); 1593 return n; 1594 } 1595 1596 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i, 1597 struct page ***pages, size_t maxsize, 1598 size_t *_start_offset) 1599 { 1600 struct page **p; 1601 unsigned nr, offset; 1602 pgoff_t index, count; 1603 size_t size = maxsize, actual; 1604 loff_t pos; 1605 1606 if (!size) 1607 return 0; 1608 1609 pos = i->xarray_start + i->iov_offset; 1610 index = pos >> PAGE_SHIFT; 1611 offset = pos & ~PAGE_MASK; 1612 *_start_offset = offset; 1613 1614 count = 1; 1615 if (size > PAGE_SIZE - offset) { 1616 size -= PAGE_SIZE - offset; 1617 count += size >> PAGE_SHIFT; 1618 size &= ~PAGE_MASK; 1619 if (size) 1620 count++; 1621 } 1622 1623 p = get_pages_array(count); 1624 if (!p) 1625 return -ENOMEM; 1626 *pages = p; 1627 1628 nr = iter_xarray_populate_pages(p, i->xarray, index, count); 1629 if (nr == 0) 1630 return 0; 1631 1632 actual = PAGE_SIZE * nr; 1633 actual -= offset; 1634 if (nr == count && size > 0) { 1635 unsigned last_offset = (nr > 1) ? 0 : offset; 1636 actual -= PAGE_SIZE - (last_offset + size); 1637 } 1638 return actual; 1639 } 1640 1641 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, 1642 struct page ***pages, size_t maxsize, 1643 size_t *start) 1644 { 1645 struct page **p; 1646 size_t len; 1647 int n, res; 1648 1649 if (maxsize > i->count) 1650 maxsize = i->count; 1651 if (!maxsize) 1652 return 0; 1653 1654 if (likely(iter_is_iovec(i))) { 1655 unsigned int gup_flags = 0; 1656 unsigned long addr; 1657 1658 if (iov_iter_rw(i) != WRITE) 1659 gup_flags |= FOLL_WRITE; 1660 if (i->nofault) 1661 gup_flags |= FOLL_NOFAULT; 1662 1663 addr = first_iovec_segment(i, &len, start, maxsize, ~0U); 1664 n = DIV_ROUND_UP(len, PAGE_SIZE); 1665 p = get_pages_array(n); 1666 if (!p) 1667 return -ENOMEM; 1668 res = get_user_pages_fast(addr, n, gup_flags, p); 1669 if (unlikely(res <= 0)) { 1670 kvfree(p); 1671 *pages = NULL; 1672 return res; 1673 } 1674 *pages = p; 1675 return (res == n ? len : res * PAGE_SIZE) - *start; 1676 } 1677 if (iov_iter_is_bvec(i)) { 1678 struct page *page; 1679 1680 page = first_bvec_segment(i, &len, start, maxsize, ~0U); 1681 n = DIV_ROUND_UP(len, PAGE_SIZE); 1682 *pages = p = get_pages_array(n); 1683 if (!p) 1684 return -ENOMEM; 1685 while (n--) 1686 get_page(*p++ = page++); 1687 return len - *start; 1688 } 1689 if (iov_iter_is_pipe(i)) 1690 return pipe_get_pages_alloc(i, pages, maxsize, start); 1691 if (iov_iter_is_xarray(i)) 1692 return iter_xarray_get_pages_alloc(i, pages, maxsize, start); 1693 return -EFAULT; 1694 } 1695 EXPORT_SYMBOL(iov_iter_get_pages_alloc); 1696 1697 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, 1698 struct iov_iter *i) 1699 { 1700 __wsum sum, next; 1701 sum = *csum; 1702 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 1703 WARN_ON(1); 1704 return 0; 1705 } 1706 iterate_and_advance(i, bytes, base, len, off, ({ 1707 next = csum_and_copy_from_user(base, addr + off, len); 1708 sum = csum_block_add(sum, next, off); 1709 next ? 0 : len; 1710 }), ({ 1711 sum = csum_and_memcpy(addr + off, base, len, sum, off); 1712 }) 1713 ) 1714 *csum = sum; 1715 return bytes; 1716 } 1717 EXPORT_SYMBOL(csum_and_copy_from_iter); 1718 1719 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, 1720 struct iov_iter *i) 1721 { 1722 struct csum_state *csstate = _csstate; 1723 __wsum sum, next; 1724 1725 if (unlikely(iov_iter_is_discard(i))) { 1726 WARN_ON(1); /* for now */ 1727 return 0; 1728 } 1729 1730 sum = csum_shift(csstate->csum, csstate->off); 1731 if (unlikely(iov_iter_is_pipe(i))) 1732 bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum); 1733 else iterate_and_advance(i, bytes, base, len, off, ({ 1734 next = csum_and_copy_to_user(addr + off, base, len); 1735 sum = csum_block_add(sum, next, off); 1736 next ? 0 : len; 1737 }), ({ 1738 sum = csum_and_memcpy(base, addr + off, len, sum, off); 1739 }) 1740 ) 1741 csstate->csum = csum_shift(sum, csstate->off); 1742 csstate->off += bytes; 1743 return bytes; 1744 } 1745 EXPORT_SYMBOL(csum_and_copy_to_iter); 1746 1747 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, 1748 struct iov_iter *i) 1749 { 1750 #ifdef CONFIG_CRYPTO_HASH 1751 struct ahash_request *hash = hashp; 1752 struct scatterlist sg; 1753 size_t copied; 1754 1755 copied = copy_to_iter(addr, bytes, i); 1756 sg_init_one(&sg, addr, copied); 1757 ahash_request_set_crypt(hash, &sg, NULL, copied); 1758 crypto_ahash_update(hash); 1759 return copied; 1760 #else 1761 return 0; 1762 #endif 1763 } 1764 EXPORT_SYMBOL(hash_and_copy_to_iter); 1765 1766 static int iov_npages(const struct iov_iter *i, int maxpages) 1767 { 1768 size_t skip = i->iov_offset, size = i->count; 1769 const struct iovec *p; 1770 int npages = 0; 1771 1772 for (p = i->iov; size; skip = 0, p++) { 1773 unsigned offs = offset_in_page(p->iov_base + skip); 1774 size_t len = min(p->iov_len - skip, size); 1775 1776 if (len) { 1777 size -= len; 1778 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1779 if (unlikely(npages > maxpages)) 1780 return maxpages; 1781 } 1782 } 1783 return npages; 1784 } 1785 1786 static int bvec_npages(const struct iov_iter *i, int maxpages) 1787 { 1788 size_t skip = i->iov_offset, size = i->count; 1789 const struct bio_vec *p; 1790 int npages = 0; 1791 1792 for (p = i->bvec; size; skip = 0, p++) { 1793 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; 1794 size_t len = min(p->bv_len - skip, size); 1795 1796 size -= len; 1797 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1798 if (unlikely(npages > maxpages)) 1799 return maxpages; 1800 } 1801 return npages; 1802 } 1803 1804 int iov_iter_npages(const struct iov_iter *i, int maxpages) 1805 { 1806 if (unlikely(!i->count)) 1807 return 0; 1808 /* iovec and kvec have identical layouts */ 1809 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1810 return iov_npages(i, maxpages); 1811 if (iov_iter_is_bvec(i)) 1812 return bvec_npages(i, maxpages); 1813 if (iov_iter_is_pipe(i)) { 1814 unsigned int iter_head; 1815 int npages; 1816 size_t off; 1817 1818 if (!sanity(i)) 1819 return 0; 1820 1821 data_start(i, &iter_head, &off); 1822 /* some of this one + all after this one */ 1823 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1824 return min(npages, maxpages); 1825 } 1826 if (iov_iter_is_xarray(i)) { 1827 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; 1828 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); 1829 return min(npages, maxpages); 1830 } 1831 return 0; 1832 } 1833 EXPORT_SYMBOL(iov_iter_npages); 1834 1835 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) 1836 { 1837 *new = *old; 1838 if (unlikely(iov_iter_is_pipe(new))) { 1839 WARN_ON(1); 1840 return NULL; 1841 } 1842 if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new))) 1843 return NULL; 1844 if (iov_iter_is_bvec(new)) 1845 return new->bvec = kmemdup(new->bvec, 1846 new->nr_segs * sizeof(struct bio_vec), 1847 flags); 1848 else 1849 /* iovec and kvec have identical layout */ 1850 return new->iov = kmemdup(new->iov, 1851 new->nr_segs * sizeof(struct iovec), 1852 flags); 1853 } 1854 EXPORT_SYMBOL(dup_iter); 1855 1856 static int copy_compat_iovec_from_user(struct iovec *iov, 1857 const struct iovec __user *uvec, unsigned long nr_segs) 1858 { 1859 const struct compat_iovec __user *uiov = 1860 (const struct compat_iovec __user *)uvec; 1861 int ret = -EFAULT, i; 1862 1863 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) 1864 return -EFAULT; 1865 1866 for (i = 0; i < nr_segs; i++) { 1867 compat_uptr_t buf; 1868 compat_ssize_t len; 1869 1870 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); 1871 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); 1872 1873 /* check for compat_size_t not fitting in compat_ssize_t .. */ 1874 if (len < 0) { 1875 ret = -EINVAL; 1876 goto uaccess_end; 1877 } 1878 iov[i].iov_base = compat_ptr(buf); 1879 iov[i].iov_len = len; 1880 } 1881 1882 ret = 0; 1883 uaccess_end: 1884 user_access_end(); 1885 return ret; 1886 } 1887 1888 static int copy_iovec_from_user(struct iovec *iov, 1889 const struct iovec __user *uvec, unsigned long nr_segs) 1890 { 1891 unsigned long seg; 1892 1893 if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec))) 1894 return -EFAULT; 1895 for (seg = 0; seg < nr_segs; seg++) { 1896 if ((ssize_t)iov[seg].iov_len < 0) 1897 return -EINVAL; 1898 } 1899 1900 return 0; 1901 } 1902 1903 struct iovec *iovec_from_user(const struct iovec __user *uvec, 1904 unsigned long nr_segs, unsigned long fast_segs, 1905 struct iovec *fast_iov, bool compat) 1906 { 1907 struct iovec *iov = fast_iov; 1908 int ret; 1909 1910 /* 1911 * SuS says "The readv() function *may* fail if the iovcnt argument was 1912 * less than or equal to 0, or greater than {IOV_MAX}. Linux has 1913 * traditionally returned zero for zero segments, so... 1914 */ 1915 if (nr_segs == 0) 1916 return iov; 1917 if (nr_segs > UIO_MAXIOV) 1918 return ERR_PTR(-EINVAL); 1919 if (nr_segs > fast_segs) { 1920 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); 1921 if (!iov) 1922 return ERR_PTR(-ENOMEM); 1923 } 1924 1925 if (compat) 1926 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); 1927 else 1928 ret = copy_iovec_from_user(iov, uvec, nr_segs); 1929 if (ret) { 1930 if (iov != fast_iov) 1931 kfree(iov); 1932 return ERR_PTR(ret); 1933 } 1934 1935 return iov; 1936 } 1937 1938 ssize_t __import_iovec(int type, const struct iovec __user *uvec, 1939 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, 1940 struct iov_iter *i, bool compat) 1941 { 1942 ssize_t total_len = 0; 1943 unsigned long seg; 1944 struct iovec *iov; 1945 1946 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); 1947 if (IS_ERR(iov)) { 1948 *iovp = NULL; 1949 return PTR_ERR(iov); 1950 } 1951 1952 /* 1953 * According to the Single Unix Specification we should return EINVAL if 1954 * an element length is < 0 when cast to ssize_t or if the total length 1955 * would overflow the ssize_t return value of the system call. 1956 * 1957 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 1958 * overflow case. 1959 */ 1960 for (seg = 0; seg < nr_segs; seg++) { 1961 ssize_t len = (ssize_t)iov[seg].iov_len; 1962 1963 if (!access_ok(iov[seg].iov_base, len)) { 1964 if (iov != *iovp) 1965 kfree(iov); 1966 *iovp = NULL; 1967 return -EFAULT; 1968 } 1969 1970 if (len > MAX_RW_COUNT - total_len) { 1971 len = MAX_RW_COUNT - total_len; 1972 iov[seg].iov_len = len; 1973 } 1974 total_len += len; 1975 } 1976 1977 iov_iter_init(i, type, iov, nr_segs, total_len); 1978 if (iov == *iovp) 1979 *iovp = NULL; 1980 else 1981 *iovp = iov; 1982 return total_len; 1983 } 1984 1985 /** 1986 * import_iovec() - Copy an array of &struct iovec from userspace 1987 * into the kernel, check that it is valid, and initialize a new 1988 * &struct iov_iter iterator to access it. 1989 * 1990 * @type: One of %READ or %WRITE. 1991 * @uvec: Pointer to the userspace array. 1992 * @nr_segs: Number of elements in userspace array. 1993 * @fast_segs: Number of elements in @iov. 1994 * @iovp: (input and output parameter) Pointer to pointer to (usually small 1995 * on-stack) kernel array. 1996 * @i: Pointer to iterator that will be initialized on success. 1997 * 1998 * If the array pointed to by *@iov is large enough to hold all @nr_segs, 1999 * then this function places %NULL in *@iov on return. Otherwise, a new 2000 * array will be allocated and the result placed in *@iov. This means that 2001 * the caller may call kfree() on *@iov regardless of whether the small 2002 * on-stack array was used or not (and regardless of whether this function 2003 * returns an error or not). 2004 * 2005 * Return: Negative error code on error, bytes imported on success 2006 */ 2007 ssize_t import_iovec(int type, const struct iovec __user *uvec, 2008 unsigned nr_segs, unsigned fast_segs, 2009 struct iovec **iovp, struct iov_iter *i) 2010 { 2011 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, 2012 in_compat_syscall()); 2013 } 2014 EXPORT_SYMBOL(import_iovec); 2015 2016 int import_single_range(int rw, void __user *buf, size_t len, 2017 struct iovec *iov, struct iov_iter *i) 2018 { 2019 if (len > MAX_RW_COUNT) 2020 len = MAX_RW_COUNT; 2021 if (unlikely(!access_ok(buf, len))) 2022 return -EFAULT; 2023 2024 iov->iov_base = buf; 2025 iov->iov_len = len; 2026 iov_iter_init(i, rw, iov, 1, len); 2027 return 0; 2028 } 2029 EXPORT_SYMBOL(import_single_range); 2030 2031 /** 2032 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when 2033 * iov_iter_save_state() was called. 2034 * 2035 * @i: &struct iov_iter to restore 2036 * @state: state to restore from 2037 * 2038 * Used after iov_iter_save_state() to bring restore @i, if operations may 2039 * have advanced it. 2040 * 2041 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC 2042 */ 2043 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) 2044 { 2045 if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) && 2046 !iov_iter_is_kvec(i)) 2047 return; 2048 i->iov_offset = state->iov_offset; 2049 i->count = state->count; 2050 /* 2051 * For the *vec iters, nr_segs + iov is constant - if we increment 2052 * the vec, then we also decrement the nr_segs count. Hence we don't 2053 * need to track both of these, just one is enough and we can deduct 2054 * the other from that. ITER_KVEC and ITER_IOVEC are the same struct 2055 * size, so we can just increment the iov pointer as they are unionzed. 2056 * ITER_BVEC _may_ be the same size on some archs, but on others it is 2057 * not. Be safe and handle it separately. 2058 */ 2059 BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); 2060 if (iov_iter_is_bvec(i)) 2061 i->bvec -= state->nr_segs - i->nr_segs; 2062 else 2063 i->iov -= state->nr_segs - i->nr_segs; 2064 i->nr_segs = state->nr_segs; 2065 } 2066