1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2018 HUAWEI, Inc. 4 * https://www.huawei.com/ 5 * Copyright (C) 2022 Alibaba Cloud 6 */ 7 #include "compress.h" 8 #include <linux/psi.h> 9 #include <linux/cpuhotplug.h> 10 #include <trace/events/erofs.h> 11 12 #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) 13 #define Z_EROFS_INLINE_BVECS 2 14 15 /* 16 * let's leave a type here in case of introducing 17 * another tagged pointer later. 18 */ 19 typedef void *z_erofs_next_pcluster_t; 20 21 struct z_erofs_bvec { 22 struct page *page; 23 int offset; 24 unsigned int end; 25 }; 26 27 #define __Z_EROFS_BVSET(name, total) \ 28 struct name { \ 29 /* point to the next page which contains the following bvecs */ \ 30 struct page *nextpage; \ 31 struct z_erofs_bvec bvec[total]; \ 32 } 33 __Z_EROFS_BVSET(z_erofs_bvset,); 34 __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); 35 36 /* 37 * Structure fields follow one of the following exclusion rules. 38 * 39 * I: Modifiable by initialization/destruction paths and read-only 40 * for everyone else; 41 * 42 * L: Field should be protected by the pcluster lock; 43 * 44 * A: Field should be accessed / updated in atomic for parallelized code. 45 */ 46 struct z_erofs_pcluster { 47 struct erofs_workgroup obj; 48 struct mutex lock; 49 50 /* A: point to next chained pcluster or TAILs */ 51 z_erofs_next_pcluster_t next; 52 53 /* L: the maximum decompression size of this round */ 54 unsigned int length; 55 56 /* L: total number of bvecs */ 57 unsigned int vcnt; 58 59 /* I: page offset of start position of decompression */ 60 unsigned short pageofs_out; 61 62 /* I: page offset of inline compressed data */ 63 unsigned short pageofs_in; 64 65 union { 66 /* L: inline a certain number of bvec for bootstrap */ 67 struct z_erofs_bvset_inline bvset; 68 69 /* I: can be used to free the pcluster by RCU. */ 70 struct rcu_head rcu; 71 }; 72 73 union { 74 /* I: physical cluster size in pages */ 75 unsigned short pclusterpages; 76 77 /* I: tailpacking inline compressed size */ 78 unsigned short tailpacking_size; 79 }; 80 81 /* I: compression algorithm format */ 82 unsigned char algorithmformat; 83 84 /* L: whether partial decompression or not */ 85 bool partial; 86 87 /* L: indicate several pageofs_outs or not */ 88 bool multibases; 89 90 /* A: compressed bvecs (can be cached or inplaced pages) */ 91 struct z_erofs_bvec compressed_bvecs[]; 92 }; 93 94 /* the end of a chain of pclusters */ 95 #define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA) 96 #define Z_EROFS_PCLUSTER_NIL (NULL) 97 98 struct z_erofs_decompressqueue { 99 struct super_block *sb; 100 atomic_t pending_bios; 101 z_erofs_next_pcluster_t head; 102 103 union { 104 struct completion done; 105 struct work_struct work; 106 struct kthread_work kthread_work; 107 } u; 108 bool eio, sync; 109 }; 110 111 static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) 112 { 113 return !pcl->obj.index; 114 } 115 116 static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) 117 { 118 if (z_erofs_is_inline_pcluster(pcl)) 119 return 1; 120 return pcl->pclusterpages; 121 } 122 123 /* 124 * bit 30: I/O error occurred on this page 125 * bit 0 - 29: remaining parts to complete this page 126 */ 127 #define Z_EROFS_PAGE_EIO (1 << 30) 128 129 static inline void z_erofs_onlinepage_init(struct page *page) 130 { 131 union { 132 atomic_t o; 133 unsigned long v; 134 } u = { .o = ATOMIC_INIT(1) }; 135 136 set_page_private(page, u.v); 137 smp_wmb(); 138 SetPagePrivate(page); 139 } 140 141 static inline void z_erofs_onlinepage_split(struct page *page) 142 { 143 atomic_inc((atomic_t *)&page->private); 144 } 145 146 static void z_erofs_onlinepage_endio(struct page *page, int err) 147 { 148 int orig, v; 149 150 DBG_BUGON(!PagePrivate(page)); 151 152 do { 153 orig = atomic_read((atomic_t *)&page->private); 154 v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0); 155 } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig); 156 157 if (!(v & ~Z_EROFS_PAGE_EIO)) { 158 set_page_private(page, 0); 159 ClearPagePrivate(page); 160 if (!(v & Z_EROFS_PAGE_EIO)) 161 SetPageUptodate(page); 162 unlock_page(page); 163 } 164 } 165 166 #define Z_EROFS_ONSTACK_PAGES 32 167 168 /* 169 * since pclustersize is variable for big pcluster feature, introduce slab 170 * pools implementation for different pcluster sizes. 171 */ 172 struct z_erofs_pcluster_slab { 173 struct kmem_cache *slab; 174 unsigned int maxpages; 175 char name[48]; 176 }; 177 178 #define _PCLP(n) { .maxpages = n } 179 180 static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { 181 _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), 182 _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) 183 }; 184 185 struct z_erofs_bvec_iter { 186 struct page *bvpage; 187 struct z_erofs_bvset *bvset; 188 unsigned int nr, cur; 189 }; 190 191 static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) 192 { 193 if (iter->bvpage) 194 kunmap_local(iter->bvset); 195 return iter->bvpage; 196 } 197 198 static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) 199 { 200 unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; 201 /* have to access nextpage in advance, otherwise it will be unmapped */ 202 struct page *nextpage = iter->bvset->nextpage; 203 struct page *oldpage; 204 205 DBG_BUGON(!nextpage); 206 oldpage = z_erofs_bvec_iter_end(iter); 207 iter->bvpage = nextpage; 208 iter->bvset = kmap_local_page(nextpage); 209 iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); 210 iter->cur = 0; 211 return oldpage; 212 } 213 214 static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, 215 struct z_erofs_bvset_inline *bvset, 216 unsigned int bootstrap_nr, 217 unsigned int cur) 218 { 219 *iter = (struct z_erofs_bvec_iter) { 220 .nr = bootstrap_nr, 221 .bvset = (struct z_erofs_bvset *)bvset, 222 }; 223 224 while (cur > iter->nr) { 225 cur -= iter->nr; 226 z_erofs_bvset_flip(iter); 227 } 228 iter->cur = cur; 229 } 230 231 static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, 232 struct z_erofs_bvec *bvec, 233 struct page **candidate_bvpage, 234 struct page **pagepool) 235 { 236 if (iter->cur >= iter->nr) { 237 struct page *nextpage = *candidate_bvpage; 238 239 if (!nextpage) { 240 nextpage = erofs_allocpage(pagepool, GFP_NOFS); 241 if (!nextpage) 242 return -ENOMEM; 243 set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); 244 } 245 DBG_BUGON(iter->bvset->nextpage); 246 iter->bvset->nextpage = nextpage; 247 z_erofs_bvset_flip(iter); 248 249 iter->bvset->nextpage = NULL; 250 *candidate_bvpage = NULL; 251 } 252 iter->bvset->bvec[iter->cur++] = *bvec; 253 return 0; 254 } 255 256 static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, 257 struct z_erofs_bvec *bvec, 258 struct page **old_bvpage) 259 { 260 if (iter->cur == iter->nr) 261 *old_bvpage = z_erofs_bvset_flip(iter); 262 else 263 *old_bvpage = NULL; 264 *bvec = iter->bvset->bvec[iter->cur++]; 265 } 266 267 static void z_erofs_destroy_pcluster_pool(void) 268 { 269 int i; 270 271 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { 272 if (!pcluster_pool[i].slab) 273 continue; 274 kmem_cache_destroy(pcluster_pool[i].slab); 275 pcluster_pool[i].slab = NULL; 276 } 277 } 278 279 static int z_erofs_create_pcluster_pool(void) 280 { 281 struct z_erofs_pcluster_slab *pcs; 282 struct z_erofs_pcluster *a; 283 unsigned int size; 284 285 for (pcs = pcluster_pool; 286 pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { 287 size = struct_size(a, compressed_bvecs, pcs->maxpages); 288 289 sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); 290 pcs->slab = kmem_cache_create(pcs->name, size, 0, 291 SLAB_RECLAIM_ACCOUNT, NULL); 292 if (pcs->slab) 293 continue; 294 295 z_erofs_destroy_pcluster_pool(); 296 return -ENOMEM; 297 } 298 return 0; 299 } 300 301 static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) 302 { 303 int i; 304 305 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { 306 struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; 307 struct z_erofs_pcluster *pcl; 308 309 if (nrpages > pcs->maxpages) 310 continue; 311 312 pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); 313 if (!pcl) 314 return ERR_PTR(-ENOMEM); 315 pcl->pclusterpages = nrpages; 316 return pcl; 317 } 318 return ERR_PTR(-EINVAL); 319 } 320 321 static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) 322 { 323 unsigned int pclusterpages = z_erofs_pclusterpages(pcl); 324 int i; 325 326 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { 327 struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; 328 329 if (pclusterpages > pcs->maxpages) 330 continue; 331 332 kmem_cache_free(pcs->slab, pcl); 333 return; 334 } 335 DBG_BUGON(1); 336 } 337 338 static struct workqueue_struct *z_erofs_workqueue __read_mostly; 339 340 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD 341 static struct kthread_worker __rcu **z_erofs_pcpu_workers; 342 343 static void erofs_destroy_percpu_workers(void) 344 { 345 struct kthread_worker *worker; 346 unsigned int cpu; 347 348 for_each_possible_cpu(cpu) { 349 worker = rcu_dereference_protected( 350 z_erofs_pcpu_workers[cpu], 1); 351 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); 352 if (worker) 353 kthread_destroy_worker(worker); 354 } 355 kfree(z_erofs_pcpu_workers); 356 } 357 358 static struct kthread_worker *erofs_init_percpu_worker(int cpu) 359 { 360 struct kthread_worker *worker = 361 kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu); 362 363 if (IS_ERR(worker)) 364 return worker; 365 if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI)) 366 sched_set_fifo_low(worker->task); 367 return worker; 368 } 369 370 static int erofs_init_percpu_workers(void) 371 { 372 struct kthread_worker *worker; 373 unsigned int cpu; 374 375 z_erofs_pcpu_workers = kcalloc(num_possible_cpus(), 376 sizeof(struct kthread_worker *), GFP_ATOMIC); 377 if (!z_erofs_pcpu_workers) 378 return -ENOMEM; 379 380 for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */ 381 worker = erofs_init_percpu_worker(cpu); 382 if (!IS_ERR(worker)) 383 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); 384 } 385 return 0; 386 } 387 #else 388 static inline void erofs_destroy_percpu_workers(void) {} 389 static inline int erofs_init_percpu_workers(void) { return 0; } 390 #endif 391 392 #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD) 393 static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); 394 static enum cpuhp_state erofs_cpuhp_state; 395 396 static int erofs_cpu_online(unsigned int cpu) 397 { 398 struct kthread_worker *worker, *old; 399 400 worker = erofs_init_percpu_worker(cpu); 401 if (IS_ERR(worker)) 402 return PTR_ERR(worker); 403 404 spin_lock(&z_erofs_pcpu_worker_lock); 405 old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], 406 lockdep_is_held(&z_erofs_pcpu_worker_lock)); 407 if (!old) 408 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); 409 spin_unlock(&z_erofs_pcpu_worker_lock); 410 if (old) 411 kthread_destroy_worker(worker); 412 return 0; 413 } 414 415 static int erofs_cpu_offline(unsigned int cpu) 416 { 417 struct kthread_worker *worker; 418 419 spin_lock(&z_erofs_pcpu_worker_lock); 420 worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], 421 lockdep_is_held(&z_erofs_pcpu_worker_lock)); 422 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); 423 spin_unlock(&z_erofs_pcpu_worker_lock); 424 425 synchronize_rcu(); 426 if (worker) 427 kthread_destroy_worker(worker); 428 return 0; 429 } 430 431 static int erofs_cpu_hotplug_init(void) 432 { 433 int state; 434 435 state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, 436 "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline); 437 if (state < 0) 438 return state; 439 440 erofs_cpuhp_state = state; 441 return 0; 442 } 443 444 static void erofs_cpu_hotplug_destroy(void) 445 { 446 if (erofs_cpuhp_state) 447 cpuhp_remove_state_nocalls(erofs_cpuhp_state); 448 } 449 #else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */ 450 static inline int erofs_cpu_hotplug_init(void) { return 0; } 451 static inline void erofs_cpu_hotplug_destroy(void) {} 452 #endif 453 454 void z_erofs_exit_zip_subsystem(void) 455 { 456 erofs_cpu_hotplug_destroy(); 457 erofs_destroy_percpu_workers(); 458 destroy_workqueue(z_erofs_workqueue); 459 z_erofs_destroy_pcluster_pool(); 460 } 461 462 int __init z_erofs_init_zip_subsystem(void) 463 { 464 int err = z_erofs_create_pcluster_pool(); 465 466 if (err) 467 goto out_error_pcluster_pool; 468 469 z_erofs_workqueue = alloc_workqueue("erofs_worker", 470 WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); 471 if (!z_erofs_workqueue) { 472 err = -ENOMEM; 473 goto out_error_workqueue_init; 474 } 475 476 err = erofs_init_percpu_workers(); 477 if (err) 478 goto out_error_pcpu_worker; 479 480 err = erofs_cpu_hotplug_init(); 481 if (err < 0) 482 goto out_error_cpuhp_init; 483 return err; 484 485 out_error_cpuhp_init: 486 erofs_destroy_percpu_workers(); 487 out_error_pcpu_worker: 488 destroy_workqueue(z_erofs_workqueue); 489 out_error_workqueue_init: 490 z_erofs_destroy_pcluster_pool(); 491 out_error_pcluster_pool: 492 return err; 493 } 494 495 enum z_erofs_pclustermode { 496 Z_EROFS_PCLUSTER_INFLIGHT, 497 /* 498 * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it 499 * could be dispatched into bypass queue later due to uptodated managed 500 * pages. All related online pages cannot be reused for inplace I/O (or 501 * bvpage) since it can be directly decoded without I/O submission. 502 */ 503 Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, 504 /* 505 * The pcluster was just linked to a decompression chain by us. It can 506 * also be linked with the remaining pclusters, which means if the 507 * processing page is the tail page of a pcluster, this pcluster can 508 * safely use the whole page (since the previous pcluster is within the 509 * same chain) for in-place I/O, as illustrated below: 510 * ___________________________________________________ 511 * | tail (partial) page | head (partial) page | 512 * | (of the current pcl) | (of the previous pcl) | 513 * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____| 514 * 515 * [ (*) the page above can be used as inplace I/O. ] 516 */ 517 Z_EROFS_PCLUSTER_FOLLOWED, 518 }; 519 520 struct z_erofs_decompress_frontend { 521 struct inode *const inode; 522 struct erofs_map_blocks map; 523 struct z_erofs_bvec_iter biter; 524 525 struct page *pagepool; 526 struct page *candidate_bvpage; 527 struct z_erofs_pcluster *pcl; 528 z_erofs_next_pcluster_t owned_head; 529 enum z_erofs_pclustermode mode; 530 531 erofs_off_t headoffset; 532 533 /* a pointer used to pick up inplace I/O pages */ 534 unsigned int icur; 535 }; 536 537 #define DECOMPRESS_FRONTEND_INIT(__i) { \ 538 .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ 539 .mode = Z_EROFS_PCLUSTER_FOLLOWED } 540 541 static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) 542 { 543 unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy; 544 545 if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) 546 return false; 547 548 if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED)) 549 return true; 550 551 if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && 552 fe->map.m_la < fe->headoffset) 553 return true; 554 555 return false; 556 } 557 558 static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) 559 { 560 struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); 561 struct z_erofs_pcluster *pcl = fe->pcl; 562 bool shouldalloc = z_erofs_should_alloc_cache(fe); 563 bool standalone = true; 564 /* 565 * optimistic allocation without direct reclaim since inplace I/O 566 * can be used if low memory otherwise. 567 */ 568 gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | 569 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; 570 unsigned int i; 571 572 if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) 573 return; 574 575 for (i = 0; i < pcl->pclusterpages; ++i) { 576 struct page *page; 577 void *t; /* mark pages just found for debugging */ 578 struct page *newpage = NULL; 579 580 /* the compressed page was loaded before */ 581 if (READ_ONCE(pcl->compressed_bvecs[i].page)) 582 continue; 583 584 page = find_get_page(mc, pcl->obj.index + i); 585 586 if (page) { 587 t = (void *)((unsigned long)page | 1); 588 } else { 589 /* I/O is needed, no possible to decompress directly */ 590 standalone = false; 591 if (!shouldalloc) 592 continue; 593 594 /* 595 * try to use cached I/O if page allocation 596 * succeeds or fallback to in-place I/O instead 597 * to avoid any direct reclaim. 598 */ 599 newpage = erofs_allocpage(&fe->pagepool, gfp); 600 if (!newpage) 601 continue; 602 set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); 603 t = (void *)((unsigned long)newpage | 1); 604 } 605 606 if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) 607 continue; 608 609 if (page) 610 put_page(page); 611 else if (newpage) 612 erofs_pagepool_add(&fe->pagepool, newpage); 613 } 614 615 /* 616 * don't do inplace I/O if all compressed pages are available in 617 * managed cache since it can be moved to the bypass queue instead. 618 */ 619 if (standalone) 620 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; 621 } 622 623 /* called by erofs_shrinker to get rid of all compressed_pages */ 624 int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, 625 struct erofs_workgroup *grp) 626 { 627 struct z_erofs_pcluster *const pcl = 628 container_of(grp, struct z_erofs_pcluster, obj); 629 int i; 630 631 DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); 632 /* 633 * refcount of workgroup is now freezed as 0, 634 * therefore no need to worry about available decompression users. 635 */ 636 for (i = 0; i < pcl->pclusterpages; ++i) { 637 struct page *page = pcl->compressed_bvecs[i].page; 638 639 if (!page) 640 continue; 641 642 /* block other users from reclaiming or migrating the page */ 643 if (!trylock_page(page)) 644 return -EBUSY; 645 646 if (!erofs_page_is_managed(sbi, page)) 647 continue; 648 649 /* barrier is implied in the following 'unlock_page' */ 650 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); 651 detach_page_private(page); 652 unlock_page(page); 653 } 654 return 0; 655 } 656 657 static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) 658 { 659 struct z_erofs_pcluster *pcl = folio_get_private(folio); 660 bool ret; 661 int i; 662 663 if (!folio_test_private(folio)) 664 return true; 665 666 ret = false; 667 spin_lock(&pcl->obj.lockref.lock); 668 if (pcl->obj.lockref.count > 0) 669 goto out; 670 671 DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); 672 for (i = 0; i < pcl->pclusterpages; ++i) { 673 if (pcl->compressed_bvecs[i].page == &folio->page) { 674 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); 675 ret = true; 676 break; 677 } 678 } 679 if (ret) 680 folio_detach_private(folio); 681 out: 682 spin_unlock(&pcl->obj.lockref.lock); 683 return ret; 684 } 685 686 /* 687 * It will be called only on inode eviction. In case that there are still some 688 * decompression requests in progress, wait with rescheduling for a bit here. 689 * An extra lock could be introduced instead but it seems unnecessary. 690 */ 691 static void z_erofs_cache_invalidate_folio(struct folio *folio, 692 size_t offset, size_t length) 693 { 694 const size_t stop = length + offset; 695 696 /* Check for potential overflow in debug mode */ 697 DBG_BUGON(stop > folio_size(folio) || stop < length); 698 699 if (offset == 0 && stop == folio_size(folio)) 700 while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) 701 cond_resched(); 702 } 703 704 static const struct address_space_operations z_erofs_cache_aops = { 705 .release_folio = z_erofs_cache_release_folio, 706 .invalidate_folio = z_erofs_cache_invalidate_folio, 707 }; 708 709 int erofs_init_managed_cache(struct super_block *sb) 710 { 711 struct inode *const inode = new_inode(sb); 712 713 if (!inode) 714 return -ENOMEM; 715 716 set_nlink(inode, 1); 717 inode->i_size = OFFSET_MAX; 718 inode->i_mapping->a_ops = &z_erofs_cache_aops; 719 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); 720 EROFS_SB(sb)->managed_cache = inode; 721 return 0; 722 } 723 724 static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, 725 struct z_erofs_bvec *bvec) 726 { 727 struct z_erofs_pcluster *const pcl = fe->pcl; 728 729 while (fe->icur > 0) { 730 if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, 731 NULL, bvec->page)) { 732 pcl->compressed_bvecs[fe->icur] = *bvec; 733 return true; 734 } 735 } 736 return false; 737 } 738 739 /* callers must be with pcluster lock held */ 740 static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, 741 struct z_erofs_bvec *bvec, bool exclusive) 742 { 743 int ret; 744 745 if (exclusive) { 746 /* give priority for inplaceio to use file pages first */ 747 if (z_erofs_try_inplace_io(fe, bvec)) 748 return 0; 749 /* otherwise, check if it can be used as a bvpage */ 750 if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && 751 !fe->candidate_bvpage) 752 fe->candidate_bvpage = bvec->page; 753 } 754 ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage, 755 &fe->pagepool); 756 fe->pcl->vcnt += (ret >= 0); 757 return ret; 758 } 759 760 static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) 761 { 762 struct z_erofs_pcluster *pcl = f->pcl; 763 z_erofs_next_pcluster_t *owned_head = &f->owned_head; 764 765 /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ 766 if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, 767 *owned_head) == Z_EROFS_PCLUSTER_NIL) { 768 *owned_head = &pcl->next; 769 /* so we can attach this pcluster to our submission chain. */ 770 f->mode = Z_EROFS_PCLUSTER_FOLLOWED; 771 return; 772 } 773 774 /* type 2, it belongs to an ongoing chain */ 775 f->mode = Z_EROFS_PCLUSTER_INFLIGHT; 776 } 777 778 static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) 779 { 780 struct erofs_map_blocks *map = &fe->map; 781 bool ztailpacking = map->m_flags & EROFS_MAP_META; 782 struct z_erofs_pcluster *pcl; 783 struct erofs_workgroup *grp; 784 int err; 785 786 if (!(map->m_flags & EROFS_MAP_ENCODED) || 787 (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { 788 DBG_BUGON(1); 789 return -EFSCORRUPTED; 790 } 791 792 /* no available pcluster, let's allocate one */ 793 pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : 794 map->m_plen >> PAGE_SHIFT); 795 if (IS_ERR(pcl)) 796 return PTR_ERR(pcl); 797 798 spin_lock_init(&pcl->obj.lockref.lock); 799 pcl->algorithmformat = map->m_algorithmformat; 800 pcl->length = 0; 801 pcl->partial = true; 802 803 /* new pclusters should be claimed as type 1, primary and followed */ 804 pcl->next = fe->owned_head; 805 pcl->pageofs_out = map->m_la & ~PAGE_MASK; 806 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; 807 808 /* 809 * lock all primary followed works before visible to others 810 * and mutex_trylock *never* fails for a new pcluster. 811 */ 812 mutex_init(&pcl->lock); 813 DBG_BUGON(!mutex_trylock(&pcl->lock)); 814 815 if (ztailpacking) { 816 pcl->obj.index = 0; /* which indicates ztailpacking */ 817 pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa); 818 pcl->tailpacking_size = map->m_plen; 819 } else { 820 pcl->obj.index = map->m_pa >> PAGE_SHIFT; 821 822 grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); 823 if (IS_ERR(grp)) { 824 err = PTR_ERR(grp); 825 goto err_out; 826 } 827 828 if (grp != &pcl->obj) { 829 fe->pcl = container_of(grp, 830 struct z_erofs_pcluster, obj); 831 err = -EEXIST; 832 goto err_out; 833 } 834 } 835 fe->owned_head = &pcl->next; 836 fe->pcl = pcl; 837 return 0; 838 839 err_out: 840 mutex_unlock(&pcl->lock); 841 z_erofs_free_pcluster(pcl); 842 return err; 843 } 844 845 static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) 846 { 847 struct erofs_map_blocks *map = &fe->map; 848 struct super_block *sb = fe->inode->i_sb; 849 erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); 850 struct erofs_workgroup *grp = NULL; 851 int ret; 852 853 DBG_BUGON(fe->pcl); 854 855 /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ 856 DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); 857 858 if (!(map->m_flags & EROFS_MAP_META)) { 859 grp = erofs_find_workgroup(sb, blknr); 860 } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { 861 DBG_BUGON(1); 862 return -EFSCORRUPTED; 863 } 864 865 if (grp) { 866 fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); 867 ret = -EEXIST; 868 } else { 869 ret = z_erofs_register_pcluster(fe); 870 } 871 872 if (ret == -EEXIST) { 873 mutex_lock(&fe->pcl->lock); 874 z_erofs_try_to_claim_pcluster(fe); 875 } else if (ret) { 876 return ret; 877 } 878 879 z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, 880 Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); 881 if (!z_erofs_is_inline_pcluster(fe->pcl)) { 882 /* bind cache first when cached decompression is preferred */ 883 z_erofs_bind_cache(fe); 884 } else { 885 void *mptr; 886 887 mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP); 888 if (IS_ERR(mptr)) { 889 ret = PTR_ERR(mptr); 890 erofs_err(sb, "failed to get inline data %d", ret); 891 return ret; 892 } 893 get_page(map->buf.page); 894 WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); 895 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; 896 } 897 /* file-backed inplace I/O pages are traversed in reverse order */ 898 fe->icur = z_erofs_pclusterpages(fe->pcl); 899 return 0; 900 } 901 902 /* 903 * keep in mind that no referenced pclusters will be freed 904 * only after a RCU grace period. 905 */ 906 static void z_erofs_rcu_callback(struct rcu_head *head) 907 { 908 z_erofs_free_pcluster(container_of(head, 909 struct z_erofs_pcluster, rcu)); 910 } 911 912 void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) 913 { 914 struct z_erofs_pcluster *const pcl = 915 container_of(grp, struct z_erofs_pcluster, obj); 916 917 call_rcu(&pcl->rcu, z_erofs_rcu_callback); 918 } 919 920 static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) 921 { 922 struct z_erofs_pcluster *pcl = fe->pcl; 923 924 if (!pcl) 925 return; 926 927 z_erofs_bvec_iter_end(&fe->biter); 928 mutex_unlock(&pcl->lock); 929 930 if (fe->candidate_bvpage) 931 fe->candidate_bvpage = NULL; 932 933 /* 934 * if all pending pages are added, don't hold its reference 935 * any longer if the pcluster isn't hosted by ourselves. 936 */ 937 if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) 938 erofs_workgroup_put(&pcl->obj); 939 940 fe->pcl = NULL; 941 } 942 943 static int z_erofs_read_fragment(struct super_block *sb, struct page *page, 944 unsigned int cur, unsigned int end, erofs_off_t pos) 945 { 946 struct inode *packed_inode = EROFS_SB(sb)->packed_inode; 947 struct erofs_buf buf = __EROFS_BUF_INITIALIZER; 948 unsigned int cnt; 949 u8 *src; 950 951 if (!packed_inode) 952 return -EFSCORRUPTED; 953 954 buf.inode = packed_inode; 955 for (; cur < end; cur += cnt, pos += cnt) { 956 cnt = min_t(unsigned int, end - cur, 957 sb->s_blocksize - erofs_blkoff(sb, pos)); 958 src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); 959 if (IS_ERR(src)) { 960 erofs_put_metabuf(&buf); 961 return PTR_ERR(src); 962 } 963 memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt); 964 } 965 erofs_put_metabuf(&buf); 966 return 0; 967 } 968 969 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, 970 struct page *page) 971 { 972 struct inode *const inode = fe->inode; 973 struct erofs_map_blocks *const map = &fe->map; 974 const loff_t offset = page_offset(page); 975 bool tight = true, exclusive; 976 unsigned int cur, end, len, split; 977 int err = 0; 978 979 z_erofs_onlinepage_init(page); 980 981 split = 0; 982 end = PAGE_SIZE; 983 repeat: 984 if (offset + end - 1 < map->m_la || 985 offset + end - 1 >= map->m_la + map->m_llen) { 986 z_erofs_pcluster_end(fe); 987 map->m_la = offset + end - 1; 988 map->m_llen = 0; 989 err = z_erofs_map_blocks_iter(inode, map, 0); 990 if (err) 991 goto out; 992 } 993 994 cur = offset > map->m_la ? 0 : map->m_la - offset; 995 /* bump split parts first to avoid several separate cases */ 996 ++split; 997 998 if (!(map->m_flags & EROFS_MAP_MAPPED)) { 999 zero_user_segment(page, cur, end); 1000 tight = false; 1001 goto next_part; 1002 } 1003 1004 if (map->m_flags & EROFS_MAP_FRAGMENT) { 1005 erofs_off_t fpos = offset + cur - map->m_la; 1006 1007 len = min_t(unsigned int, map->m_llen - fpos, end - cur); 1008 err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len, 1009 EROFS_I(inode)->z_fragmentoff + fpos); 1010 if (err) 1011 goto out; 1012 tight = false; 1013 goto next_part; 1014 } 1015 1016 if (!fe->pcl) { 1017 err = z_erofs_pcluster_begin(fe); 1018 if (err) 1019 goto out; 1020 } 1021 1022 /* 1023 * Ensure the current partial page belongs to this submit chain rather 1024 * than other concurrent submit chains or the noio(bypass) chain since 1025 * those chains are handled asynchronously thus the page cannot be used 1026 * for inplace I/O or bvpage (should be processed in a strict order.) 1027 */ 1028 tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); 1029 exclusive = (!cur && ((split <= 1) || tight)); 1030 if (cur) 1031 tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); 1032 1033 err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { 1034 .page = page, 1035 .offset = offset - map->m_la, 1036 .end = end, 1037 }), exclusive); 1038 if (err) 1039 goto out; 1040 1041 z_erofs_onlinepage_split(page); 1042 if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) 1043 fe->pcl->multibases = true; 1044 if (fe->pcl->length < offset + end - map->m_la) { 1045 fe->pcl->length = offset + end - map->m_la; 1046 fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; 1047 } 1048 if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && 1049 !(map->m_flags & EROFS_MAP_PARTIAL_REF) && 1050 fe->pcl->length == map->m_llen) 1051 fe->pcl->partial = false; 1052 next_part: 1053 /* shorten the remaining extent to update progress */ 1054 map->m_llen = offset + cur - map->m_la; 1055 map->m_flags &= ~EROFS_MAP_FULL_MAPPED; 1056 1057 end = cur; 1058 if (end > 0) 1059 goto repeat; 1060 1061 out: 1062 z_erofs_onlinepage_endio(page, err); 1063 return err; 1064 } 1065 1066 static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, 1067 unsigned int readahead_pages) 1068 { 1069 /* auto: enable for read_folio, disable for readahead */ 1070 if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) && 1071 !readahead_pages) 1072 return true; 1073 1074 if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) && 1075 (readahead_pages <= sbi->opt.max_sync_decompress_pages)) 1076 return true; 1077 1078 return false; 1079 } 1080 1081 static bool z_erofs_page_is_invalidated(struct page *page) 1082 { 1083 return !page->mapping && !z_erofs_is_shortlived_page(page); 1084 } 1085 1086 struct z_erofs_decompress_backend { 1087 struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; 1088 struct super_block *sb; 1089 struct z_erofs_pcluster *pcl; 1090 1091 /* pages with the longest decompressed length for deduplication */ 1092 struct page **decompressed_pages; 1093 /* pages to keep the compressed data */ 1094 struct page **compressed_pages; 1095 1096 struct list_head decompressed_secondary_bvecs; 1097 struct page **pagepool; 1098 unsigned int onstack_used, nr_pages; 1099 }; 1100 1101 struct z_erofs_bvec_item { 1102 struct z_erofs_bvec bvec; 1103 struct list_head list; 1104 }; 1105 1106 static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, 1107 struct z_erofs_bvec *bvec) 1108 { 1109 struct z_erofs_bvec_item *item; 1110 unsigned int pgnr; 1111 1112 if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) && 1113 (bvec->end == PAGE_SIZE || 1114 bvec->offset + bvec->end == be->pcl->length)) { 1115 pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; 1116 DBG_BUGON(pgnr >= be->nr_pages); 1117 if (!be->decompressed_pages[pgnr]) { 1118 be->decompressed_pages[pgnr] = bvec->page; 1119 return; 1120 } 1121 } 1122 1123 /* (cold path) one pcluster is requested multiple times */ 1124 item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); 1125 item->bvec = *bvec; 1126 list_add(&item->list, &be->decompressed_secondary_bvecs); 1127 } 1128 1129 static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, 1130 int err) 1131 { 1132 unsigned int off0 = be->pcl->pageofs_out; 1133 struct list_head *p, *n; 1134 1135 list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { 1136 struct z_erofs_bvec_item *bvi; 1137 unsigned int end, cur; 1138 void *dst, *src; 1139 1140 bvi = container_of(p, struct z_erofs_bvec_item, list); 1141 cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; 1142 end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, 1143 bvi->bvec.end); 1144 dst = kmap_local_page(bvi->bvec.page); 1145 while (cur < end) { 1146 unsigned int pgnr, scur, len; 1147 1148 pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; 1149 DBG_BUGON(pgnr >= be->nr_pages); 1150 1151 scur = bvi->bvec.offset + cur - 1152 ((pgnr << PAGE_SHIFT) - off0); 1153 len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); 1154 if (!be->decompressed_pages[pgnr]) { 1155 err = -EFSCORRUPTED; 1156 cur += len; 1157 continue; 1158 } 1159 src = kmap_local_page(be->decompressed_pages[pgnr]); 1160 memcpy(dst + cur, src + scur, len); 1161 kunmap_local(src); 1162 cur += len; 1163 } 1164 kunmap_local(dst); 1165 z_erofs_onlinepage_endio(bvi->bvec.page, err); 1166 list_del(p); 1167 kfree(bvi); 1168 } 1169 } 1170 1171 static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) 1172 { 1173 struct z_erofs_pcluster *pcl = be->pcl; 1174 struct z_erofs_bvec_iter biter; 1175 struct page *old_bvpage; 1176 int i; 1177 1178 z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); 1179 for (i = 0; i < pcl->vcnt; ++i) { 1180 struct z_erofs_bvec bvec; 1181 1182 z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); 1183 1184 if (old_bvpage) 1185 z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); 1186 1187 DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); 1188 z_erofs_do_decompressed_bvec(be, &bvec); 1189 } 1190 1191 old_bvpage = z_erofs_bvec_iter_end(&biter); 1192 if (old_bvpage) 1193 z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); 1194 } 1195 1196 static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, 1197 bool *overlapped) 1198 { 1199 struct z_erofs_pcluster *pcl = be->pcl; 1200 unsigned int pclusterpages = z_erofs_pclusterpages(pcl); 1201 int i, err = 0; 1202 1203 *overlapped = false; 1204 for (i = 0; i < pclusterpages; ++i) { 1205 struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; 1206 struct page *page = bvec->page; 1207 1208 /* compressed pages ought to be present before decompressing */ 1209 if (!page) { 1210 DBG_BUGON(1); 1211 continue; 1212 } 1213 be->compressed_pages[i] = page; 1214 1215 if (z_erofs_is_inline_pcluster(pcl)) { 1216 if (!PageUptodate(page)) 1217 err = -EIO; 1218 continue; 1219 } 1220 1221 DBG_BUGON(z_erofs_page_is_invalidated(page)); 1222 if (!z_erofs_is_shortlived_page(page)) { 1223 if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { 1224 if (!PageUptodate(page)) 1225 err = -EIO; 1226 continue; 1227 } 1228 z_erofs_do_decompressed_bvec(be, bvec); 1229 *overlapped = true; 1230 } 1231 } 1232 1233 if (err) 1234 return err; 1235 return 0; 1236 } 1237 1238 static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, 1239 int err) 1240 { 1241 struct erofs_sb_info *const sbi = EROFS_SB(be->sb); 1242 struct z_erofs_pcluster *pcl = be->pcl; 1243 unsigned int pclusterpages = z_erofs_pclusterpages(pcl); 1244 const struct z_erofs_decompressor *decompressor = 1245 &erofs_decompressors[pcl->algorithmformat]; 1246 unsigned int i, inputsize; 1247 int err2; 1248 struct page *page; 1249 bool overlapped; 1250 1251 mutex_lock(&pcl->lock); 1252 be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; 1253 1254 /* allocate (de)compressed page arrays if cannot be kept on stack */ 1255 be->decompressed_pages = NULL; 1256 be->compressed_pages = NULL; 1257 be->onstack_used = 0; 1258 if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { 1259 be->decompressed_pages = be->onstack_pages; 1260 be->onstack_used = be->nr_pages; 1261 memset(be->decompressed_pages, 0, 1262 sizeof(struct page *) * be->nr_pages); 1263 } 1264 1265 if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) 1266 be->compressed_pages = be->onstack_pages + be->onstack_used; 1267 1268 if (!be->decompressed_pages) 1269 be->decompressed_pages = 1270 kvcalloc(be->nr_pages, sizeof(struct page *), 1271 GFP_KERNEL | __GFP_NOFAIL); 1272 if (!be->compressed_pages) 1273 be->compressed_pages = 1274 kvcalloc(pclusterpages, sizeof(struct page *), 1275 GFP_KERNEL | __GFP_NOFAIL); 1276 1277 z_erofs_parse_out_bvecs(be); 1278 err2 = z_erofs_parse_in_bvecs(be, &overlapped); 1279 if (err2) 1280 err = err2; 1281 if (err) 1282 goto out; 1283 1284 if (z_erofs_is_inline_pcluster(pcl)) 1285 inputsize = pcl->tailpacking_size; 1286 else 1287 inputsize = pclusterpages * PAGE_SIZE; 1288 1289 err = decompressor->decompress(&(struct z_erofs_decompress_req) { 1290 .sb = be->sb, 1291 .in = be->compressed_pages, 1292 .out = be->decompressed_pages, 1293 .pageofs_in = pcl->pageofs_in, 1294 .pageofs_out = pcl->pageofs_out, 1295 .inputsize = inputsize, 1296 .outputsize = pcl->length, 1297 .alg = pcl->algorithmformat, 1298 .inplace_io = overlapped, 1299 .partial_decoding = pcl->partial, 1300 .fillgaps = pcl->multibases, 1301 }, be->pagepool); 1302 1303 out: 1304 /* must handle all compressed pages before actual file pages */ 1305 if (z_erofs_is_inline_pcluster(pcl)) { 1306 page = pcl->compressed_bvecs[0].page; 1307 WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); 1308 put_page(page); 1309 } else { 1310 for (i = 0; i < pclusterpages; ++i) { 1311 page = pcl->compressed_bvecs[i].page; 1312 1313 if (erofs_page_is_managed(sbi, page)) 1314 continue; 1315 1316 /* recycle all individual short-lived pages */ 1317 (void)z_erofs_put_shortlivedpage(be->pagepool, page); 1318 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); 1319 } 1320 } 1321 if (be->compressed_pages < be->onstack_pages || 1322 be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) 1323 kvfree(be->compressed_pages); 1324 z_erofs_fill_other_copies(be, err); 1325 1326 for (i = 0; i < be->nr_pages; ++i) { 1327 page = be->decompressed_pages[i]; 1328 if (!page) 1329 continue; 1330 1331 DBG_BUGON(z_erofs_page_is_invalidated(page)); 1332 1333 /* recycle all individual short-lived pages */ 1334 if (z_erofs_put_shortlivedpage(be->pagepool, page)) 1335 continue; 1336 z_erofs_onlinepage_endio(page, err); 1337 } 1338 1339 if (be->decompressed_pages != be->onstack_pages) 1340 kvfree(be->decompressed_pages); 1341 1342 pcl->length = 0; 1343 pcl->partial = true; 1344 pcl->multibases = false; 1345 pcl->bvset.nextpage = NULL; 1346 pcl->vcnt = 0; 1347 1348 /* pcluster lock MUST be taken before the following line */ 1349 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); 1350 mutex_unlock(&pcl->lock); 1351 return err; 1352 } 1353 1354 static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, 1355 struct page **pagepool) 1356 { 1357 struct z_erofs_decompress_backend be = { 1358 .sb = io->sb, 1359 .pagepool = pagepool, 1360 .decompressed_secondary_bvecs = 1361 LIST_HEAD_INIT(be.decompressed_secondary_bvecs), 1362 }; 1363 z_erofs_next_pcluster_t owned = io->head; 1364 1365 while (owned != Z_EROFS_PCLUSTER_TAIL) { 1366 DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); 1367 1368 be.pcl = container_of(owned, struct z_erofs_pcluster, next); 1369 owned = READ_ONCE(be.pcl->next); 1370 1371 z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); 1372 if (z_erofs_is_inline_pcluster(be.pcl)) 1373 z_erofs_free_pcluster(be.pcl); 1374 else 1375 erofs_workgroup_put(&be.pcl->obj); 1376 } 1377 } 1378 1379 static void z_erofs_decompressqueue_work(struct work_struct *work) 1380 { 1381 struct z_erofs_decompressqueue *bgq = 1382 container_of(work, struct z_erofs_decompressqueue, u.work); 1383 struct page *pagepool = NULL; 1384 1385 DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL); 1386 z_erofs_decompress_queue(bgq, &pagepool); 1387 erofs_release_pages(&pagepool); 1388 kvfree(bgq); 1389 } 1390 1391 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD 1392 static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work) 1393 { 1394 z_erofs_decompressqueue_work((struct work_struct *)work); 1395 } 1396 #endif 1397 1398 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, 1399 int bios) 1400 { 1401 struct erofs_sb_info *const sbi = EROFS_SB(io->sb); 1402 1403 /* wake up the caller thread for sync decompression */ 1404 if (io->sync) { 1405 if (!atomic_add_return(bios, &io->pending_bios)) 1406 complete(&io->u.done); 1407 return; 1408 } 1409 1410 if (atomic_add_return(bios, &io->pending_bios)) 1411 return; 1412 /* Use (kthread_)work and sync decompression for atomic contexts only */ 1413 if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) { 1414 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD 1415 struct kthread_worker *worker; 1416 1417 rcu_read_lock(); 1418 worker = rcu_dereference( 1419 z_erofs_pcpu_workers[raw_smp_processor_id()]); 1420 if (!worker) { 1421 INIT_WORK(&io->u.work, z_erofs_decompressqueue_work); 1422 queue_work(z_erofs_workqueue, &io->u.work); 1423 } else { 1424 kthread_queue_work(worker, &io->u.kthread_work); 1425 } 1426 rcu_read_unlock(); 1427 #else 1428 queue_work(z_erofs_workqueue, &io->u.work); 1429 #endif 1430 /* enable sync decompression for readahead */ 1431 if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) 1432 sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; 1433 return; 1434 } 1435 z_erofs_decompressqueue_work(&io->u.work); 1436 } 1437 1438 static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, 1439 unsigned int nr, 1440 struct page **pagepool, 1441 struct address_space *mc) 1442 { 1443 const pgoff_t index = pcl->obj.index; 1444 gfp_t gfp = mapping_gfp_mask(mc); 1445 bool tocache = false; 1446 1447 struct address_space *mapping; 1448 struct page *oldpage, *page; 1449 int justfound; 1450 1451 repeat: 1452 page = READ_ONCE(pcl->compressed_bvecs[nr].page); 1453 oldpage = page; 1454 1455 if (!page) 1456 goto out_allocpage; 1457 1458 justfound = (unsigned long)page & 1UL; 1459 page = (struct page *)((unsigned long)page & ~1UL); 1460 1461 /* 1462 * preallocated cached pages, which is used to avoid direct reclaim 1463 * otherwise, it will go inplace I/O path instead. 1464 */ 1465 if (page->private == Z_EROFS_PREALLOCATED_PAGE) { 1466 WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); 1467 set_page_private(page, 0); 1468 tocache = true; 1469 goto out_tocache; 1470 } 1471 mapping = READ_ONCE(page->mapping); 1472 1473 /* 1474 * file-backed online pages in plcuster are all locked steady, 1475 * therefore it is impossible for `mapping' to be NULL. 1476 */ 1477 if (mapping && mapping != mc) 1478 /* ought to be unmanaged pages */ 1479 goto out; 1480 1481 /* directly return for shortlived page as well */ 1482 if (z_erofs_is_shortlived_page(page)) 1483 goto out; 1484 1485 lock_page(page); 1486 1487 /* only true if page reclaim goes wrong, should never happen */ 1488 DBG_BUGON(justfound && PagePrivate(page)); 1489 1490 /* the page is still in manage cache */ 1491 if (page->mapping == mc) { 1492 WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); 1493 1494 if (!PagePrivate(page)) { 1495 /* 1496 * impossible to be !PagePrivate(page) for 1497 * the current restriction as well if 1498 * the page is already in compressed_bvecs[]. 1499 */ 1500 DBG_BUGON(!justfound); 1501 1502 justfound = 0; 1503 set_page_private(page, (unsigned long)pcl); 1504 SetPagePrivate(page); 1505 } 1506 1507 /* no need to submit io if it is already up-to-date */ 1508 if (PageUptodate(page)) { 1509 unlock_page(page); 1510 page = NULL; 1511 } 1512 goto out; 1513 } 1514 1515 /* 1516 * the managed page has been truncated, it's unsafe to 1517 * reuse this one, let's allocate a new cache-managed page. 1518 */ 1519 DBG_BUGON(page->mapping); 1520 DBG_BUGON(!justfound); 1521 1522 tocache = true; 1523 unlock_page(page); 1524 put_page(page); 1525 out_allocpage: 1526 page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); 1527 if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, 1528 oldpage, page)) { 1529 erofs_pagepool_add(pagepool, page); 1530 cond_resched(); 1531 goto repeat; 1532 } 1533 out_tocache: 1534 if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { 1535 /* turn into temporary page if fails (1 ref) */ 1536 set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); 1537 goto out; 1538 } 1539 attach_page_private(page, pcl); 1540 /* drop a refcount added by allocpage (then we have 2 refs here) */ 1541 put_page(page); 1542 1543 out: /* the only exit (for tracing and debugging) */ 1544 return page; 1545 } 1546 1547 static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, 1548 struct z_erofs_decompressqueue *fgq, bool *fg) 1549 { 1550 struct z_erofs_decompressqueue *q; 1551 1552 if (fg && !*fg) { 1553 q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN); 1554 if (!q) { 1555 *fg = true; 1556 goto fg_out; 1557 } 1558 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD 1559 kthread_init_work(&q->u.kthread_work, 1560 z_erofs_decompressqueue_kthread_work); 1561 #else 1562 INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); 1563 #endif 1564 } else { 1565 fg_out: 1566 q = fgq; 1567 init_completion(&fgq->u.done); 1568 atomic_set(&fgq->pending_bios, 0); 1569 q->eio = false; 1570 q->sync = true; 1571 } 1572 q->sb = sb; 1573 q->head = Z_EROFS_PCLUSTER_TAIL; 1574 return q; 1575 } 1576 1577 /* define decompression jobqueue types */ 1578 enum { 1579 JQ_BYPASS, 1580 JQ_SUBMIT, 1581 NR_JOBQUEUES, 1582 }; 1583 1584 static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, 1585 z_erofs_next_pcluster_t qtail[], 1586 z_erofs_next_pcluster_t owned_head) 1587 { 1588 z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; 1589 z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; 1590 1591 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL); 1592 1593 WRITE_ONCE(*submit_qtail, owned_head); 1594 WRITE_ONCE(*bypass_qtail, &pcl->next); 1595 1596 qtail[JQ_BYPASS] = &pcl->next; 1597 } 1598 1599 static void z_erofs_decompressqueue_endio(struct bio *bio) 1600 { 1601 struct z_erofs_decompressqueue *q = bio->bi_private; 1602 blk_status_t err = bio->bi_status; 1603 struct bio_vec *bvec; 1604 struct bvec_iter_all iter_all; 1605 1606 bio_for_each_segment_all(bvec, bio, iter_all) { 1607 struct page *page = bvec->bv_page; 1608 1609 DBG_BUGON(PageUptodate(page)); 1610 DBG_BUGON(z_erofs_page_is_invalidated(page)); 1611 1612 if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { 1613 if (!err) 1614 SetPageUptodate(page); 1615 unlock_page(page); 1616 } 1617 } 1618 if (err) 1619 q->eio = true; 1620 z_erofs_decompress_kickoff(q, -1); 1621 bio_put(bio); 1622 } 1623 1624 static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, 1625 struct z_erofs_decompressqueue *fgq, 1626 bool *force_fg, bool readahead) 1627 { 1628 struct super_block *sb = f->inode->i_sb; 1629 struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); 1630 z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; 1631 struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; 1632 z_erofs_next_pcluster_t owned_head = f->owned_head; 1633 /* bio is NULL initially, so no need to initialize last_{index,bdev} */ 1634 pgoff_t last_index; 1635 struct block_device *last_bdev; 1636 unsigned int nr_bios = 0; 1637 struct bio *bio = NULL; 1638 unsigned long pflags; 1639 int memstall = 0; 1640 1641 /* 1642 * if managed cache is enabled, bypass jobqueue is needed, 1643 * no need to read from device for all pclusters in this queue. 1644 */ 1645 q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); 1646 q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); 1647 1648 qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; 1649 qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; 1650 1651 /* by default, all need io submission */ 1652 q[JQ_SUBMIT]->head = owned_head; 1653 1654 do { 1655 struct erofs_map_dev mdev; 1656 struct z_erofs_pcluster *pcl; 1657 pgoff_t cur, end; 1658 unsigned int i = 0; 1659 bool bypass = true; 1660 1661 DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); 1662 pcl = container_of(owned_head, struct z_erofs_pcluster, next); 1663 owned_head = READ_ONCE(pcl->next); 1664 1665 if (z_erofs_is_inline_pcluster(pcl)) { 1666 move_to_bypass_jobqueue(pcl, qtail, owned_head); 1667 continue; 1668 } 1669 1670 /* no device id here, thus it will always succeed */ 1671 mdev = (struct erofs_map_dev) { 1672 .m_pa = erofs_pos(sb, pcl->obj.index), 1673 }; 1674 (void)erofs_map_dev(sb, &mdev); 1675 1676 cur = erofs_blknr(sb, mdev.m_pa); 1677 end = cur + pcl->pclusterpages; 1678 1679 do { 1680 struct page *page; 1681 1682 page = pickup_page_for_submission(pcl, i++, 1683 &f->pagepool, mc); 1684 if (!page) 1685 continue; 1686 1687 if (bio && (cur != last_index + 1 || 1688 last_bdev != mdev.m_bdev)) { 1689 submit_bio_retry: 1690 submit_bio(bio); 1691 if (memstall) { 1692 psi_memstall_leave(&pflags); 1693 memstall = 0; 1694 } 1695 bio = NULL; 1696 } 1697 1698 if (unlikely(PageWorkingset(page)) && !memstall) { 1699 psi_memstall_enter(&pflags); 1700 memstall = 1; 1701 } 1702 1703 if (!bio) { 1704 bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, 1705 REQ_OP_READ, GFP_NOIO); 1706 bio->bi_end_io = z_erofs_decompressqueue_endio; 1707 1708 last_bdev = mdev.m_bdev; 1709 bio->bi_iter.bi_sector = (sector_t)cur << 1710 (sb->s_blocksize_bits - 9); 1711 bio->bi_private = q[JQ_SUBMIT]; 1712 if (readahead) 1713 bio->bi_opf |= REQ_RAHEAD; 1714 ++nr_bios; 1715 } 1716 1717 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) 1718 goto submit_bio_retry; 1719 1720 last_index = cur; 1721 bypass = false; 1722 } while (++cur < end); 1723 1724 if (!bypass) 1725 qtail[JQ_SUBMIT] = &pcl->next; 1726 else 1727 move_to_bypass_jobqueue(pcl, qtail, owned_head); 1728 } while (owned_head != Z_EROFS_PCLUSTER_TAIL); 1729 1730 if (bio) { 1731 submit_bio(bio); 1732 if (memstall) 1733 psi_memstall_leave(&pflags); 1734 } 1735 1736 /* 1737 * although background is preferred, no one is pending for submission. 1738 * don't issue decompression but drop it directly instead. 1739 */ 1740 if (!*force_fg && !nr_bios) { 1741 kvfree(q[JQ_SUBMIT]); 1742 return; 1743 } 1744 z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); 1745 } 1746 1747 static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, 1748 bool force_fg, bool ra) 1749 { 1750 struct z_erofs_decompressqueue io[NR_JOBQUEUES]; 1751 1752 if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) 1753 return; 1754 z_erofs_submit_queue(f, io, &force_fg, ra); 1755 1756 /* handle bypass queue (no i/o pclusters) immediately */ 1757 z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); 1758 1759 if (!force_fg) 1760 return; 1761 1762 /* wait until all bios are completed */ 1763 wait_for_completion_io(&io[JQ_SUBMIT].u.done); 1764 1765 /* handle synchronous decompress queue in the caller context */ 1766 z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool); 1767 } 1768 1769 /* 1770 * Since partial uptodate is still unimplemented for now, we have to use 1771 * approximate readmore strategies as a start. 1772 */ 1773 static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, 1774 struct readahead_control *rac, bool backmost) 1775 { 1776 struct inode *inode = f->inode; 1777 struct erofs_map_blocks *map = &f->map; 1778 erofs_off_t cur, end, headoffset = f->headoffset; 1779 int err; 1780 1781 if (backmost) { 1782 if (rac) 1783 end = headoffset + readahead_length(rac) - 1; 1784 else 1785 end = headoffset + PAGE_SIZE - 1; 1786 map->m_la = end; 1787 err = z_erofs_map_blocks_iter(inode, map, 1788 EROFS_GET_BLOCKS_READMORE); 1789 if (err) 1790 return; 1791 1792 /* expand ra for the trailing edge if readahead */ 1793 if (rac) { 1794 cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); 1795 readahead_expand(rac, headoffset, cur - headoffset); 1796 return; 1797 } 1798 end = round_up(end, PAGE_SIZE); 1799 } else { 1800 end = round_up(map->m_la, PAGE_SIZE); 1801 1802 if (!map->m_llen) 1803 return; 1804 } 1805 1806 cur = map->m_la + map->m_llen - 1; 1807 while ((cur >= end) && (cur < i_size_read(inode))) { 1808 pgoff_t index = cur >> PAGE_SHIFT; 1809 struct page *page; 1810 1811 page = erofs_grab_cache_page_nowait(inode->i_mapping, index); 1812 if (page) { 1813 if (PageUptodate(page)) 1814 unlock_page(page); 1815 else 1816 (void)z_erofs_do_read_page(f, page); 1817 put_page(page); 1818 } 1819 1820 if (cur < PAGE_SIZE) 1821 break; 1822 cur = (index << PAGE_SHIFT) - 1; 1823 } 1824 } 1825 1826 static int z_erofs_read_folio(struct file *file, struct folio *folio) 1827 { 1828 struct inode *const inode = folio->mapping->host; 1829 struct erofs_sb_info *const sbi = EROFS_I_SB(inode); 1830 struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); 1831 int err; 1832 1833 trace_erofs_read_folio(folio, false); 1834 f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; 1835 1836 z_erofs_pcluster_readmore(&f, NULL, true); 1837 err = z_erofs_do_read_page(&f, &folio->page); 1838 z_erofs_pcluster_readmore(&f, NULL, false); 1839 z_erofs_pcluster_end(&f); 1840 1841 /* if some compressed cluster ready, need submit them anyway */ 1842 z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); 1843 1844 if (err && err != -EINTR) 1845 erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", 1846 err, folio->index, EROFS_I(inode)->nid); 1847 1848 erofs_put_metabuf(&f.map.buf); 1849 erofs_release_pages(&f.pagepool); 1850 return err; 1851 } 1852 1853 static void z_erofs_readahead(struct readahead_control *rac) 1854 { 1855 struct inode *const inode = rac->mapping->host; 1856 struct erofs_sb_info *const sbi = EROFS_I_SB(inode); 1857 struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); 1858 struct folio *head = NULL, *folio; 1859 unsigned int nr_folios; 1860 int err; 1861 1862 f.headoffset = readahead_pos(rac); 1863 1864 z_erofs_pcluster_readmore(&f, rac, true); 1865 nr_folios = readahead_count(rac); 1866 trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false); 1867 1868 while ((folio = readahead_folio(rac))) { 1869 folio->private = head; 1870 head = folio; 1871 } 1872 1873 /* traverse in reverse order for best metadata I/O performance */ 1874 while (head) { 1875 folio = head; 1876 head = folio_get_private(folio); 1877 1878 err = z_erofs_do_read_page(&f, &folio->page); 1879 if (err && err != -EINTR) 1880 erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", 1881 folio->index, EROFS_I(inode)->nid); 1882 } 1883 z_erofs_pcluster_readmore(&f, rac, false); 1884 z_erofs_pcluster_end(&f); 1885 1886 z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true); 1887 erofs_put_metabuf(&f.map.buf); 1888 erofs_release_pages(&f.pagepool); 1889 } 1890 1891 const struct address_space_operations z_erofs_aops = { 1892 .read_folio = z_erofs_read_folio, 1893 .readahead = z_erofs_readahead, 1894 }; 1895