1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * fs/f2fs/gc.c 4 * 5 * Copyright (c) 2012 Samsung Electronics Co., Ltd. 6 * http://www.samsung.com/ 7 */ 8 #include <linux/fs.h> 9 #include <linux/module.h> 10 #include <linux/init.h> 11 #include <linux/f2fs_fs.h> 12 #include <linux/kthread.h> 13 #include <linux/delay.h> 14 #include <linux/freezer.h> 15 #include <linux/sched/signal.h> 16 #include <linux/random.h> 17 #include <linux/sched/mm.h> 18 19 #include "f2fs.h" 20 #include "node.h" 21 #include "segment.h" 22 #include "gc.h" 23 #include "iostat.h" 24 #include <trace/events/f2fs.h> 25 26 static struct kmem_cache *victim_entry_slab; 27 28 static unsigned int count_bits(const unsigned long *addr, 29 unsigned int offset, unsigned int len); 30 31 static int gc_thread_func(void *data) 32 { 33 struct f2fs_sb_info *sbi = data; 34 struct f2fs_gc_kthread *gc_th = sbi->gc_thread; 35 wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; 36 wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; 37 unsigned int wait_ms; 38 struct f2fs_gc_control gc_control = { 39 .victim_segno = NULL_SEGNO, 40 .should_migrate_blocks = false, 41 .err_gc_skipped = false, 42 .one_time = false }; 43 44 wait_ms = gc_th->min_sleep_time; 45 46 set_freezable(); 47 do { 48 bool sync_mode, foreground = false, gc_boost = false; 49 50 wait_event_freezable_timeout(*wq, 51 kthread_should_stop() || 52 waitqueue_active(fggc_wq) || 53 gc_th->gc_wake, 54 msecs_to_jiffies(wait_ms)); 55 56 if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq)) { 57 foreground = true; 58 gc_control.one_time = false; 59 } else if (f2fs_sb_has_blkzoned(sbi)) { 60 gc_control.one_time = true; 61 } 62 63 /* give it a try one time */ 64 if (gc_th->gc_wake) 65 gc_th->gc_wake = false; 66 67 if (f2fs_readonly(sbi->sb)) { 68 stat_other_skip_bggc_count(sbi); 69 continue; 70 } 71 if (kthread_should_stop()) 72 break; 73 74 if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { 75 increase_sleep_time(gc_th, &wait_ms); 76 stat_other_skip_bggc_count(sbi); 77 continue; 78 } 79 80 if (time_to_inject(sbi, FAULT_CHECKPOINT)) 81 f2fs_stop_checkpoint(sbi, false, 82 STOP_CP_REASON_FAULT_INJECT); 83 84 if (!sb_start_write_trylock(sbi->sb)) { 85 stat_other_skip_bggc_count(sbi); 86 continue; 87 } 88 89 /* 90 * [GC triggering condition] 91 * 0. GC is not conducted currently. 92 * 1. There are enough dirty segments. 93 * 2. IO subsystem is idle by checking the # of writeback pages. 94 * 3. IO subsystem is idle by checking the # of requests in 95 * bdev's request list. 96 * 97 * Note) We have to avoid triggering GCs frequently. 98 * Because it is possible that some segments can be 99 * invalidated soon after by user update or deletion. 100 * So, I'd like to wait some time to collect dirty segments. 101 */ 102 if (sbi->gc_mode == GC_URGENT_HIGH || 103 sbi->gc_mode == GC_URGENT_MID) { 104 wait_ms = gc_th->urgent_sleep_time; 105 f2fs_down_write(&sbi->gc_lock); 106 goto do_gc; 107 } 108 109 if (foreground) { 110 f2fs_down_write(&sbi->gc_lock); 111 goto do_gc; 112 } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) { 113 stat_other_skip_bggc_count(sbi); 114 goto next; 115 } 116 117 if (!is_idle(sbi, GC_TIME)) { 118 increase_sleep_time(gc_th, &wait_ms); 119 f2fs_up_write(&sbi->gc_lock); 120 stat_io_skip_bggc_count(sbi); 121 goto next; 122 } 123 124 if (f2fs_sb_has_blkzoned(sbi)) { 125 if (has_enough_free_blocks(sbi, 126 gc_th->no_zoned_gc_percent)) { 127 wait_ms = gc_th->no_gc_sleep_time; 128 f2fs_up_write(&sbi->gc_lock); 129 goto next; 130 } 131 if (wait_ms == gc_th->no_gc_sleep_time) 132 wait_ms = gc_th->max_sleep_time; 133 } 134 135 if (need_to_boost_gc(sbi)) { 136 decrease_sleep_time(gc_th, &wait_ms); 137 if (f2fs_sb_has_blkzoned(sbi)) 138 gc_boost = true; 139 } else { 140 increase_sleep_time(gc_th, &wait_ms); 141 } 142 do_gc: 143 stat_inc_gc_call_count(sbi, foreground ? 144 FOREGROUND : BACKGROUND); 145 146 sync_mode = (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) || 147 (gc_boost && gc_th->boost_gc_greedy); 148 149 /* foreground GC was been triggered via f2fs_balance_fs() */ 150 if (foreground && !f2fs_sb_has_blkzoned(sbi)) 151 sync_mode = false; 152 153 gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC; 154 gc_control.no_bg_gc = foreground; 155 gc_control.nr_free_secs = foreground ? 1 : 0; 156 157 /* if return value is not zero, no victim was selected */ 158 if (f2fs_gc(sbi, &gc_control)) { 159 /* don't bother wait_ms by foreground gc */ 160 if (!foreground) 161 wait_ms = gc_th->no_gc_sleep_time; 162 } else { 163 /* reset wait_ms to default sleep time */ 164 if (wait_ms == gc_th->no_gc_sleep_time) 165 wait_ms = gc_th->min_sleep_time; 166 } 167 168 if (foreground) 169 wake_up_all(&gc_th->fggc_wq); 170 171 trace_f2fs_background_gc(sbi->sb, wait_ms, 172 prefree_segments(sbi), free_segments(sbi)); 173 174 /* balancing f2fs's metadata periodically */ 175 f2fs_balance_fs_bg(sbi, true); 176 next: 177 if (sbi->gc_mode != GC_NORMAL) { 178 spin_lock(&sbi->gc_remaining_trials_lock); 179 if (sbi->gc_remaining_trials) { 180 sbi->gc_remaining_trials--; 181 if (!sbi->gc_remaining_trials) 182 sbi->gc_mode = GC_NORMAL; 183 } 184 spin_unlock(&sbi->gc_remaining_trials_lock); 185 } 186 sb_end_write(sbi->sb); 187 188 } while (!kthread_should_stop()); 189 return 0; 190 } 191 192 int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) 193 { 194 struct f2fs_gc_kthread *gc_th; 195 dev_t dev = sbi->sb->s_bdev->bd_dev; 196 197 gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); 198 if (!gc_th) 199 return -ENOMEM; 200 201 gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; 202 gc_th->valid_thresh_ratio = DEF_GC_THREAD_VALID_THRESH_RATIO; 203 gc_th->boost_gc_multiple = BOOST_GC_MULTIPLE; 204 gc_th->boost_gc_greedy = GC_GREEDY; 205 206 if (f2fs_sb_has_blkzoned(sbi)) { 207 gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME_ZONED; 208 gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME_ZONED; 209 gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME_ZONED; 210 gc_th->no_zoned_gc_percent = LIMIT_NO_ZONED_GC; 211 gc_th->boost_zoned_gc_percent = LIMIT_BOOST_ZONED_GC; 212 } else { 213 gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; 214 gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; 215 gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; 216 gc_th->no_zoned_gc_percent = 0; 217 gc_th->boost_zoned_gc_percent = 0; 218 } 219 220 gc_th->gc_wake = false; 221 222 sbi->gc_thread = gc_th; 223 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); 224 init_waitqueue_head(&sbi->gc_thread->fggc_wq); 225 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, 226 "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); 227 if (IS_ERR(gc_th->f2fs_gc_task)) { 228 int err = PTR_ERR(gc_th->f2fs_gc_task); 229 230 kfree(gc_th); 231 sbi->gc_thread = NULL; 232 return err; 233 } 234 235 return 0; 236 } 237 238 void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) 239 { 240 struct f2fs_gc_kthread *gc_th = sbi->gc_thread; 241 242 if (!gc_th) 243 return; 244 kthread_stop(gc_th->f2fs_gc_task); 245 wake_up_all(&gc_th->fggc_wq); 246 kfree(gc_th); 247 sbi->gc_thread = NULL; 248 } 249 250 static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) 251 { 252 int gc_mode; 253 254 if (gc_type == BG_GC) { 255 if (sbi->am.atgc_enabled) 256 gc_mode = GC_AT; 257 else 258 gc_mode = GC_CB; 259 } else { 260 gc_mode = GC_GREEDY; 261 } 262 263 switch (sbi->gc_mode) { 264 case GC_IDLE_CB: 265 case GC_URGENT_LOW: 266 case GC_URGENT_MID: 267 gc_mode = GC_CB; 268 break; 269 case GC_IDLE_GREEDY: 270 case GC_URGENT_HIGH: 271 gc_mode = GC_GREEDY; 272 break; 273 case GC_IDLE_AT: 274 gc_mode = GC_AT; 275 break; 276 } 277 278 return gc_mode; 279 } 280 281 static void select_policy(struct f2fs_sb_info *sbi, int gc_type, 282 int type, struct victim_sel_policy *p) 283 { 284 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 285 286 if (p->alloc_mode == SSR || p->alloc_mode == AT_SSR) { 287 p->gc_mode = GC_GREEDY; 288 p->dirty_bitmap = dirty_i->dirty_segmap[type]; 289 p->max_search = dirty_i->nr_dirty[type]; 290 p->ofs_unit = 1; 291 } else { 292 p->gc_mode = select_gc_type(sbi, gc_type); 293 p->ofs_unit = SEGS_PER_SEC(sbi); 294 if (__is_large_section(sbi)) { 295 p->dirty_bitmap = dirty_i->dirty_secmap; 296 p->max_search = count_bits(p->dirty_bitmap, 297 0, MAIN_SECS(sbi)); 298 } else { 299 p->dirty_bitmap = dirty_i->dirty_segmap[DIRTY]; 300 p->max_search = dirty_i->nr_dirty[DIRTY]; 301 } 302 } 303 304 /* 305 * adjust candidates range, should select all dirty segments for 306 * foreground GC and urgent GC cases. 307 */ 308 if (gc_type != FG_GC && 309 (sbi->gc_mode != GC_URGENT_HIGH) && 310 (p->gc_mode != GC_AT && p->alloc_mode != AT_SSR) && 311 p->max_search > sbi->max_victim_search) 312 p->max_search = sbi->max_victim_search; 313 314 /* let's select beginning hot/small space first. */ 315 if (f2fs_need_rand_seg(sbi)) 316 p->offset = get_random_u32_below(MAIN_SECS(sbi) * 317 SEGS_PER_SEC(sbi)); 318 else if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) 319 p->offset = 0; 320 else 321 p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; 322 } 323 324 static unsigned int get_max_cost(struct f2fs_sb_info *sbi, 325 struct victim_sel_policy *p) 326 { 327 /* SSR allocates in a segment unit */ 328 if (p->alloc_mode == SSR) 329 return BLKS_PER_SEG(sbi); 330 else if (p->alloc_mode == AT_SSR) 331 return UINT_MAX; 332 333 /* LFS */ 334 if (p->gc_mode == GC_GREEDY) 335 return SEGS_TO_BLKS(sbi, 2 * p->ofs_unit); 336 else if (p->gc_mode == GC_CB) 337 return UINT_MAX; 338 else if (p->gc_mode == GC_AT) 339 return UINT_MAX; 340 else /* No other gc_mode */ 341 return 0; 342 } 343 344 static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) 345 { 346 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 347 unsigned int secno; 348 349 /* 350 * If the gc_type is FG_GC, we can select victim segments 351 * selected by background GC before. 352 * Those segments guarantee they have small valid blocks. 353 */ 354 for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { 355 if (sec_usage_check(sbi, secno)) 356 continue; 357 clear_bit(secno, dirty_i->victim_secmap); 358 return GET_SEG_FROM_SEC(sbi, secno); 359 } 360 return NULL_SEGNO; 361 } 362 363 static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) 364 { 365 struct sit_info *sit_i = SIT_I(sbi); 366 unsigned long long mtime = 0; 367 unsigned int vblocks; 368 unsigned char age = 0; 369 unsigned char u; 370 unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi); 371 372 mtime = f2fs_get_section_mtime(sbi, segno); 373 f2fs_bug_on(sbi, mtime == INVALID_MTIME); 374 vblocks = get_valid_blocks(sbi, segno, true); 375 vblocks = div_u64(vblocks, usable_segs_per_sec); 376 377 u = BLKS_TO_SEGS(sbi, vblocks * 100); 378 379 /* Handle if the system time has changed by the user */ 380 if (mtime < sit_i->min_mtime) 381 sit_i->min_mtime = mtime; 382 if (mtime > sit_i->max_mtime) 383 sit_i->max_mtime = mtime; 384 if (sit_i->max_mtime != sit_i->min_mtime) 385 age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime), 386 sit_i->max_mtime - sit_i->min_mtime); 387 388 return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); 389 } 390 391 static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, 392 unsigned int segno, struct victim_sel_policy *p, 393 unsigned int valid_thresh_ratio) 394 { 395 if (p->alloc_mode == SSR) 396 return get_seg_entry(sbi, segno)->ckpt_valid_blocks; 397 398 if (p->one_time_gc && (valid_thresh_ratio < 100) && 399 (get_valid_blocks(sbi, segno, true) >= 400 CAP_BLKS_PER_SEC(sbi) * valid_thresh_ratio / 100)) 401 return UINT_MAX; 402 403 /* alloc_mode == LFS */ 404 if (p->gc_mode == GC_GREEDY) 405 return get_valid_blocks(sbi, segno, true); 406 else if (p->gc_mode == GC_CB) 407 return get_cb_cost(sbi, segno); 408 409 f2fs_bug_on(sbi, 1); 410 return 0; 411 } 412 413 static unsigned int count_bits(const unsigned long *addr, 414 unsigned int offset, unsigned int len) 415 { 416 unsigned int end = offset + len, sum = 0; 417 418 while (offset < end) { 419 if (test_bit(offset++, addr)) 420 ++sum; 421 } 422 return sum; 423 } 424 425 static bool f2fs_check_victim_tree(struct f2fs_sb_info *sbi, 426 struct rb_root_cached *root) 427 { 428 #ifdef CONFIG_F2FS_CHECK_FS 429 struct rb_node *cur = rb_first_cached(root), *next; 430 struct victim_entry *cur_ve, *next_ve; 431 432 while (cur) { 433 next = rb_next(cur); 434 if (!next) 435 return true; 436 437 cur_ve = rb_entry(cur, struct victim_entry, rb_node); 438 next_ve = rb_entry(next, struct victim_entry, rb_node); 439 440 if (cur_ve->mtime > next_ve->mtime) { 441 f2fs_info(sbi, "broken victim_rbtree, " 442 "cur_mtime(%llu) next_mtime(%llu)", 443 cur_ve->mtime, next_ve->mtime); 444 return false; 445 } 446 cur = next; 447 } 448 #endif 449 return true; 450 } 451 452 static struct victim_entry *__lookup_victim_entry(struct f2fs_sb_info *sbi, 453 unsigned long long mtime) 454 { 455 struct atgc_management *am = &sbi->am; 456 struct rb_node *node = am->root.rb_root.rb_node; 457 struct victim_entry *ve = NULL; 458 459 while (node) { 460 ve = rb_entry(node, struct victim_entry, rb_node); 461 462 if (mtime < ve->mtime) 463 node = node->rb_left; 464 else 465 node = node->rb_right; 466 } 467 return ve; 468 } 469 470 static struct victim_entry *__create_victim_entry(struct f2fs_sb_info *sbi, 471 unsigned long long mtime, unsigned int segno) 472 { 473 struct atgc_management *am = &sbi->am; 474 struct victim_entry *ve; 475 476 ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS, true, NULL); 477 478 ve->mtime = mtime; 479 ve->segno = segno; 480 481 list_add_tail(&ve->list, &am->victim_list); 482 am->victim_count++; 483 484 return ve; 485 } 486 487 static void __insert_victim_entry(struct f2fs_sb_info *sbi, 488 unsigned long long mtime, unsigned int segno) 489 { 490 struct atgc_management *am = &sbi->am; 491 struct rb_root_cached *root = &am->root; 492 struct rb_node **p = &root->rb_root.rb_node; 493 struct rb_node *parent = NULL; 494 struct victim_entry *ve; 495 bool left_most = true; 496 497 /* look up rb tree to find parent node */ 498 while (*p) { 499 parent = *p; 500 ve = rb_entry(parent, struct victim_entry, rb_node); 501 502 if (mtime < ve->mtime) { 503 p = &(*p)->rb_left; 504 } else { 505 p = &(*p)->rb_right; 506 left_most = false; 507 } 508 } 509 510 ve = __create_victim_entry(sbi, mtime, segno); 511 512 rb_link_node(&ve->rb_node, parent, p); 513 rb_insert_color_cached(&ve->rb_node, root, left_most); 514 } 515 516 static void add_victim_entry(struct f2fs_sb_info *sbi, 517 struct victim_sel_policy *p, unsigned int segno) 518 { 519 struct sit_info *sit_i = SIT_I(sbi); 520 unsigned long long mtime = 0; 521 522 if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { 523 if (p->gc_mode == GC_AT && 524 get_valid_blocks(sbi, segno, true) == 0) 525 return; 526 } 527 528 mtime = f2fs_get_section_mtime(sbi, segno); 529 f2fs_bug_on(sbi, mtime == INVALID_MTIME); 530 531 /* Handle if the system time has changed by the user */ 532 if (mtime < sit_i->min_mtime) 533 sit_i->min_mtime = mtime; 534 if (mtime > sit_i->max_mtime) 535 sit_i->max_mtime = mtime; 536 if (mtime < sit_i->dirty_min_mtime) 537 sit_i->dirty_min_mtime = mtime; 538 if (mtime > sit_i->dirty_max_mtime) 539 sit_i->dirty_max_mtime = mtime; 540 541 /* don't choose young section as candidate */ 542 if (sit_i->dirty_max_mtime - mtime < p->age_threshold) 543 return; 544 545 __insert_victim_entry(sbi, mtime, segno); 546 } 547 548 static void atgc_lookup_victim(struct f2fs_sb_info *sbi, 549 struct victim_sel_policy *p) 550 { 551 struct sit_info *sit_i = SIT_I(sbi); 552 struct atgc_management *am = &sbi->am; 553 struct rb_root_cached *root = &am->root; 554 struct rb_node *node; 555 struct victim_entry *ve; 556 unsigned long long total_time; 557 unsigned long long age, u, accu; 558 unsigned long long max_mtime = sit_i->dirty_max_mtime; 559 unsigned long long min_mtime = sit_i->dirty_min_mtime; 560 unsigned int sec_blocks = CAP_BLKS_PER_SEC(sbi); 561 unsigned int vblocks; 562 unsigned int dirty_threshold = max(am->max_candidate_count, 563 am->candidate_ratio * 564 am->victim_count / 100); 565 unsigned int age_weight = am->age_weight; 566 unsigned int cost; 567 unsigned int iter = 0; 568 569 if (max_mtime < min_mtime) 570 return; 571 572 max_mtime += 1; 573 total_time = max_mtime - min_mtime; 574 575 accu = div64_u64(ULLONG_MAX, total_time); 576 accu = min_t(unsigned long long, div_u64(accu, 100), 577 DEFAULT_ACCURACY_CLASS); 578 579 node = rb_first_cached(root); 580 next: 581 ve = rb_entry_safe(node, struct victim_entry, rb_node); 582 if (!ve) 583 return; 584 585 if (ve->mtime >= max_mtime || ve->mtime < min_mtime) 586 goto skip; 587 588 /* age = 10000 * x% * 60 */ 589 age = div64_u64(accu * (max_mtime - ve->mtime), total_time) * 590 age_weight; 591 592 vblocks = get_valid_blocks(sbi, ve->segno, true); 593 f2fs_bug_on(sbi, !vblocks || vblocks == sec_blocks); 594 595 /* u = 10000 * x% * 40 */ 596 u = div64_u64(accu * (sec_blocks - vblocks), sec_blocks) * 597 (100 - age_weight); 598 599 f2fs_bug_on(sbi, age + u >= UINT_MAX); 600 601 cost = UINT_MAX - (age + u); 602 iter++; 603 604 if (cost < p->min_cost || 605 (cost == p->min_cost && age > p->oldest_age)) { 606 p->min_cost = cost; 607 p->oldest_age = age; 608 p->min_segno = ve->segno; 609 } 610 skip: 611 if (iter < dirty_threshold) { 612 node = rb_next(node); 613 goto next; 614 } 615 } 616 617 /* 618 * select candidates around source section in range of 619 * [target - dirty_threshold, target + dirty_threshold] 620 */ 621 static void atssr_lookup_victim(struct f2fs_sb_info *sbi, 622 struct victim_sel_policy *p) 623 { 624 struct sit_info *sit_i = SIT_I(sbi); 625 struct atgc_management *am = &sbi->am; 626 struct victim_entry *ve; 627 unsigned long long age; 628 unsigned long long max_mtime = sit_i->dirty_max_mtime; 629 unsigned long long min_mtime = sit_i->dirty_min_mtime; 630 unsigned int vblocks; 631 unsigned int dirty_threshold = max(am->max_candidate_count, 632 am->candidate_ratio * 633 am->victim_count / 100); 634 unsigned int cost, iter; 635 int stage = 0; 636 637 if (max_mtime < min_mtime) 638 return; 639 max_mtime += 1; 640 next_stage: 641 iter = 0; 642 ve = __lookup_victim_entry(sbi, p->age); 643 next_node: 644 if (!ve) { 645 if (stage++ == 0) 646 goto next_stage; 647 return; 648 } 649 650 if (ve->mtime >= max_mtime || ve->mtime < min_mtime) 651 goto skip_node; 652 653 age = max_mtime - ve->mtime; 654 655 vblocks = get_seg_entry(sbi, ve->segno)->ckpt_valid_blocks; 656 f2fs_bug_on(sbi, !vblocks); 657 658 /* rare case */ 659 if (vblocks == BLKS_PER_SEG(sbi)) 660 goto skip_node; 661 662 iter++; 663 664 age = max_mtime - abs(p->age - age); 665 cost = UINT_MAX - vblocks; 666 667 if (cost < p->min_cost || 668 (cost == p->min_cost && age > p->oldest_age)) { 669 p->min_cost = cost; 670 p->oldest_age = age; 671 p->min_segno = ve->segno; 672 } 673 skip_node: 674 if (iter < dirty_threshold) { 675 ve = rb_entry(stage == 0 ? rb_prev(&ve->rb_node) : 676 rb_next(&ve->rb_node), 677 struct victim_entry, rb_node); 678 goto next_node; 679 } 680 681 if (stage++ == 0) 682 goto next_stage; 683 } 684 685 static void lookup_victim_by_age(struct f2fs_sb_info *sbi, 686 struct victim_sel_policy *p) 687 { 688 f2fs_bug_on(sbi, !f2fs_check_victim_tree(sbi, &sbi->am.root)); 689 690 if (p->gc_mode == GC_AT) 691 atgc_lookup_victim(sbi, p); 692 else if (p->alloc_mode == AT_SSR) 693 atssr_lookup_victim(sbi, p); 694 else 695 f2fs_bug_on(sbi, 1); 696 } 697 698 static void release_victim_entry(struct f2fs_sb_info *sbi) 699 { 700 struct atgc_management *am = &sbi->am; 701 struct victim_entry *ve, *tmp; 702 703 list_for_each_entry_safe(ve, tmp, &am->victim_list, list) { 704 list_del(&ve->list); 705 kmem_cache_free(victim_entry_slab, ve); 706 am->victim_count--; 707 } 708 709 am->root = RB_ROOT_CACHED; 710 711 f2fs_bug_on(sbi, am->victim_count); 712 f2fs_bug_on(sbi, !list_empty(&am->victim_list)); 713 } 714 715 static bool f2fs_pin_section(struct f2fs_sb_info *sbi, unsigned int segno) 716 { 717 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 718 unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); 719 720 if (!dirty_i->enable_pin_section) 721 return false; 722 if (!test_and_set_bit(secno, dirty_i->pinned_secmap)) 723 dirty_i->pinned_secmap_cnt++; 724 return true; 725 } 726 727 static bool f2fs_pinned_section_exists(struct dirty_seglist_info *dirty_i) 728 { 729 return dirty_i->pinned_secmap_cnt; 730 } 731 732 static bool f2fs_section_is_pinned(struct dirty_seglist_info *dirty_i, 733 unsigned int secno) 734 { 735 return dirty_i->enable_pin_section && 736 f2fs_pinned_section_exists(dirty_i) && 737 test_bit(secno, dirty_i->pinned_secmap); 738 } 739 740 static void f2fs_unpin_all_sections(struct f2fs_sb_info *sbi, bool enable) 741 { 742 unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); 743 744 if (f2fs_pinned_section_exists(DIRTY_I(sbi))) { 745 memset(DIRTY_I(sbi)->pinned_secmap, 0, bitmap_size); 746 DIRTY_I(sbi)->pinned_secmap_cnt = 0; 747 } 748 DIRTY_I(sbi)->enable_pin_section = enable; 749 } 750 751 static int f2fs_gc_pinned_control(struct inode *inode, int gc_type, 752 unsigned int segno) 753 { 754 if (!f2fs_is_pinned_file(inode)) 755 return 0; 756 if (gc_type != FG_GC) 757 return -EBUSY; 758 if (!f2fs_pin_section(F2FS_I_SB(inode), segno)) 759 f2fs_pin_file_control(inode, true); 760 return -EAGAIN; 761 } 762 763 /* 764 * This function is called from two paths. 765 * One is garbage collection and the other is SSR segment selection. 766 * When it is called during GC, it just gets a victim segment 767 * and it does not remove it from dirty seglist. 768 * When it is called from SSR segment selection, it finds a segment 769 * which has minimum valid blocks and removes it from dirty seglist. 770 */ 771 int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, 772 int gc_type, int type, char alloc_mode, 773 unsigned long long age, bool one_time) 774 { 775 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 776 struct sit_info *sm = SIT_I(sbi); 777 struct victim_sel_policy p = {0}; 778 unsigned int secno, last_victim; 779 unsigned int last_segment; 780 unsigned int nsearched; 781 unsigned int valid_thresh_ratio = 100; 782 bool is_atgc; 783 int ret = 0; 784 785 mutex_lock(&dirty_i->seglist_lock); 786 last_segment = MAIN_SECS(sbi) * SEGS_PER_SEC(sbi); 787 788 p.alloc_mode = alloc_mode; 789 p.age = age; 790 p.age_threshold = sbi->am.age_threshold; 791 if (one_time) { 792 p.one_time_gc = one_time; 793 if (has_enough_free_secs(sbi, 0, NR_PERSISTENT_LOG)) 794 valid_thresh_ratio = sbi->gc_thread->valid_thresh_ratio; 795 } 796 797 retry: 798 select_policy(sbi, gc_type, type, &p); 799 p.min_segno = NULL_SEGNO; 800 p.oldest_age = 0; 801 p.min_cost = get_max_cost(sbi, &p); 802 803 is_atgc = (p.gc_mode == GC_AT || p.alloc_mode == AT_SSR); 804 nsearched = 0; 805 806 if (is_atgc) 807 SIT_I(sbi)->dirty_min_mtime = ULLONG_MAX; 808 809 if (*result != NULL_SEGNO) { 810 if (!get_valid_blocks(sbi, *result, false)) { 811 ret = -ENODATA; 812 goto out; 813 } 814 815 if (sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) { 816 ret = -EBUSY; 817 goto out; 818 } 819 if (gc_type == FG_GC) 820 clear_bit(GET_SEC_FROM_SEG(sbi, *result), dirty_i->victim_secmap); 821 p.min_segno = *result; 822 goto got_result; 823 } 824 825 ret = -ENODATA; 826 if (p.max_search == 0) 827 goto out; 828 829 if (__is_large_section(sbi) && p.alloc_mode == LFS) { 830 if (sbi->next_victim_seg[BG_GC] != NULL_SEGNO) { 831 p.min_segno = sbi->next_victim_seg[BG_GC]; 832 *result = p.min_segno; 833 sbi->next_victim_seg[BG_GC] = NULL_SEGNO; 834 goto got_result; 835 } 836 if (gc_type == FG_GC && 837 sbi->next_victim_seg[FG_GC] != NULL_SEGNO) { 838 p.min_segno = sbi->next_victim_seg[FG_GC]; 839 *result = p.min_segno; 840 sbi->next_victim_seg[FG_GC] = NULL_SEGNO; 841 goto got_result; 842 } 843 } 844 845 last_victim = sm->last_victim[p.gc_mode]; 846 if (p.alloc_mode == LFS && gc_type == FG_GC) { 847 p.min_segno = check_bg_victims(sbi); 848 if (p.min_segno != NULL_SEGNO) 849 goto got_it; 850 } 851 852 while (1) { 853 unsigned long cost, *dirty_bitmap; 854 unsigned int unit_no, segno; 855 856 dirty_bitmap = p.dirty_bitmap; 857 unit_no = find_next_bit(dirty_bitmap, 858 last_segment / p.ofs_unit, 859 p.offset / p.ofs_unit); 860 segno = unit_no * p.ofs_unit; 861 if (segno >= last_segment) { 862 if (sm->last_victim[p.gc_mode]) { 863 last_segment = 864 sm->last_victim[p.gc_mode]; 865 sm->last_victim[p.gc_mode] = 0; 866 p.offset = 0; 867 continue; 868 } 869 break; 870 } 871 872 p.offset = segno + p.ofs_unit; 873 nsearched++; 874 875 #ifdef CONFIG_F2FS_CHECK_FS 876 /* 877 * skip selecting the invalid segno (that is failed due to block 878 * validity check failure during GC) to avoid endless GC loop in 879 * such cases. 880 */ 881 if (test_bit(segno, sm->invalid_segmap)) 882 goto next; 883 #endif 884 885 secno = GET_SEC_FROM_SEG(sbi, segno); 886 887 if (sec_usage_check(sbi, secno)) 888 goto next; 889 890 /* Don't touch checkpointed data */ 891 if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { 892 if (p.alloc_mode == LFS) { 893 /* 894 * LFS is set to find source section during GC. 895 * The victim should have no checkpointed data. 896 */ 897 if (get_ckpt_valid_blocks(sbi, segno, true)) 898 goto next; 899 } else { 900 /* 901 * SSR | AT_SSR are set to find target segment 902 * for writes which can be full by checkpointed 903 * and newly written blocks. 904 */ 905 if (!f2fs_segment_has_free_slot(sbi, segno)) 906 goto next; 907 } 908 } 909 910 if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) 911 goto next; 912 913 if (gc_type == FG_GC && f2fs_section_is_pinned(dirty_i, secno)) 914 goto next; 915 916 if (is_atgc) { 917 add_victim_entry(sbi, &p, segno); 918 goto next; 919 } 920 921 cost = get_gc_cost(sbi, segno, &p, valid_thresh_ratio); 922 923 if (p.min_cost > cost) { 924 p.min_segno = segno; 925 p.min_cost = cost; 926 } 927 next: 928 if (nsearched >= p.max_search) { 929 if (!sm->last_victim[p.gc_mode] && segno <= last_victim) 930 sm->last_victim[p.gc_mode] = 931 last_victim + p.ofs_unit; 932 else 933 sm->last_victim[p.gc_mode] = segno + p.ofs_unit; 934 sm->last_victim[p.gc_mode] %= 935 (MAIN_SECS(sbi) * SEGS_PER_SEC(sbi)); 936 break; 937 } 938 } 939 940 /* get victim for GC_AT/AT_SSR */ 941 if (is_atgc) { 942 lookup_victim_by_age(sbi, &p); 943 release_victim_entry(sbi); 944 } 945 946 if (is_atgc && p.min_segno == NULL_SEGNO && 947 sm->elapsed_time < p.age_threshold) { 948 p.age_threshold = 0; 949 goto retry; 950 } 951 952 if (p.min_segno != NULL_SEGNO) { 953 got_it: 954 *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; 955 got_result: 956 if (p.alloc_mode == LFS) { 957 secno = GET_SEC_FROM_SEG(sbi, p.min_segno); 958 if (gc_type == FG_GC) 959 sbi->cur_victim_sec = secno; 960 else 961 set_bit(secno, dirty_i->victim_secmap); 962 } 963 ret = 0; 964 965 } 966 out: 967 if (p.min_segno != NULL_SEGNO) 968 trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, 969 sbi->cur_victim_sec, 970 prefree_segments(sbi), free_segments(sbi)); 971 mutex_unlock(&dirty_i->seglist_lock); 972 973 return ret; 974 } 975 976 static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino) 977 { 978 struct inode_entry *ie; 979 980 ie = radix_tree_lookup(&gc_list->iroot, ino); 981 if (ie) 982 return ie->inode; 983 return NULL; 984 } 985 986 static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) 987 { 988 struct inode_entry *new_ie; 989 990 if (inode == find_gc_inode(gc_list, inode->i_ino)) { 991 iput(inode); 992 return; 993 } 994 new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, 995 GFP_NOFS, true, NULL); 996 new_ie->inode = inode; 997 998 f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); 999 list_add_tail(&new_ie->list, &gc_list->ilist); 1000 } 1001 1002 static void put_gc_inode(struct gc_inode_list *gc_list) 1003 { 1004 struct inode_entry *ie, *next_ie; 1005 1006 list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { 1007 radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); 1008 iput(ie->inode); 1009 list_del(&ie->list); 1010 kmem_cache_free(f2fs_inode_entry_slab, ie); 1011 } 1012 } 1013 1014 static int check_valid_map(struct f2fs_sb_info *sbi, 1015 unsigned int segno, int offset) 1016 { 1017 struct sit_info *sit_i = SIT_I(sbi); 1018 struct seg_entry *sentry; 1019 int ret; 1020 1021 down_read(&sit_i->sentry_lock); 1022 sentry = get_seg_entry(sbi, segno); 1023 ret = f2fs_test_bit(offset, sentry->cur_valid_map); 1024 up_read(&sit_i->sentry_lock); 1025 return ret; 1026 } 1027 1028 /* 1029 * This function compares node address got in summary with that in NAT. 1030 * On validity, copy that node with cold status, otherwise (invalid node) 1031 * ignore that. 1032 */ 1033 static int gc_node_segment(struct f2fs_sb_info *sbi, 1034 struct f2fs_summary *sum, unsigned int segno, int gc_type) 1035 { 1036 struct f2fs_summary *entry; 1037 block_t start_addr; 1038 int off; 1039 int phase = 0; 1040 bool fggc = (gc_type == FG_GC); 1041 int submitted = 0; 1042 unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); 1043 1044 start_addr = START_BLOCK(sbi, segno); 1045 1046 next_step: 1047 entry = sum; 1048 1049 if (fggc && phase == 2) 1050 atomic_inc(&sbi->wb_sync_req[NODE]); 1051 1052 for (off = 0; off < usable_blks_in_seg; off++, entry++) { 1053 nid_t nid = le32_to_cpu(entry->nid); 1054 struct folio *node_folio; 1055 struct node_info ni; 1056 int err; 1057 1058 /* stop BG_GC if there is not enough free sections. */ 1059 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) 1060 return submitted; 1061 1062 if (check_valid_map(sbi, segno, off) == 0) 1063 continue; 1064 1065 if (phase == 0) { 1066 f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, 1067 META_NAT, true); 1068 continue; 1069 } 1070 1071 if (phase == 1) { 1072 f2fs_ra_node_page(sbi, nid); 1073 continue; 1074 } 1075 1076 /* phase == 2 */ 1077 node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); 1078 if (IS_ERR(node_folio)) 1079 continue; 1080 1081 /* block may become invalid during f2fs_get_node_folio */ 1082 if (check_valid_map(sbi, segno, off) == 0) { 1083 f2fs_folio_put(node_folio, true); 1084 continue; 1085 } 1086 1087 if (f2fs_get_node_info(sbi, nid, &ni, false)) { 1088 f2fs_folio_put(node_folio, true); 1089 continue; 1090 } 1091 1092 if (ni.blk_addr != start_addr + off) { 1093 f2fs_folio_put(node_folio, true); 1094 continue; 1095 } 1096 1097 err = f2fs_move_node_folio(node_folio, gc_type); 1098 if (!err && gc_type == FG_GC) 1099 submitted++; 1100 stat_inc_node_blk_count(sbi, 1, gc_type); 1101 } 1102 1103 if (++phase < 3) 1104 goto next_step; 1105 1106 if (fggc) 1107 atomic_dec(&sbi->wb_sync_req[NODE]); 1108 return submitted; 1109 } 1110 1111 /* 1112 * Calculate start block index indicating the given node offset. 1113 * Be careful, caller should give this node offset only indicating direct node 1114 * blocks. If any node offsets, which point the other types of node blocks such 1115 * as indirect or double indirect node blocks, are given, it must be a caller's 1116 * bug. 1117 */ 1118 block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) 1119 { 1120 unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; 1121 unsigned int bidx; 1122 1123 if (node_ofs == 0) 1124 return 0; 1125 1126 if (node_ofs <= 2) { 1127 bidx = node_ofs - 1; 1128 } else if (node_ofs <= indirect_blks) { 1129 int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); 1130 1131 bidx = node_ofs - 2 - dec; 1132 } else { 1133 int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); 1134 1135 bidx = node_ofs - 5 - dec; 1136 } 1137 return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode); 1138 } 1139 1140 static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 1141 struct node_info *dni, block_t blkaddr, unsigned int *nofs) 1142 { 1143 struct folio *node_folio; 1144 nid_t nid; 1145 unsigned int ofs_in_node, max_addrs, base; 1146 block_t source_blkaddr; 1147 1148 nid = le32_to_cpu(sum->nid); 1149 ofs_in_node = le16_to_cpu(sum->ofs_in_node); 1150 1151 node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); 1152 if (IS_ERR(node_folio)) 1153 return false; 1154 1155 if (f2fs_get_node_info(sbi, nid, dni, false)) { 1156 f2fs_folio_put(node_folio, true); 1157 return false; 1158 } 1159 1160 if (sum->version != dni->version) { 1161 f2fs_warn(sbi, "%s: valid data with mismatched node version.", 1162 __func__); 1163 set_sbi_flag(sbi, SBI_NEED_FSCK); 1164 } 1165 1166 if (f2fs_check_nid_range(sbi, dni->ino)) { 1167 f2fs_folio_put(node_folio, true); 1168 return false; 1169 } 1170 1171 if (IS_INODE(node_folio)) { 1172 base = offset_in_addr(F2FS_INODE(node_folio)); 1173 max_addrs = DEF_ADDRS_PER_INODE; 1174 } else { 1175 base = 0; 1176 max_addrs = DEF_ADDRS_PER_BLOCK; 1177 } 1178 1179 if (base + ofs_in_node >= max_addrs) { 1180 f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u", 1181 base, ofs_in_node, max_addrs, dni->ino, dni->nid); 1182 f2fs_folio_put(node_folio, true); 1183 return false; 1184 } 1185 1186 *nofs = ofs_of_node(node_folio); 1187 source_blkaddr = data_blkaddr(NULL, node_folio, ofs_in_node); 1188 f2fs_folio_put(node_folio, true); 1189 1190 if (source_blkaddr != blkaddr) { 1191 #ifdef CONFIG_F2FS_CHECK_FS 1192 unsigned int segno = GET_SEGNO(sbi, blkaddr); 1193 unsigned long offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); 1194 1195 if (unlikely(check_valid_map(sbi, segno, offset))) { 1196 if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { 1197 f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u", 1198 blkaddr, source_blkaddr, segno); 1199 set_sbi_flag(sbi, SBI_NEED_FSCK); 1200 } 1201 } 1202 #endif 1203 return false; 1204 } 1205 return true; 1206 } 1207 1208 static int ra_data_block(struct inode *inode, pgoff_t index) 1209 { 1210 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 1211 struct address_space *mapping = f2fs_is_cow_file(inode) ? 1212 F2FS_I(inode)->atomic_inode->i_mapping : inode->i_mapping; 1213 struct dnode_of_data dn; 1214 struct folio *folio, *efolio; 1215 struct f2fs_io_info fio = { 1216 .sbi = sbi, 1217 .ino = inode->i_ino, 1218 .type = DATA, 1219 .temp = COLD, 1220 .op = REQ_OP_READ, 1221 .op_flags = 0, 1222 .encrypted_page = NULL, 1223 .in_list = 0, 1224 }; 1225 int err; 1226 1227 folio = f2fs_grab_cache_folio(mapping, index, true); 1228 if (IS_ERR(folio)) 1229 return PTR_ERR(folio); 1230 1231 if (f2fs_lookup_read_extent_cache_block(inode, index, 1232 &dn.data_blkaddr)) { 1233 if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, 1234 DATA_GENERIC_ENHANCE_READ))) { 1235 err = -EFSCORRUPTED; 1236 goto put_folio; 1237 } 1238 goto got_it; 1239 } 1240 1241 set_new_dnode(&dn, inode, NULL, NULL, 0); 1242 err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); 1243 if (err) 1244 goto put_folio; 1245 f2fs_put_dnode(&dn); 1246 1247 if (!__is_valid_data_blkaddr(dn.data_blkaddr)) { 1248 err = -ENOENT; 1249 goto put_folio; 1250 } 1251 if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, 1252 DATA_GENERIC_ENHANCE))) { 1253 err = -EFSCORRUPTED; 1254 goto put_folio; 1255 } 1256 got_it: 1257 /* read folio */ 1258 fio.folio = folio; 1259 fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; 1260 1261 /* 1262 * don't cache encrypted data into meta inode until previous dirty 1263 * data were writebacked to avoid racing between GC and flush. 1264 */ 1265 f2fs_folio_wait_writeback(folio, DATA, true, true); 1266 1267 f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); 1268 1269 efolio = f2fs_filemap_get_folio(META_MAPPING(sbi), dn.data_blkaddr, 1270 FGP_LOCK | FGP_CREAT, GFP_NOFS); 1271 if (IS_ERR(efolio)) { 1272 err = PTR_ERR(efolio); 1273 goto put_folio; 1274 } 1275 1276 fio.encrypted_page = &efolio->page; 1277 1278 err = f2fs_submit_page_bio(&fio); 1279 if (err) 1280 goto put_encrypted_page; 1281 f2fs_put_page(fio.encrypted_page, false); 1282 f2fs_folio_put(folio, true); 1283 1284 f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); 1285 f2fs_update_iostat(sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE); 1286 1287 return 0; 1288 put_encrypted_page: 1289 f2fs_put_page(fio.encrypted_page, true); 1290 put_folio: 1291 f2fs_folio_put(folio, true); 1292 return err; 1293 } 1294 1295 /* 1296 * Move data block via META_MAPPING while keeping locked data page. 1297 * This can be used to move blocks, aka LBAs, directly on disk. 1298 */ 1299 static int move_data_block(struct inode *inode, block_t bidx, 1300 int gc_type, unsigned int segno, int off) 1301 { 1302 struct address_space *mapping = f2fs_is_cow_file(inode) ? 1303 F2FS_I(inode)->atomic_inode->i_mapping : inode->i_mapping; 1304 struct f2fs_io_info fio = { 1305 .sbi = F2FS_I_SB(inode), 1306 .ino = inode->i_ino, 1307 .type = DATA, 1308 .temp = COLD, 1309 .op = REQ_OP_READ, 1310 .op_flags = 0, 1311 .encrypted_page = NULL, 1312 .in_list = 0, 1313 }; 1314 struct dnode_of_data dn; 1315 struct f2fs_summary sum; 1316 struct node_info ni; 1317 struct folio *folio, *mfolio, *efolio; 1318 block_t newaddr; 1319 int err = 0; 1320 bool lfs_mode = f2fs_lfs_mode(fio.sbi); 1321 int type = fio.sbi->am.atgc_enabled && (gc_type == BG_GC) && 1322 (fio.sbi->gc_mode != GC_URGENT_HIGH) ? 1323 CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA; 1324 1325 /* do not read out */ 1326 folio = f2fs_grab_cache_folio(mapping, bidx, false); 1327 if (IS_ERR(folio)) 1328 return PTR_ERR(folio); 1329 1330 if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { 1331 err = -ENOENT; 1332 goto out; 1333 } 1334 1335 err = f2fs_gc_pinned_control(inode, gc_type, segno); 1336 if (err) 1337 goto out; 1338 1339 set_new_dnode(&dn, inode, NULL, NULL, 0); 1340 err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); 1341 if (err) 1342 goto out; 1343 1344 if (unlikely(dn.data_blkaddr == NULL_ADDR)) { 1345 folio_clear_uptodate(folio); 1346 err = -ENOENT; 1347 goto put_out; 1348 } 1349 1350 /* 1351 * don't cache encrypted data into meta inode until previous dirty 1352 * data were writebacked to avoid racing between GC and flush. 1353 */ 1354 f2fs_folio_wait_writeback(folio, DATA, true, true); 1355 1356 f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); 1357 1358 err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); 1359 if (err) 1360 goto put_out; 1361 1362 /* read page */ 1363 fio.folio = folio; 1364 fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; 1365 1366 if (lfs_mode) 1367 f2fs_down_write(&fio.sbi->io_order_lock); 1368 1369 mfolio = f2fs_grab_cache_folio(META_MAPPING(fio.sbi), 1370 fio.old_blkaddr, false); 1371 if (IS_ERR(mfolio)) { 1372 err = PTR_ERR(mfolio); 1373 goto up_out; 1374 } 1375 1376 fio.encrypted_page = folio_file_page(mfolio, fio.old_blkaddr); 1377 1378 /* read source block in mfolio */ 1379 if (!folio_test_uptodate(mfolio)) { 1380 err = f2fs_submit_page_bio(&fio); 1381 if (err) { 1382 f2fs_folio_put(mfolio, true); 1383 goto up_out; 1384 } 1385 1386 f2fs_update_iostat(fio.sbi, inode, FS_DATA_READ_IO, 1387 F2FS_BLKSIZE); 1388 f2fs_update_iostat(fio.sbi, NULL, FS_GDATA_READ_IO, 1389 F2FS_BLKSIZE); 1390 1391 folio_lock(mfolio); 1392 if (unlikely(!is_meta_folio(mfolio) || 1393 !folio_test_uptodate(mfolio))) { 1394 err = -EIO; 1395 f2fs_folio_put(mfolio, true); 1396 goto up_out; 1397 } 1398 } 1399 1400 set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); 1401 1402 /* allocate block address */ 1403 err = f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, 1404 &sum, type, NULL); 1405 if (err) { 1406 f2fs_folio_put(mfolio, true); 1407 /* filesystem should shutdown, no need to recovery block */ 1408 goto up_out; 1409 } 1410 1411 efolio = f2fs_filemap_get_folio(META_MAPPING(fio.sbi), newaddr, 1412 FGP_LOCK | FGP_CREAT, GFP_NOFS); 1413 if (IS_ERR(efolio)) { 1414 err = PTR_ERR(efolio); 1415 f2fs_folio_put(mfolio, true); 1416 goto recover_block; 1417 } 1418 1419 fio.encrypted_page = &efolio->page; 1420 1421 /* write target block */ 1422 f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true); 1423 memcpy(page_address(fio.encrypted_page), 1424 folio_address(mfolio), PAGE_SIZE); 1425 f2fs_folio_put(mfolio, true); 1426 1427 f2fs_invalidate_internal_cache(fio.sbi, fio.old_blkaddr, 1); 1428 1429 set_page_dirty(fio.encrypted_page); 1430 if (clear_page_dirty_for_io(fio.encrypted_page)) 1431 dec_page_count(fio.sbi, F2FS_DIRTY_META); 1432 1433 set_page_writeback(fio.encrypted_page); 1434 1435 fio.op = REQ_OP_WRITE; 1436 fio.op_flags = REQ_SYNC; 1437 fio.new_blkaddr = newaddr; 1438 f2fs_submit_page_write(&fio); 1439 1440 f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE); 1441 1442 f2fs_update_data_blkaddr(&dn, newaddr); 1443 set_inode_flag(inode, FI_APPEND_WRITE); 1444 1445 f2fs_put_page(fio.encrypted_page, true); 1446 recover_block: 1447 if (err) 1448 f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, 1449 true, true, true); 1450 up_out: 1451 if (lfs_mode) 1452 f2fs_up_write(&fio.sbi->io_order_lock); 1453 put_out: 1454 f2fs_put_dnode(&dn); 1455 out: 1456 f2fs_folio_put(folio, true); 1457 return err; 1458 } 1459 1460 static int move_data_page(struct inode *inode, block_t bidx, int gc_type, 1461 unsigned int segno, int off) 1462 { 1463 struct folio *folio; 1464 int err = 0; 1465 1466 folio = f2fs_get_lock_data_folio(inode, bidx, true); 1467 if (IS_ERR(folio)) 1468 return PTR_ERR(folio); 1469 1470 if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { 1471 err = -ENOENT; 1472 goto out; 1473 } 1474 1475 err = f2fs_gc_pinned_control(inode, gc_type, segno); 1476 if (err) 1477 goto out; 1478 1479 if (gc_type == BG_GC) { 1480 if (folio_test_writeback(folio)) { 1481 err = -EAGAIN; 1482 goto out; 1483 } 1484 folio_mark_dirty(folio); 1485 folio_set_f2fs_gcing(folio); 1486 } else { 1487 struct f2fs_io_info fio = { 1488 .sbi = F2FS_I_SB(inode), 1489 .ino = inode->i_ino, 1490 .type = DATA, 1491 .temp = COLD, 1492 .op = REQ_OP_WRITE, 1493 .op_flags = REQ_SYNC, 1494 .old_blkaddr = NULL_ADDR, 1495 .folio = folio, 1496 .encrypted_page = NULL, 1497 .need_lock = LOCK_REQ, 1498 .io_type = FS_GC_DATA_IO, 1499 }; 1500 bool is_dirty = folio_test_dirty(folio); 1501 1502 retry: 1503 f2fs_folio_wait_writeback(folio, DATA, true, true); 1504 1505 folio_mark_dirty(folio); 1506 if (folio_clear_dirty_for_io(folio)) { 1507 inode_dec_dirty_pages(inode); 1508 f2fs_remove_dirty_inode(inode); 1509 } 1510 1511 folio_set_f2fs_gcing(folio); 1512 1513 err = f2fs_do_write_data_page(&fio); 1514 if (err) { 1515 folio_clear_f2fs_gcing(folio); 1516 if (err == -ENOMEM) { 1517 memalloc_retry_wait(GFP_NOFS); 1518 goto retry; 1519 } 1520 if (is_dirty) 1521 folio_mark_dirty(folio); 1522 } 1523 } 1524 out: 1525 f2fs_folio_put(folio, true); 1526 return err; 1527 } 1528 1529 /* 1530 * This function tries to get parent node of victim data block, and identifies 1531 * data block validity. If the block is valid, copy that with cold status and 1532 * modify parent node. 1533 * If the parent node is not valid or the data block address is different, 1534 * the victim data block is ignored. 1535 */ 1536 static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 1537 struct gc_inode_list *gc_list, unsigned int segno, int gc_type, 1538 bool force_migrate) 1539 { 1540 struct super_block *sb = sbi->sb; 1541 struct f2fs_summary *entry; 1542 block_t start_addr; 1543 int off; 1544 int phase = 0; 1545 int submitted = 0; 1546 unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); 1547 1548 start_addr = START_BLOCK(sbi, segno); 1549 1550 next_step: 1551 entry = sum; 1552 1553 for (off = 0; off < usable_blks_in_seg; off++, entry++) { 1554 struct inode *inode; 1555 struct node_info dni; /* dnode info for the data */ 1556 unsigned int ofs_in_node, nofs; 1557 block_t start_bidx; 1558 nid_t nid = le32_to_cpu(entry->nid); 1559 1560 /* 1561 * stop BG_GC if there is not enough free sections. 1562 * Or, stop GC if the segment becomes fully valid caused by 1563 * race condition along with SSR block allocation. 1564 */ 1565 if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || 1566 (!force_migrate && get_valid_blocks(sbi, segno, true) == 1567 CAP_BLKS_PER_SEC(sbi))) 1568 return submitted; 1569 1570 if (check_valid_map(sbi, segno, off) == 0) 1571 continue; 1572 1573 if (phase == 0) { 1574 f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, 1575 META_NAT, true); 1576 continue; 1577 } 1578 1579 if (phase == 1) { 1580 f2fs_ra_node_page(sbi, nid); 1581 continue; 1582 } 1583 1584 /* Get an inode by ino with checking validity */ 1585 if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) 1586 continue; 1587 1588 if (phase == 2) { 1589 f2fs_ra_node_page(sbi, dni.ino); 1590 continue; 1591 } 1592 1593 ofs_in_node = le16_to_cpu(entry->ofs_in_node); 1594 1595 if (phase == 3) { 1596 struct folio *data_folio; 1597 int err; 1598 1599 inode = f2fs_iget(sb, dni.ino); 1600 if (IS_ERR(inode)) 1601 continue; 1602 1603 if (is_bad_inode(inode) || 1604 special_file(inode->i_mode)) { 1605 iput(inode); 1606 continue; 1607 } 1608 1609 if (f2fs_has_inline_data(inode)) { 1610 iput(inode); 1611 set_sbi_flag(sbi, SBI_NEED_FSCK); 1612 f2fs_err_ratelimited(sbi, 1613 "inode %lx has both inline_data flag and " 1614 "data block, nid=%u, ofs_in_node=%u", 1615 inode->i_ino, dni.nid, ofs_in_node); 1616 continue; 1617 } 1618 1619 err = f2fs_gc_pinned_control(inode, gc_type, segno); 1620 if (err == -EAGAIN) { 1621 iput(inode); 1622 return submitted; 1623 } 1624 1625 if (!f2fs_down_write_trylock( 1626 &F2FS_I(inode)->i_gc_rwsem[WRITE])) { 1627 iput(inode); 1628 sbi->skipped_gc_rwsem++; 1629 continue; 1630 } 1631 1632 start_bidx = f2fs_start_bidx_of_node(nofs, inode) + 1633 ofs_in_node; 1634 1635 if (f2fs_meta_inode_gc_required(inode)) { 1636 int err = ra_data_block(inode, start_bidx); 1637 1638 f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); 1639 if (err) { 1640 iput(inode); 1641 continue; 1642 } 1643 add_gc_inode(gc_list, inode); 1644 continue; 1645 } 1646 1647 data_folio = f2fs_get_read_data_folio(inode, start_bidx, 1648 REQ_RAHEAD, true, NULL); 1649 f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); 1650 if (IS_ERR(data_folio)) { 1651 iput(inode); 1652 continue; 1653 } 1654 1655 f2fs_folio_put(data_folio, false); 1656 add_gc_inode(gc_list, inode); 1657 continue; 1658 } 1659 1660 /* phase 4 */ 1661 inode = find_gc_inode(gc_list, dni.ino); 1662 if (inode) { 1663 struct f2fs_inode_info *fi = F2FS_I(inode); 1664 bool locked = false; 1665 int err; 1666 1667 if (S_ISREG(inode->i_mode)) { 1668 if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[WRITE])) { 1669 sbi->skipped_gc_rwsem++; 1670 continue; 1671 } 1672 if (!f2fs_down_write_trylock( 1673 &fi->i_gc_rwsem[READ])) { 1674 sbi->skipped_gc_rwsem++; 1675 f2fs_up_write(&fi->i_gc_rwsem[WRITE]); 1676 continue; 1677 } 1678 locked = true; 1679 1680 /* wait for all inflight aio data */ 1681 inode_dio_wait(inode); 1682 } 1683 1684 start_bidx = f2fs_start_bidx_of_node(nofs, inode) 1685 + ofs_in_node; 1686 if (f2fs_meta_inode_gc_required(inode)) 1687 err = move_data_block(inode, start_bidx, 1688 gc_type, segno, off); 1689 else 1690 err = move_data_page(inode, start_bidx, gc_type, 1691 segno, off); 1692 1693 if (!err && (gc_type == FG_GC || 1694 f2fs_meta_inode_gc_required(inode))) 1695 submitted++; 1696 1697 if (locked) { 1698 f2fs_up_write(&fi->i_gc_rwsem[READ]); 1699 f2fs_up_write(&fi->i_gc_rwsem[WRITE]); 1700 } 1701 1702 stat_inc_data_blk_count(sbi, 1, gc_type); 1703 } 1704 } 1705 1706 if (++phase < 5) 1707 goto next_step; 1708 1709 return submitted; 1710 } 1711 1712 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, 1713 int gc_type, bool one_time) 1714 { 1715 struct sit_info *sit_i = SIT_I(sbi); 1716 int ret; 1717 1718 down_write(&sit_i->sentry_lock); 1719 ret = f2fs_get_victim(sbi, victim, gc_type, NO_CHECK_TYPE, 1720 LFS, 0, one_time); 1721 up_write(&sit_i->sentry_lock); 1722 return ret; 1723 } 1724 1725 static int do_garbage_collect(struct f2fs_sb_info *sbi, 1726 unsigned int start_segno, 1727 struct gc_inode_list *gc_list, int gc_type, 1728 bool force_migrate, bool one_time) 1729 { 1730 struct blk_plug plug; 1731 unsigned int segno = start_segno; 1732 unsigned int end_segno = start_segno + SEGS_PER_SEC(sbi); 1733 unsigned int sec_end_segno; 1734 int seg_freed = 0, migrated = 0; 1735 unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? 1736 SUM_TYPE_DATA : SUM_TYPE_NODE; 1737 unsigned char data_type = (type == SUM_TYPE_DATA) ? DATA : NODE; 1738 int submitted = 0, sum_blk_cnt; 1739 1740 if (__is_large_section(sbi)) { 1741 sec_end_segno = rounddown(end_segno, SEGS_PER_SEC(sbi)); 1742 1743 /* 1744 * zone-capacity can be less than zone-size in zoned devices, 1745 * resulting in less than expected usable segments in the zone, 1746 * calculate the end segno in the zone which can be garbage 1747 * collected 1748 */ 1749 if (f2fs_sb_has_blkzoned(sbi)) 1750 sec_end_segno -= SEGS_PER_SEC(sbi) - 1751 f2fs_usable_segs_in_sec(sbi); 1752 1753 if (gc_type == BG_GC || one_time) { 1754 unsigned int window_granularity = 1755 sbi->migration_window_granularity; 1756 1757 if (f2fs_sb_has_blkzoned(sbi) && 1758 !has_enough_free_blocks(sbi, 1759 sbi->gc_thread->boost_zoned_gc_percent)) 1760 window_granularity *= 1761 sbi->gc_thread->boost_gc_multiple; 1762 1763 end_segno = start_segno + window_granularity; 1764 } 1765 1766 if (end_segno > sec_end_segno) 1767 end_segno = sec_end_segno; 1768 } 1769 1770 sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type); 1771 1772 segno = rounddown(segno, SUMS_PER_BLOCK); 1773 sum_blk_cnt = DIV_ROUND_UP(end_segno - segno, SUMS_PER_BLOCK); 1774 /* readahead multi ssa blocks those have contiguous address */ 1775 if (__is_large_section(sbi)) 1776 f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), 1777 sum_blk_cnt, META_SSA, true); 1778 1779 /* reference all summary page */ 1780 while (segno < end_segno) { 1781 struct folio *sum_folio = f2fs_get_sum_folio(sbi, segno); 1782 1783 segno += SUMS_PER_BLOCK; 1784 if (IS_ERR(sum_folio)) { 1785 int err = PTR_ERR(sum_folio); 1786 1787 end_segno = segno - SUMS_PER_BLOCK; 1788 segno = rounddown(start_segno, SUMS_PER_BLOCK); 1789 while (segno < end_segno) { 1790 sum_folio = filemap_get_folio(META_MAPPING(sbi), 1791 GET_SUM_BLOCK(sbi, segno)); 1792 folio_put_refs(sum_folio, 2); 1793 segno += SUMS_PER_BLOCK; 1794 } 1795 return err; 1796 } 1797 folio_unlock(sum_folio); 1798 } 1799 1800 blk_start_plug(&plug); 1801 1802 segno = start_segno; 1803 while (segno < end_segno) { 1804 unsigned int cur_segno; 1805 1806 /* find segment summary of victim */ 1807 struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi), 1808 GET_SUM_BLOCK(sbi, segno)); 1809 unsigned int block_end_segno = rounddown(segno, SUMS_PER_BLOCK) 1810 + SUMS_PER_BLOCK; 1811 1812 if (block_end_segno > end_segno) 1813 block_end_segno = end_segno; 1814 1815 if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) { 1816 f2fs_err(sbi, "%s: segment %u is used by log", 1817 __func__, segno); 1818 f2fs_bug_on(sbi, 1); 1819 goto next_block; 1820 } 1821 1822 if (!folio_test_uptodate(sum_folio) || 1823 unlikely(f2fs_cp_error(sbi))) 1824 goto next_block; 1825 1826 for (cur_segno = segno; cur_segno < block_end_segno; 1827 cur_segno++) { 1828 struct f2fs_summary_block *sum; 1829 1830 if (get_valid_blocks(sbi, cur_segno, false) == 0) 1831 goto freed; 1832 if (gc_type == BG_GC && __is_large_section(sbi) && 1833 migrated >= sbi->migration_granularity) 1834 continue; 1835 1836 sum = SUM_BLK_PAGE_ADDR(sum_folio, cur_segno); 1837 if (type != GET_SUM_TYPE((&sum->footer))) { 1838 f2fs_err(sbi, "Inconsistent segment (%u) type " 1839 "[%d, %d] in SSA and SIT", 1840 cur_segno, type, 1841 GET_SUM_TYPE((&sum->footer))); 1842 f2fs_stop_checkpoint(sbi, false, 1843 STOP_CP_REASON_CORRUPTED_SUMMARY); 1844 continue; 1845 } 1846 1847 /* 1848 * this is to avoid deadlock: 1849 * - lock_page(sum_page) - f2fs_replace_block 1850 * - check_valid_map() - down_write(sentry_lock) 1851 * - down_read(sentry_lock) - change_curseg() 1852 * - lock_page(sum_page) 1853 */ 1854 if (type == SUM_TYPE_NODE) 1855 submitted += gc_node_segment(sbi, sum->entries, 1856 cur_segno, gc_type); 1857 else 1858 submitted += gc_data_segment(sbi, sum->entries, 1859 gc_list, cur_segno, 1860 gc_type, force_migrate); 1861 1862 stat_inc_gc_seg_count(sbi, data_type, gc_type); 1863 sbi->gc_reclaimed_segs[sbi->gc_mode]++; 1864 migrated++; 1865 1866 freed: 1867 if (gc_type == FG_GC && 1868 get_valid_blocks(sbi, cur_segno, false) == 0) 1869 seg_freed++; 1870 1871 if (__is_large_section(sbi)) 1872 sbi->next_victim_seg[gc_type] = 1873 (cur_segno + 1 < sec_end_segno) ? 1874 cur_segno + 1 : NULL_SEGNO; 1875 } 1876 next_block: 1877 folio_put_refs(sum_folio, 2); 1878 segno = block_end_segno; 1879 } 1880 1881 if (submitted) 1882 f2fs_submit_merged_write(sbi, data_type); 1883 1884 blk_finish_plug(&plug); 1885 1886 if (migrated) 1887 stat_inc_gc_sec_count(sbi, data_type, gc_type); 1888 1889 return seg_freed; 1890 } 1891 1892 int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) 1893 { 1894 int gc_type = gc_control->init_gc_type; 1895 unsigned int segno = gc_control->victim_segno; 1896 int sec_freed = 0, seg_freed = 0, total_freed = 0, total_sec_freed = 0; 1897 int ret = 0; 1898 struct cp_control cpc; 1899 struct gc_inode_list gc_list = { 1900 .ilist = LIST_HEAD_INIT(gc_list.ilist), 1901 .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), 1902 }; 1903 unsigned int skipped_round = 0, round = 0; 1904 unsigned int upper_secs; 1905 1906 trace_f2fs_gc_begin(sbi->sb, gc_type, gc_control->no_bg_gc, 1907 gc_control->nr_free_secs, 1908 get_pages(sbi, F2FS_DIRTY_NODES), 1909 get_pages(sbi, F2FS_DIRTY_DENTS), 1910 get_pages(sbi, F2FS_DIRTY_IMETA), 1911 free_sections(sbi), 1912 free_segments(sbi), 1913 reserved_segments(sbi), 1914 prefree_segments(sbi)); 1915 1916 cpc.reason = __get_cp_reason(sbi); 1917 gc_more: 1918 sbi->skipped_gc_rwsem = 0; 1919 if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { 1920 ret = -EINVAL; 1921 goto stop; 1922 } 1923 if (unlikely(f2fs_cp_error(sbi))) { 1924 ret = -EIO; 1925 goto stop; 1926 } 1927 1928 /* Let's run FG_GC, if we don't have enough space. */ 1929 if (has_not_enough_free_secs(sbi, 0, 0)) { 1930 gc_type = FG_GC; 1931 gc_control->one_time = false; 1932 1933 /* 1934 * For example, if there are many prefree_segments below given 1935 * threshold, we can make them free by checkpoint. Then, we 1936 * secure free segments which doesn't need fggc any more. 1937 */ 1938 if (prefree_segments(sbi)) { 1939 stat_inc_cp_call_count(sbi, TOTAL_CALL); 1940 ret = f2fs_write_checkpoint(sbi, &cpc); 1941 if (ret) 1942 goto stop; 1943 /* Reset due to checkpoint */ 1944 sec_freed = 0; 1945 } 1946 } 1947 1948 /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ 1949 if (gc_type == BG_GC && gc_control->no_bg_gc) { 1950 ret = -EINVAL; 1951 goto stop; 1952 } 1953 retry: 1954 ret = __get_victim(sbi, &segno, gc_type, gc_control->one_time); 1955 if (ret) { 1956 /* allow to search victim from sections has pinned data */ 1957 if (ret == -ENODATA && gc_type == FG_GC && 1958 f2fs_pinned_section_exists(DIRTY_I(sbi))) { 1959 f2fs_unpin_all_sections(sbi, false); 1960 goto retry; 1961 } 1962 goto stop; 1963 } 1964 1965 seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, 1966 gc_control->should_migrate_blocks, 1967 gc_control->one_time); 1968 if (seg_freed < 0) 1969 goto stop; 1970 1971 total_freed += seg_freed; 1972 1973 if (seg_freed == f2fs_usable_segs_in_sec(sbi)) { 1974 sec_freed++; 1975 total_sec_freed++; 1976 } 1977 1978 if (gc_control->one_time) 1979 goto stop; 1980 1981 if (gc_type == FG_GC) { 1982 sbi->cur_victim_sec = NULL_SEGNO; 1983 1984 if (has_enough_free_secs(sbi, sec_freed, 0)) { 1985 if (!gc_control->no_bg_gc && 1986 total_sec_freed < gc_control->nr_free_secs) 1987 goto go_gc_more; 1988 goto stop; 1989 } 1990 if (sbi->skipped_gc_rwsem) 1991 skipped_round++; 1992 round++; 1993 if (skipped_round > MAX_SKIP_GC_COUNT && 1994 skipped_round * 2 >= round) { 1995 stat_inc_cp_call_count(sbi, TOTAL_CALL); 1996 ret = f2fs_write_checkpoint(sbi, &cpc); 1997 goto stop; 1998 } 1999 } else if (has_enough_free_secs(sbi, 0, 0)) { 2000 goto stop; 2001 } 2002 2003 __get_secs_required(sbi, NULL, &upper_secs, NULL); 2004 2005 /* 2006 * Write checkpoint to reclaim prefree segments. 2007 * We need more three extra sections for writer's data/node/dentry. 2008 */ 2009 if (free_sections(sbi) <= upper_secs + NR_GC_CHECKPOINT_SECS && 2010 prefree_segments(sbi)) { 2011 stat_inc_cp_call_count(sbi, TOTAL_CALL); 2012 ret = f2fs_write_checkpoint(sbi, &cpc); 2013 if (ret) 2014 goto stop; 2015 /* Reset due to checkpoint */ 2016 sec_freed = 0; 2017 } 2018 go_gc_more: 2019 segno = NULL_SEGNO; 2020 goto gc_more; 2021 2022 stop: 2023 SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; 2024 SIT_I(sbi)->last_victim[FLUSH_DEVICE] = gc_control->victim_segno; 2025 2026 if (gc_type == FG_GC) 2027 f2fs_unpin_all_sections(sbi, true); 2028 2029 trace_f2fs_gc_end(sbi->sb, ret, total_freed, total_sec_freed, 2030 get_pages(sbi, F2FS_DIRTY_NODES), 2031 get_pages(sbi, F2FS_DIRTY_DENTS), 2032 get_pages(sbi, F2FS_DIRTY_IMETA), 2033 free_sections(sbi), 2034 free_segments(sbi), 2035 reserved_segments(sbi), 2036 prefree_segments(sbi)); 2037 2038 f2fs_up_write(&sbi->gc_lock); 2039 2040 put_gc_inode(&gc_list); 2041 2042 if (gc_control->err_gc_skipped && !ret) 2043 ret = total_sec_freed ? 0 : -EAGAIN; 2044 return ret; 2045 } 2046 2047 int __init f2fs_create_garbage_collection_cache(void) 2048 { 2049 victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry", 2050 sizeof(struct victim_entry)); 2051 return victim_entry_slab ? 0 : -ENOMEM; 2052 } 2053 2054 void f2fs_destroy_garbage_collection_cache(void) 2055 { 2056 kmem_cache_destroy(victim_entry_slab); 2057 } 2058 2059 static void init_atgc_management(struct f2fs_sb_info *sbi) 2060 { 2061 struct atgc_management *am = &sbi->am; 2062 2063 if (test_opt(sbi, ATGC) && 2064 SIT_I(sbi)->elapsed_time >= DEF_GC_THREAD_AGE_THRESHOLD) 2065 am->atgc_enabled = true; 2066 2067 am->root = RB_ROOT_CACHED; 2068 INIT_LIST_HEAD(&am->victim_list); 2069 am->victim_count = 0; 2070 2071 am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO; 2072 am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT; 2073 am->age_weight = DEF_GC_THREAD_AGE_WEIGHT; 2074 am->age_threshold = DEF_GC_THREAD_AGE_THRESHOLD; 2075 } 2076 2077 void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) 2078 { 2079 sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; 2080 2081 /* give warm/cold data area from slower device */ 2082 if (f2fs_is_multi_device(sbi) && !__is_large_section(sbi)) 2083 SIT_I(sbi)->last_victim[ALLOC_NEXT] = 2084 GET_SEGNO(sbi, FDEV(0).end_blk) + 1; 2085 2086 init_atgc_management(sbi); 2087 } 2088 2089 int f2fs_gc_range(struct f2fs_sb_info *sbi, 2090 unsigned int start_seg, unsigned int end_seg, 2091 bool dry_run, unsigned int dry_run_sections) 2092 { 2093 unsigned int segno; 2094 unsigned int gc_secs = dry_run_sections; 2095 2096 if (unlikely(f2fs_cp_error(sbi))) 2097 return -EIO; 2098 2099 for (segno = start_seg; segno <= end_seg; segno += SEGS_PER_SEC(sbi)) { 2100 struct gc_inode_list gc_list = { 2101 .ilist = LIST_HEAD_INIT(gc_list.ilist), 2102 .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), 2103 }; 2104 2105 /* 2106 * avoid migrating empty section, as it can be allocated by 2107 * log in parallel. 2108 */ 2109 if (!get_valid_blocks(sbi, segno, true)) 2110 continue; 2111 2112 if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) 2113 continue; 2114 2115 do_garbage_collect(sbi, segno, &gc_list, FG_GC, true, false); 2116 put_gc_inode(&gc_list); 2117 2118 if (!dry_run && get_valid_blocks(sbi, segno, true)) 2119 return -EAGAIN; 2120 if (dry_run && dry_run_sections && 2121 !get_valid_blocks(sbi, segno, true) && --gc_secs == 0) 2122 break; 2123 2124 if (fatal_signal_pending(current)) 2125 return -ERESTARTSYS; 2126 } 2127 2128 return 0; 2129 } 2130 2131 static int free_segment_range(struct f2fs_sb_info *sbi, 2132 unsigned int secs, bool dry_run) 2133 { 2134 unsigned int next_inuse, start, end; 2135 struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; 2136 int gc_mode, gc_type; 2137 int err = 0; 2138 int type; 2139 2140 /* Force block allocation for GC */ 2141 MAIN_SECS(sbi) -= secs; 2142 start = MAIN_SECS(sbi) * SEGS_PER_SEC(sbi); 2143 end = MAIN_SEGS(sbi) - 1; 2144 2145 mutex_lock(&DIRTY_I(sbi)->seglist_lock); 2146 for (gc_mode = 0; gc_mode < MAX_GC_POLICY; gc_mode++) 2147 if (SIT_I(sbi)->last_victim[gc_mode] >= start) 2148 SIT_I(sbi)->last_victim[gc_mode] = 0; 2149 2150 for (gc_type = BG_GC; gc_type <= FG_GC; gc_type++) 2151 if (sbi->next_victim_seg[gc_type] >= start) 2152 sbi->next_victim_seg[gc_type] = NULL_SEGNO; 2153 mutex_unlock(&DIRTY_I(sbi)->seglist_lock); 2154 2155 /* Move out cursegs from the target range */ 2156 for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++) { 2157 err = f2fs_allocate_segment_for_resize(sbi, type, start, end); 2158 if (err) 2159 goto out; 2160 } 2161 2162 /* do GC to move out valid blocks in the range */ 2163 err = f2fs_gc_range(sbi, start, end, dry_run, 0); 2164 if (err || dry_run) 2165 goto out; 2166 2167 stat_inc_cp_call_count(sbi, TOTAL_CALL); 2168 err = f2fs_write_checkpoint(sbi, &cpc); 2169 if (err) 2170 goto out; 2171 2172 next_inuse = find_next_inuse(FREE_I(sbi), end + 1, start); 2173 if (next_inuse <= end) { 2174 f2fs_err(sbi, "segno %u should be free but still inuse!", 2175 next_inuse); 2176 f2fs_bug_on(sbi, 1); 2177 } 2178 out: 2179 MAIN_SECS(sbi) += secs; 2180 return err; 2181 } 2182 2183 static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) 2184 { 2185 struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi); 2186 int section_count; 2187 int segment_count; 2188 int segment_count_main; 2189 long long block_count; 2190 int segs = secs * SEGS_PER_SEC(sbi); 2191 2192 f2fs_down_write(&sbi->sb_lock); 2193 2194 section_count = le32_to_cpu(raw_sb->section_count); 2195 segment_count = le32_to_cpu(raw_sb->segment_count); 2196 segment_count_main = le32_to_cpu(raw_sb->segment_count_main); 2197 block_count = le64_to_cpu(raw_sb->block_count); 2198 2199 raw_sb->section_count = cpu_to_le32(section_count + secs); 2200 raw_sb->segment_count = cpu_to_le32(segment_count + segs); 2201 raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs); 2202 raw_sb->block_count = cpu_to_le64(block_count + 2203 (long long)SEGS_TO_BLKS(sbi, segs)); 2204 if (f2fs_is_multi_device(sbi)) { 2205 int last_dev = sbi->s_ndevs - 1; 2206 int dev_segs = 2207 le32_to_cpu(raw_sb->devs[last_dev].total_segments); 2208 2209 raw_sb->devs[last_dev].total_segments = 2210 cpu_to_le32(dev_segs + segs); 2211 } 2212 2213 f2fs_up_write(&sbi->sb_lock); 2214 } 2215 2216 static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) 2217 { 2218 int segs = secs * SEGS_PER_SEC(sbi); 2219 long long blks = SEGS_TO_BLKS(sbi, segs); 2220 long long user_block_count = 2221 le64_to_cpu(F2FS_CKPT(sbi)->user_block_count); 2222 2223 SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs; 2224 MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs; 2225 MAIN_SECS(sbi) += secs; 2226 if (sbi->allocate_section_hint > MAIN_SECS(sbi)) 2227 sbi->allocate_section_hint = MAIN_SECS(sbi); 2228 FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs; 2229 FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs; 2230 F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks); 2231 2232 if (f2fs_is_multi_device(sbi)) { 2233 int last_dev = sbi->s_ndevs - 1; 2234 2235 sbi->allocate_section_hint = FDEV(0).total_segments / 2236 SEGS_PER_SEC(sbi); 2237 2238 FDEV(last_dev).total_segments = 2239 (int)FDEV(last_dev).total_segments + segs; 2240 FDEV(last_dev).end_blk = 2241 (long long)FDEV(last_dev).end_blk + blks; 2242 #ifdef CONFIG_BLK_DEV_ZONED 2243 FDEV(last_dev).nr_blkz = FDEV(last_dev).nr_blkz + 2244 div_u64(blks, sbi->blocks_per_blkz); 2245 #endif 2246 } 2247 } 2248 2249 int f2fs_resize_fs(struct file *filp, __u64 block_count) 2250 { 2251 struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); 2252 __u64 old_block_count, shrunk_blocks; 2253 struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; 2254 unsigned int secs; 2255 int err = 0; 2256 __u32 rem; 2257 2258 old_block_count = le64_to_cpu(F2FS_RAW_SUPER(sbi)->block_count); 2259 if (block_count > old_block_count) 2260 return -EINVAL; 2261 2262 if (f2fs_is_multi_device(sbi)) { 2263 int last_dev = sbi->s_ndevs - 1; 2264 __u64 last_segs = FDEV(last_dev).total_segments; 2265 2266 if (block_count + SEGS_TO_BLKS(sbi, last_segs) <= 2267 old_block_count) 2268 return -EINVAL; 2269 } 2270 2271 /* new fs size should align to section size */ 2272 div_u64_rem(block_count, BLKS_PER_SEC(sbi), &rem); 2273 if (rem) 2274 return -EINVAL; 2275 2276 if (block_count == old_block_count) 2277 return 0; 2278 2279 if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { 2280 f2fs_err(sbi, "Should run fsck to repair first."); 2281 return -EFSCORRUPTED; 2282 } 2283 2284 if (test_opt(sbi, DISABLE_CHECKPOINT)) { 2285 f2fs_err(sbi, "Checkpoint should be enabled."); 2286 return -EINVAL; 2287 } 2288 2289 err = mnt_want_write_file(filp); 2290 if (err) 2291 return err; 2292 2293 shrunk_blocks = old_block_count - block_count; 2294 secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi)); 2295 2296 /* stop other GC */ 2297 if (!f2fs_down_write_trylock(&sbi->gc_lock)) { 2298 err = -EAGAIN; 2299 goto out_drop_write; 2300 } 2301 2302 /* stop CP to protect MAIN_SEC in free_segment_range */ 2303 f2fs_lock_op(sbi); 2304 2305 spin_lock(&sbi->stat_lock); 2306 if (shrunk_blocks + valid_user_blocks(sbi) + 2307 sbi->current_reserved_blocks + sbi->unusable_block_count + 2308 F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) 2309 err = -ENOSPC; 2310 spin_unlock(&sbi->stat_lock); 2311 2312 if (err) 2313 goto out_unlock; 2314 2315 err = free_segment_range(sbi, secs, true); 2316 2317 out_unlock: 2318 f2fs_unlock_op(sbi); 2319 f2fs_up_write(&sbi->gc_lock); 2320 out_drop_write: 2321 mnt_drop_write_file(filp); 2322 if (err) 2323 return err; 2324 2325 err = freeze_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL); 2326 if (err) 2327 return err; 2328 2329 if (f2fs_readonly(sbi->sb)) { 2330 err = thaw_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL); 2331 if (err) 2332 return err; 2333 return -EROFS; 2334 } 2335 2336 f2fs_down_write(&sbi->gc_lock); 2337 f2fs_down_write(&sbi->cp_global_sem); 2338 2339 spin_lock(&sbi->stat_lock); 2340 if (shrunk_blocks + valid_user_blocks(sbi) + 2341 sbi->current_reserved_blocks + sbi->unusable_block_count + 2342 F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) 2343 err = -ENOSPC; 2344 else 2345 sbi->user_block_count -= shrunk_blocks; 2346 spin_unlock(&sbi->stat_lock); 2347 if (err) 2348 goto out_err; 2349 2350 set_sbi_flag(sbi, SBI_IS_RESIZEFS); 2351 err = free_segment_range(sbi, secs, false); 2352 if (err) 2353 goto recover_out; 2354 2355 update_sb_metadata(sbi, -secs); 2356 2357 err = f2fs_commit_super(sbi, false); 2358 if (err) { 2359 update_sb_metadata(sbi, secs); 2360 goto recover_out; 2361 } 2362 2363 update_fs_metadata(sbi, -secs); 2364 clear_sbi_flag(sbi, SBI_IS_RESIZEFS); 2365 set_sbi_flag(sbi, SBI_IS_DIRTY); 2366 2367 stat_inc_cp_call_count(sbi, TOTAL_CALL); 2368 err = f2fs_write_checkpoint(sbi, &cpc); 2369 if (err) { 2370 update_fs_metadata(sbi, secs); 2371 update_sb_metadata(sbi, secs); 2372 f2fs_commit_super(sbi, false); 2373 } 2374 recover_out: 2375 clear_sbi_flag(sbi, SBI_IS_RESIZEFS); 2376 if (err) { 2377 set_sbi_flag(sbi, SBI_NEED_FSCK); 2378 f2fs_err(sbi, "resize_fs failed, should run fsck to repair!"); 2379 2380 spin_lock(&sbi->stat_lock); 2381 sbi->user_block_count += shrunk_blocks; 2382 spin_unlock(&sbi->stat_lock); 2383 } 2384 out_err: 2385 f2fs_up_write(&sbi->cp_global_sem); 2386 f2fs_up_write(&sbi->gc_lock); 2387 thaw_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL); 2388 return err; 2389 } 2390