1 /* 2 * fs/f2fs/gc.c 3 * 4 * Copyright (c) 2012 Samsung Electronics Co., Ltd. 5 * http://www.samsung.com/ 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License version 2 as 9 * published by the Free Software Foundation. 10 */ 11 #include <linux/fs.h> 12 #include <linux/module.h> 13 #include <linux/backing-dev.h> 14 #include <linux/init.h> 15 #include <linux/f2fs_fs.h> 16 #include <linux/kthread.h> 17 #include <linux/delay.h> 18 #include <linux/freezer.h> 19 20 #include "f2fs.h" 21 #include "node.h" 22 #include "segment.h" 23 #include "gc.h" 24 #include <trace/events/f2fs.h> 25 26 static int gc_thread_func(void *data) 27 { 28 struct f2fs_sb_info *sbi = data; 29 struct f2fs_gc_kthread *gc_th = sbi->gc_thread; 30 wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; 31 unsigned int wait_ms; 32 33 wait_ms = gc_th->min_sleep_time; 34 35 set_freezable(); 36 do { 37 wait_event_interruptible_timeout(*wq, 38 kthread_should_stop() || freezing(current) || 39 gc_th->gc_wake, 40 msecs_to_jiffies(wait_ms)); 41 42 /* give it a try one time */ 43 if (gc_th->gc_wake) 44 gc_th->gc_wake = 0; 45 46 if (try_to_freeze()) 47 continue; 48 if (kthread_should_stop()) 49 break; 50 51 if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { 52 increase_sleep_time(gc_th, &wait_ms); 53 continue; 54 } 55 56 #ifdef CONFIG_F2FS_FAULT_INJECTION 57 if (time_to_inject(sbi, FAULT_CHECKPOINT)) { 58 f2fs_show_injection_info(FAULT_CHECKPOINT); 59 f2fs_stop_checkpoint(sbi, false); 60 } 61 #endif 62 63 if (!sb_start_write_trylock(sbi->sb)) 64 continue; 65 66 /* 67 * [GC triggering condition] 68 * 0. GC is not conducted currently. 69 * 1. There are enough dirty segments. 70 * 2. IO subsystem is idle by checking the # of writeback pages. 71 * 3. IO subsystem is idle by checking the # of requests in 72 * bdev's request list. 73 * 74 * Note) We have to avoid triggering GCs frequently. 75 * Because it is possible that some segments can be 76 * invalidated soon after by user update or deletion. 77 * So, I'd like to wait some time to collect dirty segments. 78 */ 79 if (sbi->gc_mode == GC_URGENT) { 80 wait_ms = gc_th->urgent_sleep_time; 81 mutex_lock(&sbi->gc_mutex); 82 goto do_gc; 83 } 84 85 if (!mutex_trylock(&sbi->gc_mutex)) 86 goto next; 87 88 if (!is_idle(sbi)) { 89 increase_sleep_time(gc_th, &wait_ms); 90 mutex_unlock(&sbi->gc_mutex); 91 goto next; 92 } 93 94 if (has_enough_invalid_blocks(sbi)) 95 decrease_sleep_time(gc_th, &wait_ms); 96 else 97 increase_sleep_time(gc_th, &wait_ms); 98 do_gc: 99 stat_inc_bggc_count(sbi); 100 101 /* if return value is not zero, no victim was selected */ 102 if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) 103 wait_ms = gc_th->no_gc_sleep_time; 104 105 trace_f2fs_background_gc(sbi->sb, wait_ms, 106 prefree_segments(sbi), free_segments(sbi)); 107 108 /* balancing f2fs's metadata periodically */ 109 f2fs_balance_fs_bg(sbi); 110 next: 111 sb_end_write(sbi->sb); 112 113 } while (!kthread_should_stop()); 114 return 0; 115 } 116 117 int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) 118 { 119 struct f2fs_gc_kthread *gc_th; 120 dev_t dev = sbi->sb->s_bdev->bd_dev; 121 int err = 0; 122 123 gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); 124 if (!gc_th) { 125 err = -ENOMEM; 126 goto out; 127 } 128 129 gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; 130 gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; 131 gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; 132 gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; 133 134 gc_th->gc_wake= 0; 135 136 sbi->gc_thread = gc_th; 137 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); 138 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, 139 "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); 140 if (IS_ERR(gc_th->f2fs_gc_task)) { 141 err = PTR_ERR(gc_th->f2fs_gc_task); 142 kfree(gc_th); 143 sbi->gc_thread = NULL; 144 } 145 out: 146 return err; 147 } 148 149 void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) 150 { 151 struct f2fs_gc_kthread *gc_th = sbi->gc_thread; 152 if (!gc_th) 153 return; 154 kthread_stop(gc_th->f2fs_gc_task); 155 kfree(gc_th); 156 sbi->gc_thread = NULL; 157 } 158 159 static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) 160 { 161 int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; 162 163 switch (sbi->gc_mode) { 164 case GC_IDLE_CB: 165 gc_mode = GC_CB; 166 break; 167 case GC_IDLE_GREEDY: 168 case GC_URGENT: 169 gc_mode = GC_GREEDY; 170 break; 171 } 172 return gc_mode; 173 } 174 175 static void select_policy(struct f2fs_sb_info *sbi, int gc_type, 176 int type, struct victim_sel_policy *p) 177 { 178 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 179 180 if (p->alloc_mode == SSR) { 181 p->gc_mode = GC_GREEDY; 182 p->dirty_segmap = dirty_i->dirty_segmap[type]; 183 p->max_search = dirty_i->nr_dirty[type]; 184 p->ofs_unit = 1; 185 } else { 186 p->gc_mode = select_gc_type(sbi, gc_type); 187 p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; 188 p->max_search = dirty_i->nr_dirty[DIRTY]; 189 p->ofs_unit = sbi->segs_per_sec; 190 } 191 192 /* we need to check every dirty segments in the FG_GC case */ 193 if (gc_type != FG_GC && 194 (sbi->gc_mode != GC_URGENT) && 195 p->max_search > sbi->max_victim_search) 196 p->max_search = sbi->max_victim_search; 197 198 /* let's select beginning hot/small space first in no_heap mode*/ 199 if (test_opt(sbi, NOHEAP) && 200 (type == CURSEG_HOT_DATA || IS_NODESEG(type))) 201 p->offset = 0; 202 else 203 p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; 204 } 205 206 static unsigned int get_max_cost(struct f2fs_sb_info *sbi, 207 struct victim_sel_policy *p) 208 { 209 /* SSR allocates in a segment unit */ 210 if (p->alloc_mode == SSR) 211 return sbi->blocks_per_seg; 212 if (p->gc_mode == GC_GREEDY) 213 return 2 * sbi->blocks_per_seg * p->ofs_unit; 214 else if (p->gc_mode == GC_CB) 215 return UINT_MAX; 216 else /* No other gc_mode */ 217 return 0; 218 } 219 220 static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) 221 { 222 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 223 unsigned int secno; 224 225 /* 226 * If the gc_type is FG_GC, we can select victim segments 227 * selected by background GC before. 228 * Those segments guarantee they have small valid blocks. 229 */ 230 for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { 231 if (sec_usage_check(sbi, secno)) 232 continue; 233 clear_bit(secno, dirty_i->victim_secmap); 234 return GET_SEG_FROM_SEC(sbi, secno); 235 } 236 return NULL_SEGNO; 237 } 238 239 static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) 240 { 241 struct sit_info *sit_i = SIT_I(sbi); 242 unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); 243 unsigned int start = GET_SEG_FROM_SEC(sbi, secno); 244 unsigned long long mtime = 0; 245 unsigned int vblocks; 246 unsigned char age = 0; 247 unsigned char u; 248 unsigned int i; 249 250 for (i = 0; i < sbi->segs_per_sec; i++) 251 mtime += get_seg_entry(sbi, start + i)->mtime; 252 vblocks = get_valid_blocks(sbi, segno, true); 253 254 mtime = div_u64(mtime, sbi->segs_per_sec); 255 vblocks = div_u64(vblocks, sbi->segs_per_sec); 256 257 u = (vblocks * 100) >> sbi->log_blocks_per_seg; 258 259 /* Handle if the system time has changed by the user */ 260 if (mtime < sit_i->min_mtime) 261 sit_i->min_mtime = mtime; 262 if (mtime > sit_i->max_mtime) 263 sit_i->max_mtime = mtime; 264 if (sit_i->max_mtime != sit_i->min_mtime) 265 age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime), 266 sit_i->max_mtime - sit_i->min_mtime); 267 268 return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); 269 } 270 271 static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, 272 unsigned int segno, struct victim_sel_policy *p) 273 { 274 if (p->alloc_mode == SSR) 275 return get_seg_entry(sbi, segno)->ckpt_valid_blocks; 276 277 /* alloc_mode == LFS */ 278 if (p->gc_mode == GC_GREEDY) 279 return get_valid_blocks(sbi, segno, true); 280 else 281 return get_cb_cost(sbi, segno); 282 } 283 284 static unsigned int count_bits(const unsigned long *addr, 285 unsigned int offset, unsigned int len) 286 { 287 unsigned int end = offset + len, sum = 0; 288 289 while (offset < end) { 290 if (test_bit(offset++, addr)) 291 ++sum; 292 } 293 return sum; 294 } 295 296 /* 297 * This function is called from two paths. 298 * One is garbage collection and the other is SSR segment selection. 299 * When it is called during GC, it just gets a victim segment 300 * and it does not remove it from dirty seglist. 301 * When it is called from SSR segment selection, it finds a segment 302 * which has minimum valid blocks and removes it from dirty seglist. 303 */ 304 static int get_victim_by_default(struct f2fs_sb_info *sbi, 305 unsigned int *result, int gc_type, int type, char alloc_mode) 306 { 307 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 308 struct sit_info *sm = SIT_I(sbi); 309 struct victim_sel_policy p; 310 unsigned int secno, last_victim; 311 unsigned int last_segment = MAIN_SEGS(sbi); 312 unsigned int nsearched = 0; 313 314 mutex_lock(&dirty_i->seglist_lock); 315 316 p.alloc_mode = alloc_mode; 317 select_policy(sbi, gc_type, type, &p); 318 319 p.min_segno = NULL_SEGNO; 320 p.min_cost = get_max_cost(sbi, &p); 321 322 if (*result != NULL_SEGNO) { 323 if (IS_DATASEG(get_seg_entry(sbi, *result)->type) && 324 get_valid_blocks(sbi, *result, false) && 325 !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) 326 p.min_segno = *result; 327 goto out; 328 } 329 330 if (p.max_search == 0) 331 goto out; 332 333 last_victim = sm->last_victim[p.gc_mode]; 334 if (p.alloc_mode == LFS && gc_type == FG_GC) { 335 p.min_segno = check_bg_victims(sbi); 336 if (p.min_segno != NULL_SEGNO) 337 goto got_it; 338 } 339 340 while (1) { 341 unsigned long cost; 342 unsigned int segno; 343 344 segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); 345 if (segno >= last_segment) { 346 if (sm->last_victim[p.gc_mode]) { 347 last_segment = 348 sm->last_victim[p.gc_mode]; 349 sm->last_victim[p.gc_mode] = 0; 350 p.offset = 0; 351 continue; 352 } 353 break; 354 } 355 356 p.offset = segno + p.ofs_unit; 357 if (p.ofs_unit > 1) { 358 p.offset -= segno % p.ofs_unit; 359 nsearched += count_bits(p.dirty_segmap, 360 p.offset - p.ofs_unit, 361 p.ofs_unit); 362 } else { 363 nsearched++; 364 } 365 366 secno = GET_SEC_FROM_SEG(sbi, segno); 367 368 if (sec_usage_check(sbi, secno)) 369 goto next; 370 if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) 371 goto next; 372 373 cost = get_gc_cost(sbi, segno, &p); 374 375 if (p.min_cost > cost) { 376 p.min_segno = segno; 377 p.min_cost = cost; 378 } 379 next: 380 if (nsearched >= p.max_search) { 381 if (!sm->last_victim[p.gc_mode] && segno <= last_victim) 382 sm->last_victim[p.gc_mode] = last_victim + 1; 383 else 384 sm->last_victim[p.gc_mode] = segno + 1; 385 sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); 386 break; 387 } 388 } 389 if (p.min_segno != NULL_SEGNO) { 390 got_it: 391 if (p.alloc_mode == LFS) { 392 secno = GET_SEC_FROM_SEG(sbi, p.min_segno); 393 if (gc_type == FG_GC) 394 sbi->cur_victim_sec = secno; 395 else 396 set_bit(secno, dirty_i->victim_secmap); 397 } 398 *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; 399 400 trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, 401 sbi->cur_victim_sec, 402 prefree_segments(sbi), free_segments(sbi)); 403 } 404 out: 405 mutex_unlock(&dirty_i->seglist_lock); 406 407 return (p.min_segno == NULL_SEGNO) ? 0 : 1; 408 } 409 410 static const struct victim_selection default_v_ops = { 411 .get_victim = get_victim_by_default, 412 }; 413 414 static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino) 415 { 416 struct inode_entry *ie; 417 418 ie = radix_tree_lookup(&gc_list->iroot, ino); 419 if (ie) 420 return ie->inode; 421 return NULL; 422 } 423 424 static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) 425 { 426 struct inode_entry *new_ie; 427 428 if (inode == find_gc_inode(gc_list, inode->i_ino)) { 429 iput(inode); 430 return; 431 } 432 new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS); 433 new_ie->inode = inode; 434 435 f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); 436 list_add_tail(&new_ie->list, &gc_list->ilist); 437 } 438 439 static void put_gc_inode(struct gc_inode_list *gc_list) 440 { 441 struct inode_entry *ie, *next_ie; 442 list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { 443 radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); 444 iput(ie->inode); 445 list_del(&ie->list); 446 kmem_cache_free(f2fs_inode_entry_slab, ie); 447 } 448 } 449 450 static int check_valid_map(struct f2fs_sb_info *sbi, 451 unsigned int segno, int offset) 452 { 453 struct sit_info *sit_i = SIT_I(sbi); 454 struct seg_entry *sentry; 455 int ret; 456 457 down_read(&sit_i->sentry_lock); 458 sentry = get_seg_entry(sbi, segno); 459 ret = f2fs_test_bit(offset, sentry->cur_valid_map); 460 up_read(&sit_i->sentry_lock); 461 return ret; 462 } 463 464 /* 465 * This function compares node address got in summary with that in NAT. 466 * On validity, copy that node with cold status, otherwise (invalid node) 467 * ignore that. 468 */ 469 static void gc_node_segment(struct f2fs_sb_info *sbi, 470 struct f2fs_summary *sum, unsigned int segno, int gc_type) 471 { 472 struct f2fs_summary *entry; 473 block_t start_addr; 474 int off; 475 int phase = 0; 476 bool fggc = (gc_type == FG_GC); 477 478 start_addr = START_BLOCK(sbi, segno); 479 480 next_step: 481 entry = sum; 482 483 if (fggc && phase == 2) 484 atomic_inc(&sbi->wb_sync_req[NODE]); 485 486 for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { 487 nid_t nid = le32_to_cpu(entry->nid); 488 struct page *node_page; 489 struct node_info ni; 490 491 /* stop BG_GC if there is not enough free sections. */ 492 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) 493 return; 494 495 if (check_valid_map(sbi, segno, off) == 0) 496 continue; 497 498 if (phase == 0) { 499 f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, 500 META_NAT, true); 501 continue; 502 } 503 504 if (phase == 1) { 505 f2fs_ra_node_page(sbi, nid); 506 continue; 507 } 508 509 /* phase == 2 */ 510 node_page = f2fs_get_node_page(sbi, nid); 511 if (IS_ERR(node_page)) 512 continue; 513 514 /* block may become invalid during f2fs_get_node_page */ 515 if (check_valid_map(sbi, segno, off) == 0) { 516 f2fs_put_page(node_page, 1); 517 continue; 518 } 519 520 f2fs_get_node_info(sbi, nid, &ni); 521 if (ni.blk_addr != start_addr + off) { 522 f2fs_put_page(node_page, 1); 523 continue; 524 } 525 526 f2fs_move_node_page(node_page, gc_type); 527 stat_inc_node_blk_count(sbi, 1, gc_type); 528 } 529 530 if (++phase < 3) 531 goto next_step; 532 533 if (fggc) 534 atomic_dec(&sbi->wb_sync_req[NODE]); 535 } 536 537 /* 538 * Calculate start block index indicating the given node offset. 539 * Be careful, caller should give this node offset only indicating direct node 540 * blocks. If any node offsets, which point the other types of node blocks such 541 * as indirect or double indirect node blocks, are given, it must be a caller's 542 * bug. 543 */ 544 block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) 545 { 546 unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; 547 unsigned int bidx; 548 549 if (node_ofs == 0) 550 return 0; 551 552 if (node_ofs <= 2) { 553 bidx = node_ofs - 1; 554 } else if (node_ofs <= indirect_blks) { 555 int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); 556 bidx = node_ofs - 2 - dec; 557 } else { 558 int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); 559 bidx = node_ofs - 5 - dec; 560 } 561 return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode); 562 } 563 564 static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 565 struct node_info *dni, block_t blkaddr, unsigned int *nofs) 566 { 567 struct page *node_page; 568 nid_t nid; 569 unsigned int ofs_in_node; 570 block_t source_blkaddr; 571 572 nid = le32_to_cpu(sum->nid); 573 ofs_in_node = le16_to_cpu(sum->ofs_in_node); 574 575 node_page = f2fs_get_node_page(sbi, nid); 576 if (IS_ERR(node_page)) 577 return false; 578 579 f2fs_get_node_info(sbi, nid, dni); 580 581 if (sum->version != dni->version) { 582 f2fs_msg(sbi->sb, KERN_WARNING, 583 "%s: valid data with mismatched node version.", 584 __func__); 585 set_sbi_flag(sbi, SBI_NEED_FSCK); 586 } 587 588 *nofs = ofs_of_node(node_page); 589 source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); 590 f2fs_put_page(node_page, 1); 591 592 if (source_blkaddr != blkaddr) 593 return false; 594 return true; 595 } 596 597 /* 598 * Move data block via META_MAPPING while keeping locked data page. 599 * This can be used to move blocks, aka LBAs, directly on disk. 600 */ 601 static void move_data_block(struct inode *inode, block_t bidx, 602 int gc_type, unsigned int segno, int off) 603 { 604 struct f2fs_io_info fio = { 605 .sbi = F2FS_I_SB(inode), 606 .ino = inode->i_ino, 607 .type = DATA, 608 .temp = COLD, 609 .op = REQ_OP_READ, 610 .op_flags = 0, 611 .encrypted_page = NULL, 612 .in_list = false, 613 .retry = false, 614 }; 615 struct dnode_of_data dn; 616 struct f2fs_summary sum; 617 struct node_info ni; 618 struct page *page; 619 block_t newaddr; 620 int err; 621 bool lfs_mode = test_opt(fio.sbi, LFS); 622 623 /* do not read out */ 624 page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); 625 if (!page) 626 return; 627 628 if (!check_valid_map(F2FS_I_SB(inode), segno, off)) 629 goto out; 630 631 if (f2fs_is_atomic_file(inode)) { 632 F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; 633 F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; 634 goto out; 635 } 636 637 if (f2fs_is_pinned_file(inode)) { 638 f2fs_pin_file_control(inode, true); 639 goto out; 640 } 641 642 set_new_dnode(&dn, inode, NULL, NULL, 0); 643 err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE); 644 if (err) 645 goto out; 646 647 if (unlikely(dn.data_blkaddr == NULL_ADDR)) { 648 ClearPageUptodate(page); 649 goto put_out; 650 } 651 652 /* 653 * don't cache encrypted data into meta inode until previous dirty 654 * data were writebacked to avoid racing between GC and flush. 655 */ 656 f2fs_wait_on_page_writeback(page, DATA, true); 657 658 f2fs_get_node_info(fio.sbi, dn.nid, &ni); 659 set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); 660 661 /* read page */ 662 fio.page = page; 663 fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; 664 665 if (lfs_mode) 666 down_write(&fio.sbi->io_order_lock); 667 668 f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, 669 &sum, CURSEG_COLD_DATA, NULL, false); 670 671 fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), 672 newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); 673 if (!fio.encrypted_page) { 674 err = -ENOMEM; 675 goto recover_block; 676 } 677 678 err = f2fs_submit_page_bio(&fio); 679 if (err) 680 goto put_page_out; 681 682 /* write page */ 683 lock_page(fio.encrypted_page); 684 685 if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) { 686 err = -EIO; 687 goto put_page_out; 688 } 689 if (unlikely(!PageUptodate(fio.encrypted_page))) { 690 err = -EIO; 691 goto put_page_out; 692 } 693 694 set_page_dirty(fio.encrypted_page); 695 f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true); 696 if (clear_page_dirty_for_io(fio.encrypted_page)) 697 dec_page_count(fio.sbi, F2FS_DIRTY_META); 698 699 set_page_writeback(fio.encrypted_page); 700 ClearPageError(page); 701 702 /* allocate block address */ 703 f2fs_wait_on_page_writeback(dn.node_page, NODE, true); 704 705 fio.op = REQ_OP_WRITE; 706 fio.op_flags = REQ_SYNC; 707 fio.new_blkaddr = newaddr; 708 f2fs_submit_page_write(&fio); 709 if (fio.retry) { 710 if (PageWriteback(fio.encrypted_page)) 711 end_page_writeback(fio.encrypted_page); 712 goto put_page_out; 713 } 714 715 f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); 716 717 f2fs_update_data_blkaddr(&dn, newaddr); 718 set_inode_flag(inode, FI_APPEND_WRITE); 719 if (page->index == 0) 720 set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); 721 put_page_out: 722 f2fs_put_page(fio.encrypted_page, 1); 723 recover_block: 724 if (lfs_mode) 725 up_write(&fio.sbi->io_order_lock); 726 if (err) 727 f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, 728 true, true); 729 put_out: 730 f2fs_put_dnode(&dn); 731 out: 732 f2fs_put_page(page, 1); 733 } 734 735 static void move_data_page(struct inode *inode, block_t bidx, int gc_type, 736 unsigned int segno, int off) 737 { 738 struct page *page; 739 740 page = f2fs_get_lock_data_page(inode, bidx, true); 741 if (IS_ERR(page)) 742 return; 743 744 if (!check_valid_map(F2FS_I_SB(inode), segno, off)) 745 goto out; 746 747 if (f2fs_is_atomic_file(inode)) { 748 F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; 749 F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; 750 goto out; 751 } 752 if (f2fs_is_pinned_file(inode)) { 753 if (gc_type == FG_GC) 754 f2fs_pin_file_control(inode, true); 755 goto out; 756 } 757 758 if (gc_type == BG_GC) { 759 if (PageWriteback(page)) 760 goto out; 761 set_page_dirty(page); 762 set_cold_data(page); 763 } else { 764 struct f2fs_io_info fio = { 765 .sbi = F2FS_I_SB(inode), 766 .ino = inode->i_ino, 767 .type = DATA, 768 .temp = COLD, 769 .op = REQ_OP_WRITE, 770 .op_flags = REQ_SYNC, 771 .old_blkaddr = NULL_ADDR, 772 .page = page, 773 .encrypted_page = NULL, 774 .need_lock = LOCK_REQ, 775 .io_type = FS_GC_DATA_IO, 776 }; 777 bool is_dirty = PageDirty(page); 778 int err; 779 780 retry: 781 set_page_dirty(page); 782 f2fs_wait_on_page_writeback(page, DATA, true); 783 if (clear_page_dirty_for_io(page)) { 784 inode_dec_dirty_pages(inode); 785 f2fs_remove_dirty_inode(inode); 786 } 787 788 set_cold_data(page); 789 790 err = f2fs_do_write_data_page(&fio); 791 if (err) { 792 clear_cold_data(page); 793 if (err == -ENOMEM) { 794 congestion_wait(BLK_RW_ASYNC, HZ/50); 795 goto retry; 796 } 797 if (is_dirty) 798 set_page_dirty(page); 799 } 800 } 801 out: 802 f2fs_put_page(page, 1); 803 } 804 805 /* 806 * This function tries to get parent node of victim data block, and identifies 807 * data block validity. If the block is valid, copy that with cold status and 808 * modify parent node. 809 * If the parent node is not valid or the data block address is different, 810 * the victim data block is ignored. 811 */ 812 static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, 813 struct gc_inode_list *gc_list, unsigned int segno, int gc_type) 814 { 815 struct super_block *sb = sbi->sb; 816 struct f2fs_summary *entry; 817 block_t start_addr; 818 int off; 819 int phase = 0; 820 821 start_addr = START_BLOCK(sbi, segno); 822 823 next_step: 824 entry = sum; 825 826 for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { 827 struct page *data_page; 828 struct inode *inode; 829 struct node_info dni; /* dnode info for the data */ 830 unsigned int ofs_in_node, nofs; 831 block_t start_bidx; 832 nid_t nid = le32_to_cpu(entry->nid); 833 834 /* stop BG_GC if there is not enough free sections. */ 835 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) 836 return; 837 838 if (check_valid_map(sbi, segno, off) == 0) 839 continue; 840 841 if (phase == 0) { 842 f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, 843 META_NAT, true); 844 continue; 845 } 846 847 if (phase == 1) { 848 f2fs_ra_node_page(sbi, nid); 849 continue; 850 } 851 852 /* Get an inode by ino with checking validity */ 853 if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) 854 continue; 855 856 if (phase == 2) { 857 f2fs_ra_node_page(sbi, dni.ino); 858 continue; 859 } 860 861 ofs_in_node = le16_to_cpu(entry->ofs_in_node); 862 863 if (phase == 3) { 864 inode = f2fs_iget(sb, dni.ino); 865 if (IS_ERR(inode) || is_bad_inode(inode)) 866 continue; 867 868 /* if inode uses special I/O path, let's go phase 3 */ 869 if (f2fs_post_read_required(inode)) { 870 add_gc_inode(gc_list, inode); 871 continue; 872 } 873 874 if (!down_write_trylock( 875 &F2FS_I(inode)->i_gc_rwsem[WRITE])) { 876 iput(inode); 877 continue; 878 } 879 880 start_bidx = f2fs_start_bidx_of_node(nofs, inode); 881 data_page = f2fs_get_read_data_page(inode, 882 start_bidx + ofs_in_node, REQ_RAHEAD, 883 true); 884 up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); 885 if (IS_ERR(data_page)) { 886 iput(inode); 887 continue; 888 } 889 890 f2fs_put_page(data_page, 0); 891 add_gc_inode(gc_list, inode); 892 continue; 893 } 894 895 /* phase 4 */ 896 inode = find_gc_inode(gc_list, dni.ino); 897 if (inode) { 898 struct f2fs_inode_info *fi = F2FS_I(inode); 899 bool locked = false; 900 901 if (S_ISREG(inode->i_mode)) { 902 if (!down_write_trylock(&fi->i_gc_rwsem[READ])) 903 continue; 904 if (!down_write_trylock( 905 &fi->i_gc_rwsem[WRITE])) { 906 up_write(&fi->i_gc_rwsem[READ]); 907 continue; 908 } 909 locked = true; 910 911 /* wait for all inflight aio data */ 912 inode_dio_wait(inode); 913 } 914 915 start_bidx = f2fs_start_bidx_of_node(nofs, inode) 916 + ofs_in_node; 917 if (f2fs_post_read_required(inode)) 918 move_data_block(inode, start_bidx, gc_type, 919 segno, off); 920 else 921 move_data_page(inode, start_bidx, gc_type, 922 segno, off); 923 924 if (locked) { 925 up_write(&fi->i_gc_rwsem[WRITE]); 926 up_write(&fi->i_gc_rwsem[READ]); 927 } 928 929 stat_inc_data_blk_count(sbi, 1, gc_type); 930 } 931 } 932 933 if (++phase < 5) 934 goto next_step; 935 } 936 937 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, 938 int gc_type) 939 { 940 struct sit_info *sit_i = SIT_I(sbi); 941 int ret; 942 943 down_write(&sit_i->sentry_lock); 944 ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, 945 NO_CHECK_TYPE, LFS); 946 up_write(&sit_i->sentry_lock); 947 return ret; 948 } 949 950 static int do_garbage_collect(struct f2fs_sb_info *sbi, 951 unsigned int start_segno, 952 struct gc_inode_list *gc_list, int gc_type) 953 { 954 struct page *sum_page; 955 struct f2fs_summary_block *sum; 956 struct blk_plug plug; 957 unsigned int segno = start_segno; 958 unsigned int end_segno = start_segno + sbi->segs_per_sec; 959 int seg_freed = 0; 960 unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? 961 SUM_TYPE_DATA : SUM_TYPE_NODE; 962 963 /* readahead multi ssa blocks those have contiguous address */ 964 if (sbi->segs_per_sec > 1) 965 f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), 966 sbi->segs_per_sec, META_SSA, true); 967 968 /* reference all summary page */ 969 while (segno < end_segno) { 970 sum_page = f2fs_get_sum_page(sbi, segno++); 971 unlock_page(sum_page); 972 } 973 974 blk_start_plug(&plug); 975 976 for (segno = start_segno; segno < end_segno; segno++) { 977 978 /* find segment summary of victim */ 979 sum_page = find_get_page(META_MAPPING(sbi), 980 GET_SUM_BLOCK(sbi, segno)); 981 f2fs_put_page(sum_page, 0); 982 983 if (get_valid_blocks(sbi, segno, false) == 0 || 984 !PageUptodate(sum_page) || 985 unlikely(f2fs_cp_error(sbi))) 986 goto next; 987 988 sum = page_address(sum_page); 989 f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); 990 991 /* 992 * this is to avoid deadlock: 993 * - lock_page(sum_page) - f2fs_replace_block 994 * - check_valid_map() - down_write(sentry_lock) 995 * - down_read(sentry_lock) - change_curseg() 996 * - lock_page(sum_page) 997 */ 998 if (type == SUM_TYPE_NODE) 999 gc_node_segment(sbi, sum->entries, segno, gc_type); 1000 else 1001 gc_data_segment(sbi, sum->entries, gc_list, segno, 1002 gc_type); 1003 1004 stat_inc_seg_count(sbi, type, gc_type); 1005 1006 if (gc_type == FG_GC && 1007 get_valid_blocks(sbi, segno, false) == 0) 1008 seg_freed++; 1009 next: 1010 f2fs_put_page(sum_page, 0); 1011 } 1012 1013 if (gc_type == FG_GC) 1014 f2fs_submit_merged_write(sbi, 1015 (type == SUM_TYPE_NODE) ? NODE : DATA); 1016 1017 blk_finish_plug(&plug); 1018 1019 stat_inc_call_count(sbi->stat_info); 1020 1021 return seg_freed; 1022 } 1023 1024 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, 1025 bool background, unsigned int segno) 1026 { 1027 int gc_type = sync ? FG_GC : BG_GC; 1028 int sec_freed = 0, seg_freed = 0, total_freed = 0; 1029 int ret = 0; 1030 struct cp_control cpc; 1031 unsigned int init_segno = segno; 1032 struct gc_inode_list gc_list = { 1033 .ilist = LIST_HEAD_INIT(gc_list.ilist), 1034 .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), 1035 }; 1036 unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; 1037 unsigned int skipped_round = 0, round = 0; 1038 1039 trace_f2fs_gc_begin(sbi->sb, sync, background, 1040 get_pages(sbi, F2FS_DIRTY_NODES), 1041 get_pages(sbi, F2FS_DIRTY_DENTS), 1042 get_pages(sbi, F2FS_DIRTY_IMETA), 1043 free_sections(sbi), 1044 free_segments(sbi), 1045 reserved_segments(sbi), 1046 prefree_segments(sbi)); 1047 1048 cpc.reason = __get_cp_reason(sbi); 1049 gc_more: 1050 if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { 1051 ret = -EINVAL; 1052 goto stop; 1053 } 1054 if (unlikely(f2fs_cp_error(sbi))) { 1055 ret = -EIO; 1056 goto stop; 1057 } 1058 1059 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) { 1060 /* 1061 * For example, if there are many prefree_segments below given 1062 * threshold, we can make them free by checkpoint. Then, we 1063 * secure free segments which doesn't need fggc any more. 1064 */ 1065 if (prefree_segments(sbi)) { 1066 ret = f2fs_write_checkpoint(sbi, &cpc); 1067 if (ret) 1068 goto stop; 1069 } 1070 if (has_not_enough_free_secs(sbi, 0, 0)) 1071 gc_type = FG_GC; 1072 } 1073 1074 /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ 1075 if (gc_type == BG_GC && !background) { 1076 ret = -EINVAL; 1077 goto stop; 1078 } 1079 if (!__get_victim(sbi, &segno, gc_type)) { 1080 ret = -ENODATA; 1081 goto stop; 1082 } 1083 1084 seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); 1085 if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) 1086 sec_freed++; 1087 total_freed += seg_freed; 1088 1089 if (gc_type == FG_GC) { 1090 if (sbi->skipped_atomic_files[FG_GC] > last_skipped) 1091 skipped_round++; 1092 last_skipped = sbi->skipped_atomic_files[FG_GC]; 1093 round++; 1094 } 1095 1096 if (gc_type == FG_GC) 1097 sbi->cur_victim_sec = NULL_SEGNO; 1098 1099 if (!sync) { 1100 if (has_not_enough_free_secs(sbi, sec_freed, 0)) { 1101 if (skipped_round > MAX_SKIP_ATOMIC_COUNT && 1102 skipped_round * 2 >= round) 1103 f2fs_drop_inmem_pages_all(sbi, true); 1104 segno = NULL_SEGNO; 1105 goto gc_more; 1106 } 1107 1108 if (gc_type == FG_GC) 1109 ret = f2fs_write_checkpoint(sbi, &cpc); 1110 } 1111 stop: 1112 SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; 1113 SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; 1114 1115 trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, 1116 get_pages(sbi, F2FS_DIRTY_NODES), 1117 get_pages(sbi, F2FS_DIRTY_DENTS), 1118 get_pages(sbi, F2FS_DIRTY_IMETA), 1119 free_sections(sbi), 1120 free_segments(sbi), 1121 reserved_segments(sbi), 1122 prefree_segments(sbi)); 1123 1124 mutex_unlock(&sbi->gc_mutex); 1125 1126 put_gc_inode(&gc_list); 1127 1128 if (sync) 1129 ret = sec_freed ? 0 : -EAGAIN; 1130 return ret; 1131 } 1132 1133 void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) 1134 { 1135 DIRTY_I(sbi)->v_ops = &default_v_ops; 1136 1137 sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; 1138 1139 /* give warm/cold data area from slower device */ 1140 if (sbi->s_ndevs && sbi->segs_per_sec == 1) 1141 SIT_I(sbi)->last_victim[ALLOC_NEXT] = 1142 GET_SEGNO(sbi, FDEV(0).end_blk) + 1; 1143 } 1144