1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_sb.h" 14 #include "xfs_mount.h" 15 #include "xfs_inode.h" 16 #include "xfs_dir2.h" 17 #include "xfs_ialloc.h" 18 #include "xfs_alloc.h" 19 #include "xfs_rtalloc.h" 20 #include "xfs_bmap.h" 21 #include "xfs_trans.h" 22 #include "xfs_trans_priv.h" 23 #include "xfs_log.h" 24 #include "xfs_error.h" 25 #include "xfs_quota.h" 26 #include "xfs_fsops.h" 27 #include "xfs_icache.h" 28 #include "xfs_sysfs.h" 29 #include "xfs_rmap_btree.h" 30 #include "xfs_refcount_btree.h" 31 #include "xfs_reflink.h" 32 #include "xfs_extent_busy.h" 33 #include "xfs_health.h" 34 #include "xfs_trace.h" 35 #include "xfs_ag.h" 36 37 static DEFINE_MUTEX(xfs_uuid_table_mutex); 38 static int xfs_uuid_table_size; 39 static uuid_t *xfs_uuid_table; 40 41 void 42 xfs_uuid_table_free(void) 43 { 44 if (xfs_uuid_table_size == 0) 45 return; 46 kmem_free(xfs_uuid_table); 47 xfs_uuid_table = NULL; 48 xfs_uuid_table_size = 0; 49 } 50 51 /* 52 * See if the UUID is unique among mounted XFS filesystems. 53 * Mount fails if UUID is nil or a FS with the same UUID is already mounted. 54 */ 55 STATIC int 56 xfs_uuid_mount( 57 struct xfs_mount *mp) 58 { 59 uuid_t *uuid = &mp->m_sb.sb_uuid; 60 int hole, i; 61 62 /* Publish UUID in struct super_block */ 63 uuid_copy(&mp->m_super->s_uuid, uuid); 64 65 if (mp->m_flags & XFS_MOUNT_NOUUID) 66 return 0; 67 68 if (uuid_is_null(uuid)) { 69 xfs_warn(mp, "Filesystem has null UUID - can't mount"); 70 return -EINVAL; 71 } 72 73 mutex_lock(&xfs_uuid_table_mutex); 74 for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) { 75 if (uuid_is_null(&xfs_uuid_table[i])) { 76 hole = i; 77 continue; 78 } 79 if (uuid_equal(uuid, &xfs_uuid_table[i])) 80 goto out_duplicate; 81 } 82 83 if (hole < 0) { 84 xfs_uuid_table = krealloc(xfs_uuid_table, 85 (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), 86 GFP_KERNEL | __GFP_NOFAIL); 87 hole = xfs_uuid_table_size++; 88 } 89 xfs_uuid_table[hole] = *uuid; 90 mutex_unlock(&xfs_uuid_table_mutex); 91 92 return 0; 93 94 out_duplicate: 95 mutex_unlock(&xfs_uuid_table_mutex); 96 xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid); 97 return -EINVAL; 98 } 99 100 STATIC void 101 xfs_uuid_unmount( 102 struct xfs_mount *mp) 103 { 104 uuid_t *uuid = &mp->m_sb.sb_uuid; 105 int i; 106 107 if (mp->m_flags & XFS_MOUNT_NOUUID) 108 return; 109 110 mutex_lock(&xfs_uuid_table_mutex); 111 for (i = 0; i < xfs_uuid_table_size; i++) { 112 if (uuid_is_null(&xfs_uuid_table[i])) 113 continue; 114 if (!uuid_equal(uuid, &xfs_uuid_table[i])) 115 continue; 116 memset(&xfs_uuid_table[i], 0, sizeof(uuid_t)); 117 break; 118 } 119 ASSERT(i < xfs_uuid_table_size); 120 mutex_unlock(&xfs_uuid_table_mutex); 121 } 122 123 /* 124 * Check size of device based on the (data/realtime) block count. 125 * Note: this check is used by the growfs code as well as mount. 126 */ 127 int 128 xfs_sb_validate_fsb_count( 129 xfs_sb_t *sbp, 130 uint64_t nblocks) 131 { 132 ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); 133 ASSERT(sbp->sb_blocklog >= BBSHIFT); 134 135 /* Limited by ULONG_MAX of page cache index */ 136 if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) 137 return -EFBIG; 138 return 0; 139 } 140 141 /* 142 * xfs_readsb 143 * 144 * Does the initial read of the superblock. 145 */ 146 int 147 xfs_readsb( 148 struct xfs_mount *mp, 149 int flags) 150 { 151 unsigned int sector_size; 152 struct xfs_buf *bp; 153 struct xfs_sb *sbp = &mp->m_sb; 154 int error; 155 int loud = !(flags & XFS_MFSI_QUIET); 156 const struct xfs_buf_ops *buf_ops; 157 158 ASSERT(mp->m_sb_bp == NULL); 159 ASSERT(mp->m_ddev_targp != NULL); 160 161 /* 162 * For the initial read, we must guess at the sector 163 * size based on the block device. It's enough to 164 * get the sb_sectsize out of the superblock and 165 * then reread with the proper length. 166 * We don't verify it yet, because it may not be complete. 167 */ 168 sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); 169 buf_ops = NULL; 170 171 /* 172 * Allocate a (locked) buffer to hold the superblock. This will be kept 173 * around at all times to optimize access to the superblock. Therefore, 174 * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count 175 * elevated. 176 */ 177 reread: 178 error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, 179 BTOBB(sector_size), XBF_NO_IOACCT, &bp, 180 buf_ops); 181 if (error) { 182 if (loud) 183 xfs_warn(mp, "SB validate failed with error %d.", error); 184 /* bad CRC means corrupted metadata */ 185 if (error == -EFSBADCRC) 186 error = -EFSCORRUPTED; 187 return error; 188 } 189 190 /* 191 * Initialize the mount structure from the superblock. 192 */ 193 xfs_sb_from_disk(sbp, bp->b_addr); 194 195 /* 196 * If we haven't validated the superblock, do so now before we try 197 * to check the sector size and reread the superblock appropriately. 198 */ 199 if (sbp->sb_magicnum != XFS_SB_MAGIC) { 200 if (loud) 201 xfs_warn(mp, "Invalid superblock magic number"); 202 error = -EINVAL; 203 goto release_buf; 204 } 205 206 /* 207 * We must be able to do sector-sized and sector-aligned IO. 208 */ 209 if (sector_size > sbp->sb_sectsize) { 210 if (loud) 211 xfs_warn(mp, "device supports %u byte sectors (not %u)", 212 sector_size, sbp->sb_sectsize); 213 error = -ENOSYS; 214 goto release_buf; 215 } 216 217 if (buf_ops == NULL) { 218 /* 219 * Re-read the superblock so the buffer is correctly sized, 220 * and properly verified. 221 */ 222 xfs_buf_relse(bp); 223 sector_size = sbp->sb_sectsize; 224 buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops; 225 goto reread; 226 } 227 228 xfs_reinit_percpu_counters(mp); 229 230 /* no need to be quiet anymore, so reset the buf ops */ 231 bp->b_ops = &xfs_sb_buf_ops; 232 233 mp->m_sb_bp = bp; 234 xfs_buf_unlock(bp); 235 return 0; 236 237 release_buf: 238 xfs_buf_relse(bp); 239 return error; 240 } 241 242 /* 243 * If the sunit/swidth change would move the precomputed root inode value, we 244 * must reject the ondisk change because repair will stumble over that. 245 * However, we allow the mount to proceed because we never rejected this 246 * combination before. Returns true to update the sb, false otherwise. 247 */ 248 static inline int 249 xfs_check_new_dalign( 250 struct xfs_mount *mp, 251 int new_dalign, 252 bool *update_sb) 253 { 254 struct xfs_sb *sbp = &mp->m_sb; 255 xfs_ino_t calc_ino; 256 257 calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign); 258 trace_xfs_check_new_dalign(mp, new_dalign, calc_ino); 259 260 if (sbp->sb_rootino == calc_ino) { 261 *update_sb = true; 262 return 0; 263 } 264 265 xfs_warn(mp, 266 "Cannot change stripe alignment; would require moving root inode."); 267 268 /* 269 * XXX: Next time we add a new incompat feature, this should start 270 * returning -EINVAL to fail the mount. Until then, spit out a warning 271 * that we're ignoring the administrator's instructions. 272 */ 273 xfs_warn(mp, "Skipping superblock stripe alignment update."); 274 *update_sb = false; 275 return 0; 276 } 277 278 /* 279 * If we were provided with new sunit/swidth values as mount options, make sure 280 * that they pass basic alignment and superblock feature checks, and convert 281 * them into the same units (FSB) that everything else expects. This step 282 * /must/ be done before computing the inode geometry. 283 */ 284 STATIC int 285 xfs_validate_new_dalign( 286 struct xfs_mount *mp) 287 { 288 if (mp->m_dalign == 0) 289 return 0; 290 291 /* 292 * If stripe unit and stripe width are not multiples 293 * of the fs blocksize turn off alignment. 294 */ 295 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || 296 (BBTOB(mp->m_swidth) & mp->m_blockmask)) { 297 xfs_warn(mp, 298 "alignment check failed: sunit/swidth vs. blocksize(%d)", 299 mp->m_sb.sb_blocksize); 300 return -EINVAL; 301 } else { 302 /* 303 * Convert the stripe unit and width to FSBs. 304 */ 305 mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); 306 if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { 307 xfs_warn(mp, 308 "alignment check failed: sunit/swidth vs. agsize(%d)", 309 mp->m_sb.sb_agblocks); 310 return -EINVAL; 311 } else if (mp->m_dalign) { 312 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); 313 } else { 314 xfs_warn(mp, 315 "alignment check failed: sunit(%d) less than bsize(%d)", 316 mp->m_dalign, mp->m_sb.sb_blocksize); 317 return -EINVAL; 318 } 319 } 320 321 if (!xfs_sb_version_hasdalign(&mp->m_sb)) { 322 xfs_warn(mp, 323 "cannot change alignment: superblock does not support data alignment"); 324 return -EINVAL; 325 } 326 327 return 0; 328 } 329 330 /* Update alignment values based on mount options and sb values. */ 331 STATIC int 332 xfs_update_alignment( 333 struct xfs_mount *mp) 334 { 335 struct xfs_sb *sbp = &mp->m_sb; 336 337 if (mp->m_dalign) { 338 bool update_sb; 339 int error; 340 341 if (sbp->sb_unit == mp->m_dalign && 342 sbp->sb_width == mp->m_swidth) 343 return 0; 344 345 error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb); 346 if (error || !update_sb) 347 return error; 348 349 sbp->sb_unit = mp->m_dalign; 350 sbp->sb_width = mp->m_swidth; 351 mp->m_update_sb = true; 352 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && 353 xfs_sb_version_hasdalign(&mp->m_sb)) { 354 mp->m_dalign = sbp->sb_unit; 355 mp->m_swidth = sbp->sb_width; 356 } 357 358 return 0; 359 } 360 361 /* 362 * precalculate the low space thresholds for dynamic speculative preallocation. 363 */ 364 void 365 xfs_set_low_space_thresholds( 366 struct xfs_mount *mp) 367 { 368 int i; 369 370 for (i = 0; i < XFS_LOWSP_MAX; i++) { 371 uint64_t space = mp->m_sb.sb_dblocks; 372 373 do_div(space, 100); 374 mp->m_low_space[i] = space * (i + 1); 375 } 376 } 377 378 /* 379 * Check that the data (and log if separate) is an ok size. 380 */ 381 STATIC int 382 xfs_check_sizes( 383 struct xfs_mount *mp) 384 { 385 struct xfs_buf *bp; 386 xfs_daddr_t d; 387 int error; 388 389 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 390 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { 391 xfs_warn(mp, "filesystem size mismatch detected"); 392 return -EFBIG; 393 } 394 error = xfs_buf_read_uncached(mp->m_ddev_targp, 395 d - XFS_FSS_TO_BB(mp, 1), 396 XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); 397 if (error) { 398 xfs_warn(mp, "last sector read failed"); 399 return error; 400 } 401 xfs_buf_relse(bp); 402 403 if (mp->m_logdev_targp == mp->m_ddev_targp) 404 return 0; 405 406 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 407 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 408 xfs_warn(mp, "log size mismatch detected"); 409 return -EFBIG; 410 } 411 error = xfs_buf_read_uncached(mp->m_logdev_targp, 412 d - XFS_FSB_TO_BB(mp, 1), 413 XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); 414 if (error) { 415 xfs_warn(mp, "log device read failed"); 416 return error; 417 } 418 xfs_buf_relse(bp); 419 return 0; 420 } 421 422 /* 423 * Clear the quotaflags in memory and in the superblock. 424 */ 425 int 426 xfs_mount_reset_sbqflags( 427 struct xfs_mount *mp) 428 { 429 mp->m_qflags = 0; 430 431 /* It is OK to look at sb_qflags in the mount path without m_sb_lock. */ 432 if (mp->m_sb.sb_qflags == 0) 433 return 0; 434 spin_lock(&mp->m_sb_lock); 435 mp->m_sb.sb_qflags = 0; 436 spin_unlock(&mp->m_sb_lock); 437 438 if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) 439 return 0; 440 441 return xfs_sync_sb(mp, false); 442 } 443 444 uint64_t 445 xfs_default_resblks(xfs_mount_t *mp) 446 { 447 uint64_t resblks; 448 449 /* 450 * We default to 5% or 8192 fsbs of space reserved, whichever is 451 * smaller. This is intended to cover concurrent allocation 452 * transactions when we initially hit enospc. These each require a 4 453 * block reservation. Hence by default we cover roughly 2000 concurrent 454 * allocation reservations. 455 */ 456 resblks = mp->m_sb.sb_dblocks; 457 do_div(resblks, 20); 458 resblks = min_t(uint64_t, resblks, 8192); 459 return resblks; 460 } 461 462 /* Ensure the summary counts are correct. */ 463 STATIC int 464 xfs_check_summary_counts( 465 struct xfs_mount *mp) 466 { 467 /* 468 * The AG0 superblock verifier rejects in-progress filesystems, 469 * so we should never see the flag set this far into mounting. 470 */ 471 if (mp->m_sb.sb_inprogress) { 472 xfs_err(mp, "sb_inprogress set after log recovery??"); 473 WARN_ON(1); 474 return -EFSCORRUPTED; 475 } 476 477 /* 478 * Now the log is mounted, we know if it was an unclean shutdown or 479 * not. If it was, with the first phase of recovery has completed, we 480 * have consistent AG blocks on disk. We have not recovered EFIs yet, 481 * but they are recovered transactionally in the second recovery phase 482 * later. 483 * 484 * If the log was clean when we mounted, we can check the summary 485 * counters. If any of them are obviously incorrect, we can recompute 486 * them from the AGF headers in the next step. 487 */ 488 if (XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && 489 (mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks || 490 !xfs_verify_icount(mp, mp->m_sb.sb_icount) || 491 mp->m_sb.sb_ifree > mp->m_sb.sb_icount)) 492 xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); 493 494 /* 495 * We can safely re-initialise incore superblock counters from the 496 * per-ag data. These may not be correct if the filesystem was not 497 * cleanly unmounted, so we waited for recovery to finish before doing 498 * this. 499 * 500 * If the filesystem was cleanly unmounted or the previous check did 501 * not flag anything weird, then we can trust the values in the 502 * superblock to be correct and we don't need to do anything here. 503 * Otherwise, recalculate the summary counters. 504 */ 505 if ((!xfs_sb_version_haslazysbcount(&mp->m_sb) || 506 XFS_LAST_UNMOUNT_WAS_CLEAN(mp)) && 507 !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) 508 return 0; 509 510 return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); 511 } 512 513 /* 514 * Flush and reclaim dirty inodes in preparation for unmount. Inodes and 515 * internal inode structures can be sitting in the CIL and AIL at this point, 516 * so we need to unpin them, write them back and/or reclaim them before unmount 517 * can proceed. 518 * 519 * An inode cluster that has been freed can have its buffer still pinned in 520 * memory because the transaction is still sitting in a iclog. The stale inodes 521 * on that buffer will be pinned to the buffer until the transaction hits the 522 * disk and the callbacks run. Pushing the AIL will skip the stale inodes and 523 * may never see the pinned buffer, so nothing will push out the iclog and 524 * unpin the buffer. 525 * 526 * Hence we need to force the log to unpin everything first. However, log 527 * forces don't wait for the discards they issue to complete, so we have to 528 * explicitly wait for them to complete here as well. 529 * 530 * Then we can tell the world we are unmounting so that error handling knows 531 * that the filesystem is going away and we should error out anything that we 532 * have been retrying in the background. This will prevent never-ending 533 * retries in AIL pushing from hanging the unmount. 534 * 535 * Finally, we can push the AIL to clean all the remaining dirty objects, then 536 * reclaim the remaining inodes that are still in memory at this point in time. 537 */ 538 static void 539 xfs_unmount_flush_inodes( 540 struct xfs_mount *mp) 541 { 542 xfs_log_force(mp, XFS_LOG_SYNC); 543 xfs_extent_busy_wait_all(mp); 544 flush_workqueue(xfs_discard_wq); 545 546 mp->m_flags |= XFS_MOUNT_UNMOUNTING; 547 548 xfs_ail_push_all_sync(mp->m_ail); 549 cancel_delayed_work_sync(&mp->m_reclaim_work); 550 xfs_reclaim_inodes(mp); 551 xfs_health_unmount(mp); 552 } 553 554 static void 555 xfs_mount_setup_inode_geom( 556 struct xfs_mount *mp) 557 { 558 struct xfs_ino_geometry *igeo = M_IGEO(mp); 559 560 igeo->attr_fork_offset = xfs_bmap_compute_attr_offset(mp); 561 ASSERT(igeo->attr_fork_offset < XFS_LITINO(mp)); 562 563 xfs_ialloc_setup_geometry(mp); 564 } 565 566 /* 567 * This function does the following on an initial mount of a file system: 568 * - reads the superblock from disk and init the mount struct 569 * - if we're a 32-bit kernel, do a size check on the superblock 570 * so we don't mount terabyte filesystems 571 * - init mount struct realtime fields 572 * - allocate inode hash table for fs 573 * - init directory manager 574 * - perform recovery and init the log manager 575 */ 576 int 577 xfs_mountfs( 578 struct xfs_mount *mp) 579 { 580 struct xfs_sb *sbp = &(mp->m_sb); 581 struct xfs_inode *rip; 582 struct xfs_ino_geometry *igeo = M_IGEO(mp); 583 uint64_t resblks; 584 uint quotamount = 0; 585 uint quotaflags = 0; 586 int error = 0; 587 588 xfs_sb_mount_common(mp, sbp); 589 590 /* 591 * Check for a mismatched features2 values. Older kernels read & wrote 592 * into the wrong sb offset for sb_features2 on some platforms due to 593 * xfs_sb_t not being 64bit size aligned when sb_features2 was added, 594 * which made older superblock reading/writing routines swap it as a 595 * 64-bit value. 596 * 597 * For backwards compatibility, we make both slots equal. 598 * 599 * If we detect a mismatched field, we OR the set bits into the existing 600 * features2 field in case it has already been modified; we don't want 601 * to lose any features. We then update the bad location with the ORed 602 * value so that older kernels will see any features2 flags. The 603 * superblock writeback code ensures the new sb_features2 is copied to 604 * sb_bad_features2 before it is logged or written to disk. 605 */ 606 if (xfs_sb_has_mismatched_features2(sbp)) { 607 xfs_warn(mp, "correcting sb_features alignment problem"); 608 sbp->sb_features2 |= sbp->sb_bad_features2; 609 mp->m_update_sb = true; 610 611 /* 612 * Re-check for ATTR2 in case it was found in bad_features2 613 * slot. 614 */ 615 if (xfs_sb_version_hasattr2(&mp->m_sb) && 616 !(mp->m_flags & XFS_MOUNT_NOATTR2)) 617 mp->m_flags |= XFS_MOUNT_ATTR2; 618 } 619 620 if (xfs_sb_version_hasattr2(&mp->m_sb) && 621 (mp->m_flags & XFS_MOUNT_NOATTR2)) { 622 xfs_sb_version_removeattr2(&mp->m_sb); 623 mp->m_update_sb = true; 624 625 /* update sb_versionnum for the clearing of the morebits */ 626 if (!sbp->sb_features2) 627 mp->m_update_sb = true; 628 } 629 630 /* always use v2 inodes by default now */ 631 if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) { 632 mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT; 633 mp->m_update_sb = true; 634 } 635 636 /* 637 * If we were given new sunit/swidth options, do some basic validation 638 * checks and convert the incore dalign and swidth values to the 639 * same units (FSB) that everything else uses. This /must/ happen 640 * before computing the inode geometry. 641 */ 642 error = xfs_validate_new_dalign(mp); 643 if (error) 644 goto out; 645 646 xfs_alloc_compute_maxlevels(mp); 647 xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); 648 xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); 649 xfs_mount_setup_inode_geom(mp); 650 xfs_rmapbt_compute_maxlevels(mp); 651 xfs_refcountbt_compute_maxlevels(mp); 652 653 /* 654 * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks 655 * is NOT aligned turn off m_dalign since allocator alignment is within 656 * an ag, therefore ag has to be aligned at stripe boundary. Note that 657 * we must compute the free space and rmap btree geometry before doing 658 * this. 659 */ 660 error = xfs_update_alignment(mp); 661 if (error) 662 goto out; 663 664 /* enable fail_at_unmount as default */ 665 mp->m_fail_unmount = true; 666 667 error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, 668 NULL, mp->m_super->s_id); 669 if (error) 670 goto out; 671 672 error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, 673 &mp->m_kobj, "stats"); 674 if (error) 675 goto out_remove_sysfs; 676 677 error = xfs_error_sysfs_init(mp); 678 if (error) 679 goto out_del_stats; 680 681 error = xfs_errortag_init(mp); 682 if (error) 683 goto out_remove_error_sysfs; 684 685 error = xfs_uuid_mount(mp); 686 if (error) 687 goto out_remove_errortag; 688 689 /* 690 * Update the preferred write size based on the information from the 691 * on-disk superblock. 692 */ 693 mp->m_allocsize_log = 694 max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log); 695 mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog); 696 697 /* set the low space thresholds for dynamic preallocation */ 698 xfs_set_low_space_thresholds(mp); 699 700 /* 701 * If enabled, sparse inode chunk alignment is expected to match the 702 * cluster size. Full inode chunk alignment must match the chunk size, 703 * but that is checked on sb read verification... 704 */ 705 if (xfs_sb_version_hassparseinodes(&mp->m_sb) && 706 mp->m_sb.sb_spino_align != 707 XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) { 708 xfs_warn(mp, 709 "Sparse inode block alignment (%u) must match cluster size (%llu).", 710 mp->m_sb.sb_spino_align, 711 XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)); 712 error = -EINVAL; 713 goto out_remove_uuid; 714 } 715 716 /* 717 * Check that the data (and log if separate) is an ok size. 718 */ 719 error = xfs_check_sizes(mp); 720 if (error) 721 goto out_remove_uuid; 722 723 /* 724 * Initialize realtime fields in the mount structure 725 */ 726 error = xfs_rtmount_init(mp); 727 if (error) { 728 xfs_warn(mp, "RT mount failed"); 729 goto out_remove_uuid; 730 } 731 732 /* 733 * Copies the low order bits of the timestamp and the randomly 734 * set "sequence" number out of a UUID. 735 */ 736 mp->m_fixedfsid[0] = 737 (get_unaligned_be16(&sbp->sb_uuid.b[8]) << 16) | 738 get_unaligned_be16(&sbp->sb_uuid.b[4]); 739 mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]); 740 741 error = xfs_da_mount(mp); 742 if (error) { 743 xfs_warn(mp, "Failed dir/attr init: %d", error); 744 goto out_remove_uuid; 745 } 746 747 /* 748 * Initialize the precomputed transaction reservations values. 749 */ 750 xfs_trans_init(mp); 751 752 /* 753 * Allocate and initialize the per-ag data. 754 */ 755 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); 756 if (error) { 757 xfs_warn(mp, "Failed per-ag init: %d", error); 758 goto out_free_dir; 759 } 760 761 if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) { 762 xfs_warn(mp, "no log defined"); 763 error = -EFSCORRUPTED; 764 goto out_free_perag; 765 } 766 767 /* 768 * Log's mount-time initialization. The first part of recovery can place 769 * some items on the AIL, to be handled when recovery is finished or 770 * cancelled. 771 */ 772 error = xfs_log_mount(mp, mp->m_logdev_targp, 773 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), 774 XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); 775 if (error) { 776 xfs_warn(mp, "log mount failed"); 777 goto out_fail_wait; 778 } 779 780 /* Make sure the summary counts are ok. */ 781 error = xfs_check_summary_counts(mp); 782 if (error) 783 goto out_log_dealloc; 784 785 /* 786 * Get and sanity-check the root inode. 787 * Save the pointer to it in the mount structure. 788 */ 789 error = xfs_iget(mp, NULL, sbp->sb_rootino, XFS_IGET_UNTRUSTED, 790 XFS_ILOCK_EXCL, &rip); 791 if (error) { 792 xfs_warn(mp, 793 "Failed to read root inode 0x%llx, error %d", 794 sbp->sb_rootino, -error); 795 goto out_log_dealloc; 796 } 797 798 ASSERT(rip != NULL); 799 800 if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) { 801 xfs_warn(mp, "corrupted root inode %llu: not a directory", 802 (unsigned long long)rip->i_ino); 803 xfs_iunlock(rip, XFS_ILOCK_EXCL); 804 error = -EFSCORRUPTED; 805 goto out_rele_rip; 806 } 807 mp->m_rootip = rip; /* save it */ 808 809 xfs_iunlock(rip, XFS_ILOCK_EXCL); 810 811 /* 812 * Initialize realtime inode pointers in the mount structure 813 */ 814 error = xfs_rtmount_inodes(mp); 815 if (error) { 816 /* 817 * Free up the root inode. 818 */ 819 xfs_warn(mp, "failed to read RT inodes"); 820 goto out_rele_rip; 821 } 822 823 /* 824 * If this is a read-only mount defer the superblock updates until 825 * the next remount into writeable mode. Otherwise we would never 826 * perform the update e.g. for the root filesystem. 827 */ 828 if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) { 829 error = xfs_sync_sb(mp, false); 830 if (error) { 831 xfs_warn(mp, "failed to write sb changes"); 832 goto out_rtunmount; 833 } 834 } 835 836 /* 837 * Initialise the XFS quota management subsystem for this mount 838 */ 839 if (XFS_IS_QUOTA_RUNNING(mp)) { 840 error = xfs_qm_newmount(mp, "amount, "aflags); 841 if (error) 842 goto out_rtunmount; 843 } else { 844 ASSERT(!XFS_IS_QUOTA_ON(mp)); 845 846 /* 847 * If a file system had quotas running earlier, but decided to 848 * mount without -o uquota/pquota/gquota options, revoke the 849 * quotachecked license. 850 */ 851 if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) { 852 xfs_notice(mp, "resetting quota flags"); 853 error = xfs_mount_reset_sbqflags(mp); 854 if (error) 855 goto out_rtunmount; 856 } 857 } 858 859 /* 860 * Finish recovering the file system. This part needed to be delayed 861 * until after the root and real-time bitmap inodes were consistently 862 * read in. Temporarily create per-AG space reservations for metadata 863 * btree shape changes because space freeing transactions (for inode 864 * inactivation) require the per-AG reservation in lieu of reserving 865 * blocks. 866 */ 867 error = xfs_fs_reserve_ag_blocks(mp); 868 if (error && error == -ENOSPC) 869 xfs_warn(mp, 870 "ENOSPC reserving per-AG metadata pool, log recovery may fail."); 871 error = xfs_log_mount_finish(mp); 872 xfs_fs_unreserve_ag_blocks(mp); 873 if (error) { 874 xfs_warn(mp, "log mount finish failed"); 875 goto out_rtunmount; 876 } 877 878 /* 879 * Now the log is fully replayed, we can transition to full read-only 880 * mode for read-only mounts. This will sync all the metadata and clean 881 * the log so that the recovery we just performed does not have to be 882 * replayed again on the next mount. 883 * 884 * We use the same quiesce mechanism as the rw->ro remount, as they are 885 * semantically identical operations. 886 */ 887 if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) == 888 XFS_MOUNT_RDONLY) { 889 xfs_log_clean(mp); 890 } 891 892 /* 893 * Complete the quota initialisation, post-log-replay component. 894 */ 895 if (quotamount) { 896 ASSERT(mp->m_qflags == 0); 897 mp->m_qflags = quotaflags; 898 899 xfs_qm_mount_quotas(mp); 900 } 901 902 /* 903 * Now we are mounted, reserve a small amount of unused space for 904 * privileged transactions. This is needed so that transaction 905 * space required for critical operations can dip into this pool 906 * when at ENOSPC. This is needed for operations like create with 907 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations 908 * are not allowed to use this reserved space. 909 * 910 * This may drive us straight to ENOSPC on mount, but that implies 911 * we were already there on the last unmount. Warn if this occurs. 912 */ 913 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 914 resblks = xfs_default_resblks(mp); 915 error = xfs_reserve_blocks(mp, &resblks, NULL); 916 if (error) 917 xfs_warn(mp, 918 "Unable to allocate reserve blocks. Continuing without reserve pool."); 919 920 /* Recover any CoW blocks that never got remapped. */ 921 error = xfs_reflink_recover_cow(mp); 922 if (error) { 923 xfs_err(mp, 924 "Error %d recovering leftover CoW allocations.", error); 925 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 926 goto out_quota; 927 } 928 929 /* Reserve AG blocks for future btree expansion. */ 930 error = xfs_fs_reserve_ag_blocks(mp); 931 if (error && error != -ENOSPC) 932 goto out_agresv; 933 } 934 935 return 0; 936 937 out_agresv: 938 xfs_fs_unreserve_ag_blocks(mp); 939 out_quota: 940 xfs_qm_unmount_quotas(mp); 941 out_rtunmount: 942 xfs_rtunmount_inodes(mp); 943 out_rele_rip: 944 xfs_irele(rip); 945 /* Clean out dquots that might be in memory after quotacheck. */ 946 xfs_qm_unmount(mp); 947 /* 948 * Flush all inode reclamation work and flush the log. 949 * We have to do this /after/ rtunmount and qm_unmount because those 950 * two will have scheduled delayed reclaim for the rt/quota inodes. 951 * 952 * This is slightly different from the unmountfs call sequence 953 * because we could be tearing down a partially set up mount. In 954 * particular, if log_mount_finish fails we bail out without calling 955 * qm_unmount_quotas and therefore rely on qm_unmount to release the 956 * quota inodes. 957 */ 958 xfs_unmount_flush_inodes(mp); 959 out_log_dealloc: 960 xfs_log_mount_cancel(mp); 961 out_fail_wait: 962 if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) 963 xfs_buftarg_drain(mp->m_logdev_targp); 964 xfs_buftarg_drain(mp->m_ddev_targp); 965 out_free_perag: 966 xfs_free_perag(mp); 967 out_free_dir: 968 xfs_da_unmount(mp); 969 out_remove_uuid: 970 xfs_uuid_unmount(mp); 971 out_remove_errortag: 972 xfs_errortag_del(mp); 973 out_remove_error_sysfs: 974 xfs_error_sysfs_del(mp); 975 out_del_stats: 976 xfs_sysfs_del(&mp->m_stats.xs_kobj); 977 out_remove_sysfs: 978 xfs_sysfs_del(&mp->m_kobj); 979 out: 980 return error; 981 } 982 983 /* 984 * This flushes out the inodes,dquots and the superblock, unmounts the 985 * log and makes sure that incore structures are freed. 986 */ 987 void 988 xfs_unmountfs( 989 struct xfs_mount *mp) 990 { 991 uint64_t resblks; 992 int error; 993 994 xfs_blockgc_stop(mp); 995 xfs_fs_unreserve_ag_blocks(mp); 996 xfs_qm_unmount_quotas(mp); 997 xfs_rtunmount_inodes(mp); 998 xfs_irele(mp->m_rootip); 999 1000 xfs_unmount_flush_inodes(mp); 1001 1002 xfs_qm_unmount(mp); 1003 1004 /* 1005 * Unreserve any blocks we have so that when we unmount we don't account 1006 * the reserved free space as used. This is really only necessary for 1007 * lazy superblock counting because it trusts the incore superblock 1008 * counters to be absolutely correct on clean unmount. 1009 * 1010 * We don't bother correcting this elsewhere for lazy superblock 1011 * counting because on mount of an unclean filesystem we reconstruct the 1012 * correct counter value and this is irrelevant. 1013 * 1014 * For non-lazy counter filesystems, this doesn't matter at all because 1015 * we only every apply deltas to the superblock and hence the incore 1016 * value does not matter.... 1017 */ 1018 resblks = 0; 1019 error = xfs_reserve_blocks(mp, &resblks, NULL); 1020 if (error) 1021 xfs_warn(mp, "Unable to free reserved block pool. " 1022 "Freespace may not be correct on next mount."); 1023 1024 xfs_log_unmount(mp); 1025 xfs_da_unmount(mp); 1026 xfs_uuid_unmount(mp); 1027 1028 #if defined(DEBUG) 1029 xfs_errortag_clearall(mp); 1030 #endif 1031 xfs_free_perag(mp); 1032 1033 xfs_errortag_del(mp); 1034 xfs_error_sysfs_del(mp); 1035 xfs_sysfs_del(&mp->m_stats.xs_kobj); 1036 xfs_sysfs_del(&mp->m_kobj); 1037 } 1038 1039 /* 1040 * Determine whether modifications can proceed. The caller specifies the minimum 1041 * freeze level for which modifications should not be allowed. This allows 1042 * certain operations to proceed while the freeze sequence is in progress, if 1043 * necessary. 1044 */ 1045 bool 1046 xfs_fs_writable( 1047 struct xfs_mount *mp, 1048 int level) 1049 { 1050 ASSERT(level > SB_UNFROZEN); 1051 if ((mp->m_super->s_writers.frozen >= level) || 1052 XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY)) 1053 return false; 1054 1055 return true; 1056 } 1057 1058 /* 1059 * Deltas for the block count can vary from 1 to very large, but lock contention 1060 * only occurs on frequent small block count updates such as in the delayed 1061 * allocation path for buffered writes (page a time updates). Hence we set 1062 * a large batch count (1024) to minimise global counter updates except when 1063 * we get near to ENOSPC and we have to be very accurate with our updates. 1064 */ 1065 #define XFS_FDBLOCKS_BATCH 1024 1066 int 1067 xfs_mod_fdblocks( 1068 struct xfs_mount *mp, 1069 int64_t delta, 1070 bool rsvd) 1071 { 1072 int64_t lcounter; 1073 long long res_used; 1074 s32 batch; 1075 uint64_t set_aside; 1076 1077 if (delta > 0) { 1078 /* 1079 * If the reserve pool is depleted, put blocks back into it 1080 * first. Most of the time the pool is full. 1081 */ 1082 if (likely(mp->m_resblks == mp->m_resblks_avail)) { 1083 percpu_counter_add(&mp->m_fdblocks, delta); 1084 return 0; 1085 } 1086 1087 spin_lock(&mp->m_sb_lock); 1088 res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); 1089 1090 if (res_used > delta) { 1091 mp->m_resblks_avail += delta; 1092 } else { 1093 delta -= res_used; 1094 mp->m_resblks_avail = mp->m_resblks; 1095 percpu_counter_add(&mp->m_fdblocks, delta); 1096 } 1097 spin_unlock(&mp->m_sb_lock); 1098 return 0; 1099 } 1100 1101 /* 1102 * Taking blocks away, need to be more accurate the closer we 1103 * are to zero. 1104 * 1105 * If the counter has a value of less than 2 * max batch size, 1106 * then make everything serialise as we are real close to 1107 * ENOSPC. 1108 */ 1109 if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH, 1110 XFS_FDBLOCKS_BATCH) < 0) 1111 batch = 1; 1112 else 1113 batch = XFS_FDBLOCKS_BATCH; 1114 1115 /* 1116 * Set aside allocbt blocks because these blocks are tracked as free 1117 * space but not available for allocation. Technically this means that a 1118 * single reservation cannot consume all remaining free space, but the 1119 * ratio of allocbt blocks to usable free blocks should be rather small. 1120 * The tradeoff without this is that filesystems that maintain high 1121 * perag block reservations can over reserve physical block availability 1122 * and fail physical allocation, which leads to much more serious 1123 * problems (i.e. transaction abort, pagecache discards, etc.) than 1124 * slightly premature -ENOSPC. 1125 */ 1126 set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); 1127 percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); 1128 if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, 1129 XFS_FDBLOCKS_BATCH) >= 0) { 1130 /* we had space! */ 1131 return 0; 1132 } 1133 1134 /* 1135 * lock up the sb for dipping into reserves before releasing the space 1136 * that took us to ENOSPC. 1137 */ 1138 spin_lock(&mp->m_sb_lock); 1139 percpu_counter_add(&mp->m_fdblocks, -delta); 1140 if (!rsvd) 1141 goto fdblocks_enospc; 1142 1143 lcounter = (long long)mp->m_resblks_avail + delta; 1144 if (lcounter >= 0) { 1145 mp->m_resblks_avail = lcounter; 1146 spin_unlock(&mp->m_sb_lock); 1147 return 0; 1148 } 1149 xfs_warn_once(mp, 1150 "Reserve blocks depleted! Consider increasing reserve pool size."); 1151 1152 fdblocks_enospc: 1153 spin_unlock(&mp->m_sb_lock); 1154 return -ENOSPC; 1155 } 1156 1157 int 1158 xfs_mod_frextents( 1159 struct xfs_mount *mp, 1160 int64_t delta) 1161 { 1162 int64_t lcounter; 1163 int ret = 0; 1164 1165 spin_lock(&mp->m_sb_lock); 1166 lcounter = mp->m_sb.sb_frextents + delta; 1167 if (lcounter < 0) 1168 ret = -ENOSPC; 1169 else 1170 mp->m_sb.sb_frextents = lcounter; 1171 spin_unlock(&mp->m_sb_lock); 1172 return ret; 1173 } 1174 1175 /* 1176 * Used to free the superblock along various error paths. 1177 */ 1178 void 1179 xfs_freesb( 1180 struct xfs_mount *mp) 1181 { 1182 struct xfs_buf *bp = mp->m_sb_bp; 1183 1184 xfs_buf_lock(bp); 1185 mp->m_sb_bp = NULL; 1186 xfs_buf_relse(bp); 1187 } 1188 1189 /* 1190 * If the underlying (data/log/rt) device is readonly, there are some 1191 * operations that cannot proceed. 1192 */ 1193 int 1194 xfs_dev_is_read_only( 1195 struct xfs_mount *mp, 1196 char *message) 1197 { 1198 if (xfs_readonly_buftarg(mp->m_ddev_targp) || 1199 xfs_readonly_buftarg(mp->m_logdev_targp) || 1200 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { 1201 xfs_notice(mp, "%s required on read-only device.", message); 1202 xfs_notice(mp, "write access unavailable, cannot proceed."); 1203 return -EROFS; 1204 } 1205 return 0; 1206 } 1207 1208 /* Force the summary counters to be recalculated at next mount. */ 1209 void 1210 xfs_force_summary_recalc( 1211 struct xfs_mount *mp) 1212 { 1213 if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) 1214 return; 1215 1216 xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); 1217 } 1218 1219 /* 1220 * Update the in-core delayed block counter. 1221 * 1222 * We prefer to update the counter without having to take a spinlock for every 1223 * counter update (i.e. batching). Each change to delayed allocation 1224 * reservations can change can easily exceed the default percpu counter 1225 * batching, so we use a larger batch factor here. 1226 * 1227 * Note that we don't currently have any callers requiring fast summation 1228 * (e.g. percpu_counter_read) so we can use a big batch value here. 1229 */ 1230 #define XFS_DELALLOC_BATCH (4096) 1231 void 1232 xfs_mod_delalloc( 1233 struct xfs_mount *mp, 1234 int64_t delta) 1235 { 1236 percpu_counter_add_batch(&mp->m_delalloc_blks, delta, 1237 XFS_DELALLOC_BATCH); 1238 } 1239