1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_sb.h" 13 #include "xfs_mount.h" 14 #include "xfs_trans.h" 15 #include "xfs_error.h" 16 #include "xfs_alloc.h" 17 #include "xfs_fsops.h" 18 #include "xfs_trans_space.h" 19 #include "xfs_log.h" 20 #include "xfs_ag.h" 21 #include "xfs_ag_resv.h" 22 23 /* 24 * Write new AG headers to disk. Non-transactional, but need to be 25 * written and completed prior to the growfs transaction being logged. 26 * To do this, we use a delayed write buffer list and wait for 27 * submission and IO completion of the list as a whole. This allows the 28 * IO subsystem to merge all the AG headers in a single AG into a single 29 * IO and hide most of the latency of the IO from us. 30 * 31 * This also means that if we get an error whilst building the buffer 32 * list to write, we can cancel the entire list without having written 33 * anything. 34 */ 35 static int 36 xfs_resizefs_init_new_ags( 37 struct xfs_trans *tp, 38 struct aghdr_init_data *id, 39 xfs_agnumber_t oagcount, 40 xfs_agnumber_t nagcount, 41 xfs_rfsblock_t delta, 42 bool *lastag_extended) 43 { 44 struct xfs_mount *mp = tp->t_mountp; 45 xfs_rfsblock_t nb = mp->m_sb.sb_dblocks + delta; 46 int error; 47 48 *lastag_extended = false; 49 50 INIT_LIST_HEAD(&id->buffer_list); 51 for (id->agno = nagcount - 1; 52 id->agno >= oagcount; 53 id->agno--, delta -= id->agsize) { 54 55 if (id->agno == nagcount - 1) 56 id->agsize = nb - (id->agno * 57 (xfs_rfsblock_t)mp->m_sb.sb_agblocks); 58 else 59 id->agsize = mp->m_sb.sb_agblocks; 60 61 error = xfs_ag_init_headers(mp, id); 62 if (error) { 63 xfs_buf_delwri_cancel(&id->buffer_list); 64 return error; 65 } 66 } 67 68 error = xfs_buf_delwri_submit(&id->buffer_list); 69 if (error) 70 return error; 71 72 if (delta) { 73 *lastag_extended = true; 74 error = xfs_ag_extend_space(mp, tp, id, delta); 75 } 76 return error; 77 } 78 79 /* 80 * growfs operations 81 */ 82 static int 83 xfs_growfs_data_private( 84 struct xfs_mount *mp, /* mount point for filesystem */ 85 struct xfs_growfs_data *in) /* growfs data input struct */ 86 { 87 struct xfs_buf *bp; 88 int error; 89 xfs_agnumber_t nagcount; 90 xfs_agnumber_t nagimax = 0; 91 xfs_rfsblock_t nb, nb_div, nb_mod; 92 int64_t delta; 93 bool lastag_extended; 94 xfs_agnumber_t oagcount; 95 struct xfs_trans *tp; 96 struct aghdr_init_data id = {}; 97 98 nb = in->newblocks; 99 error = xfs_sb_validate_fsb_count(&mp->m_sb, nb); 100 if (error) 101 return error; 102 103 if (nb > mp->m_sb.sb_dblocks) { 104 error = xfs_buf_read_uncached(mp->m_ddev_targp, 105 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), 106 XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); 107 if (error) 108 return error; 109 xfs_buf_relse(bp); 110 } 111 112 nb_div = nb; 113 nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks); 114 nagcount = nb_div + (nb_mod != 0); 115 if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { 116 nagcount--; 117 nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; 118 } 119 delta = nb - mp->m_sb.sb_dblocks; 120 /* 121 * Reject filesystems with a single AG because they are not 122 * supported, and reject a shrink operation that would cause a 123 * filesystem to become unsupported. 124 */ 125 if (delta < 0 && nagcount < 2) 126 return -EINVAL; 127 128 oagcount = mp->m_sb.sb_agcount; 129 130 /* allocate the new per-ag structures */ 131 if (nagcount > oagcount) { 132 error = xfs_initialize_perag(mp, nagcount, &nagimax); 133 if (error) 134 return error; 135 } else if (nagcount < oagcount) { 136 /* TODO: shrinking the entire AGs hasn't yet completed */ 137 return -EINVAL; 138 } 139 140 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, 141 (delta > 0 ? XFS_GROWFS_SPACE_RES(mp) : -delta), 0, 142 XFS_TRANS_RESERVE, &tp); 143 if (error) 144 return error; 145 146 if (delta > 0) { 147 error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, 148 delta, &lastag_extended); 149 } else { 150 static struct ratelimit_state shrink_warning = \ 151 RATELIMIT_STATE_INIT("shrink_warning", 86400 * HZ, 1); 152 ratelimit_set_flags(&shrink_warning, RATELIMIT_MSG_ON_RELEASE); 153 154 if (__ratelimit(&shrink_warning)) 155 xfs_alert(mp, 156 "EXPERIMENTAL online shrink feature in use. Use at your own risk!"); 157 158 error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta); 159 } 160 if (error) 161 goto out_trans_cancel; 162 163 /* 164 * Update changed superblock fields transactionally. These are not 165 * seen by the rest of the world until the transaction commit applies 166 * them atomically to the superblock. 167 */ 168 if (nagcount > oagcount) 169 xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); 170 if (delta) 171 xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, delta); 172 if (id.nfree) 173 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree); 174 175 /* 176 * Sync sb counters now to reflect the updated values. This is 177 * particularly important for shrink because the write verifier 178 * will fail if sb_fdblocks is ever larger than sb_dblocks. 179 */ 180 if (xfs_sb_version_haslazysbcount(&mp->m_sb)) 181 xfs_log_sb(tp); 182 183 xfs_trans_set_sync(tp); 184 error = xfs_trans_commit(tp); 185 if (error) 186 return error; 187 188 /* New allocation groups fully initialized, so update mount struct */ 189 if (nagimax) 190 mp->m_maxagi = nagimax; 191 xfs_set_low_space_thresholds(mp); 192 mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); 193 194 if (delta > 0) { 195 /* 196 * If we expanded the last AG, free the per-AG reservation 197 * so we can reinitialize it with the new size. 198 */ 199 if (lastag_extended) { 200 struct xfs_perag *pag; 201 202 pag = xfs_perag_get(mp, id.agno); 203 error = xfs_ag_resv_free(pag); 204 xfs_perag_put(pag); 205 if (error) 206 return error; 207 } 208 /* 209 * Reserve AG metadata blocks. ENOSPC here does not mean there 210 * was a growfs failure, just that there still isn't space for 211 * new user data after the grow has been run. 212 */ 213 error = xfs_fs_reserve_ag_blocks(mp); 214 if (error == -ENOSPC) 215 error = 0; 216 } 217 return error; 218 219 out_trans_cancel: 220 xfs_trans_cancel(tp); 221 return error; 222 } 223 224 static int 225 xfs_growfs_log_private( 226 struct xfs_mount *mp, /* mount point for filesystem */ 227 struct xfs_growfs_log *in) /* growfs log input struct */ 228 { 229 xfs_extlen_t nb; 230 231 nb = in->newblocks; 232 if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES)) 233 return -EINVAL; 234 if (nb == mp->m_sb.sb_logblocks && 235 in->isint == (mp->m_sb.sb_logstart != 0)) 236 return -EINVAL; 237 /* 238 * Moving the log is hard, need new interfaces to sync 239 * the log first, hold off all activity while moving it. 240 * Can have shorter or longer log in the same space, 241 * or transform internal to external log or vice versa. 242 */ 243 return -ENOSYS; 244 } 245 246 static int 247 xfs_growfs_imaxpct( 248 struct xfs_mount *mp, 249 __u32 imaxpct) 250 { 251 struct xfs_trans *tp; 252 int dpct; 253 int error; 254 255 if (imaxpct > 100) 256 return -EINVAL; 257 258 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, 259 XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); 260 if (error) 261 return error; 262 263 dpct = imaxpct - mp->m_sb.sb_imax_pct; 264 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); 265 xfs_trans_set_sync(tp); 266 return xfs_trans_commit(tp); 267 } 268 269 /* 270 * protected versions of growfs function acquire and release locks on the mount 271 * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG, 272 * XFS_IOC_FSGROWFSRT 273 */ 274 int 275 xfs_growfs_data( 276 struct xfs_mount *mp, 277 struct xfs_growfs_data *in) 278 { 279 int error = 0; 280 281 if (!capable(CAP_SYS_ADMIN)) 282 return -EPERM; 283 if (!mutex_trylock(&mp->m_growlock)) 284 return -EWOULDBLOCK; 285 286 /* update imaxpct separately to the physical grow of the filesystem */ 287 if (in->imaxpct != mp->m_sb.sb_imax_pct) { 288 error = xfs_growfs_imaxpct(mp, in->imaxpct); 289 if (error) 290 goto out_error; 291 } 292 293 if (in->newblocks != mp->m_sb.sb_dblocks) { 294 error = xfs_growfs_data_private(mp, in); 295 if (error) 296 goto out_error; 297 } 298 299 /* Post growfs calculations needed to reflect new state in operations */ 300 if (mp->m_sb.sb_imax_pct) { 301 uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; 302 do_div(icount, 100); 303 M_IGEO(mp)->maxicount = XFS_FSB_TO_INO(mp, icount); 304 } else 305 M_IGEO(mp)->maxicount = 0; 306 307 /* Update secondary superblocks now the physical grow has completed */ 308 error = xfs_update_secondary_sbs(mp); 309 310 out_error: 311 /* 312 * Increment the generation unconditionally, the error could be from 313 * updating the secondary superblocks, in which case the new size 314 * is live already. 315 */ 316 mp->m_generation++; 317 mutex_unlock(&mp->m_growlock); 318 return error; 319 } 320 321 int 322 xfs_growfs_log( 323 xfs_mount_t *mp, 324 struct xfs_growfs_log *in) 325 { 326 int error; 327 328 if (!capable(CAP_SYS_ADMIN)) 329 return -EPERM; 330 if (!mutex_trylock(&mp->m_growlock)) 331 return -EWOULDBLOCK; 332 error = xfs_growfs_log_private(mp, in); 333 mutex_unlock(&mp->m_growlock); 334 return error; 335 } 336 337 /* 338 * exported through ioctl XFS_IOC_FSCOUNTS 339 */ 340 341 void 342 xfs_fs_counts( 343 xfs_mount_t *mp, 344 xfs_fsop_counts_t *cnt) 345 { 346 cnt->allocino = percpu_counter_read_positive(&mp->m_icount); 347 cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); 348 cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - 349 mp->m_alloc_set_aside; 350 351 spin_lock(&mp->m_sb_lock); 352 cnt->freertx = mp->m_sb.sb_frextents; 353 spin_unlock(&mp->m_sb_lock); 354 } 355 356 /* 357 * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS 358 * 359 * xfs_reserve_blocks is called to set m_resblks 360 * in the in-core mount table. The number of unused reserved blocks 361 * is kept in m_resblks_avail. 362 * 363 * Reserve the requested number of blocks if available. Otherwise return 364 * as many as possible to satisfy the request. The actual number 365 * reserved are returned in outval 366 * 367 * A null inval pointer indicates that only the current reserved blocks 368 * available should be returned no settings are changed. 369 */ 370 371 int 372 xfs_reserve_blocks( 373 xfs_mount_t *mp, 374 uint64_t *inval, 375 xfs_fsop_resblks_t *outval) 376 { 377 int64_t lcounter, delta; 378 int64_t fdblks_delta = 0; 379 uint64_t request; 380 int64_t free; 381 int error = 0; 382 383 /* If inval is null, report current values and return */ 384 if (inval == (uint64_t *)NULL) { 385 if (!outval) 386 return -EINVAL; 387 outval->resblks = mp->m_resblks; 388 outval->resblks_avail = mp->m_resblks_avail; 389 return 0; 390 } 391 392 request = *inval; 393 394 /* 395 * With per-cpu counters, this becomes an interesting problem. we need 396 * to work out if we are freeing or allocation blocks first, then we can 397 * do the modification as necessary. 398 * 399 * We do this under the m_sb_lock so that if we are near ENOSPC, we will 400 * hold out any changes while we work out what to do. This means that 401 * the amount of free space can change while we do this, so we need to 402 * retry if we end up trying to reserve more space than is available. 403 */ 404 spin_lock(&mp->m_sb_lock); 405 406 /* 407 * If our previous reservation was larger than the current value, 408 * then move any unused blocks back to the free pool. Modify the resblks 409 * counters directly since we shouldn't have any problems unreserving 410 * space. 411 */ 412 if (mp->m_resblks > request) { 413 lcounter = mp->m_resblks_avail - request; 414 if (lcounter > 0) { /* release unused blocks */ 415 fdblks_delta = lcounter; 416 mp->m_resblks_avail -= lcounter; 417 } 418 mp->m_resblks = request; 419 if (fdblks_delta) { 420 spin_unlock(&mp->m_sb_lock); 421 error = xfs_mod_fdblocks(mp, fdblks_delta, 0); 422 spin_lock(&mp->m_sb_lock); 423 } 424 425 goto out; 426 } 427 428 /* 429 * If the request is larger than the current reservation, reserve the 430 * blocks before we update the reserve counters. Sample m_fdblocks and 431 * perform a partial reservation if the request exceeds free space. 432 */ 433 error = -ENOSPC; 434 do { 435 free = percpu_counter_sum(&mp->m_fdblocks) - 436 mp->m_alloc_set_aside; 437 if (free <= 0) 438 break; 439 440 delta = request - mp->m_resblks; 441 lcounter = free - delta; 442 if (lcounter < 0) 443 /* We can't satisfy the request, just get what we can */ 444 fdblks_delta = free; 445 else 446 fdblks_delta = delta; 447 448 /* 449 * We'll either succeed in getting space from the free block 450 * count or we'll get an ENOSPC. If we get a ENOSPC, it means 451 * things changed while we were calculating fdblks_delta and so 452 * we should try again to see if there is anything left to 453 * reserve. 454 * 455 * Don't set the reserved flag here - we don't want to reserve 456 * the extra reserve blocks from the reserve..... 457 */ 458 spin_unlock(&mp->m_sb_lock); 459 error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); 460 spin_lock(&mp->m_sb_lock); 461 } while (error == -ENOSPC); 462 463 /* 464 * Update the reserve counters if blocks have been successfully 465 * allocated. 466 */ 467 if (!error && fdblks_delta) { 468 mp->m_resblks += fdblks_delta; 469 mp->m_resblks_avail += fdblks_delta; 470 } 471 472 out: 473 if (outval) { 474 outval->resblks = mp->m_resblks; 475 outval->resblks_avail = mp->m_resblks_avail; 476 } 477 478 spin_unlock(&mp->m_sb_lock); 479 return error; 480 } 481 482 int 483 xfs_fs_goingdown( 484 xfs_mount_t *mp, 485 uint32_t inflags) 486 { 487 switch (inflags) { 488 case XFS_FSOP_GOING_FLAGS_DEFAULT: { 489 if (!freeze_bdev(mp->m_super->s_bdev)) { 490 xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); 491 thaw_bdev(mp->m_super->s_bdev); 492 } 493 break; 494 } 495 case XFS_FSOP_GOING_FLAGS_LOGFLUSH: 496 xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); 497 break; 498 case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH: 499 xfs_force_shutdown(mp, 500 SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR); 501 break; 502 default: 503 return -EINVAL; 504 } 505 506 return 0; 507 } 508 509 /* 510 * Force a shutdown of the filesystem instantly while keeping the filesystem 511 * consistent. We don't do an unmount here; just shutdown the shop, make sure 512 * that absolutely nothing persistent happens to this filesystem after this 513 * point. 514 */ 515 void 516 xfs_do_force_shutdown( 517 struct xfs_mount *mp, 518 int flags, 519 char *fname, 520 int lnnum) 521 { 522 bool logerror = flags & SHUTDOWN_LOG_IO_ERROR; 523 524 /* 525 * No need to duplicate efforts. 526 */ 527 if (XFS_FORCED_SHUTDOWN(mp) && !logerror) 528 return; 529 530 /* 531 * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't 532 * queue up anybody new on the log reservations, and wakes up 533 * everybody who's sleeping on log reservations to tell them 534 * the bad news. 535 */ 536 if (xfs_log_force_umount(mp, logerror)) 537 return; 538 539 if (flags & SHUTDOWN_FORCE_UMOUNT) { 540 xfs_alert(mp, 541 "User initiated shutdown received. Shutting down filesystem"); 542 return; 543 } 544 545 xfs_notice(mp, 546 "%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT, 547 __func__, flags, lnnum, fname, __return_address); 548 549 if (flags & SHUTDOWN_CORRUPT_INCORE) { 550 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, 551 "Corruption of in-memory data detected. Shutting down filesystem"); 552 if (XFS_ERRLEVEL_HIGH <= xfs_error_level) 553 xfs_stack_trace(); 554 } else if (logerror) { 555 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, 556 "Log I/O Error Detected. Shutting down filesystem"); 557 } else { 558 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, 559 "I/O Error Detected. Shutting down filesystem"); 560 } 561 562 xfs_alert(mp, 563 "Please unmount the filesystem and rectify the problem(s)"); 564 } 565 566 /* 567 * Reserve free space for per-AG metadata. 568 */ 569 int 570 xfs_fs_reserve_ag_blocks( 571 struct xfs_mount *mp) 572 { 573 xfs_agnumber_t agno; 574 struct xfs_perag *pag; 575 int error = 0; 576 int err2; 577 578 mp->m_finobt_nores = false; 579 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 580 pag = xfs_perag_get(mp, agno); 581 err2 = xfs_ag_resv_init(pag, NULL); 582 xfs_perag_put(pag); 583 if (err2 && !error) 584 error = err2; 585 } 586 587 if (error && error != -ENOSPC) { 588 xfs_warn(mp, 589 "Error %d reserving per-AG metadata reserve pool.", error); 590 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 591 } 592 593 return error; 594 } 595 596 /* 597 * Free space reserved for per-AG metadata. 598 */ 599 int 600 xfs_fs_unreserve_ag_blocks( 601 struct xfs_mount *mp) 602 { 603 xfs_agnumber_t agno; 604 struct xfs_perag *pag; 605 int error = 0; 606 int err2; 607 608 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 609 pag = xfs_perag_get(mp, agno); 610 err2 = xfs_ag_resv_free(pag); 611 xfs_perag_put(pag); 612 if (err2 && !error) 613 error = err2; 614 } 615 616 if (error) 617 xfs_warn(mp, 618 "Error %d freeing per-AG metadata reserve pool.", error); 619 620 return error; 621 } 622