1 /*- 2 * SPDX-License-Identifier: (BSD-2-Clause AND BSD-3-Clause) 3 * 4 * Copyright (c) 2002 Networks Associates Technology, Inc. 5 * All rights reserved. 6 * 7 * This software was developed for the FreeBSD Project by Marshall 8 * Kirk McKusick and Network Associates Laboratories, the Security 9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 11 * research program 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1982, 1986, 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include "opt_quota.h" 68 69 #include <sys/param.h> 70 #include <sys/systm.h> 71 #include <sys/bio.h> 72 #include <sys/buf.h> 73 #include <sys/capsicum.h> 74 #include <sys/conf.h> 75 #include <sys/fcntl.h> 76 #include <sys/file.h> 77 #include <sys/filedesc.h> 78 #include <sys/gsb_crc32.h> 79 #include <sys/kernel.h> 80 #include <sys/mount.h> 81 #include <sys/priv.h> 82 #include <sys/proc.h> 83 #include <sys/stat.h> 84 #include <sys/syscallsubr.h> 85 #include <sys/sysctl.h> 86 #include <sys/syslog.h> 87 #include <sys/taskqueue.h> 88 #include <sys/vnode.h> 89 90 #include <security/audit/audit.h> 91 92 #include <geom/geom.h> 93 #include <geom/geom_vfs.h> 94 95 #include <ufs/ufs/dir.h> 96 #include <ufs/ufs/extattr.h> 97 #include <ufs/ufs/quota.h> 98 #include <ufs/ufs/inode.h> 99 #include <ufs/ufs/ufs_extern.h> 100 #include <ufs/ufs/ufsmount.h> 101 102 #include <ufs/ffs/fs.h> 103 #include <ufs/ffs/ffs_extern.h> 104 #include <ufs/ffs/softdep.h> 105 106 typedef ufs2_daddr_t allocfcn_t(struct inode *ip, uint64_t cg, 107 ufs2_daddr_t bpref, int size, int rsize); 108 109 static ufs2_daddr_t ffs_alloccg(struct inode *, uint64_t, ufs2_daddr_t, int, 110 int); 111 static ufs2_daddr_t 112 ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); 113 static void ffs_blkfree_cg(struct ufsmount *, struct fs *, 114 struct vnode *, ufs2_daddr_t, long, ino_t, 115 struct workhead *); 116 #ifdef INVARIANTS 117 static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); 118 #endif 119 static ufs2_daddr_t ffs_clusteralloc(struct inode *, uint64_t, ufs2_daddr_t, 120 int); 121 static ino_t ffs_dirpref(struct inode *); 122 static ufs2_daddr_t ffs_fragextend(struct inode *, uint64_t, ufs2_daddr_t, 123 int, int); 124 static ufs2_daddr_t ffs_hashalloc(struct inode *, uint64_t, ufs2_daddr_t, 125 int, int, allocfcn_t *); 126 static ufs2_daddr_t ffs_nodealloccg(struct inode *, uint64_t, ufs2_daddr_t, int, 127 int); 128 static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); 129 static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); 130 static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); 131 static void ffs_ckhash_cg(struct buf *); 132 133 /* 134 * Allocate a block in the filesystem. 135 * 136 * The size of the requested block is given, which must be some 137 * multiple of fs_fsize and <= fs_bsize. 138 * A preference may be optionally specified. If a preference is given 139 * the following hierarchy is used to allocate a block: 140 * 1) allocate the requested block. 141 * 2) allocate a rotationally optimal block in the same cylinder. 142 * 3) allocate a block in the same cylinder group. 143 * 4) quadratically rehash into other cylinder groups, until an 144 * available block is located. 145 * If no block preference is given the following hierarchy is used 146 * to allocate a block: 147 * 1) allocate a block in the cylinder group that contains the 148 * inode for the file. 149 * 2) quadratically rehash into other cylinder groups, until an 150 * available block is located. 151 */ 152 int 153 ffs_alloc(struct inode *ip, 154 ufs2_daddr_t lbn, 155 ufs2_daddr_t bpref, 156 int size, 157 int flags, 158 struct ucred *cred, 159 ufs2_daddr_t *bnp) 160 { 161 struct fs *fs; 162 struct ufsmount *ump; 163 ufs2_daddr_t bno; 164 uint64_t cg, reclaimed; 165 int64_t delta; 166 #ifdef QUOTA 167 int error; 168 #endif 169 170 *bnp = 0; 171 ump = ITOUMP(ip); 172 fs = ump->um_fs; 173 mtx_assert(UFS_MTX(ump), MA_OWNED); 174 #ifdef INVARIANTS 175 if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0) { 176 printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", 177 devtoname(ump->um_dev), (long)fs->fs_bsize, size, 178 fs->fs_fsmnt); 179 panic("ffs_alloc: bad size"); 180 } 181 if (cred == NOCRED) 182 panic("ffs_alloc: missing credential"); 183 #endif /* INVARIANTS */ 184 reclaimed = 0; 185 retry: 186 #ifdef QUOTA 187 UFS_UNLOCK(ump); 188 error = chkdq(ip, btodb(size), cred, 0); 189 if (error) 190 return (error); 191 UFS_LOCK(ump); 192 #endif 193 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 194 goto nospace; 195 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && 196 freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) 197 goto nospace; 198 if (bpref >= fs->fs_size) 199 bpref = 0; 200 if (bpref == 0) 201 cg = ino_to_cg(fs, ip->i_number); 202 else 203 cg = dtog(fs, bpref); 204 bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); 205 if (bno > 0) { 206 delta = btodb(size); 207 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 208 if (flags & IO_EXT) 209 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 210 else 211 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 212 *bnp = bno; 213 return (0); 214 } 215 nospace: 216 #ifdef QUOTA 217 UFS_UNLOCK(ump); 218 /* 219 * Restore user's disk quota because allocation failed. 220 */ 221 (void) chkdq(ip, -btodb(size), cred, FORCE); 222 UFS_LOCK(ump); 223 #endif 224 if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 225 reclaimed = 1; 226 softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT); 227 goto retry; 228 } 229 if (ffs_fsfail_cleanup_locked(ump, 0)) { 230 UFS_UNLOCK(ump); 231 return (ENXIO); 232 } 233 if (reclaimed > 0 && 234 ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 235 UFS_UNLOCK(ump); 236 ffs_fserr(fs, ip->i_number, "filesystem full"); 237 uprintf("\n%s: write failed, filesystem is full\n", 238 fs->fs_fsmnt); 239 } else { 240 UFS_UNLOCK(ump); 241 } 242 return (ENOSPC); 243 } 244 245 /* 246 * Reallocate a fragment to a bigger size 247 * 248 * The number and size of the old block is given, and a preference 249 * and new size is also specified. The allocator attempts to extend 250 * the original block. Failing that, the regular block allocator is 251 * invoked to get an appropriate block. 252 */ 253 int 254 ffs_realloccg(struct inode *ip, 255 ufs2_daddr_t lbprev, 256 ufs2_daddr_t bprev, 257 ufs2_daddr_t bpref, 258 int osize, 259 int nsize, 260 int flags, 261 struct ucred *cred, 262 struct buf **bpp) 263 { 264 struct vnode *vp; 265 struct fs *fs; 266 struct buf *bp; 267 struct ufsmount *ump; 268 uint64_t cg, request, reclaimed; 269 int error, gbflags; 270 ufs2_daddr_t bno; 271 int64_t delta; 272 273 vp = ITOV(ip); 274 ump = ITOUMP(ip); 275 fs = ump->um_fs; 276 bp = NULL; 277 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 278 #ifdef WITNESS 279 gbflags |= IS_SNAPSHOT(ip) ? GB_NOWITNESS : 0; 280 #endif 281 282 mtx_assert(UFS_MTX(ump), MA_OWNED); 283 #ifdef INVARIANTS 284 if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 285 panic("ffs_realloccg: allocation on suspended filesystem"); 286 if ((uint64_t)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || 287 (uint64_t)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { 288 printf( 289 "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", 290 devtoname(ump->um_dev), (long)fs->fs_bsize, osize, 291 nsize, fs->fs_fsmnt); 292 panic("ffs_realloccg: bad size"); 293 } 294 if (cred == NOCRED) 295 panic("ffs_realloccg: missing credential"); 296 #endif /* INVARIANTS */ 297 reclaimed = 0; 298 retry: 299 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && 300 freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) { 301 goto nospace; 302 } 303 if (bprev == 0) { 304 printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", 305 devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev, 306 fs->fs_fsmnt); 307 panic("ffs_realloccg: bad bprev"); 308 } 309 UFS_UNLOCK(ump); 310 /* 311 * Allocate the extra space in the buffer. 312 */ 313 error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); 314 if (error) { 315 return (error); 316 } 317 318 if (bp->b_blkno == bp->b_lblkno) { 319 if (lbprev >= UFS_NDADDR) 320 panic("ffs_realloccg: lbprev out of range"); 321 bp->b_blkno = fsbtodb(fs, bprev); 322 } 323 324 #ifdef QUOTA 325 error = chkdq(ip, btodb(nsize - osize), cred, 0); 326 if (error) { 327 brelse(bp); 328 return (error); 329 } 330 #endif 331 /* 332 * Check for extension in the existing location. 333 */ 334 *bpp = NULL; 335 cg = dtog(fs, bprev); 336 UFS_LOCK(ump); 337 bno = ffs_fragextend(ip, cg, bprev, osize, nsize); 338 if (bno) { 339 if (bp->b_blkno != fsbtodb(fs, bno)) 340 panic("ffs_realloccg: bad blockno"); 341 delta = btodb(nsize - osize); 342 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 343 if (flags & IO_EXT) 344 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 345 else 346 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 347 allocbuf(bp, nsize); 348 bp->b_flags |= B_DONE; 349 vfs_bio_bzero_buf(bp, osize, nsize - osize); 350 if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 351 vfs_bio_set_valid(bp, osize, nsize - osize); 352 *bpp = bp; 353 return (0); 354 } 355 /* 356 * Allocate a new disk location. 357 */ 358 if (bpref >= fs->fs_size) 359 bpref = 0; 360 switch ((int)fs->fs_optim) { 361 case FS_OPTSPACE: 362 /* 363 * Allocate an exact sized fragment. Although this makes 364 * best use of space, we will waste time relocating it if 365 * the file continues to grow. If the fragmentation is 366 * less than half of the minimum free reserve, we choose 367 * to begin optimizing for time. 368 */ 369 request = nsize; 370 if (fs->fs_minfree <= 5 || 371 fs->fs_cstotal.cs_nffree > 372 (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) 373 break; 374 log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", 375 fs->fs_fsmnt); 376 fs->fs_optim = FS_OPTTIME; 377 break; 378 case FS_OPTTIME: 379 /* 380 * At this point we have discovered a file that is trying to 381 * grow a small fragment to a larger fragment. To save time, 382 * we allocate a full sized block, then free the unused portion. 383 * If the file continues to grow, the `ffs_fragextend' call 384 * above will be able to grow it in place without further 385 * copying. If aberrant programs cause disk fragmentation to 386 * grow within 2% of the free reserve, we choose to begin 387 * optimizing for space. 388 */ 389 request = fs->fs_bsize; 390 if (fs->fs_cstotal.cs_nffree < 391 (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) 392 break; 393 log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", 394 fs->fs_fsmnt); 395 fs->fs_optim = FS_OPTSPACE; 396 break; 397 default: 398 printf("dev = %s, optim = %ld, fs = %s\n", 399 devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt); 400 panic("ffs_realloccg: bad optim"); 401 /* NOTREACHED */ 402 } 403 bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); 404 if (bno > 0) { 405 bp->b_blkno = fsbtodb(fs, bno); 406 if (!DOINGSOFTDEP(vp)) 407 /* 408 * The usual case is that a smaller fragment that 409 * was just allocated has been replaced with a bigger 410 * fragment or a full-size block. If it is marked as 411 * B_DELWRI, the current contents have not been written 412 * to disk. It is possible that the block was written 413 * earlier, but very uncommon. If the block has never 414 * been written, there is no need to send a BIO_DELETE 415 * for it when it is freed. The gain from avoiding the 416 * TRIMs for the common case of unwritten blocks far 417 * exceeds the cost of the write amplification for the 418 * uncommon case of failing to send a TRIM for a block 419 * that had been written. 420 */ 421 ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, 422 ip->i_number, vp->v_type, NULL, 423 (bp->b_flags & B_DELWRI) != 0 ? 424 NOTRIM_KEY : SINGLETON_KEY); 425 delta = btodb(nsize - osize); 426 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 427 if (flags & IO_EXT) 428 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 429 else 430 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 431 allocbuf(bp, nsize); 432 bp->b_flags |= B_DONE; 433 vfs_bio_bzero_buf(bp, osize, nsize - osize); 434 if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 435 vfs_bio_set_valid(bp, osize, nsize - osize); 436 *bpp = bp; 437 return (0); 438 } 439 #ifdef QUOTA 440 UFS_UNLOCK(ump); 441 /* 442 * Restore user's disk quota because allocation failed. 443 */ 444 (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); 445 UFS_LOCK(ump); 446 #endif 447 nospace: 448 /* 449 * no space available 450 */ 451 if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 452 reclaimed = 1; 453 UFS_UNLOCK(ump); 454 if (bp) { 455 brelse(bp); 456 bp = NULL; 457 } 458 UFS_LOCK(ump); 459 softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); 460 goto retry; 461 } 462 if (bp) 463 brelse(bp); 464 if (ffs_fsfail_cleanup_locked(ump, 0)) { 465 UFS_UNLOCK(ump); 466 return (ENXIO); 467 } 468 if (reclaimed > 0 && 469 ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 470 UFS_UNLOCK(ump); 471 ffs_fserr(fs, ip->i_number, "filesystem full"); 472 uprintf("\n%s: write failed, filesystem is full\n", 473 fs->fs_fsmnt); 474 } else { 475 UFS_UNLOCK(ump); 476 } 477 return (ENOSPC); 478 } 479 480 /* 481 * Reallocate a sequence of blocks into a contiguous sequence of blocks. 482 * 483 * The vnode and an array of buffer pointers for a range of sequential 484 * logical blocks to be made contiguous is given. The allocator attempts 485 * to find a range of sequential blocks starting as close as possible 486 * from the end of the allocation for the logical block immediately 487 * preceding the current range. If successful, the physical block numbers 488 * in the buffer pointers and in the inode are changed to reflect the new 489 * allocation. If unsuccessful, the allocation is left unchanged. The 490 * success in doing the reallocation is returned. Note that the error 491 * return is not reflected back to the user. Rather the previous block 492 * allocation will be used. 493 */ 494 495 SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 496 "FFS filesystem"); 497 498 static int doasyncfree = 1; 499 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, 500 "do not force synchronous writes when blocks are reallocated"); 501 502 static int doreallocblks = 1; 503 SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, 504 "enable block reallocation"); 505 506 static int dotrimcons = 1; 507 SYSCTL_INT(_vfs_ffs, OID_AUTO, dotrimcons, CTLFLAG_RWTUN, &dotrimcons, 0, 508 "enable BIO_DELETE / TRIM consolidation"); 509 510 static int maxclustersearch = 10; 511 SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch, 512 0, "max number of cylinder group to search for contigous blocks"); 513 514 #ifdef DIAGNOSTIC 515 static int prtrealloc = 0; 516 SYSCTL_INT(_debug, OID_AUTO, ffs_prtrealloc, CTLFLAG_RW, &prtrealloc, 0, 517 "print out FFS filesystem block reallocation operations"); 518 #endif 519 520 int 521 ffs_reallocblks( 522 struct vop_reallocblks_args /* { 523 struct vnode *a_vp; 524 struct cluster_save *a_buflist; 525 } */ *ap) 526 { 527 struct ufsmount *ump; 528 int error; 529 530 /* 531 * We used to skip reallocating the blocks of a file into a 532 * contiguous sequence if the underlying flash device requested 533 * BIO_DELETE notifications, because devices that benefit from 534 * BIO_DELETE also benefit from not moving the data. However, 535 * the destination for the data is usually moved before the data 536 * is written to the initially allocated location, so we rarely 537 * suffer the penalty of extra writes. With the addition of the 538 * consolidation of contiguous blocks into single BIO_DELETE 539 * operations, having fewer but larger contiguous blocks reduces 540 * the number of (slow and expensive) BIO_DELETE operations. So 541 * when doing BIO_DELETE consolidation, we do block reallocation. 542 * 543 * Skip if reallocblks has been disabled globally. 544 */ 545 ump = ap->a_vp->v_mount->mnt_data; 546 if ((((ump->um_flags) & UM_CANDELETE) != 0 && dotrimcons == 0) || 547 doreallocblks == 0) 548 return (ENOSPC); 549 550 /* 551 * We can't wait in softdep prealloc as it may fsync and recurse 552 * here. Instead we simply fail to reallocate blocks if this 553 * rare condition arises. 554 */ 555 if (DOINGSUJ(ap->a_vp)) 556 if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) 557 return (ENOSPC); 558 vn_seqc_write_begin(ap->a_vp); 559 error = ump->um_fstype == UFS1 ? ffs_reallocblks_ufs1(ap) : 560 ffs_reallocblks_ufs2(ap); 561 vn_seqc_write_end(ap->a_vp); 562 return (error); 563 } 564 565 static int 566 ffs_reallocblks_ufs1( 567 struct vop_reallocblks_args /* { 568 struct vnode *a_vp; 569 struct cluster_save *a_buflist; 570 } */ *ap) 571 { 572 struct fs *fs; 573 struct inode *ip; 574 struct vnode *vp; 575 struct buf *sbp, *ebp, *bp; 576 ufs1_daddr_t *bap, *sbap, *ebap; 577 struct cluster_save *buflist; 578 struct ufsmount *ump; 579 ufs_lbn_t start_lbn, end_lbn; 580 ufs1_daddr_t soff, newblk, blkno; 581 ufs2_daddr_t pref; 582 struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; 583 int i, cg, len, start_lvl, end_lvl, ssize; 584 585 vp = ap->a_vp; 586 ip = VTOI(vp); 587 ump = ITOUMP(ip); 588 fs = ump->um_fs; 589 /* 590 * If we are not tracking block clusters or if we have less than 4% 591 * free blocks left, then do not attempt to cluster. Running with 592 * less than 5% free block reserve is not recommended and those that 593 * choose to do so do not expect to have good file layout. 594 */ 595 if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 596 return (ENOSPC); 597 buflist = ap->a_buflist; 598 len = buflist->bs_nchildren; 599 start_lbn = buflist->bs_children[0]->b_lblkno; 600 end_lbn = start_lbn + len - 1; 601 #ifdef INVARIANTS 602 for (i = 0; i < len; i++) 603 if (!ffs_checkblk(ip, 604 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 605 panic("ffs_reallocblks: unallocated block 1"); 606 for (i = 1; i < len; i++) 607 if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 608 panic("ffs_reallocblks: non-logical cluster"); 609 blkno = buflist->bs_children[0]->b_blkno; 610 ssize = fsbtodb(fs, fs->fs_frag); 611 for (i = 1; i < len - 1; i++) 612 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 613 panic("ffs_reallocblks: non-physical cluster %d", i); 614 #endif 615 /* 616 * If the cluster crosses the boundary for the first indirect 617 * block, leave space for the indirect block. Indirect blocks 618 * are initially laid out in a position after the last direct 619 * block. Block reallocation would usually destroy locality by 620 * moving the indirect block out of the way to make room for 621 * data blocks if we didn't compensate here. We should also do 622 * this for other indirect block boundaries, but it is only 623 * important for the first one. 624 */ 625 if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) 626 return (ENOSPC); 627 /* 628 * If the latest allocation is in a new cylinder group, assume that 629 * the filesystem has decided to move and do not force it back to 630 * the previous cylinder group. 631 */ 632 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 633 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 634 return (ENOSPC); 635 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 636 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 637 return (ENOSPC); 638 /* 639 * Get the starting offset and block map for the first block. 640 */ 641 if (start_lvl == 0) { 642 sbap = &ip->i_din1->di_db[0]; 643 soff = start_lbn; 644 } else { 645 idp = &start_ap[start_lvl - 1]; 646 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 647 brelse(sbp); 648 return (ENOSPC); 649 } 650 sbap = (ufs1_daddr_t *)sbp->b_data; 651 soff = idp->in_off; 652 } 653 /* 654 * If the block range spans two block maps, get the second map. 655 */ 656 ebap = NULL; 657 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 658 ssize = len; 659 } else { 660 #ifdef INVARIANTS 661 if (start_lvl > 0 && 662 start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 663 panic("ffs_reallocblk: start == end"); 664 #endif 665 ssize = len - (idp->in_off + 1); 666 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 667 goto fail; 668 ebap = (ufs1_daddr_t *)ebp->b_data; 669 } 670 /* 671 * Find the preferred location for the cluster. If we have not 672 * previously failed at this endeavor, then follow our standard 673 * preference calculation. If we have failed at it, then pick up 674 * where we last ended our search. 675 */ 676 UFS_LOCK(ump); 677 if (ip->i_nextclustercg == -1) 678 pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); 679 else 680 pref = cgdata(fs, ip->i_nextclustercg); 681 /* 682 * Search the block map looking for an allocation of the desired size. 683 * To avoid wasting too much time, we limit the number of cylinder 684 * groups that we will search. 685 */ 686 cg = dtog(fs, pref); 687 for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { 688 if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) 689 break; 690 cg += 1; 691 if (cg >= fs->fs_ncg) 692 cg = 0; 693 } 694 /* 695 * If we have failed in our search, record where we gave up for 696 * next time. Otherwise, fall back to our usual search citerion. 697 */ 698 if (newblk == 0) { 699 ip->i_nextclustercg = cg; 700 UFS_UNLOCK(ump); 701 goto fail; 702 } 703 ip->i_nextclustercg = -1; 704 /* 705 * We have found a new contiguous block. 706 * 707 * First we have to replace the old block pointers with the new 708 * block pointers in the inode and indirect blocks associated 709 * with the file. 710 */ 711 #ifdef DIAGNOSTIC 712 if (prtrealloc) 713 printf("realloc: ino %ju, lbns %jd-%jd\n\told:", 714 (uintmax_t)ip->i_number, 715 (intmax_t)start_lbn, (intmax_t)end_lbn); 716 #endif 717 blkno = newblk; 718 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 719 if (i == ssize) { 720 bap = ebap; 721 soff = -i; 722 } 723 #ifdef INVARIANTS 724 if (!ffs_checkblk(ip, 725 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 726 panic("ffs_reallocblks: unallocated block 2"); 727 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 728 panic("ffs_reallocblks: alloc mismatch"); 729 #endif 730 #ifdef DIAGNOSTIC 731 if (prtrealloc) 732 printf(" %d,", *bap); 733 #endif 734 if (DOINGSOFTDEP(vp)) { 735 if (sbap == &ip->i_din1->di_db[0] && i < ssize) 736 softdep_setup_allocdirect(ip, start_lbn + i, 737 blkno, *bap, fs->fs_bsize, fs->fs_bsize, 738 buflist->bs_children[i]); 739 else 740 softdep_setup_allocindir_page(ip, start_lbn + i, 741 i < ssize ? sbp : ebp, soff + i, blkno, 742 *bap, buflist->bs_children[i]); 743 } 744 *bap++ = blkno; 745 } 746 /* 747 * Next we must write out the modified inode and indirect blocks. 748 * For strict correctness, the writes should be synchronous since 749 * the old block values may have been written to disk. In practise 750 * they are almost never written, but if we are concerned about 751 * strict correctness, the `doasyncfree' flag should be set to zero. 752 * 753 * The test on `doasyncfree' should be changed to test a flag 754 * that shows whether the associated buffers and inodes have 755 * been written. The flag should be set when the cluster is 756 * started and cleared whenever the buffer or inode is flushed. 757 * We can then check below to see if it is set, and do the 758 * synchronous write only when it has been cleared. 759 */ 760 if (sbap != &ip->i_din1->di_db[0]) { 761 if (doasyncfree) 762 bdwrite(sbp); 763 else 764 bwrite(sbp); 765 } else { 766 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 767 if (!doasyncfree) 768 ffs_update(vp, 1); 769 } 770 if (ssize < len) { 771 if (doasyncfree) 772 bdwrite(ebp); 773 else 774 bwrite(ebp); 775 } 776 /* 777 * Last, free the old blocks and assign the new blocks to the buffers. 778 */ 779 #ifdef DIAGNOSTIC 780 if (prtrealloc) 781 printf("\n\tnew:"); 782 #endif 783 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 784 bp = buflist->bs_children[i]; 785 if (!DOINGSOFTDEP(vp)) 786 /* 787 * The usual case is that a set of N-contiguous blocks 788 * that was just allocated has been replaced with a 789 * set of N+1-contiguous blocks. If they are marked as 790 * B_DELWRI, the current contents have not been written 791 * to disk. It is possible that the blocks were written 792 * earlier, but very uncommon. If the blocks have never 793 * been written, there is no need to send a BIO_DELETE 794 * for them when they are freed. The gain from avoiding 795 * the TRIMs for the common case of unwritten blocks 796 * far exceeds the cost of the write amplification for 797 * the uncommon case of failing to send a TRIM for the 798 * blocks that had been written. 799 */ 800 ffs_blkfree(ump, fs, ump->um_devvp, 801 dbtofsb(fs, bp->b_blkno), 802 fs->fs_bsize, ip->i_number, vp->v_type, NULL, 803 (bp->b_flags & B_DELWRI) != 0 ? 804 NOTRIM_KEY : SINGLETON_KEY); 805 bp->b_blkno = fsbtodb(fs, blkno); 806 #ifdef INVARIANTS 807 if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) 808 panic("ffs_reallocblks: unallocated block 3"); 809 #endif 810 #ifdef DIAGNOSTIC 811 if (prtrealloc) 812 printf(" %d,", blkno); 813 #endif 814 } 815 #ifdef DIAGNOSTIC 816 if (prtrealloc) { 817 prtrealloc--; 818 printf("\n"); 819 } 820 #endif 821 return (0); 822 823 fail: 824 if (ssize < len) 825 brelse(ebp); 826 if (sbap != &ip->i_din1->di_db[0]) 827 brelse(sbp); 828 return (ENOSPC); 829 } 830 831 static int 832 ffs_reallocblks_ufs2( 833 struct vop_reallocblks_args /* { 834 struct vnode *a_vp; 835 struct cluster_save *a_buflist; 836 } */ *ap) 837 { 838 struct fs *fs; 839 struct inode *ip; 840 struct vnode *vp; 841 struct buf *sbp, *ebp, *bp; 842 ufs2_daddr_t *bap, *sbap, *ebap; 843 struct cluster_save *buflist; 844 struct ufsmount *ump; 845 ufs_lbn_t start_lbn, end_lbn; 846 ufs2_daddr_t soff, newblk, blkno, pref; 847 struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; 848 int i, cg, len, start_lvl, end_lvl, ssize; 849 850 vp = ap->a_vp; 851 ip = VTOI(vp); 852 ump = ITOUMP(ip); 853 fs = ump->um_fs; 854 /* 855 * If we are not tracking block clusters or if we have less than 4% 856 * free blocks left, then do not attempt to cluster. Running with 857 * less than 5% free block reserve is not recommended and those that 858 * choose to do so do not expect to have good file layout. 859 */ 860 if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 861 return (ENOSPC); 862 buflist = ap->a_buflist; 863 len = buflist->bs_nchildren; 864 start_lbn = buflist->bs_children[0]->b_lblkno; 865 end_lbn = start_lbn + len - 1; 866 #ifdef INVARIANTS 867 for (i = 0; i < len; i++) 868 if (!ffs_checkblk(ip, 869 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 870 panic("ffs_reallocblks: unallocated block 1"); 871 for (i = 1; i < len; i++) 872 if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 873 panic("ffs_reallocblks: non-logical cluster"); 874 blkno = buflist->bs_children[0]->b_blkno; 875 ssize = fsbtodb(fs, fs->fs_frag); 876 for (i = 1; i < len - 1; i++) 877 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 878 panic("ffs_reallocblks: non-physical cluster %d", i); 879 #endif 880 /* 881 * If the cluster crosses the boundary for the first indirect 882 * block, do not move anything in it. Indirect blocks are 883 * usually initially laid out in a position between the data 884 * blocks. Block reallocation would usually destroy locality by 885 * moving the indirect block out of the way to make room for 886 * data blocks if we didn't compensate here. We should also do 887 * this for other indirect block boundaries, but it is only 888 * important for the first one. 889 */ 890 if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) 891 return (ENOSPC); 892 /* 893 * If the latest allocation is in a new cylinder group, assume that 894 * the filesystem has decided to move and do not force it back to 895 * the previous cylinder group. 896 */ 897 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 898 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 899 return (ENOSPC); 900 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 901 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 902 return (ENOSPC); 903 /* 904 * Get the starting offset and block map for the first block. 905 */ 906 if (start_lvl == 0) { 907 sbap = &ip->i_din2->di_db[0]; 908 soff = start_lbn; 909 } else { 910 idp = &start_ap[start_lvl - 1]; 911 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 912 brelse(sbp); 913 return (ENOSPC); 914 } 915 sbap = (ufs2_daddr_t *)sbp->b_data; 916 soff = idp->in_off; 917 } 918 /* 919 * If the block range spans two block maps, get the second map. 920 */ 921 ebap = NULL; 922 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 923 ssize = len; 924 } else { 925 #ifdef INVARIANTS 926 if (start_lvl > 0 && 927 start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 928 panic("ffs_reallocblk: start == end"); 929 #endif 930 ssize = len - (idp->in_off + 1); 931 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 932 goto fail; 933 ebap = (ufs2_daddr_t *)ebp->b_data; 934 } 935 /* 936 * Find the preferred location for the cluster. If we have not 937 * previously failed at this endeavor, then follow our standard 938 * preference calculation. If we have failed at it, then pick up 939 * where we last ended our search. 940 */ 941 UFS_LOCK(ump); 942 if (ip->i_nextclustercg == -1) 943 pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); 944 else 945 pref = cgdata(fs, ip->i_nextclustercg); 946 /* 947 * Search the block map looking for an allocation of the desired size. 948 * To avoid wasting too much time, we limit the number of cylinder 949 * groups that we will search. 950 */ 951 cg = dtog(fs, pref); 952 for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { 953 if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) 954 break; 955 cg += 1; 956 if (cg >= fs->fs_ncg) 957 cg = 0; 958 } 959 /* 960 * If we have failed in our search, record where we gave up for 961 * next time. Otherwise, fall back to our usual search citerion. 962 */ 963 if (newblk == 0) { 964 ip->i_nextclustercg = cg; 965 UFS_UNLOCK(ump); 966 goto fail; 967 } 968 ip->i_nextclustercg = -1; 969 /* 970 * We have found a new contiguous block. 971 * 972 * First we have to replace the old block pointers with the new 973 * block pointers in the inode and indirect blocks associated 974 * with the file. 975 */ 976 #ifdef DIAGNOSTIC 977 if (prtrealloc) 978 printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, 979 (intmax_t)start_lbn, (intmax_t)end_lbn); 980 #endif 981 blkno = newblk; 982 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 983 if (i == ssize) { 984 bap = ebap; 985 soff = -i; 986 } 987 #ifdef INVARIANTS 988 if (!ffs_checkblk(ip, 989 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 990 panic("ffs_reallocblks: unallocated block 2"); 991 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 992 panic("ffs_reallocblks: alloc mismatch"); 993 #endif 994 #ifdef DIAGNOSTIC 995 if (prtrealloc) 996 printf(" %jd,", (intmax_t)*bap); 997 #endif 998 if (DOINGSOFTDEP(vp)) { 999 if (sbap == &ip->i_din2->di_db[0] && i < ssize) 1000 softdep_setup_allocdirect(ip, start_lbn + i, 1001 blkno, *bap, fs->fs_bsize, fs->fs_bsize, 1002 buflist->bs_children[i]); 1003 else 1004 softdep_setup_allocindir_page(ip, start_lbn + i, 1005 i < ssize ? sbp : ebp, soff + i, blkno, 1006 *bap, buflist->bs_children[i]); 1007 } 1008 *bap++ = blkno; 1009 } 1010 /* 1011 * Next we must write out the modified inode and indirect blocks. 1012 * For strict correctness, the writes should be synchronous since 1013 * the old block values may have been written to disk. In practise 1014 * they are almost never written, but if we are concerned about 1015 * strict correctness, the `doasyncfree' flag should be set to zero. 1016 * 1017 * The test on `doasyncfree' should be changed to test a flag 1018 * that shows whether the associated buffers and inodes have 1019 * been written. The flag should be set when the cluster is 1020 * started and cleared whenever the buffer or inode is flushed. 1021 * We can then check below to see if it is set, and do the 1022 * synchronous write only when it has been cleared. 1023 */ 1024 if (sbap != &ip->i_din2->di_db[0]) { 1025 if (doasyncfree) 1026 bdwrite(sbp); 1027 else 1028 bwrite(sbp); 1029 } else { 1030 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1031 if (!doasyncfree) 1032 ffs_update(vp, 1); 1033 } 1034 if (ssize < len) { 1035 if (doasyncfree) 1036 bdwrite(ebp); 1037 else 1038 bwrite(ebp); 1039 } 1040 /* 1041 * Last, free the old blocks and assign the new blocks to the buffers. 1042 */ 1043 #ifdef DIAGNOSTIC 1044 if (prtrealloc) 1045 printf("\n\tnew:"); 1046 #endif 1047 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 1048 bp = buflist->bs_children[i]; 1049 if (!DOINGSOFTDEP(vp)) 1050 /* 1051 * The usual case is that a set of N-contiguous blocks 1052 * that was just allocated has been replaced with a 1053 * set of N+1-contiguous blocks. If they are marked as 1054 * B_DELWRI, the current contents have not been written 1055 * to disk. It is possible that the blocks were written 1056 * earlier, but very uncommon. If the blocks have never 1057 * been written, there is no need to send a BIO_DELETE 1058 * for them when they are freed. The gain from avoiding 1059 * the TRIMs for the common case of unwritten blocks 1060 * far exceeds the cost of the write amplification for 1061 * the uncommon case of failing to send a TRIM for the 1062 * blocks that had been written. 1063 */ 1064 ffs_blkfree(ump, fs, ump->um_devvp, 1065 dbtofsb(fs, bp->b_blkno), 1066 fs->fs_bsize, ip->i_number, vp->v_type, NULL, 1067 (bp->b_flags & B_DELWRI) != 0 ? 1068 NOTRIM_KEY : SINGLETON_KEY); 1069 bp->b_blkno = fsbtodb(fs, blkno); 1070 #ifdef INVARIANTS 1071 if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) 1072 panic("ffs_reallocblks: unallocated block 3"); 1073 #endif 1074 #ifdef DIAGNOSTIC 1075 if (prtrealloc) 1076 printf(" %jd,", (intmax_t)blkno); 1077 #endif 1078 } 1079 #ifdef DIAGNOSTIC 1080 if (prtrealloc) { 1081 prtrealloc--; 1082 printf("\n"); 1083 } 1084 #endif 1085 return (0); 1086 1087 fail: 1088 if (ssize < len) 1089 brelse(ebp); 1090 if (sbap != &ip->i_din2->di_db[0]) 1091 brelse(sbp); 1092 return (ENOSPC); 1093 } 1094 1095 /* 1096 * Allocate an inode in the filesystem. 1097 * 1098 * If allocating a directory, use ffs_dirpref to select the inode. 1099 * If allocating in a directory, the following hierarchy is followed: 1100 * 1) allocate the preferred inode. 1101 * 2) allocate an inode in the same cylinder group. 1102 * 3) quadratically rehash into other cylinder groups, until an 1103 * available inode is located. 1104 * If no inode preference is given the following hierarchy is used 1105 * to allocate an inode: 1106 * 1) allocate an inode in cylinder group 0. 1107 * 2) quadratically rehash into other cylinder groups, until an 1108 * available inode is located. 1109 */ 1110 int 1111 ffs_valloc(struct vnode *pvp, 1112 int mode, 1113 struct ucred *cred, 1114 struct vnode **vpp) 1115 { 1116 struct inode *pip; 1117 struct fs *fs; 1118 struct inode *ip; 1119 struct timespec ts; 1120 struct ufsmount *ump; 1121 ino_t ino, ipref; 1122 uint64_t cg; 1123 int error, reclaimed; 1124 1125 *vpp = NULL; 1126 pip = VTOI(pvp); 1127 ump = ITOUMP(pip); 1128 fs = ump->um_fs; 1129 1130 UFS_LOCK(ump); 1131 reclaimed = 0; 1132 retry: 1133 if (fs->fs_cstotal.cs_nifree == 0) 1134 goto noinodes; 1135 1136 if ((mode & IFMT) == IFDIR) 1137 ipref = ffs_dirpref(pip); 1138 else 1139 ipref = pip->i_number; 1140 if (ipref >= fs->fs_ncg * fs->fs_ipg) 1141 ipref = 0; 1142 cg = ino_to_cg(fs, ipref); 1143 /* 1144 * Track number of dirs created one after another 1145 * in a same cg without intervening by files. 1146 */ 1147 if ((mode & IFMT) == IFDIR) { 1148 if (fs->fs_contigdirs[cg] < 255) 1149 fs->fs_contigdirs[cg]++; 1150 } else { 1151 if (fs->fs_contigdirs[cg] > 0) 1152 fs->fs_contigdirs[cg]--; 1153 } 1154 ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 1155 (allocfcn_t *)ffs_nodealloccg); 1156 if (ino == 0) 1157 goto noinodes; 1158 /* 1159 * Get rid of the cached old vnode, force allocation of a new vnode 1160 * for this inode. If this fails, release the allocated ino and 1161 * return the error. 1162 */ 1163 if ((error = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp, 1164 FFSV_FORCEINSMQ | FFSV_REPLACE | FFSV_NEWINODE)) != 0) { 1165 ffs_vfree(pvp, ino, mode); 1166 return (error); 1167 } 1168 /* 1169 * We got an inode, so check mode and panic if it is already allocated. 1170 */ 1171 ip = VTOI(*vpp); 1172 if (ip->i_mode) { 1173 printf("mode = 0%o, inum = %ju, fs = %s\n", 1174 ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt); 1175 panic("ffs_valloc: dup alloc"); 1176 } 1177 if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ 1178 printf("free inode %s/%ju had %ld blocks\n", 1179 fs->fs_fsmnt, (intmax_t)ino, (long)DIP(ip, i_blocks)); 1180 DIP_SET(ip, i_blocks, 0); 1181 } 1182 ip->i_flags = 0; 1183 DIP_SET(ip, i_flags, 0); 1184 if ((mode & IFMT) == IFDIR) 1185 DIP_SET(ip, i_dirdepth, DIP(pip, i_dirdepth) + 1); 1186 /* 1187 * Set up a new generation number for this inode. 1188 */ 1189 while (ip->i_gen == 0 || ++ip->i_gen == 0) 1190 ip->i_gen = arc4random(); 1191 DIP_SET(ip, i_gen, ip->i_gen); 1192 if (fs->fs_magic == FS_UFS2_MAGIC) { 1193 vfs_timestamp(&ts); 1194 ip->i_din2->di_birthtime = ts.tv_sec; 1195 ip->i_din2->di_birthnsec = ts.tv_nsec; 1196 } 1197 ip->i_flag = 0; 1198 (*vpp)->v_vflag = 0; 1199 (*vpp)->v_type = VNON; 1200 if (fs->fs_magic == FS_UFS2_MAGIC) { 1201 (*vpp)->v_op = &ffs_vnodeops2; 1202 UFS_INODE_SET_FLAG(ip, IN_UFS2); 1203 } else { 1204 (*vpp)->v_op = &ffs_vnodeops1; 1205 } 1206 return (0); 1207 noinodes: 1208 if (reclaimed == 0) { 1209 reclaimed = 1; 1210 softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT); 1211 goto retry; 1212 } 1213 if (ffs_fsfail_cleanup_locked(ump, 0)) { 1214 UFS_UNLOCK(ump); 1215 return (ENXIO); 1216 } 1217 if (ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 1218 UFS_UNLOCK(ump); 1219 ffs_fserr(fs, pip->i_number, "out of inodes"); 1220 uprintf("\n%s: create/symlink failed, no inodes free\n", 1221 fs->fs_fsmnt); 1222 } else { 1223 UFS_UNLOCK(ump); 1224 } 1225 return (ENOSPC); 1226 } 1227 1228 /* 1229 * Find a cylinder group to place a directory. 1230 * 1231 * The policy implemented by this algorithm is to allocate a 1232 * directory inode in the same cylinder group as its parent 1233 * directory, but also to reserve space for its files inodes 1234 * and data. Restrict the number of directories which may be 1235 * allocated one after another in the same cylinder group 1236 * without intervening allocation of files. 1237 * 1238 * If we allocate a first level directory then force allocation 1239 * in another cylinder group. 1240 */ 1241 static ino_t 1242 ffs_dirpref(struct inode *pip) 1243 { 1244 struct fs *fs; 1245 int cg, prefcg, curcg, dirsize, cgsize; 1246 int depth, range, start, end, numdirs, power, numerator, denominator; 1247 uint64_t avgifree, avgbfree, avgndir, curdirsize; 1248 uint64_t minifree, minbfree, maxndir; 1249 uint64_t maxcontigdirs; 1250 1251 mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED); 1252 fs = ITOFS(pip); 1253 1254 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 1255 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1256 avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; 1257 1258 /* 1259 * Select a preferred cylinder group to place a new directory. 1260 * If we are near the root of the filesystem we aim to spread 1261 * them out as much as possible. As we descend deeper from the 1262 * root we cluster them closer together around their parent as 1263 * we expect them to be more closely interactive. Higher-level 1264 * directories like usr/src/sys and usr/src/bin should be 1265 * separated while the directories in these areas are more 1266 * likely to be accessed together so should be closer. 1267 * 1268 * We pick a range of cylinder groups around the cylinder group 1269 * of the directory in which we are being created. The size of 1270 * the range for our search is based on our depth from the root 1271 * of our filesystem. We then probe that range based on how many 1272 * directories are already present. The first new directory is at 1273 * 1/2 (middle) of the range; the second is in the first 1/4 of the 1274 * range, then at 3/4, 1/8, 3/8, 5/8, 7/8, 1/16, 3/16, 5/16, etc. 1275 */ 1276 depth = DIP(pip, i_dirdepth); 1277 range = fs->fs_ncg / (1 << depth); 1278 curcg = ino_to_cg(fs, pip->i_number); 1279 start = curcg - (range / 2); 1280 if (start < 0) 1281 start += fs->fs_ncg; 1282 end = curcg + (range / 2); 1283 if (end >= fs->fs_ncg) 1284 end -= fs->fs_ncg; 1285 numdirs = pip->i_effnlink - 1; 1286 power = fls(numdirs); 1287 numerator = (numdirs & ~(1 << (power - 1))) * 2 + 1; 1288 denominator = 1 << power; 1289 prefcg = (curcg - (range / 2) + (range * numerator / denominator)); 1290 if (prefcg < 0) 1291 prefcg += fs->fs_ncg; 1292 if (prefcg >= fs->fs_ncg) 1293 prefcg -= fs->fs_ncg; 1294 /* 1295 * If this filesystem is not tracking directory depths, 1296 * revert to the old algorithm. 1297 */ 1298 if (depth == 0 && pip->i_number != UFS_ROOTINO) 1299 prefcg = curcg; 1300 1301 /* 1302 * Count various limits which used for 1303 * optimal allocation of a directory inode. 1304 */ 1305 maxndir = min(avgndir + (1 << depth), fs->fs_ipg); 1306 minifree = avgifree - avgifree / 4; 1307 if (minifree < 1) 1308 minifree = 1; 1309 minbfree = avgbfree - avgbfree / 4; 1310 if (minbfree < 1) 1311 minbfree = 1; 1312 cgsize = fs->fs_fsize * fs->fs_fpg; 1313 dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; 1314 curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; 1315 if (dirsize < curdirsize) 1316 dirsize = curdirsize; 1317 if (dirsize <= 0) 1318 maxcontigdirs = 0; /* dirsize overflowed */ 1319 else 1320 maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); 1321 if (fs->fs_avgfpdir > 0) 1322 maxcontigdirs = min(maxcontigdirs, 1323 fs->fs_ipg / fs->fs_avgfpdir); 1324 if (maxcontigdirs == 0) 1325 maxcontigdirs = 1; 1326 1327 /* 1328 * Limit number of dirs in one cg and reserve space for 1329 * regular files, but only if we have no deficit in 1330 * inodes or space. 1331 * 1332 * We are trying to find a suitable cylinder group nearby 1333 * our preferred cylinder group to place a new directory. 1334 * We scan from our preferred cylinder group forward looking 1335 * for a cylinder group that meets our criterion. If we get 1336 * to the final cylinder group and do not find anything, 1337 * we start scanning forwards from the beginning of the 1338 * filesystem. While it might seem sensible to start scanning 1339 * backwards or even to alternate looking forward and backward, 1340 * this approach fails badly when the filesystem is nearly full. 1341 * Specifically, we first search all the areas that have no space 1342 * and finally try the one preceding that. We repeat this on 1343 * every request and in the case of the final block end up 1344 * searching the entire filesystem. By jumping to the front 1345 * of the filesystem, our future forward searches always look 1346 * in new cylinder groups so finds every possible block after 1347 * one pass over the filesystem. 1348 */ 1349 for (cg = prefcg; cg < fs->fs_ncg; cg++) 1350 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 1351 fs->fs_cs(fs, cg).cs_nifree >= minifree && 1352 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 1353 if (fs->fs_contigdirs[cg] < maxcontigdirs) 1354 return ((ino_t)(fs->fs_ipg * cg)); 1355 } 1356 for (cg = 0; cg < prefcg; cg++) 1357 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 1358 fs->fs_cs(fs, cg).cs_nifree >= minifree && 1359 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 1360 if (fs->fs_contigdirs[cg] < maxcontigdirs) 1361 return ((ino_t)(fs->fs_ipg * cg)); 1362 } 1363 /* 1364 * This is a backstop when we have deficit in space. 1365 */ 1366 for (cg = prefcg; cg < fs->fs_ncg; cg++) 1367 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 1368 return ((ino_t)(fs->fs_ipg * cg)); 1369 for (cg = 0; cg < prefcg; cg++) 1370 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 1371 break; 1372 return ((ino_t)(fs->fs_ipg * cg)); 1373 } 1374 1375 /* 1376 * Select the desired position for the next block in a file. The file is 1377 * logically divided into sections. The first section is composed of the 1378 * direct blocks and the next fs_maxbpg blocks. Each additional section 1379 * contains fs_maxbpg blocks. 1380 * 1381 * If no blocks have been allocated in the first section, the policy is to 1382 * request a block in the same cylinder group as the inode that describes 1383 * the file. The first indirect is allocated immediately following the last 1384 * direct block and the data blocks for the first indirect immediately 1385 * follow it. 1386 * 1387 * If no blocks have been allocated in any other section, the indirect 1388 * block(s) are allocated in the same cylinder group as its inode in an 1389 * area reserved immediately following the inode blocks. The policy for 1390 * the data blocks is to place them in a cylinder group with a greater than 1391 * average number of free blocks. An appropriate cylinder group is found 1392 * by using a rotor that sweeps the cylinder groups. When a new group of 1393 * blocks is needed, the sweep begins in the cylinder group following the 1394 * cylinder group from which the previous allocation was made. The sweep 1395 * continues until a cylinder group with greater than the average number 1396 * of free blocks is found. If the allocation is for the first block in an 1397 * indirect block or the previous block is a hole, then the information on 1398 * the previous allocation is unavailable; here a best guess is made based 1399 * on the logical block number being allocated. 1400 * 1401 * If a section is already partially allocated, the policy is to 1402 * allocate blocks contiguously within the section if possible. 1403 */ 1404 ufs2_daddr_t 1405 ffs_blkpref_ufs1(struct inode *ip, 1406 ufs_lbn_t lbn, 1407 int indx, 1408 ufs1_daddr_t *bap) 1409 { 1410 struct fs *fs; 1411 uint64_t cg, inocg; 1412 uint64_t avgbfree, startcg; 1413 ufs2_daddr_t pref, prevbn; 1414 1415 KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1416 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1417 fs = ITOFS(ip); 1418 /* 1419 * Allocation of indirect blocks is indicated by passing negative 1420 * values in indx: -1 for single indirect, -2 for double indirect, 1421 * -3 for triple indirect. As noted below, we attempt to allocate 1422 * the first indirect inline with the file data. For all later 1423 * indirect blocks, the data is often allocated in other cylinder 1424 * groups. However to speed random file access and to speed up 1425 * fsck, the filesystem reserves the first fs_metaspace blocks 1426 * (typically half of fs_minfree) of the data area of each cylinder 1427 * group to hold these later indirect blocks. 1428 */ 1429 inocg = ino_to_cg(fs, ip->i_number); 1430 if (indx < 0) { 1431 /* 1432 * Our preference for indirect blocks is the zone at the 1433 * beginning of the inode's cylinder group data area that 1434 * we try to reserve for indirect blocks. 1435 */ 1436 pref = cgmeta(fs, inocg); 1437 /* 1438 * If we are allocating the first indirect block, try to 1439 * place it immediately following the last direct block. 1440 */ 1441 if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && 1442 ip->i_din1->di_db[UFS_NDADDR - 1] != 0) 1443 pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag; 1444 return (pref); 1445 } 1446 /* 1447 * If we are allocating the first data block in the first indirect 1448 * block and the indirect has been allocated in the data block area, 1449 * try to place it immediately following the indirect block. 1450 */ 1451 if (lbn == UFS_NDADDR) { 1452 pref = ip->i_din1->di_ib[0]; 1453 if (pref != 0 && pref >= cgdata(fs, inocg) && 1454 pref < cgbase(fs, inocg + 1)) 1455 return (pref + fs->fs_frag); 1456 } 1457 /* 1458 * If we are at the beginning of a file, or we have already allocated 1459 * the maximum number of blocks per cylinder group, or we do not 1460 * have a block allocated immediately preceding us, then we need 1461 * to decide where to start allocating new blocks. 1462 */ 1463 if (indx == 0) { 1464 prevbn = 0; 1465 } else { 1466 prevbn = bap[indx - 1]; 1467 if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, 1468 fs->fs_bsize) != 0) 1469 prevbn = 0; 1470 } 1471 if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { 1472 /* 1473 * If we are allocating a directory data block, we want 1474 * to place it in the metadata area. 1475 */ 1476 if ((ip->i_mode & IFMT) == IFDIR) 1477 return (cgmeta(fs, inocg)); 1478 /* 1479 * Until we fill all the direct and all the first indirect's 1480 * blocks, we try to allocate in the data area of the inode's 1481 * cylinder group. 1482 */ 1483 if (lbn < UFS_NDADDR + NINDIR(fs)) 1484 return (cgdata(fs, inocg)); 1485 /* 1486 * Find a cylinder with greater than average number of 1487 * unused data blocks. 1488 */ 1489 if (indx == 0 || prevbn == 0) 1490 startcg = inocg + lbn / fs->fs_maxbpg; 1491 else 1492 startcg = dtog(fs, prevbn) + 1; 1493 startcg %= fs->fs_ncg; 1494 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1495 for (cg = startcg; cg < fs->fs_ncg; cg++) 1496 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1497 fs->fs_cgrotor = cg; 1498 return (cgdata(fs, cg)); 1499 } 1500 for (cg = 0; cg <= startcg; cg++) 1501 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1502 fs->fs_cgrotor = cg; 1503 return (cgdata(fs, cg)); 1504 } 1505 return (0); 1506 } 1507 /* 1508 * Otherwise, we just always try to lay things out contiguously. 1509 */ 1510 return (prevbn + fs->fs_frag); 1511 } 1512 1513 /* 1514 * Same as above, but for UFS2 1515 */ 1516 ufs2_daddr_t 1517 ffs_blkpref_ufs2(struct inode *ip, 1518 ufs_lbn_t lbn, 1519 int indx, 1520 ufs2_daddr_t *bap) 1521 { 1522 struct fs *fs; 1523 uint64_t cg, inocg; 1524 uint64_t avgbfree, startcg; 1525 ufs2_daddr_t pref, prevbn; 1526 1527 KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1528 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1529 fs = ITOFS(ip); 1530 /* 1531 * Allocation of indirect blocks is indicated by passing negative 1532 * values in indx: -1 for single indirect, -2 for double indirect, 1533 * -3 for triple indirect. As noted below, we attempt to allocate 1534 * the first indirect inline with the file data. For all later 1535 * indirect blocks, the data is often allocated in other cylinder 1536 * groups. However to speed random file access and to speed up 1537 * fsck, the filesystem reserves the first fs_metaspace blocks 1538 * (typically half of fs_minfree) of the data area of each cylinder 1539 * group to hold these later indirect blocks. 1540 */ 1541 inocg = ino_to_cg(fs, ip->i_number); 1542 if (indx < 0) { 1543 /* 1544 * Our preference for indirect blocks is the zone at the 1545 * beginning of the inode's cylinder group data area that 1546 * we try to reserve for indirect blocks. 1547 */ 1548 pref = cgmeta(fs, inocg); 1549 /* 1550 * If we are allocating the first indirect block, try to 1551 * place it immediately following the last direct block. 1552 */ 1553 if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && 1554 ip->i_din2->di_db[UFS_NDADDR - 1] != 0) 1555 pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag; 1556 return (pref); 1557 } 1558 /* 1559 * If we are allocating the first data block in the first indirect 1560 * block and the indirect has been allocated in the data block area, 1561 * try to place it immediately following the indirect block. 1562 */ 1563 if (lbn == UFS_NDADDR) { 1564 pref = ip->i_din2->di_ib[0]; 1565 if (pref != 0 && pref >= cgdata(fs, inocg) && 1566 pref < cgbase(fs, inocg + 1)) 1567 return (pref + fs->fs_frag); 1568 } 1569 /* 1570 * If we are at the beginning of a file, or we have already allocated 1571 * the maximum number of blocks per cylinder group, or we do not 1572 * have a block allocated immediately preceding us, then we need 1573 * to decide where to start allocating new blocks. 1574 */ 1575 if (indx == 0) { 1576 prevbn = 0; 1577 } else { 1578 prevbn = bap[indx - 1]; 1579 if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, 1580 fs->fs_bsize) != 0) 1581 prevbn = 0; 1582 } 1583 if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { 1584 /* 1585 * If we are allocating a directory data block, we want 1586 * to place it in the metadata area. 1587 */ 1588 if ((ip->i_mode & IFMT) == IFDIR) 1589 return (cgmeta(fs, inocg)); 1590 /* 1591 * Until we fill all the direct and all the first indirect's 1592 * blocks, we try to allocate in the data area of the inode's 1593 * cylinder group. 1594 */ 1595 if (lbn < UFS_NDADDR + NINDIR(fs)) 1596 return (cgdata(fs, inocg)); 1597 /* 1598 * Find a cylinder with greater than average number of 1599 * unused data blocks. 1600 */ 1601 if (indx == 0 || prevbn == 0) 1602 startcg = inocg + lbn / fs->fs_maxbpg; 1603 else 1604 startcg = dtog(fs, prevbn) + 1; 1605 startcg %= fs->fs_ncg; 1606 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1607 for (cg = startcg; cg < fs->fs_ncg; cg++) 1608 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1609 fs->fs_cgrotor = cg; 1610 return (cgdata(fs, cg)); 1611 } 1612 for (cg = 0; cg <= startcg; cg++) 1613 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1614 fs->fs_cgrotor = cg; 1615 return (cgdata(fs, cg)); 1616 } 1617 return (0); 1618 } 1619 /* 1620 * Otherwise, we just always try to lay things out contiguously. 1621 */ 1622 return (prevbn + fs->fs_frag); 1623 } 1624 1625 /* 1626 * Implement the cylinder overflow algorithm. 1627 * 1628 * The policy implemented by this algorithm is: 1629 * 1) allocate the block in its requested cylinder group. 1630 * 2) quadratically rehash on the cylinder group number. 1631 * 3) brute force search for a free block. 1632 * 1633 * Must be called with the UFS lock held. Will release the lock on success 1634 * and return with it held on failure. 1635 */ 1636 /*VARARGS5*/ 1637 static ufs2_daddr_t 1638 ffs_hashalloc(struct inode *ip, 1639 uint64_t cg, 1640 ufs2_daddr_t pref, 1641 int size, /* Search size for data blocks, mode for inodes */ 1642 int rsize, /* Real allocated size. */ 1643 allocfcn_t *allocator) 1644 { 1645 struct fs *fs; 1646 ufs2_daddr_t result; 1647 uint64_t i, icg = cg; 1648 1649 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1650 #ifdef INVARIANTS 1651 if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 1652 panic("ffs_hashalloc: allocation on suspended filesystem"); 1653 #endif 1654 fs = ITOFS(ip); 1655 /* 1656 * 1: preferred cylinder group 1657 */ 1658 result = (*allocator)(ip, cg, pref, size, rsize); 1659 if (result) 1660 return (result); 1661 /* 1662 * 2: quadratic rehash 1663 */ 1664 for (i = 1; i < fs->fs_ncg; i *= 2) { 1665 cg += i; 1666 if (cg >= fs->fs_ncg) 1667 cg -= fs->fs_ncg; 1668 result = (*allocator)(ip, cg, 0, size, rsize); 1669 if (result) 1670 return (result); 1671 } 1672 /* 1673 * 3: brute force search 1674 * Note that we start at i == 2, since 0 was checked initially, 1675 * and 1 is always checked in the quadratic rehash. 1676 */ 1677 cg = (icg + 2) % fs->fs_ncg; 1678 for (i = 2; i < fs->fs_ncg; i++) { 1679 result = (*allocator)(ip, cg, 0, size, rsize); 1680 if (result) 1681 return (result); 1682 cg++; 1683 if (cg == fs->fs_ncg) 1684 cg = 0; 1685 } 1686 return (0); 1687 } 1688 1689 /* 1690 * Determine whether a fragment can be extended. 1691 * 1692 * Check to see if the necessary fragments are available, and 1693 * if they are, allocate them. 1694 */ 1695 static ufs2_daddr_t 1696 ffs_fragextend(struct inode *ip, 1697 uint64_t cg, 1698 ufs2_daddr_t bprev, 1699 int osize, 1700 int nsize) 1701 { 1702 struct fs *fs; 1703 struct cg *cgp; 1704 struct buf *bp; 1705 struct ufsmount *ump; 1706 int nffree; 1707 long bno; 1708 int frags, bbase; 1709 int i, error; 1710 uint8_t *blksfree; 1711 1712 ump = ITOUMP(ip); 1713 fs = ump->um_fs; 1714 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) 1715 return (0); 1716 frags = numfrags(fs, nsize); 1717 bbase = fragnum(fs, bprev); 1718 if (bbase > fragnum(fs, (bprev + frags - 1))) { 1719 /* cannot extend across a block boundary */ 1720 return (0); 1721 } 1722 UFS_UNLOCK(ump); 1723 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) 1724 goto fail; 1725 bno = dtogd(fs, bprev); 1726 blksfree = cg_blksfree(cgp); 1727 for (i = numfrags(fs, osize); i < frags; i++) 1728 if (isclr(blksfree, bno + i)) 1729 goto fail; 1730 /* 1731 * the current fragment can be extended 1732 * deduct the count on fragment being extended into 1733 * increase the count on the remaining fragment (if any) 1734 * allocate the extended piece 1735 */ 1736 for (i = frags; i < fs->fs_frag - bbase; i++) 1737 if (isclr(blksfree, bno + i)) 1738 break; 1739 cgp->cg_frsum[i - numfrags(fs, osize)]--; 1740 if (i != frags) 1741 cgp->cg_frsum[i - frags]++; 1742 for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) { 1743 clrbit(blksfree, bno + i); 1744 cgp->cg_cs.cs_nffree--; 1745 nffree++; 1746 } 1747 UFS_LOCK(ump); 1748 fs->fs_cstotal.cs_nffree -= nffree; 1749 fs->fs_cs(fs, cg).cs_nffree -= nffree; 1750 fs->fs_fmod = 1; 1751 ACTIVECLEAR(fs, cg); 1752 UFS_UNLOCK(ump); 1753 if (DOINGSOFTDEP(ITOV(ip))) 1754 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, 1755 frags, numfrags(fs, osize)); 1756 bdwrite(bp); 1757 return (bprev); 1758 1759 fail: 1760 brelse(bp); 1761 UFS_LOCK(ump); 1762 return (0); 1763 1764 } 1765 1766 /* 1767 * Determine whether a block can be allocated. 1768 * 1769 * Check to see if a block of the appropriate size is available, 1770 * and if it is, allocate it. 1771 */ 1772 static ufs2_daddr_t 1773 ffs_alloccg(struct inode *ip, 1774 uint64_t cg, 1775 ufs2_daddr_t bpref, 1776 int size, 1777 int rsize) 1778 { 1779 struct fs *fs; 1780 struct cg *cgp; 1781 struct buf *bp; 1782 struct ufsmount *ump; 1783 ufs1_daddr_t bno; 1784 ufs2_daddr_t blkno; 1785 int i, allocsiz, error, frags; 1786 uint8_t *blksfree; 1787 1788 ump = ITOUMP(ip); 1789 fs = ump->um_fs; 1790 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) 1791 return (0); 1792 UFS_UNLOCK(ump); 1793 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0 || 1794 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) 1795 goto fail; 1796 if (size == fs->fs_bsize) { 1797 UFS_LOCK(ump); 1798 blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1799 ACTIVECLEAR(fs, cg); 1800 UFS_UNLOCK(ump); 1801 bdwrite(bp); 1802 return (blkno); 1803 } 1804 /* 1805 * check to see if any fragments are already available 1806 * allocsiz is the size which will be allocated, hacking 1807 * it down to a smaller size if necessary 1808 */ 1809 blksfree = cg_blksfree(cgp); 1810 frags = numfrags(fs, size); 1811 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 1812 if (cgp->cg_frsum[allocsiz] != 0) 1813 break; 1814 if (allocsiz == fs->fs_frag) { 1815 /* 1816 * no fragments were available, so a block will be 1817 * allocated, and hacked up 1818 */ 1819 if (cgp->cg_cs.cs_nbfree == 0) 1820 goto fail; 1821 UFS_LOCK(ump); 1822 blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1823 ACTIVECLEAR(fs, cg); 1824 UFS_UNLOCK(ump); 1825 bdwrite(bp); 1826 return (blkno); 1827 } 1828 KASSERT(size == rsize, 1829 ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); 1830 bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); 1831 if (bno < 0) 1832 goto fail; 1833 for (i = 0; i < frags; i++) 1834 clrbit(blksfree, bno + i); 1835 cgp->cg_cs.cs_nffree -= frags; 1836 cgp->cg_frsum[allocsiz]--; 1837 if (frags != allocsiz) 1838 cgp->cg_frsum[allocsiz - frags]++; 1839 UFS_LOCK(ump); 1840 fs->fs_cstotal.cs_nffree -= frags; 1841 fs->fs_cs(fs, cg).cs_nffree -= frags; 1842 fs->fs_fmod = 1; 1843 blkno = cgbase(fs, cg) + bno; 1844 ACTIVECLEAR(fs, cg); 1845 UFS_UNLOCK(ump); 1846 if (DOINGSOFTDEP(ITOV(ip))) 1847 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); 1848 bdwrite(bp); 1849 return (blkno); 1850 1851 fail: 1852 brelse(bp); 1853 UFS_LOCK(ump); 1854 return (0); 1855 } 1856 1857 /* 1858 * Allocate a block in a cylinder group. 1859 * 1860 * This algorithm implements the following policy: 1861 * 1) allocate the requested block. 1862 * 2) allocate a rotationally optimal block in the same cylinder. 1863 * 3) allocate the next available block on the block rotor for the 1864 * specified cylinder group. 1865 * Note that this routine only allocates fs_bsize blocks; these 1866 * blocks may be fragmented by the routine that allocates them. 1867 */ 1868 static ufs2_daddr_t 1869 ffs_alloccgblk(struct inode *ip, 1870 struct buf *bp, 1871 ufs2_daddr_t bpref, 1872 int size) 1873 { 1874 struct fs *fs; 1875 struct cg *cgp; 1876 struct ufsmount *ump; 1877 ufs1_daddr_t bno; 1878 ufs2_daddr_t blkno; 1879 uint8_t *blksfree; 1880 int i, cgbpref; 1881 1882 ump = ITOUMP(ip); 1883 fs = ump->um_fs; 1884 mtx_assert(UFS_MTX(ump), MA_OWNED); 1885 cgp = (struct cg *)bp->b_data; 1886 blksfree = cg_blksfree(cgp); 1887 if (bpref == 0) { 1888 bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag; 1889 } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) { 1890 /* map bpref to correct zone in this cg */ 1891 if (bpref < cgdata(fs, cgbpref)) 1892 bpref = cgmeta(fs, cgp->cg_cgx); 1893 else 1894 bpref = cgdata(fs, cgp->cg_cgx); 1895 } 1896 /* 1897 * if the requested block is available, use it 1898 */ 1899 bno = dtogd(fs, blknum(fs, bpref)); 1900 if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) 1901 goto gotit; 1902 /* 1903 * Take the next available block in this cylinder group. 1904 */ 1905 bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); 1906 if (bno < 0) 1907 return (0); 1908 /* Update cg_rotor only if allocated from the data zone */ 1909 if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) 1910 cgp->cg_rotor = bno; 1911 gotit: 1912 blkno = fragstoblks(fs, bno); 1913 ffs_clrblock(fs, blksfree, (long)blkno); 1914 ffs_clusteracct(fs, cgp, blkno, -1); 1915 cgp->cg_cs.cs_nbfree--; 1916 fs->fs_cstotal.cs_nbfree--; 1917 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; 1918 fs->fs_fmod = 1; 1919 blkno = cgbase(fs, cgp->cg_cgx) + bno; 1920 /* 1921 * If the caller didn't want the whole block free the frags here. 1922 */ 1923 size = numfrags(fs, size); 1924 if (size != fs->fs_frag) { 1925 bno = dtogd(fs, blkno); 1926 for (i = size; i < fs->fs_frag; i++) 1927 setbit(blksfree, bno + i); 1928 i = fs->fs_frag - size; 1929 cgp->cg_cs.cs_nffree += i; 1930 fs->fs_cstotal.cs_nffree += i; 1931 fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; 1932 fs->fs_fmod = 1; 1933 cgp->cg_frsum[i]++; 1934 } 1935 /* XXX Fixme. */ 1936 UFS_UNLOCK(ump); 1937 if (DOINGSOFTDEP(ITOV(ip))) 1938 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0); 1939 UFS_LOCK(ump); 1940 return (blkno); 1941 } 1942 1943 /* 1944 * Determine whether a cluster can be allocated. 1945 * 1946 * We do not currently check for optimal rotational layout if there 1947 * are multiple choices in the same cylinder group. Instead we just 1948 * take the first one that we find following bpref. 1949 */ 1950 static ufs2_daddr_t 1951 ffs_clusteralloc(struct inode *ip, 1952 uint64_t cg, 1953 ufs2_daddr_t bpref, 1954 int len) 1955 { 1956 struct fs *fs; 1957 struct cg *cgp; 1958 struct buf *bp; 1959 struct ufsmount *ump; 1960 int i, run, bit, map, got, error; 1961 ufs2_daddr_t bno; 1962 uint8_t *mapp; 1963 int32_t *lp; 1964 uint8_t *blksfree; 1965 1966 ump = ITOUMP(ip); 1967 fs = ump->um_fs; 1968 if (fs->fs_maxcluster[cg] < len) 1969 return (0); 1970 UFS_UNLOCK(ump); 1971 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { 1972 UFS_LOCK(ump); 1973 return (0); 1974 } 1975 /* 1976 * Check to see if a cluster of the needed size (or bigger) is 1977 * available in this cylinder group. 1978 */ 1979 lp = &cg_clustersum(cgp)[len]; 1980 for (i = len; i <= fs->fs_contigsumsize; i++) 1981 if (*lp++ > 0) 1982 break; 1983 if (i > fs->fs_contigsumsize) { 1984 /* 1985 * This is the first time looking for a cluster in this 1986 * cylinder group. Update the cluster summary information 1987 * to reflect the true maximum sized cluster so that 1988 * future cluster allocation requests can avoid reading 1989 * the cylinder group map only to find no clusters. 1990 */ 1991 lp = &cg_clustersum(cgp)[len - 1]; 1992 for (i = len - 1; i > 0; i--) 1993 if (*lp-- > 0) 1994 break; 1995 UFS_LOCK(ump); 1996 fs->fs_maxcluster[cg] = i; 1997 brelse(bp); 1998 return (0); 1999 } 2000 /* 2001 * Search the cluster map to find a big enough cluster. 2002 * We take the first one that we find, even if it is larger 2003 * than we need as we prefer to get one close to the previous 2004 * block allocation. We do not search before the current 2005 * preference point as we do not want to allocate a block 2006 * that is allocated before the previous one (as we will 2007 * then have to wait for another pass of the elevator 2008 * algorithm before it will be read). We prefer to fail and 2009 * be recalled to try an allocation in the next cylinder group. 2010 */ 2011 if (dtog(fs, bpref) != cg) 2012 bpref = cgdata(fs, cg); 2013 else 2014 bpref = blknum(fs, bpref); 2015 bpref = fragstoblks(fs, dtogd(fs, bpref)); 2016 mapp = &cg_clustersfree(cgp)[bpref / NBBY]; 2017 map = *mapp++; 2018 bit = 1 << (bpref % NBBY); 2019 for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { 2020 if ((map & bit) == 0) { 2021 run = 0; 2022 } else { 2023 run++; 2024 if (run == len) 2025 break; 2026 } 2027 if ((got & (NBBY - 1)) != (NBBY - 1)) { 2028 bit <<= 1; 2029 } else { 2030 map = *mapp++; 2031 bit = 1; 2032 } 2033 } 2034 if (got >= cgp->cg_nclusterblks) { 2035 UFS_LOCK(ump); 2036 brelse(bp); 2037 return (0); 2038 } 2039 /* 2040 * Allocate the cluster that we have found. 2041 */ 2042 blksfree = cg_blksfree(cgp); 2043 for (i = 1; i <= len; i++) 2044 if (!ffs_isblock(fs, blksfree, got - run + i)) 2045 panic("ffs_clusteralloc: map mismatch"); 2046 bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1); 2047 if (dtog(fs, bno) != cg) 2048 panic("ffs_clusteralloc: allocated out of group"); 2049 len = blkstofrags(fs, len); 2050 UFS_LOCK(ump); 2051 for (i = 0; i < len; i += fs->fs_frag) 2052 if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) 2053 panic("ffs_clusteralloc: lost block"); 2054 ACTIVECLEAR(fs, cg); 2055 UFS_UNLOCK(ump); 2056 bdwrite(bp); 2057 return (bno); 2058 } 2059 2060 static inline struct buf * 2061 getinobuf(struct inode *ip, 2062 uint64_t cg, 2063 uint32_t cginoblk, 2064 int gbflags) 2065 { 2066 struct fs *fs; 2067 2068 fs = ITOFS(ip); 2069 return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, 2070 cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, 2071 gbflags)); 2072 } 2073 2074 /* 2075 * Synchronous inode initialization is needed only when barrier writes do not 2076 * work as advertised, and will impose a heavy cost on file creation in a newly 2077 * created filesystem. 2078 */ 2079 static int doasyncinodeinit = 1; 2080 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN, 2081 &doasyncinodeinit, 0, 2082 "Perform inode block initialization using asynchronous writes"); 2083 2084 /* 2085 * Determine whether an inode can be allocated. 2086 * 2087 * Check to see if an inode is available, and if it is, 2088 * allocate it using the following policy: 2089 * 1) allocate the requested inode. 2090 * 2) allocate the next available inode after the requested 2091 * inode in the specified cylinder group. 2092 */ 2093 static ufs2_daddr_t 2094 ffs_nodealloccg(struct inode *ip, 2095 uint64_t cg, 2096 ufs2_daddr_t ipref, 2097 int mode, 2098 int unused) 2099 { 2100 struct fs *fs; 2101 struct cg *cgp; 2102 struct buf *bp, *ibp; 2103 struct ufsmount *ump; 2104 uint8_t *inosused, *loc; 2105 struct ufs2_dinode *dp2; 2106 int error, start, len, i; 2107 uint32_t old_initediblk; 2108 2109 ump = ITOUMP(ip); 2110 fs = ump->um_fs; 2111 check_nifree: 2112 if (fs->fs_cs(fs, cg).cs_nifree == 0) 2113 return (0); 2114 UFS_UNLOCK(ump); 2115 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { 2116 UFS_LOCK(ump); 2117 return (0); 2118 } 2119 restart: 2120 if (cgp->cg_cs.cs_nifree == 0) { 2121 brelse(bp); 2122 UFS_LOCK(ump); 2123 return (0); 2124 } 2125 inosused = cg_inosused(cgp); 2126 if (ipref) { 2127 ipref %= fs->fs_ipg; 2128 if (isclr(inosused, ipref)) 2129 goto gotit; 2130 } 2131 start = cgp->cg_irotor / NBBY; 2132 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); 2133 loc = memcchr(&inosused[start], 0xff, len); 2134 if (loc == NULL) { 2135 len = start + 1; 2136 start = 0; 2137 loc = memcchr(&inosused[start], 0xff, len); 2138 if (loc == NULL) { 2139 printf("cg = %ju, irotor = %ld, fs = %s\n", 2140 (intmax_t)cg, (long)cgp->cg_irotor, fs->fs_fsmnt); 2141 panic("ffs_nodealloccg: map corrupted"); 2142 /* NOTREACHED */ 2143 } 2144 } 2145 ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1; 2146 gotit: 2147 /* 2148 * Check to see if we need to initialize more inodes. 2149 */ 2150 if (fs->fs_magic == FS_UFS2_MAGIC && 2151 ipref + INOPB(fs) > cgp->cg_initediblk && 2152 cgp->cg_initediblk < cgp->cg_niblk) { 2153 old_initediblk = cgp->cg_initediblk; 2154 2155 /* 2156 * Free the cylinder group lock before writing the 2157 * initialized inode block. Entering the 2158 * babarrierwrite() with the cylinder group lock 2159 * causes lock order violation between the lock and 2160 * snaplk. 2161 * 2162 * Another thread can decide to initialize the same 2163 * inode block, but whichever thread first gets the 2164 * cylinder group lock after writing the newly 2165 * allocated inode block will update it and the other 2166 * will realize that it has lost and leave the 2167 * cylinder group unchanged. 2168 */ 2169 ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); 2170 brelse(bp); 2171 if (ibp == NULL) { 2172 /* 2173 * The inode block buffer is already owned by 2174 * another thread, which must initialize it. 2175 * Wait on the buffer to allow another thread 2176 * to finish the updates, with dropped cg 2177 * buffer lock, then retry. 2178 */ 2179 ibp = getinobuf(ip, cg, old_initediblk, 0); 2180 brelse(ibp); 2181 UFS_LOCK(ump); 2182 goto check_nifree; 2183 } 2184 bzero(ibp->b_data, (int)fs->fs_bsize); 2185 dp2 = (struct ufs2_dinode *)(ibp->b_data); 2186 for (i = 0; i < INOPB(fs); i++) { 2187 while (dp2->di_gen == 0) 2188 dp2->di_gen = arc4random(); 2189 dp2++; 2190 } 2191 2192 /* 2193 * Rather than adding a soft updates dependency to ensure 2194 * that the new inode block is written before it is claimed 2195 * by the cylinder group map, we just do a barrier write 2196 * here. The barrier write will ensure that the inode block 2197 * gets written before the updated cylinder group map can be 2198 * written. The barrier write should only slow down bulk 2199 * loading of newly created filesystems. 2200 */ 2201 if (doasyncinodeinit) 2202 babarrierwrite(ibp); 2203 else 2204 bwrite(ibp); 2205 2206 /* 2207 * After the inode block is written, try to update the 2208 * cg initediblk pointer. If another thread beat us 2209 * to it, then leave it unchanged as the other thread 2210 * has already set it correctly. 2211 */ 2212 error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp); 2213 UFS_LOCK(ump); 2214 ACTIVECLEAR(fs, cg); 2215 UFS_UNLOCK(ump); 2216 if (error != 0) 2217 return (error); 2218 if (cgp->cg_initediblk == old_initediblk) 2219 cgp->cg_initediblk += INOPB(fs); 2220 goto restart; 2221 } 2222 cgp->cg_irotor = ipref; 2223 UFS_LOCK(ump); 2224 ACTIVECLEAR(fs, cg); 2225 setbit(inosused, ipref); 2226 cgp->cg_cs.cs_nifree--; 2227 fs->fs_cstotal.cs_nifree--; 2228 fs->fs_cs(fs, cg).cs_nifree--; 2229 fs->fs_fmod = 1; 2230 if ((mode & IFMT) == IFDIR) { 2231 cgp->cg_cs.cs_ndir++; 2232 fs->fs_cstotal.cs_ndir++; 2233 fs->fs_cs(fs, cg).cs_ndir++; 2234 } 2235 UFS_UNLOCK(ump); 2236 if (DOINGSOFTDEP(ITOV(ip))) 2237 softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode); 2238 bdwrite(bp); 2239 return ((ino_t)(cg * fs->fs_ipg + ipref)); 2240 } 2241 2242 /* 2243 * Free a block or fragment. 2244 * 2245 * The specified block or fragment is placed back in the 2246 * free map. If a fragment is deallocated, a possible 2247 * block reassembly is checked. 2248 */ 2249 static void 2250 ffs_blkfree_cg(struct ufsmount *ump, 2251 struct fs *fs, 2252 struct vnode *devvp, 2253 ufs2_daddr_t bno, 2254 long size, 2255 ino_t inum, 2256 struct workhead *dephd) 2257 { 2258 struct mount *mp; 2259 struct cg *cgp; 2260 struct buf *bp; 2261 daddr_t dbn; 2262 ufs1_daddr_t fragno, cgbno; 2263 int i, blk, frags, bbase, error; 2264 uint64_t cg; 2265 uint8_t *blksfree; 2266 struct cdev *dev; 2267 2268 cg = dtog(fs, bno); 2269 if (devvp->v_type == VREG) { 2270 /* devvp is a snapshot */ 2271 MPASS(devvp->v_mount->mnt_data == ump); 2272 dev = ump->um_devvp->v_rdev; 2273 } else if (devvp->v_type == VCHR) { 2274 /* 2275 * devvp is a normal disk device 2276 * XXXKIB: devvp is not locked there, v_rdev access depends on 2277 * busy mount, which prevents mntfs devvp from reclamation. 2278 */ 2279 dev = devvp->v_rdev; 2280 } else 2281 return; 2282 #ifdef INVARIANTS 2283 if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0 || 2284 fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { 2285 printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", 2286 devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, 2287 size, fs->fs_fsmnt); 2288 panic("ffs_blkfree_cg: bad size"); 2289 } 2290 #endif 2291 if ((uint64_t)bno >= fs->fs_size) { 2292 printf("bad block %jd, ino %ju\n", (intmax_t)bno, 2293 (intmax_t)inum); 2294 ffs_fserr(fs, inum, "bad block"); 2295 return; 2296 } 2297 if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { 2298 if (!MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) 2299 return; 2300 /* 2301 * Would like to just downgrade to read-only. Until that 2302 * capability is available, just toss the cylinder group 2303 * update and mark the filesystem as needing to run fsck. 2304 */ 2305 fs->fs_flags |= FS_NEEDSFSCK; 2306 if (devvp->v_type == VREG) 2307 dbn = fragstoblks(fs, cgtod(fs, cg)); 2308 else 2309 dbn = fsbtodb(fs, cgtod(fs, cg)); 2310 error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); 2311 KASSERT(error == 0, ("getblkx failed")); 2312 softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, 2313 numfrags(fs, size), dephd, true); 2314 bp->b_flags |= B_RELBUF | B_NOCACHE; 2315 bp->b_flags &= ~B_CACHE; 2316 bawrite(bp); 2317 return; 2318 } 2319 cgbno = dtogd(fs, bno); 2320 blksfree = cg_blksfree(cgp); 2321 UFS_LOCK(ump); 2322 if (size == fs->fs_bsize) { 2323 fragno = fragstoblks(fs, cgbno); 2324 if (!ffs_isfreeblock(fs, blksfree, fragno)) { 2325 if (devvp->v_type == VREG) { 2326 UFS_UNLOCK(ump); 2327 /* devvp is a snapshot */ 2328 brelse(bp); 2329 return; 2330 } 2331 printf("dev = %s, block = %jd, fs = %s\n", 2332 devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); 2333 panic("ffs_blkfree_cg: freeing free block"); 2334 } 2335 ffs_setblock(fs, blksfree, fragno); 2336 ffs_clusteracct(fs, cgp, fragno, 1); 2337 cgp->cg_cs.cs_nbfree++; 2338 fs->fs_cstotal.cs_nbfree++; 2339 fs->fs_cs(fs, cg).cs_nbfree++; 2340 } else { 2341 bbase = cgbno - fragnum(fs, cgbno); 2342 /* 2343 * decrement the counts associated with the old frags 2344 */ 2345 blk = blkmap(fs, blksfree, bbase); 2346 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 2347 /* 2348 * deallocate the fragment 2349 */ 2350 frags = numfrags(fs, size); 2351 for (i = 0; i < frags; i++) { 2352 if (isset(blksfree, cgbno + i)) { 2353 printf("dev = %s, block = %jd, fs = %s\n", 2354 devtoname(dev), (intmax_t)(bno + i), 2355 fs->fs_fsmnt); 2356 panic("ffs_blkfree_cg: freeing free frag"); 2357 } 2358 setbit(blksfree, cgbno + i); 2359 } 2360 cgp->cg_cs.cs_nffree += i; 2361 fs->fs_cstotal.cs_nffree += i; 2362 fs->fs_cs(fs, cg).cs_nffree += i; 2363 /* 2364 * add back in counts associated with the new frags 2365 */ 2366 blk = blkmap(fs, blksfree, bbase); 2367 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 2368 /* 2369 * if a complete block has been reassembled, account for it 2370 */ 2371 fragno = fragstoblks(fs, bbase); 2372 if (ffs_isblock(fs, blksfree, fragno)) { 2373 cgp->cg_cs.cs_nffree -= fs->fs_frag; 2374 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 2375 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 2376 ffs_clusteracct(fs, cgp, fragno, 1); 2377 cgp->cg_cs.cs_nbfree++; 2378 fs->fs_cstotal.cs_nbfree++; 2379 fs->fs_cs(fs, cg).cs_nbfree++; 2380 } 2381 } 2382 fs->fs_fmod = 1; 2383 ACTIVECLEAR(fs, cg); 2384 UFS_UNLOCK(ump); 2385 mp = UFSTOVFS(ump); 2386 if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR) 2387 softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, 2388 numfrags(fs, size), dephd, false); 2389 bdwrite(bp); 2390 } 2391 2392 /* 2393 * Structures and routines associated with trim management. 2394 * 2395 * The following requests are passed to trim_lookup to indicate 2396 * the actions that should be taken. 2397 */ 2398 #define NEW 1 /* if found, error else allocate and hash it */ 2399 #define OLD 2 /* if not found, error, else return it */ 2400 #define REPLACE 3 /* if not found, error else unhash and reallocate it */ 2401 #define DONE 4 /* if not found, error else unhash and return it */ 2402 #define SINGLE 5 /* don't look up, just allocate it and don't hash it */ 2403 2404 MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures"); 2405 2406 #define TRIMLIST_HASH(ump, key) \ 2407 (&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize]) 2408 2409 /* 2410 * These structures describe each of the block free requests aggregated 2411 * together to make up a trim request. 2412 */ 2413 struct trim_blkreq { 2414 TAILQ_ENTRY(trim_blkreq) blkreqlist; 2415 ufs2_daddr_t bno; 2416 long size; 2417 struct workhead *pdephd; 2418 struct workhead dephd; 2419 }; 2420 2421 /* 2422 * Description of a trim request. 2423 */ 2424 struct ffs_blkfree_trim_params { 2425 TAILQ_HEAD(, trim_blkreq) blklist; 2426 LIST_ENTRY(ffs_blkfree_trim_params) hashlist; 2427 struct task task; 2428 struct ufsmount *ump; 2429 struct vnode *devvp; 2430 ino_t inum; 2431 ufs2_daddr_t bno; 2432 long size; 2433 long key; 2434 }; 2435 2436 static void ffs_blkfree_trim_completed(struct buf *); 2437 static void ffs_blkfree_trim_task(void *ctx, int pending __unused); 2438 static struct ffs_blkfree_trim_params *trim_lookup(struct ufsmount *, 2439 struct vnode *, ufs2_daddr_t, long, ino_t, uint64_t, int); 2440 static void ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *); 2441 2442 /* 2443 * Called on trim completion to start a task to free the associated block(s). 2444 */ 2445 static void 2446 ffs_blkfree_trim_completed(struct buf *bp) 2447 { 2448 struct ffs_blkfree_trim_params *tp; 2449 2450 tp = bp->b_fsprivate1; 2451 free(bp, M_TRIM); 2452 TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); 2453 taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task); 2454 } 2455 2456 /* 2457 * Trim completion task that free associated block(s). 2458 */ 2459 static void 2460 ffs_blkfree_trim_task(void *ctx, int pending) 2461 { 2462 struct ffs_blkfree_trim_params *tp; 2463 struct trim_blkreq *blkelm; 2464 struct ufsmount *ump; 2465 2466 tp = ctx; 2467 ump = tp->ump; 2468 while ((blkelm = TAILQ_FIRST(&tp->blklist)) != NULL) { 2469 ffs_blkfree_cg(ump, ump->um_fs, tp->devvp, blkelm->bno, 2470 blkelm->size, tp->inum, blkelm->pdephd); 2471 TAILQ_REMOVE(&tp->blklist, blkelm, blkreqlist); 2472 free(blkelm, M_TRIM); 2473 } 2474 vn_finished_secondary_write(UFSTOVFS(ump)); 2475 UFS_LOCK(ump); 2476 ump->um_trim_inflight -= 1; 2477 ump->um_trim_inflight_blks -= numfrags(ump->um_fs, tp->size); 2478 UFS_UNLOCK(ump); 2479 free(tp, M_TRIM); 2480 } 2481 2482 /* 2483 * Lookup a trim request by inode number. 2484 * Allocate if requested (NEW, REPLACE, SINGLE). 2485 */ 2486 static struct ffs_blkfree_trim_params * 2487 trim_lookup(struct ufsmount *ump, 2488 struct vnode *devvp, 2489 ufs2_daddr_t bno, 2490 long size, 2491 ino_t inum, 2492 uint64_t key, 2493 int alloctype) 2494 { 2495 struct trimlist_hashhead *tphashhead; 2496 struct ffs_blkfree_trim_params *tp, *ntp; 2497 2498 ntp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK); 2499 if (alloctype != SINGLE) { 2500 KASSERT(key >= FIRST_VALID_KEY, ("trim_lookup: invalid key")); 2501 UFS_LOCK(ump); 2502 tphashhead = TRIMLIST_HASH(ump, key); 2503 LIST_FOREACH(tp, tphashhead, hashlist) 2504 if (key == tp->key) 2505 break; 2506 } 2507 switch (alloctype) { 2508 case NEW: 2509 KASSERT(tp == NULL, ("trim_lookup: found trim")); 2510 break; 2511 case OLD: 2512 KASSERT(tp != NULL, 2513 ("trim_lookup: missing call to ffs_blkrelease_start()")); 2514 UFS_UNLOCK(ump); 2515 free(ntp, M_TRIM); 2516 return (tp); 2517 case REPLACE: 2518 KASSERT(tp != NULL, ("trim_lookup: missing REPLACE trim")); 2519 LIST_REMOVE(tp, hashlist); 2520 /* tp will be freed by caller */ 2521 break; 2522 case DONE: 2523 KASSERT(tp != NULL, ("trim_lookup: missing DONE trim")); 2524 LIST_REMOVE(tp, hashlist); 2525 UFS_UNLOCK(ump); 2526 free(ntp, M_TRIM); 2527 return (tp); 2528 } 2529 TAILQ_INIT(&ntp->blklist); 2530 ntp->ump = ump; 2531 ntp->devvp = devvp; 2532 ntp->bno = bno; 2533 ntp->size = size; 2534 ntp->inum = inum; 2535 ntp->key = key; 2536 if (alloctype != SINGLE) { 2537 LIST_INSERT_HEAD(tphashhead, ntp, hashlist); 2538 UFS_UNLOCK(ump); 2539 } 2540 return (ntp); 2541 } 2542 2543 /* 2544 * Dispatch a trim request. 2545 */ 2546 static void 2547 ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *tp) 2548 { 2549 struct ufsmount *ump; 2550 struct mount *mp; 2551 struct buf *bp; 2552 2553 /* 2554 * Postpone the set of the free bit in the cg bitmap until the 2555 * BIO_DELETE is completed. Otherwise, due to disk queue 2556 * reordering, TRIM might be issued after we reuse the block 2557 * and write some new data into it. 2558 */ 2559 ump = tp->ump; 2560 bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO); 2561 bp->b_iocmd = BIO_DELETE; 2562 bp->b_iooffset = dbtob(fsbtodb(ump->um_fs, tp->bno)); 2563 bp->b_iodone = ffs_blkfree_trim_completed; 2564 bp->b_bcount = tp->size; 2565 bp->b_fsprivate1 = tp; 2566 UFS_LOCK(ump); 2567 ump->um_trim_total += 1; 2568 ump->um_trim_inflight += 1; 2569 ump->um_trim_inflight_blks += numfrags(ump->um_fs, tp->size); 2570 ump->um_trim_total_blks += numfrags(ump->um_fs, tp->size); 2571 UFS_UNLOCK(ump); 2572 2573 mp = UFSTOVFS(ump); 2574 vn_start_secondary_write(NULL, &mp, 0); 2575 g_vfs_strategy(ump->um_bo, bp); 2576 } 2577 2578 /* 2579 * Allocate a new key to use to identify a range of blocks. 2580 */ 2581 uint64_t 2582 ffs_blkrelease_start(struct ufsmount *ump, 2583 struct vnode *devvp, 2584 ino_t inum) 2585 { 2586 static u_long masterkey; 2587 uint64_t key; 2588 2589 if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) 2590 return (SINGLETON_KEY); 2591 do { 2592 key = atomic_fetchadd_long(&masterkey, 1); 2593 } while (key < FIRST_VALID_KEY); 2594 (void) trim_lookup(ump, devvp, 0, 0, inum, key, NEW); 2595 return (key); 2596 } 2597 2598 /* 2599 * Deallocate a key that has been used to identify a range of blocks. 2600 */ 2601 void 2602 ffs_blkrelease_finish(struct ufsmount *ump, uint64_t key) 2603 { 2604 struct ffs_blkfree_trim_params *tp; 2605 2606 if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) 2607 return; 2608 /* 2609 * If the vfs.ffs.dotrimcons sysctl option is enabled while 2610 * a file deletion is active, specifically after a call 2611 * to ffs_blkrelease_start() but before the call to 2612 * ffs_blkrelease_finish(), ffs_blkrelease_start() will 2613 * have handed out SINGLETON_KEY rather than starting a 2614 * collection sequence. Thus if we get a SINGLETON_KEY 2615 * passed to ffs_blkrelease_finish(), we just return rather 2616 * than trying to finish the nonexistent sequence. 2617 */ 2618 if (key == SINGLETON_KEY) { 2619 #ifdef INVARIANTS 2620 printf("%s: vfs.ffs.dotrimcons enabled on active filesystem\n", 2621 ump->um_mountp->mnt_stat.f_mntonname); 2622 #endif 2623 return; 2624 } 2625 /* 2626 * We are done with sending blocks using this key. Look up the key 2627 * using the DONE alloctype (in tp) to request that it be unhashed 2628 * as we will not be adding to it. If the key has never been used, 2629 * tp->size will be zero, so we can just free tp. Otherwise the call 2630 * to ffs_blkfree_sendtrim(tp) causes the block range described by 2631 * tp to be issued (and then tp to be freed). 2632 */ 2633 tp = trim_lookup(ump, NULL, 0, 0, 0, key, DONE); 2634 if (tp->size == 0) 2635 free(tp, M_TRIM); 2636 else 2637 ffs_blkfree_sendtrim(tp); 2638 } 2639 2640 /* 2641 * Setup to free a block or fragment. 2642 * 2643 * Check for snapshots that might want to claim the block. 2644 * If trims are requested, prepare a trim request. Attempt to 2645 * aggregate consecutive blocks into a single trim request. 2646 */ 2647 void 2648 ffs_blkfree(struct ufsmount *ump, 2649 struct fs *fs, 2650 struct vnode *devvp, 2651 ufs2_daddr_t bno, 2652 long size, 2653 ino_t inum, 2654 __enum_uint8(vtype) vtype, 2655 struct workhead *dephd, 2656 uint64_t key) 2657 { 2658 struct ffs_blkfree_trim_params *tp, *ntp; 2659 struct trim_blkreq *blkelm; 2660 2661 /* 2662 * Check to see if a snapshot wants to claim the block. 2663 * Check that devvp is a normal disk device, not a snapshot, 2664 * it has a snapshot(s) associated with it, and one of the 2665 * snapshots wants to claim the block. 2666 */ 2667 if (devvp->v_type == VCHR && 2668 (devvp->v_vflag & VV_COPYONWRITE) && 2669 ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { 2670 return; 2671 } 2672 /* 2673 * Nothing to delay if TRIM is not required for this block or TRIM 2674 * is disabled or the operation is performed on a snapshot. 2675 */ 2676 if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) || 2677 devvp->v_type == VREG) { 2678 ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); 2679 return; 2680 } 2681 blkelm = malloc(sizeof(struct trim_blkreq), M_TRIM, M_WAITOK); 2682 blkelm->bno = bno; 2683 blkelm->size = size; 2684 if (dephd == NULL) { 2685 blkelm->pdephd = NULL; 2686 } else { 2687 LIST_INIT(&blkelm->dephd); 2688 LIST_SWAP(dephd, &blkelm->dephd, worklist, wk_list); 2689 blkelm->pdephd = &blkelm->dephd; 2690 } 2691 if (key == SINGLETON_KEY) { 2692 /* 2693 * Just a single non-contiguous piece. Use the SINGLE 2694 * alloctype to return a trim request that will not be 2695 * hashed for future lookup. 2696 */ 2697 tp = trim_lookup(ump, devvp, bno, size, inum, key, SINGLE); 2698 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2699 ffs_blkfree_sendtrim(tp); 2700 return; 2701 } 2702 /* 2703 * The callers of this function are not tracking whether or not 2704 * the blocks are contiguous. They are just saying that they 2705 * are freeing a set of blocks. It is this code that determines 2706 * the pieces of that range that are actually contiguous. 2707 * 2708 * Calling ffs_blkrelease_start() will have created an entry 2709 * that we will use. 2710 */ 2711 tp = trim_lookup(ump, devvp, bno, size, inum, key, OLD); 2712 if (tp->size == 0) { 2713 /* 2714 * First block of a potential range, set block and size 2715 * for the trim block. 2716 */ 2717 tp->bno = bno; 2718 tp->size = size; 2719 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2720 return; 2721 } 2722 /* 2723 * If this block is a continuation of the range (either 2724 * follows at the end or preceeds in the front) then we 2725 * add it to the front or back of the list and return. 2726 * 2727 * If it is not a continuation of the trim that we were 2728 * building, using the REPLACE alloctype, we request that 2729 * the old trim request (still in tp) be unhashed and a 2730 * new range started (in ntp). The ffs_blkfree_sendtrim(tp) 2731 * call causes the block range described by tp to be issued 2732 * (and then tp to be freed). 2733 */ 2734 if (bno + numfrags(fs, size) == tp->bno) { 2735 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2736 tp->bno = bno; 2737 tp->size += size; 2738 return; 2739 } else if (bno == tp->bno + numfrags(fs, tp->size)) { 2740 TAILQ_INSERT_TAIL(&tp->blklist, blkelm, blkreqlist); 2741 tp->size += size; 2742 return; 2743 } 2744 ntp = trim_lookup(ump, devvp, bno, size, inum, key, REPLACE); 2745 TAILQ_INSERT_HEAD(&ntp->blklist, blkelm, blkreqlist); 2746 ffs_blkfree_sendtrim(tp); 2747 } 2748 2749 #ifdef INVARIANTS 2750 /* 2751 * Verify allocation of a block or fragment. Returns true if block or 2752 * fragment is allocated, false if it is free. 2753 */ 2754 static int 2755 ffs_checkblk(struct inode *ip, 2756 ufs2_daddr_t bno, 2757 long size) 2758 { 2759 struct fs *fs; 2760 struct cg *cgp; 2761 struct buf *bp; 2762 ufs1_daddr_t cgbno; 2763 int i, error, frags, free; 2764 uint8_t *blksfree; 2765 2766 fs = ITOFS(ip); 2767 if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0) { 2768 printf("bsize = %ld, size = %ld, fs = %s\n", 2769 (long)fs->fs_bsize, size, fs->fs_fsmnt); 2770 panic("ffs_checkblk: bad size"); 2771 } 2772 if ((uint64_t)bno >= fs->fs_size) 2773 panic("ffs_checkblk: bad block %jd", (intmax_t)bno); 2774 error = ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), 0, &bp, &cgp); 2775 if (error) 2776 panic("ffs_checkblk: cylinder group read failed"); 2777 blksfree = cg_blksfree(cgp); 2778 cgbno = dtogd(fs, bno); 2779 if (size == fs->fs_bsize) { 2780 free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); 2781 } else { 2782 frags = numfrags(fs, size); 2783 for (free = 0, i = 0; i < frags; i++) 2784 if (isset(blksfree, cgbno + i)) 2785 free++; 2786 if (free != 0 && free != frags) 2787 panic("ffs_checkblk: partially free fragment"); 2788 } 2789 brelse(bp); 2790 return (!free); 2791 } 2792 #endif /* INVARIANTS */ 2793 2794 /* 2795 * Free an inode. 2796 */ 2797 int 2798 ffs_vfree(struct vnode *pvp, 2799 ino_t ino, 2800 int mode) 2801 { 2802 struct ufsmount *ump; 2803 2804 if (DOINGSOFTDEP(pvp)) { 2805 softdep_freefile(pvp, ino, mode); 2806 return (0); 2807 } 2808 ump = VFSTOUFS(pvp->v_mount); 2809 return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL)); 2810 } 2811 2812 /* 2813 * Do the actual free operation. 2814 * The specified inode is placed back in the free map. 2815 */ 2816 int 2817 ffs_freefile(struct ufsmount *ump, 2818 struct fs *fs, 2819 struct vnode *devvp, 2820 ino_t ino, 2821 int mode, 2822 struct workhead *wkhd) 2823 { 2824 struct cg *cgp; 2825 struct buf *bp; 2826 daddr_t dbn; 2827 int error; 2828 uint64_t cg; 2829 uint8_t *inosused; 2830 struct cdev *dev; 2831 ino_t cgino; 2832 2833 cg = ino_to_cg(fs, ino); 2834 if (devvp->v_type == VREG) { 2835 /* devvp is a snapshot */ 2836 MPASS(devvp->v_mount->mnt_data == ump); 2837 dev = ump->um_devvp->v_rdev; 2838 } else if (devvp->v_type == VCHR) { 2839 /* devvp is a normal disk device */ 2840 dev = devvp->v_rdev; 2841 } else { 2842 bp = NULL; 2843 return (0); 2844 } 2845 if (ino >= fs->fs_ipg * fs->fs_ncg) 2846 panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s", 2847 devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); 2848 if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { 2849 if (!MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) 2850 return (error); 2851 /* 2852 * Would like to just downgrade to read-only. Until that 2853 * capability is available, just toss the cylinder group 2854 * update and mark the filesystem as needing to run fsck. 2855 */ 2856 fs->fs_flags |= FS_NEEDSFSCK; 2857 if (devvp->v_type == VREG) 2858 dbn = fragstoblks(fs, cgtod(fs, cg)); 2859 else 2860 dbn = fsbtodb(fs, cgtod(fs, cg)); 2861 error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); 2862 KASSERT(error == 0, ("getblkx failed")); 2863 softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd, true); 2864 bp->b_flags |= B_RELBUF | B_NOCACHE; 2865 bp->b_flags &= ~B_CACHE; 2866 bawrite(bp); 2867 return (error); 2868 } 2869 inosused = cg_inosused(cgp); 2870 cgino = ino % fs->fs_ipg; 2871 if (isclr(inosused, cgino)) { 2872 printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev), 2873 (uintmax_t)ino, fs->fs_fsmnt); 2874 if (fs->fs_ronly == 0) 2875 panic("ffs_freefile: freeing free inode"); 2876 } 2877 clrbit(inosused, cgino); 2878 if (cgino < cgp->cg_irotor) 2879 cgp->cg_irotor = cgino; 2880 cgp->cg_cs.cs_nifree++; 2881 UFS_LOCK(ump); 2882 fs->fs_cstotal.cs_nifree++; 2883 fs->fs_cs(fs, cg).cs_nifree++; 2884 if ((mode & IFMT) == IFDIR) { 2885 cgp->cg_cs.cs_ndir--; 2886 fs->fs_cstotal.cs_ndir--; 2887 fs->fs_cs(fs, cg).cs_ndir--; 2888 } 2889 fs->fs_fmod = 1; 2890 ACTIVECLEAR(fs, cg); 2891 UFS_UNLOCK(ump); 2892 if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR) 2893 softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd, false); 2894 bdwrite(bp); 2895 return (0); 2896 } 2897 2898 /* 2899 * Check to see if a file is free. 2900 * Used to check for allocated files in snapshots. 2901 * Return 1 if file is free. 2902 */ 2903 int 2904 ffs_checkfreefile(struct fs *fs, 2905 struct vnode *devvp, 2906 ino_t ino) 2907 { 2908 struct cg *cgp; 2909 struct buf *bp; 2910 int ret, error; 2911 uint64_t cg; 2912 uint8_t *inosused; 2913 2914 cg = ino_to_cg(fs, ino); 2915 if ((devvp->v_type != VREG) && (devvp->v_type != VCHR)) 2916 return (1); 2917 if (ino >= fs->fs_ipg * fs->fs_ncg) 2918 return (1); 2919 if ((error = ffs_getcg(fs, devvp, cg, 0, &bp, &cgp)) != 0) 2920 return (1); 2921 inosused = cg_inosused(cgp); 2922 ino %= fs->fs_ipg; 2923 ret = isclr(inosused, ino); 2924 brelse(bp); 2925 return (ret); 2926 } 2927 2928 /* 2929 * Find a block of the specified size in the specified cylinder group. 2930 * 2931 * It is a panic if a request is made to find a block if none are 2932 * available. 2933 */ 2934 static ufs1_daddr_t 2935 ffs_mapsearch(struct fs *fs, 2936 struct cg *cgp, 2937 ufs2_daddr_t bpref, 2938 int allocsiz) 2939 { 2940 ufs1_daddr_t bno; 2941 int start, len, loc, i; 2942 int blk, field, subfield, pos; 2943 uint8_t *blksfree; 2944 2945 /* 2946 * find the fragment by searching through the free block 2947 * map for an appropriate bit pattern 2948 */ 2949 if (bpref) 2950 start = dtogd(fs, bpref) / NBBY; 2951 else 2952 start = cgp->cg_frotor / NBBY; 2953 blksfree = cg_blksfree(cgp); 2954 len = howmany(fs->fs_fpg, NBBY) - start; 2955 loc = scanc((uint64_t)len, (uint8_t *)&blksfree[start], 2956 fragtbl[fs->fs_frag], 2957 (uint8_t)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 2958 if (loc == 0) { 2959 len = start + 1; 2960 start = 0; 2961 loc = scanc((uint64_t)len, (uint8_t *)&blksfree[0], 2962 fragtbl[fs->fs_frag], 2963 (uint8_t)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 2964 if (loc == 0) { 2965 printf("start = %d, len = %d, fs = %s\n", 2966 start, len, fs->fs_fsmnt); 2967 panic("ffs_alloccg: map corrupted"); 2968 /* NOTREACHED */ 2969 } 2970 } 2971 bno = (start + len - loc) * NBBY; 2972 cgp->cg_frotor = bno; 2973 /* 2974 * found the byte in the map 2975 * sift through the bits to find the selected frag 2976 */ 2977 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 2978 blk = blkmap(fs, blksfree, bno); 2979 blk <<= 1; 2980 field = around[allocsiz]; 2981 subfield = inside[allocsiz]; 2982 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { 2983 if ((blk & field) == subfield) 2984 return (bno + pos); 2985 field <<= 1; 2986 subfield <<= 1; 2987 } 2988 } 2989 printf("bno = %ju, fs = %s\n", (intmax_t)bno, fs->fs_fsmnt); 2990 panic("ffs_alloccg: block not in map"); 2991 return (-1); 2992 } 2993 2994 /* 2995 * Fetch and verify a cylinder group. 2996 */ 2997 int 2998 ffs_getcg(struct fs *fs, 2999 struct vnode *devvp, 3000 uint64_t cg, 3001 int flags, 3002 struct buf **bpp, 3003 struct cg **cgpp) 3004 { 3005 struct buf *bp; 3006 struct cg *cgp; 3007 struct mount *mp; 3008 const struct statfs *sfs; 3009 daddr_t blkno; 3010 int error; 3011 3012 *bpp = NULL; 3013 *cgpp = NULL; 3014 if ((fs->fs_metackhash & CK_CYLGRP) != 0) 3015 flags |= GB_CKHASH; 3016 if (devvp->v_type == VCHR) { 3017 blkno = fsbtodb(fs, cgtod(fs, cg)); 3018 mp = devvp->v_rdev->si_mountpt; 3019 } else { 3020 blkno = fragstoblks(fs, cgtod(fs, cg)); 3021 mp = devvp->v_mount; 3022 } 3023 error = breadn_flags(devvp, blkno, blkno, (int)fs->fs_cgsize, NULL, 3024 NULL, 0, NOCRED, flags, ffs_ckhash_cg, &bp); 3025 if (error != 0) 3026 return (error); 3027 cgp = (struct cg *)bp->b_data; 3028 if ((fs->fs_metackhash & CK_CYLGRP) != 0 && 3029 (bp->b_flags & B_CKHASH) != 0 && 3030 cgp->cg_ckhash != bp->b_ckhash) { 3031 if (ppsratecheck(&VFSTOUFS(mp)->um_last_integritymsg, 3032 &VFSTOUFS(mp)->um_secs_integritymsg, 1)) { 3033 sfs = &mp->mnt_stat; 3034 printf("UFS %s%s (%s) cylinder checkhash failed: " 3035 "cg %ju, cgp: 0x%x != bp: 0x%jx\n", 3036 devvp->v_type == VCHR ? "" : "snapshot of ", 3037 sfs->f_mntfromname, sfs->f_mntonname, (intmax_t)cg, 3038 cgp->cg_ckhash, (uintmax_t)bp->b_ckhash); 3039 } 3040 bp->b_flags &= ~B_CKHASH; 3041 bp->b_flags |= B_INVAL | B_NOCACHE; 3042 brelse(bp); 3043 return (EIO); 3044 } 3045 if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) { 3046 if (ppsratecheck(&VFSTOUFS(mp)->um_last_integritymsg, 3047 &VFSTOUFS(mp)->um_secs_integritymsg, 1)) { 3048 sfs = &mp->mnt_stat; 3049 printf("UFS %s%s (%s)", 3050 devvp->v_type == VCHR ? "" : "snapshot of ", 3051 sfs->f_mntfromname, sfs->f_mntonname); 3052 if (!cg_chkmagic(cgp)) 3053 printf(" cg %ju: bad magic number 0x%x should " 3054 "be 0x%x\n", (intmax_t)cg, cgp->cg_magic, 3055 CG_MAGIC); 3056 else 3057 printf(": wrong cylinder group cg %ju != " 3058 "cgx %u\n", (intmax_t)cg, cgp->cg_cgx); 3059 } 3060 bp->b_flags &= ~B_CKHASH; 3061 bp->b_flags |= B_INVAL | B_NOCACHE; 3062 brelse(bp); 3063 return (EIO); 3064 } 3065 bp->b_flags &= ~B_CKHASH; 3066 bp->b_xflags |= BX_BKGRDWRITE; 3067 /* 3068 * If we are using check hashes on the cylinder group then we want 3069 * to limit changing the cylinder group time to when we are actually 3070 * going to write it to disk so that its check hash remains correct 3071 * in memory. If the CK_CYLGRP flag is set the time is updated in 3072 * ffs_bufwrite() as the buffer is queued for writing. Otherwise we 3073 * update the time here as we have done historically. 3074 */ 3075 if ((fs->fs_metackhash & CK_CYLGRP) != 0) 3076 bp->b_xflags |= BX_CYLGRP; 3077 else 3078 cgp->cg_old_time = cgp->cg_time = time_second; 3079 *bpp = bp; 3080 *cgpp = cgp; 3081 return (0); 3082 } 3083 3084 static void 3085 ffs_ckhash_cg(struct buf *bp) 3086 { 3087 uint32_t ckhash; 3088 struct cg *cgp; 3089 3090 cgp = (struct cg *)bp->b_data; 3091 ckhash = cgp->cg_ckhash; 3092 cgp->cg_ckhash = 0; 3093 bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); 3094 cgp->cg_ckhash = ckhash; 3095 } 3096 3097 /* 3098 * Fserr prints the name of a filesystem with an error diagnostic. 3099 * 3100 * The form of the error message is: 3101 * fs: error message 3102 */ 3103 void 3104 ffs_fserr(struct fs *fs, 3105 ino_t inum, 3106 char *cp) 3107 { 3108 struct thread *td = curthread; /* XXX */ 3109 struct proc *p = td->td_proc; 3110 3111 log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n", 3112 p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum, 3113 fs->fs_fsmnt, cp); 3114 } 3115 3116 /* 3117 * This function provides the capability for the fsck program to 3118 * update an active filesystem. Sixteen operations are provided: 3119 * 3120 * adjrefcnt(inode, amt) - adjusts the reference count on the 3121 * specified inode by the specified amount. Under normal 3122 * operation the count should always go down. Decrementing 3123 * the count to zero will cause the inode to be freed. 3124 * adjblkcnt(inode, amt) - adjust the number of blocks used by the 3125 * inode by the specified amount. 3126 * adjdepth(inode, amt) - adjust the depth of the specified directory 3127 * inode by the specified amount. 3128 * setsize(inode, size) - set the size of the inode to the 3129 * specified size. 3130 * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - 3131 * adjust the superblock summary. 3132 * freedirs(inode, count) - directory inodes [inode..inode + count - 1] 3133 * are marked as free. Inodes should never have to be marked 3134 * as in use. 3135 * freefiles(inode, count) - file inodes [inode..inode + count - 1] 3136 * are marked as free. Inodes should never have to be marked 3137 * as in use. 3138 * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] 3139 * are marked as free. Blocks should never have to be marked 3140 * as in use. 3141 * setflags(flags, set/clear) - the fs_flags field has the specified 3142 * flags set (second parameter +1) or cleared (second parameter -1). 3143 * setcwd(dirinode) - set the current directory to dirinode in the 3144 * filesystem associated with the snapshot. 3145 * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".." 3146 * in the current directory is oldvalue then change it to newvalue. 3147 * unlink(nameptr, oldvalue) - Verify that the inode number associated 3148 * with nameptr in the current directory is oldvalue then unlink it. 3149 */ 3150 3151 static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); 3152 3153 SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, 3154 CTLFLAG_WR | CTLTYPE_STRUCT | CTLFLAG_NEEDGIANT, 3155 0, 0, sysctl_ffs_fsck, "S,fsck", 3156 "Adjust Inode Reference Count"); 3157 3158 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, 3159 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3160 "Adjust Inode Used Blocks Count"); 3161 3162 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_DEPTH, adjdepth, 3163 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3164 "Adjust Directory Inode Depth"); 3165 3166 static SYSCTL_NODE(_vfs_ffs, FFS_SET_SIZE, setsize, 3167 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3168 "Set the inode size"); 3169 3170 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, 3171 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3172 "Adjust number of directories"); 3173 3174 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, 3175 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3176 "Adjust number of free blocks"); 3177 3178 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, 3179 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3180 "Adjust number of free inodes"); 3181 3182 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, 3183 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3184 "Adjust number of free frags"); 3185 3186 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, 3187 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3188 "Adjust number of free clusters"); 3189 3190 static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, 3191 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3192 "Free Range of Directory Inodes"); 3193 3194 static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, 3195 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3196 "Free Range of File Inodes"); 3197 3198 static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, 3199 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3200 "Free Range of Blocks"); 3201 3202 static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, 3203 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3204 "Change Filesystem Flags"); 3205 3206 static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, 3207 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3208 "Set Current Working Directory"); 3209 3210 static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, 3211 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3212 "Change Value of .. Entry"); 3213 3214 static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, 3215 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3216 "Unlink a Duplicate Name"); 3217 3218 #ifdef DIAGNOSTIC 3219 static int fsckcmds = 0; 3220 SYSCTL_INT(_debug, OID_AUTO, ffs_fsckcmds, CTLFLAG_RW, &fsckcmds, 0, 3221 "print out fsck_ffs-based filesystem update commands"); 3222 #endif /* DIAGNOSTIC */ 3223 3224 static int 3225 sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) 3226 { 3227 struct thread *td = curthread; 3228 struct fsck_cmd cmd; 3229 struct ufsmount *ump; 3230 struct vnode *vp, *dvp, *fdvp; 3231 struct inode *ip, *dp; 3232 struct mount *mp; 3233 struct fs *fs; 3234 struct pwd *pwd; 3235 ufs2_daddr_t blkno; 3236 long blkcnt, blksize; 3237 uint64_t key; 3238 struct file *fp; 3239 cap_rights_t rights; 3240 int filetype, error; 3241 3242 if (req->newptr == NULL || req->newlen > sizeof(cmd)) 3243 return (EBADRPC); 3244 if ((error = SYSCTL_IN(req, &cmd, sizeof(cmd))) != 0) 3245 return (error); 3246 if (cmd.version != FFS_CMD_VERSION) 3247 return (ERPCMISMATCH); 3248 if ((error = getvnode(td, cmd.handle, 3249 cap_rights_init_one(&rights, CAP_FSCK), &fp)) != 0) 3250 return (error); 3251 vp = fp->f_vnode; 3252 if (vp->v_type != VREG && vp->v_type != VDIR) { 3253 fdrop(fp, td); 3254 return (EINVAL); 3255 } 3256 vn_start_write(vp, &mp, V_WAIT); 3257 if (mp == NULL || 3258 strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { 3259 vn_finished_write(mp); 3260 fdrop(fp, td); 3261 return (EINVAL); 3262 } 3263 ump = VFSTOUFS(mp); 3264 if (mp->mnt_flag & MNT_RDONLY) { 3265 vn_finished_write(mp); 3266 fdrop(fp, td); 3267 return (EROFS); 3268 } 3269 fs = ump->um_fs; 3270 filetype = IFREG; 3271 3272 switch (oidp->oid_number) { 3273 case FFS_SET_FLAGS: 3274 #ifdef DIAGNOSTIC 3275 if (fsckcmds) 3276 printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, 3277 cmd.size > 0 ? "set" : "clear"); 3278 #endif /* DIAGNOSTIC */ 3279 if (cmd.size > 0) 3280 fs->fs_flags |= (long)cmd.value; 3281 else 3282 fs->fs_flags &= ~(long)cmd.value; 3283 break; 3284 3285 case FFS_ADJ_REFCNT: 3286 #ifdef DIAGNOSTIC 3287 if (fsckcmds) { 3288 printf("%s: adjust inode %jd link count by %jd\n", 3289 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3290 (intmax_t)cmd.size); 3291 } 3292 #endif /* DIAGNOSTIC */ 3293 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3294 break; 3295 ip = VTOI(vp); 3296 ip->i_nlink += cmd.size; 3297 DIP_SET(ip, i_nlink, ip->i_nlink); 3298 ip->i_effnlink += cmd.size; 3299 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3300 error = ffs_update(vp, 1); 3301 if (DOINGSOFTDEP(vp)) 3302 softdep_change_linkcnt(ip); 3303 vput(vp); 3304 break; 3305 3306 case FFS_ADJ_BLKCNT: 3307 #ifdef DIAGNOSTIC 3308 if (fsckcmds) { 3309 printf("%s: adjust inode %jd block count by %jd\n", 3310 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3311 (intmax_t)cmd.size); 3312 } 3313 #endif /* DIAGNOSTIC */ 3314 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3315 break; 3316 ip = VTOI(vp); 3317 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); 3318 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3319 error = ffs_update(vp, 1); 3320 vput(vp); 3321 break; 3322 3323 case FFS_ADJ_DEPTH: 3324 #ifdef DIAGNOSTIC 3325 if (fsckcmds) { 3326 printf("%s: adjust directory inode %jd depth by %jd\n", 3327 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3328 (intmax_t)cmd.size); 3329 } 3330 #endif /* DIAGNOSTIC */ 3331 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3332 break; 3333 if (vp->v_type != VDIR) { 3334 vput(vp); 3335 error = ENOTDIR; 3336 break; 3337 } 3338 ip = VTOI(vp); 3339 DIP_SET(ip, i_dirdepth, DIP(ip, i_dirdepth) + cmd.size); 3340 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3341 error = ffs_update(vp, 1); 3342 vput(vp); 3343 break; 3344 3345 case FFS_SET_SIZE: 3346 #ifdef DIAGNOSTIC 3347 if (fsckcmds) { 3348 printf("%s: set inode %jd size to %jd\n", 3349 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3350 (intmax_t)cmd.size); 3351 } 3352 #endif /* DIAGNOSTIC */ 3353 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3354 break; 3355 ip = VTOI(vp); 3356 DIP_SET(ip, i_size, cmd.size); 3357 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_MODIFIED); 3358 error = ffs_update(vp, 1); 3359 vput(vp); 3360 break; 3361 3362 case FFS_DIR_FREE: 3363 filetype = IFDIR; 3364 /* fall through */ 3365 3366 case FFS_FILE_FREE: 3367 #ifdef DIAGNOSTIC 3368 if (fsckcmds) { 3369 if (cmd.size == 1) 3370 printf("%s: free %s inode %ju\n", 3371 mp->mnt_stat.f_mntonname, 3372 filetype == IFDIR ? "directory" : "file", 3373 (uintmax_t)cmd.value); 3374 else 3375 printf("%s: free %s inodes %ju-%ju\n", 3376 mp->mnt_stat.f_mntonname, 3377 filetype == IFDIR ? "directory" : "file", 3378 (uintmax_t)cmd.value, 3379 (uintmax_t)(cmd.value + cmd.size - 1)); 3380 } 3381 #endif /* DIAGNOSTIC */ 3382 while (cmd.size > 0) { 3383 if ((error = ffs_freefile(ump, fs, ump->um_devvp, 3384 cmd.value, filetype, NULL))) 3385 break; 3386 cmd.size -= 1; 3387 cmd.value += 1; 3388 } 3389 break; 3390 3391 case FFS_BLK_FREE: 3392 #ifdef DIAGNOSTIC 3393 if (fsckcmds) { 3394 if (cmd.size == 1) 3395 printf("%s: free block %jd\n", 3396 mp->mnt_stat.f_mntonname, 3397 (intmax_t)cmd.value); 3398 else 3399 printf("%s: free blocks %jd-%jd\n", 3400 mp->mnt_stat.f_mntonname, 3401 (intmax_t)cmd.value, 3402 (intmax_t)cmd.value + cmd.size - 1); 3403 } 3404 #endif /* DIAGNOSTIC */ 3405 blkno = cmd.value; 3406 blkcnt = cmd.size; 3407 blksize = fs->fs_frag - (blkno % fs->fs_frag); 3408 key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO); 3409 while (blkcnt > 0) { 3410 if (blkcnt < blksize) 3411 blksize = blkcnt; 3412 ffs_blkfree(ump, fs, ump->um_devvp, blkno, 3413 blksize * fs->fs_fsize, UFS_ROOTINO, 3414 VDIR, NULL, key); 3415 blkno += blksize; 3416 blkcnt -= blksize; 3417 blksize = fs->fs_frag; 3418 } 3419 ffs_blkrelease_finish(ump, key); 3420 break; 3421 3422 /* 3423 * Adjust superblock summaries. fsck(8) is expected to 3424 * submit deltas when necessary. 3425 */ 3426 case FFS_ADJ_NDIR: 3427 #ifdef DIAGNOSTIC 3428 if (fsckcmds) { 3429 printf("%s: adjust number of directories by %jd\n", 3430 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3431 } 3432 #endif /* DIAGNOSTIC */ 3433 fs->fs_cstotal.cs_ndir += cmd.value; 3434 break; 3435 3436 case FFS_ADJ_NBFREE: 3437 #ifdef DIAGNOSTIC 3438 if (fsckcmds) { 3439 printf("%s: adjust number of free blocks by %+jd\n", 3440 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3441 } 3442 #endif /* DIAGNOSTIC */ 3443 fs->fs_cstotal.cs_nbfree += cmd.value; 3444 break; 3445 3446 case FFS_ADJ_NIFREE: 3447 #ifdef DIAGNOSTIC 3448 if (fsckcmds) { 3449 printf("%s: adjust number of free inodes by %+jd\n", 3450 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3451 } 3452 #endif /* DIAGNOSTIC */ 3453 fs->fs_cstotal.cs_nifree += cmd.value; 3454 break; 3455 3456 case FFS_ADJ_NFFREE: 3457 #ifdef DIAGNOSTIC 3458 if (fsckcmds) { 3459 printf("%s: adjust number of free frags by %+jd\n", 3460 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3461 } 3462 #endif /* DIAGNOSTIC */ 3463 fs->fs_cstotal.cs_nffree += cmd.value; 3464 break; 3465 3466 case FFS_ADJ_NUMCLUSTERS: 3467 #ifdef DIAGNOSTIC 3468 if (fsckcmds) { 3469 printf("%s: adjust number of free clusters by %+jd\n", 3470 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3471 } 3472 #endif /* DIAGNOSTIC */ 3473 fs->fs_cstotal.cs_numclusters += cmd.value; 3474 break; 3475 3476 case FFS_SET_CWD: 3477 #ifdef DIAGNOSTIC 3478 if (fsckcmds) { 3479 printf("%s: set current directory to inode %jd\n", 3480 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3481 } 3482 #endif /* DIAGNOSTIC */ 3483 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp))) 3484 break; 3485 AUDIT_ARG_VNODE1(vp); 3486 if ((error = change_dir(vp, td)) != 0) { 3487 vput(vp); 3488 break; 3489 } 3490 VOP_UNLOCK(vp); 3491 pwd_chdir(td, vp); 3492 break; 3493 3494 case FFS_SET_DOTDOT: 3495 #ifdef DIAGNOSTIC 3496 if (fsckcmds) { 3497 printf("%s: change .. in cwd from %jd to %jd\n", 3498 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3499 (intmax_t)cmd.size); 3500 } 3501 #endif /* DIAGNOSTIC */ 3502 /* 3503 * First we have to get and lock the parent directory 3504 * to which ".." points. 3505 */ 3506 error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp); 3507 if (error) 3508 break; 3509 /* 3510 * Now we get and lock the child directory containing "..". 3511 */ 3512 pwd = pwd_hold(td); 3513 dvp = pwd->pwd_cdir; 3514 if ((error = vget(dvp, LK_EXCLUSIVE)) != 0) { 3515 vput(fdvp); 3516 pwd_drop(pwd); 3517 break; 3518 } 3519 dp = VTOI(dvp); 3520 SET_I_OFFSET(dp, 12); /* XXX mastertemplate.dot_reclen */ 3521 error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, 3522 DT_DIR, 0); 3523 cache_purge(fdvp); 3524 cache_purge(dvp); 3525 vput(dvp); 3526 vput(fdvp); 3527 pwd_drop(pwd); 3528 break; 3529 3530 case FFS_UNLINK: 3531 #ifdef DIAGNOSTIC 3532 if (fsckcmds) { 3533 char buf[32]; 3534 3535 if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL)) 3536 strncpy(buf, "Name_too_long", 32); 3537 printf("%s: unlink %s (inode %jd)\n", 3538 mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size); 3539 } 3540 #endif /* DIAGNOSTIC */ 3541 /* 3542 * kern_funlinkat will do its own start/finish writes and 3543 * they do not nest, so drop ours here. Setting mp == NULL 3544 * indicates that vn_finished_write is not needed down below. 3545 */ 3546 vn_finished_write(mp); 3547 mp = NULL; 3548 error = kern_funlinkat(td, AT_FDCWD, 3549 (char *)(intptr_t)cmd.value, FD_NONE, UIO_USERSPACE, 3550 0, (ino_t)cmd.size); 3551 break; 3552 3553 default: 3554 #ifdef DIAGNOSTIC 3555 if (fsckcmds) { 3556 printf("Invalid request %d from fsck\n", 3557 oidp->oid_number); 3558 } 3559 #endif /* DIAGNOSTIC */ 3560 error = EINVAL; 3561 break; 3562 } 3563 fdrop(fp, td); 3564 vn_finished_write(mp); 3565 return (error); 3566 } 3567