1 /*- 2 * SPDX-License-Identifier: (BSD-2-Clause AND BSD-3-Clause) 3 * 4 * Copyright (c) 2002 Networks Associates Technology, Inc. 5 * All rights reserved. 6 * 7 * This software was developed for the FreeBSD Project by Marshall 8 * Kirk McKusick and Network Associates Laboratories, the Security 9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 11 * research program 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1982, 1986, 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include "opt_quota.h" 68 69 #include <sys/param.h> 70 #include <sys/systm.h> 71 #include <sys/bio.h> 72 #include <sys/buf.h> 73 #include <sys/capsicum.h> 74 #include <sys/conf.h> 75 #include <sys/fcntl.h> 76 #include <sys/file.h> 77 #include <sys/filedesc.h> 78 #include <sys/gsb_crc32.h> 79 #include <sys/kernel.h> 80 #include <sys/mount.h> 81 #include <sys/priv.h> 82 #include <sys/proc.h> 83 #include <sys/stat.h> 84 #include <sys/syscallsubr.h> 85 #include <sys/sysctl.h> 86 #include <sys/syslog.h> 87 #include <sys/taskqueue.h> 88 #include <sys/vnode.h> 89 90 #include <security/audit/audit.h> 91 92 #include <geom/geom.h> 93 #include <geom/geom_vfs.h> 94 95 #include <ufs/ufs/dir.h> 96 #include <ufs/ufs/extattr.h> 97 #include <ufs/ufs/quota.h> 98 #include <ufs/ufs/inode.h> 99 #include <ufs/ufs/ufs_extern.h> 100 #include <ufs/ufs/ufsmount.h> 101 102 #include <ufs/ffs/fs.h> 103 #include <ufs/ffs/ffs_extern.h> 104 #include <ufs/ffs/softdep.h> 105 106 typedef ufs2_daddr_t allocfcn_t(struct inode *ip, uint64_t cg, 107 ufs2_daddr_t bpref, int size, int rsize); 108 109 static ufs2_daddr_t ffs_alloccg(struct inode *, uint64_t, ufs2_daddr_t, int, 110 int); 111 static ufs2_daddr_t 112 ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); 113 static void ffs_blkfree_cg(struct ufsmount *, struct fs *, 114 struct vnode *, ufs2_daddr_t, long, ino_t, 115 struct workhead *); 116 #ifdef INVARIANTS 117 static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); 118 #endif 119 static ufs2_daddr_t ffs_clusteralloc(struct inode *, uint64_t, ufs2_daddr_t, 120 int); 121 static ino_t ffs_dirpref(struct inode *); 122 static ufs2_daddr_t ffs_fragextend(struct inode *, uint64_t, ufs2_daddr_t, 123 int, int); 124 static ufs2_daddr_t ffs_hashalloc(struct inode *, uint64_t, ufs2_daddr_t, 125 int, int, allocfcn_t *); 126 static ufs2_daddr_t ffs_nodealloccg(struct inode *, uint64_t, ufs2_daddr_t, int, 127 int); 128 static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); 129 static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); 130 static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); 131 static void ffs_ckhash_cg(struct buf *); 132 133 /* 134 * Allocate a block in the filesystem. 135 * 136 * The size of the requested block is given, which must be some 137 * multiple of fs_fsize and <= fs_bsize. 138 * A preference may be optionally specified. If a preference is given 139 * the following hierarchy is used to allocate a block: 140 * 1) allocate the requested block. 141 * 2) allocate a rotationally optimal block in the same cylinder. 142 * 3) allocate a block in the same cylinder group. 143 * 4) quadratically rehash into other cylinder groups, until an 144 * available block is located. 145 * If no block preference is given the following hierarchy is used 146 * to allocate a block: 147 * 1) allocate a block in the cylinder group that contains the 148 * inode for the file. 149 * 2) quadratically rehash into other cylinder groups, until an 150 * available block is located. 151 */ 152 int 153 ffs_alloc(struct inode *ip, 154 ufs2_daddr_t lbn, 155 ufs2_daddr_t bpref, 156 int size, 157 int flags, 158 struct ucred *cred, 159 ufs2_daddr_t *bnp) 160 { 161 struct fs *fs; 162 struct ufsmount *ump; 163 ufs2_daddr_t bno; 164 uint64_t cg, reclaimed; 165 int64_t delta; 166 #ifdef QUOTA 167 int error; 168 #endif 169 170 *bnp = 0; 171 ump = ITOUMP(ip); 172 fs = ump->um_fs; 173 mtx_assert(UFS_MTX(ump), MA_OWNED); 174 #ifdef INVARIANTS 175 if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0) { 176 printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", 177 devtoname(ump->um_dev), (long)fs->fs_bsize, size, 178 fs->fs_fsmnt); 179 panic("ffs_alloc: bad size"); 180 } 181 if (cred == NOCRED) 182 panic("ffs_alloc: missing credential"); 183 #endif /* INVARIANTS */ 184 reclaimed = 0; 185 retry: 186 #ifdef QUOTA 187 UFS_UNLOCK(ump); 188 error = chkdq(ip, btodb(size), cred, 0); 189 if (error) 190 return (error); 191 UFS_LOCK(ump); 192 #endif 193 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 194 goto nospace; 195 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && 196 freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) 197 goto nospace; 198 if (bpref >= fs->fs_size) 199 bpref = 0; 200 if (bpref == 0) 201 cg = ino_to_cg(fs, ip->i_number); 202 else 203 cg = dtog(fs, bpref); 204 bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); 205 if (bno > 0) { 206 delta = btodb(size); 207 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 208 if (flags & IO_EXT) 209 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 210 else 211 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 212 *bnp = bno; 213 return (0); 214 } 215 nospace: 216 #ifdef QUOTA 217 UFS_UNLOCK(ump); 218 /* 219 * Restore user's disk quota because allocation failed. 220 */ 221 (void) chkdq(ip, -btodb(size), cred, FORCE); 222 UFS_LOCK(ump); 223 #endif 224 if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 225 reclaimed = 1; 226 softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT); 227 goto retry; 228 } 229 if (ffs_fsfail_cleanup_locked(ump, 0)) { 230 UFS_UNLOCK(ump); 231 return (ENXIO); 232 } 233 if (reclaimed > 0 && 234 ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 235 UFS_UNLOCK(ump); 236 ffs_fserr(fs, ip->i_number, "filesystem full"); 237 uprintf("\n%s: write failed, filesystem is full\n", 238 fs->fs_fsmnt); 239 } else { 240 UFS_UNLOCK(ump); 241 } 242 return (ENOSPC); 243 } 244 245 /* 246 * Reallocate a fragment to a bigger size 247 * 248 * The number and size of the old block is given, and a preference 249 * and new size is also specified. The allocator attempts to extend 250 * the original block. Failing that, the regular block allocator is 251 * invoked to get an appropriate block. 252 */ 253 int 254 ffs_realloccg(struct inode *ip, 255 ufs2_daddr_t lbprev, 256 ufs2_daddr_t bprev, 257 ufs2_daddr_t bpref, 258 int osize, 259 int nsize, 260 int flags, 261 struct ucred *cred, 262 struct buf **bpp) 263 { 264 struct vnode *vp; 265 struct fs *fs; 266 struct buf *bp; 267 struct ufsmount *ump; 268 uint64_t cg, request, reclaimed; 269 int error, gbflags; 270 ufs2_daddr_t bno; 271 int64_t delta; 272 273 vp = ITOV(ip); 274 ump = ITOUMP(ip); 275 fs = ump->um_fs; 276 bp = NULL; 277 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 278 #ifdef WITNESS 279 gbflags |= IS_SNAPSHOT(ip) ? GB_NOWITNESS : 0; 280 #endif 281 282 mtx_assert(UFS_MTX(ump), MA_OWNED); 283 #ifdef INVARIANTS 284 if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 285 panic("ffs_realloccg: allocation on suspended filesystem"); 286 if ((uint64_t)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || 287 (uint64_t)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { 288 printf( 289 "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", 290 devtoname(ump->um_dev), (long)fs->fs_bsize, osize, 291 nsize, fs->fs_fsmnt); 292 panic("ffs_realloccg: bad size"); 293 } 294 if (cred == NOCRED) 295 panic("ffs_realloccg: missing credential"); 296 #endif /* INVARIANTS */ 297 reclaimed = 0; 298 retry: 299 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && 300 freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) { 301 goto nospace; 302 } 303 if (bprev == 0) { 304 printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", 305 devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev, 306 fs->fs_fsmnt); 307 panic("ffs_realloccg: bad bprev"); 308 } 309 UFS_UNLOCK(ump); 310 /* 311 * Allocate the extra space in the buffer. 312 */ 313 error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); 314 if (error) { 315 return (error); 316 } 317 318 if (bp->b_blkno == bp->b_lblkno) { 319 if (lbprev >= UFS_NDADDR) 320 panic("ffs_realloccg: lbprev out of range"); 321 bp->b_blkno = fsbtodb(fs, bprev); 322 } 323 324 #ifdef QUOTA 325 error = chkdq(ip, btodb(nsize - osize), cred, 0); 326 if (error) { 327 brelse(bp); 328 return (error); 329 } 330 #endif 331 /* 332 * Check for extension in the existing location. 333 */ 334 *bpp = NULL; 335 cg = dtog(fs, bprev); 336 UFS_LOCK(ump); 337 bno = ffs_fragextend(ip, cg, bprev, osize, nsize); 338 if (bno) { 339 if (bp->b_blkno != fsbtodb(fs, bno)) 340 panic("ffs_realloccg: bad blockno"); 341 delta = btodb(nsize - osize); 342 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 343 if (flags & IO_EXT) 344 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 345 else 346 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 347 allocbuf(bp, nsize); 348 bp->b_flags |= B_DONE; 349 vfs_bio_bzero_buf(bp, osize, nsize - osize); 350 if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 351 vfs_bio_set_valid(bp, osize, nsize - osize); 352 *bpp = bp; 353 return (0); 354 } 355 /* 356 * Allocate a new disk location. 357 */ 358 if (bpref >= fs->fs_size) 359 bpref = 0; 360 switch ((int)fs->fs_optim) { 361 case FS_OPTSPACE: 362 /* 363 * Allocate an exact sized fragment. Although this makes 364 * best use of space, we will waste time relocating it if 365 * the file continues to grow. If the fragmentation is 366 * less than half of the minimum free reserve, we choose 367 * to begin optimizing for time. 368 */ 369 request = nsize; 370 if (fs->fs_minfree <= 5 || 371 fs->fs_cstotal.cs_nffree > 372 (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) 373 break; 374 log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", 375 fs->fs_fsmnt); 376 fs->fs_optim = FS_OPTTIME; 377 break; 378 case FS_OPTTIME: 379 /* 380 * At this point we have discovered a file that is trying to 381 * grow a small fragment to a larger fragment. To save time, 382 * we allocate a full sized block, then free the unused portion. 383 * If the file continues to grow, the `ffs_fragextend' call 384 * above will be able to grow it in place without further 385 * copying. If aberrant programs cause disk fragmentation to 386 * grow within 2% of the free reserve, we choose to begin 387 * optimizing for space. 388 */ 389 request = fs->fs_bsize; 390 if (fs->fs_cstotal.cs_nffree < 391 (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) 392 break; 393 log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", 394 fs->fs_fsmnt); 395 fs->fs_optim = FS_OPTSPACE; 396 break; 397 default: 398 printf("dev = %s, optim = %ld, fs = %s\n", 399 devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt); 400 panic("ffs_realloccg: bad optim"); 401 /* NOTREACHED */ 402 } 403 bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); 404 if (bno > 0) { 405 bp->b_blkno = fsbtodb(fs, bno); 406 if (!DOINGSOFTDEP(vp)) 407 /* 408 * The usual case is that a smaller fragment that 409 * was just allocated has been replaced with a bigger 410 * fragment or a full-size block. If it is marked as 411 * B_DELWRI, the current contents have not been written 412 * to disk. It is possible that the block was written 413 * earlier, but very uncommon. If the block has never 414 * been written, there is no need to send a BIO_DELETE 415 * for it when it is freed. The gain from avoiding the 416 * TRIMs for the common case of unwritten blocks far 417 * exceeds the cost of the write amplification for the 418 * uncommon case of failing to send a TRIM for a block 419 * that had been written. 420 */ 421 ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, 422 ip->i_number, vp->v_type, NULL, 423 (bp->b_flags & B_DELWRI) != 0 ? 424 NOTRIM_KEY : SINGLETON_KEY); 425 delta = btodb(nsize - osize); 426 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 427 if (flags & IO_EXT) 428 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 429 else 430 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 431 allocbuf(bp, nsize); 432 bp->b_flags |= B_DONE; 433 vfs_bio_bzero_buf(bp, osize, nsize - osize); 434 if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 435 vfs_bio_set_valid(bp, osize, nsize - osize); 436 *bpp = bp; 437 return (0); 438 } 439 #ifdef QUOTA 440 UFS_UNLOCK(ump); 441 /* 442 * Restore user's disk quota because allocation failed. 443 */ 444 (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); 445 UFS_LOCK(ump); 446 #endif 447 nospace: 448 /* 449 * no space available 450 */ 451 if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 452 reclaimed = 1; 453 UFS_UNLOCK(ump); 454 if (bp) { 455 brelse(bp); 456 bp = NULL; 457 } 458 UFS_LOCK(ump); 459 softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); 460 goto retry; 461 } 462 if (bp) 463 brelse(bp); 464 if (ffs_fsfail_cleanup_locked(ump, 0)) { 465 UFS_UNLOCK(ump); 466 return (ENXIO); 467 } 468 if (reclaimed > 0 && 469 ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 470 UFS_UNLOCK(ump); 471 ffs_fserr(fs, ip->i_number, "filesystem full"); 472 uprintf("\n%s: write failed, filesystem is full\n", 473 fs->fs_fsmnt); 474 } else { 475 UFS_UNLOCK(ump); 476 } 477 return (ENOSPC); 478 } 479 480 /* 481 * Reallocate a sequence of blocks into a contiguous sequence of blocks. 482 * 483 * The vnode and an array of buffer pointers for a range of sequential 484 * logical blocks to be made contiguous is given. The allocator attempts 485 * to find a range of sequential blocks starting as close as possible 486 * from the end of the allocation for the logical block immediately 487 * preceding the current range. If successful, the physical block numbers 488 * in the buffer pointers and in the inode are changed to reflect the new 489 * allocation. If unsuccessful, the allocation is left unchanged. The 490 * success in doing the reallocation is returned. Note that the error 491 * return is not reflected back to the user. Rather the previous block 492 * allocation will be used. 493 */ 494 495 SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 496 "FFS filesystem"); 497 498 static int doasyncfree = 1; 499 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, 500 "do not force synchronous writes when blocks are reallocated"); 501 502 static int doreallocblks = 1; 503 SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, 504 "enable block reallocation"); 505 506 static int dotrimcons = 1; 507 SYSCTL_INT(_vfs_ffs, OID_AUTO, dotrimcons, CTLFLAG_RWTUN, &dotrimcons, 0, 508 "enable BIO_DELETE / TRIM consolidation"); 509 510 static int maxclustersearch = 10; 511 SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch, 512 0, "max number of cylinder group to search for contigous blocks"); 513 514 #ifdef DIAGNOSTIC 515 static int prtrealloc = 0; 516 SYSCTL_INT(_debug, OID_AUTO, ffs_prtrealloc, CTLFLAG_RW, &prtrealloc, 0, 517 "print out FFS filesystem block reallocation operations"); 518 #endif 519 520 int 521 ffs_reallocblks( 522 struct vop_reallocblks_args /* { 523 struct vnode *a_vp; 524 struct cluster_save *a_buflist; 525 } */ *ap) 526 { 527 struct ufsmount *ump; 528 int error; 529 530 /* 531 * We used to skip reallocating the blocks of a file into a 532 * contiguous sequence if the underlying flash device requested 533 * BIO_DELETE notifications, because devices that benefit from 534 * BIO_DELETE also benefit from not moving the data. However, 535 * the destination for the data is usually moved before the data 536 * is written to the initially allocated location, so we rarely 537 * suffer the penalty of extra writes. With the addition of the 538 * consolidation of contiguous blocks into single BIO_DELETE 539 * operations, having fewer but larger contiguous blocks reduces 540 * the number of (slow and expensive) BIO_DELETE operations. So 541 * when doing BIO_DELETE consolidation, we do block reallocation. 542 * 543 * Skip if reallocblks has been disabled globally. 544 */ 545 ump = ap->a_vp->v_mount->mnt_data; 546 if ((((ump->um_flags) & UM_CANDELETE) != 0 && dotrimcons == 0) || 547 doreallocblks == 0) 548 return (ENOSPC); 549 550 /* 551 * We can't wait in softdep prealloc as it may fsync and recurse 552 * here. Instead we simply fail to reallocate blocks if this 553 * rare condition arises. 554 */ 555 if (DOINGSUJ(ap->a_vp)) 556 if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) 557 return (ENOSPC); 558 vn_seqc_write_begin(ap->a_vp); 559 error = ump->um_fstype == UFS1 ? ffs_reallocblks_ufs1(ap) : 560 ffs_reallocblks_ufs2(ap); 561 vn_seqc_write_end(ap->a_vp); 562 return (error); 563 } 564 565 static int 566 ffs_reallocblks_ufs1( 567 struct vop_reallocblks_args /* { 568 struct vnode *a_vp; 569 struct cluster_save *a_buflist; 570 } */ *ap) 571 { 572 struct fs *fs; 573 struct inode *ip; 574 struct vnode *vp; 575 struct buf *sbp, *ebp, *bp; 576 ufs1_daddr_t *bap, *sbap, *ebap; 577 struct cluster_save *buflist; 578 struct ufsmount *ump; 579 ufs_lbn_t start_lbn, end_lbn; 580 ufs1_daddr_t soff, newblk, blkno; 581 ufs2_daddr_t pref; 582 struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; 583 int i, cg, len, start_lvl, end_lvl, ssize; 584 585 vp = ap->a_vp; 586 ip = VTOI(vp); 587 ump = ITOUMP(ip); 588 fs = ump->um_fs; 589 /* 590 * If we are not tracking block clusters or if we have less than 4% 591 * free blocks left, then do not attempt to cluster. Running with 592 * less than 5% free block reserve is not recommended and those that 593 * choose to do so do not expect to have good file layout. 594 */ 595 if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 596 return (ENOSPC); 597 buflist = ap->a_buflist; 598 len = buflist->bs_nchildren; 599 start_lbn = buflist->bs_children[0]->b_lblkno; 600 end_lbn = start_lbn + len - 1; 601 #ifdef INVARIANTS 602 for (i = 0; i < len; i++) 603 if (!ffs_checkblk(ip, 604 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 605 panic("ffs_reallocblks: unallocated block 1"); 606 for (i = 1; i < len; i++) 607 if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 608 panic("ffs_reallocblks: non-logical cluster"); 609 blkno = buflist->bs_children[0]->b_blkno; 610 ssize = fsbtodb(fs, fs->fs_frag); 611 for (i = 1; i < len - 1; i++) 612 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 613 panic("ffs_reallocblks: non-physical cluster %d", i); 614 #endif 615 /* 616 * If the cluster crosses the boundary for the first indirect 617 * block, leave space for the indirect block. Indirect blocks 618 * are initially laid out in a position after the last direct 619 * block. Block reallocation would usually destroy locality by 620 * moving the indirect block out of the way to make room for 621 * data blocks if we didn't compensate here. We should also do 622 * this for other indirect block boundaries, but it is only 623 * important for the first one. 624 */ 625 if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) 626 return (ENOSPC); 627 /* 628 * If the latest allocation is in a new cylinder group, assume that 629 * the filesystem has decided to move and do not force it back to 630 * the previous cylinder group. 631 */ 632 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 633 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 634 return (ENOSPC); 635 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 636 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 637 return (ENOSPC); 638 /* 639 * Get the starting offset and block map for the first block. 640 */ 641 if (start_lvl == 0) { 642 sbap = &ip->i_din1->di_db[0]; 643 soff = start_lbn; 644 } else { 645 idp = &start_ap[start_lvl - 1]; 646 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 647 brelse(sbp); 648 return (ENOSPC); 649 } 650 sbap = (ufs1_daddr_t *)sbp->b_data; 651 soff = idp->in_off; 652 } 653 /* 654 * If the block range spans two block maps, get the second map. 655 */ 656 ebap = NULL; 657 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 658 ssize = len; 659 } else { 660 #ifdef INVARIANTS 661 if (start_lvl > 0 && 662 start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 663 panic("ffs_reallocblk: start == end"); 664 #endif 665 ssize = len - (idp->in_off + 1); 666 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 667 goto fail; 668 ebap = (ufs1_daddr_t *)ebp->b_data; 669 } 670 /* 671 * Find the preferred location for the cluster. If we have not 672 * previously failed at this endeavor, then follow our standard 673 * preference calculation. If we have failed at it, then pick up 674 * where we last ended our search. 675 */ 676 UFS_LOCK(ump); 677 if (ip->i_nextclustercg == -1) 678 pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); 679 else 680 pref = cgdata(fs, ip->i_nextclustercg); 681 /* 682 * Search the block map looking for an allocation of the desired size. 683 * To avoid wasting too much time, we limit the number of cylinder 684 * groups that we will search. 685 */ 686 cg = dtog(fs, pref); 687 for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { 688 if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) 689 break; 690 cg += 1; 691 if (cg >= fs->fs_ncg) 692 cg = 0; 693 } 694 /* 695 * If we have failed in our search, record where we gave up for 696 * next time. Otherwise, fall back to our usual search citerion. 697 */ 698 if (newblk == 0) { 699 ip->i_nextclustercg = cg; 700 UFS_UNLOCK(ump); 701 goto fail; 702 } 703 ip->i_nextclustercg = -1; 704 /* 705 * We have found a new contiguous block. 706 * 707 * First we have to replace the old block pointers with the new 708 * block pointers in the inode and indirect blocks associated 709 * with the file. 710 */ 711 #ifdef DIAGNOSTIC 712 if (prtrealloc) 713 printf("realloc: ino %ju, lbns %jd-%jd\n\told:", 714 (uintmax_t)ip->i_number, 715 (intmax_t)start_lbn, (intmax_t)end_lbn); 716 #endif 717 blkno = newblk; 718 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 719 if (i == ssize) { 720 bap = ebap; 721 soff = -i; 722 } 723 #ifdef INVARIANTS 724 if (!ffs_checkblk(ip, 725 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 726 panic("ffs_reallocblks: unallocated block 2"); 727 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 728 panic("ffs_reallocblks: alloc mismatch"); 729 #endif 730 #ifdef DIAGNOSTIC 731 if (prtrealloc) 732 printf(" %d,", *bap); 733 #endif 734 if (DOINGSOFTDEP(vp)) { 735 if (sbap == &ip->i_din1->di_db[0] && i < ssize) 736 softdep_setup_allocdirect(ip, start_lbn + i, 737 blkno, *bap, fs->fs_bsize, fs->fs_bsize, 738 buflist->bs_children[i]); 739 else 740 softdep_setup_allocindir_page(ip, start_lbn + i, 741 i < ssize ? sbp : ebp, soff + i, blkno, 742 *bap, buflist->bs_children[i]); 743 } 744 *bap++ = blkno; 745 } 746 /* 747 * Next we must write out the modified inode and indirect blocks. 748 * For strict correctness, the writes should be synchronous since 749 * the old block values may have been written to disk. In practise 750 * they are almost never written, but if we are concerned about 751 * strict correctness, the `doasyncfree' flag should be set to zero. 752 * 753 * The test on `doasyncfree' should be changed to test a flag 754 * that shows whether the associated buffers and inodes have 755 * been written. The flag should be set when the cluster is 756 * started and cleared whenever the buffer or inode is flushed. 757 * We can then check below to see if it is set, and do the 758 * synchronous write only when it has been cleared. 759 */ 760 if (sbap != &ip->i_din1->di_db[0]) { 761 if (doasyncfree) 762 bdwrite(sbp); 763 else 764 bwrite(sbp); 765 } else { 766 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 767 if (!doasyncfree) 768 ffs_update(vp, 1); 769 } 770 if (ssize < len) { 771 if (doasyncfree) 772 bdwrite(ebp); 773 else 774 bwrite(ebp); 775 } 776 /* 777 * Last, free the old blocks and assign the new blocks to the buffers. 778 */ 779 #ifdef DIAGNOSTIC 780 if (prtrealloc) 781 printf("\n\tnew:"); 782 #endif 783 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 784 bp = buflist->bs_children[i]; 785 if (!DOINGSOFTDEP(vp)) 786 /* 787 * The usual case is that a set of N-contiguous blocks 788 * that was just allocated has been replaced with a 789 * set of N+1-contiguous blocks. If they are marked as 790 * B_DELWRI, the current contents have not been written 791 * to disk. It is possible that the blocks were written 792 * earlier, but very uncommon. If the blocks have never 793 * been written, there is no need to send a BIO_DELETE 794 * for them when they are freed. The gain from avoiding 795 * the TRIMs for the common case of unwritten blocks 796 * far exceeds the cost of the write amplification for 797 * the uncommon case of failing to send a TRIM for the 798 * blocks that had been written. 799 */ 800 ffs_blkfree(ump, fs, ump->um_devvp, 801 dbtofsb(fs, bp->b_blkno), 802 fs->fs_bsize, ip->i_number, vp->v_type, NULL, 803 (bp->b_flags & B_DELWRI) != 0 ? 804 NOTRIM_KEY : SINGLETON_KEY); 805 bp->b_blkno = fsbtodb(fs, blkno); 806 #ifdef INVARIANTS 807 if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) 808 panic("ffs_reallocblks: unallocated block 3"); 809 #endif 810 #ifdef DIAGNOSTIC 811 if (prtrealloc) 812 printf(" %d,", blkno); 813 #endif 814 } 815 #ifdef DIAGNOSTIC 816 if (prtrealloc) { 817 prtrealloc--; 818 printf("\n"); 819 } 820 #endif 821 return (0); 822 823 fail: 824 if (ssize < len) 825 brelse(ebp); 826 if (sbap != &ip->i_din1->di_db[0]) 827 brelse(sbp); 828 return (ENOSPC); 829 } 830 831 static int 832 ffs_reallocblks_ufs2( 833 struct vop_reallocblks_args /* { 834 struct vnode *a_vp; 835 struct cluster_save *a_buflist; 836 } */ *ap) 837 { 838 struct fs *fs; 839 struct inode *ip; 840 struct vnode *vp; 841 struct buf *sbp, *ebp, *bp; 842 ufs2_daddr_t *bap, *sbap, *ebap; 843 struct cluster_save *buflist; 844 struct ufsmount *ump; 845 ufs_lbn_t start_lbn, end_lbn; 846 ufs2_daddr_t soff, newblk, blkno, pref; 847 struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; 848 int i, cg, len, start_lvl, end_lvl, ssize; 849 850 vp = ap->a_vp; 851 ip = VTOI(vp); 852 ump = ITOUMP(ip); 853 fs = ump->um_fs; 854 /* 855 * If we are not tracking block clusters or if we have less than 4% 856 * free blocks left, then do not attempt to cluster. Running with 857 * less than 5% free block reserve is not recommended and those that 858 * choose to do so do not expect to have good file layout. 859 */ 860 if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 861 return (ENOSPC); 862 buflist = ap->a_buflist; 863 len = buflist->bs_nchildren; 864 start_lbn = buflist->bs_children[0]->b_lblkno; 865 end_lbn = start_lbn + len - 1; 866 #ifdef INVARIANTS 867 for (i = 0; i < len; i++) 868 if (!ffs_checkblk(ip, 869 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 870 panic("ffs_reallocblks: unallocated block 1"); 871 for (i = 1; i < len; i++) 872 if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 873 panic("ffs_reallocblks: non-logical cluster"); 874 blkno = buflist->bs_children[0]->b_blkno; 875 ssize = fsbtodb(fs, fs->fs_frag); 876 for (i = 1; i < len - 1; i++) 877 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 878 panic("ffs_reallocblks: non-physical cluster %d", i); 879 #endif 880 /* 881 * If the cluster crosses the boundary for the first indirect 882 * block, do not move anything in it. Indirect blocks are 883 * usually initially laid out in a position between the data 884 * blocks. Block reallocation would usually destroy locality by 885 * moving the indirect block out of the way to make room for 886 * data blocks if we didn't compensate here. We should also do 887 * this for other indirect block boundaries, but it is only 888 * important for the first one. 889 */ 890 if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) 891 return (ENOSPC); 892 /* 893 * If the latest allocation is in a new cylinder group, assume that 894 * the filesystem has decided to move and do not force it back to 895 * the previous cylinder group. 896 */ 897 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 898 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 899 return (ENOSPC); 900 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 901 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 902 return (ENOSPC); 903 /* 904 * Get the starting offset and block map for the first block. 905 */ 906 if (start_lvl == 0) { 907 sbap = &ip->i_din2->di_db[0]; 908 soff = start_lbn; 909 } else { 910 idp = &start_ap[start_lvl - 1]; 911 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 912 brelse(sbp); 913 return (ENOSPC); 914 } 915 sbap = (ufs2_daddr_t *)sbp->b_data; 916 soff = idp->in_off; 917 } 918 /* 919 * If the block range spans two block maps, get the second map. 920 */ 921 ebap = NULL; 922 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 923 ssize = len; 924 } else { 925 #ifdef INVARIANTS 926 if (start_lvl > 0 && 927 start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 928 panic("ffs_reallocblk: start == end"); 929 #endif 930 ssize = len - (idp->in_off + 1); 931 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 932 goto fail; 933 ebap = (ufs2_daddr_t *)ebp->b_data; 934 } 935 /* 936 * Find the preferred location for the cluster. If we have not 937 * previously failed at this endeavor, then follow our standard 938 * preference calculation. If we have failed at it, then pick up 939 * where we last ended our search. 940 */ 941 UFS_LOCK(ump); 942 if (ip->i_nextclustercg == -1) 943 pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); 944 else 945 pref = cgdata(fs, ip->i_nextclustercg); 946 /* 947 * Search the block map looking for an allocation of the desired size. 948 * To avoid wasting too much time, we limit the number of cylinder 949 * groups that we will search. 950 */ 951 cg = dtog(fs, pref); 952 for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { 953 if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) 954 break; 955 cg += 1; 956 if (cg >= fs->fs_ncg) 957 cg = 0; 958 } 959 /* 960 * If we have failed in our search, record where we gave up for 961 * next time. Otherwise, fall back to our usual search citerion. 962 */ 963 if (newblk == 0) { 964 ip->i_nextclustercg = cg; 965 UFS_UNLOCK(ump); 966 goto fail; 967 } 968 ip->i_nextclustercg = -1; 969 /* 970 * We have found a new contiguous block. 971 * 972 * First we have to replace the old block pointers with the new 973 * block pointers in the inode and indirect blocks associated 974 * with the file. 975 */ 976 #ifdef DIAGNOSTIC 977 if (prtrealloc) 978 printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, 979 (intmax_t)start_lbn, (intmax_t)end_lbn); 980 #endif 981 blkno = newblk; 982 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 983 if (i == ssize) { 984 bap = ebap; 985 soff = -i; 986 } 987 #ifdef INVARIANTS 988 if (!ffs_checkblk(ip, 989 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 990 panic("ffs_reallocblks: unallocated block 2"); 991 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 992 panic("ffs_reallocblks: alloc mismatch"); 993 #endif 994 #ifdef DIAGNOSTIC 995 if (prtrealloc) 996 printf(" %jd,", (intmax_t)*bap); 997 #endif 998 if (DOINGSOFTDEP(vp)) { 999 if (sbap == &ip->i_din2->di_db[0] && i < ssize) 1000 softdep_setup_allocdirect(ip, start_lbn + i, 1001 blkno, *bap, fs->fs_bsize, fs->fs_bsize, 1002 buflist->bs_children[i]); 1003 else 1004 softdep_setup_allocindir_page(ip, start_lbn + i, 1005 i < ssize ? sbp : ebp, soff + i, blkno, 1006 *bap, buflist->bs_children[i]); 1007 } 1008 *bap++ = blkno; 1009 } 1010 /* 1011 * Next we must write out the modified inode and indirect blocks. 1012 * For strict correctness, the writes should be synchronous since 1013 * the old block values may have been written to disk. In practise 1014 * they are almost never written, but if we are concerned about 1015 * strict correctness, the `doasyncfree' flag should be set to zero. 1016 * 1017 * The test on `doasyncfree' should be changed to test a flag 1018 * that shows whether the associated buffers and inodes have 1019 * been written. The flag should be set when the cluster is 1020 * started and cleared whenever the buffer or inode is flushed. 1021 * We can then check below to see if it is set, and do the 1022 * synchronous write only when it has been cleared. 1023 */ 1024 if (sbap != &ip->i_din2->di_db[0]) { 1025 if (doasyncfree) 1026 bdwrite(sbp); 1027 else 1028 bwrite(sbp); 1029 } else { 1030 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1031 if (!doasyncfree) 1032 ffs_update(vp, 1); 1033 } 1034 if (ssize < len) { 1035 if (doasyncfree) 1036 bdwrite(ebp); 1037 else 1038 bwrite(ebp); 1039 } 1040 /* 1041 * Last, free the old blocks and assign the new blocks to the buffers. 1042 */ 1043 #ifdef DIAGNOSTIC 1044 if (prtrealloc) 1045 printf("\n\tnew:"); 1046 #endif 1047 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 1048 bp = buflist->bs_children[i]; 1049 if (!DOINGSOFTDEP(vp)) 1050 /* 1051 * The usual case is that a set of N-contiguous blocks 1052 * that was just allocated has been replaced with a 1053 * set of N+1-contiguous blocks. If they are marked as 1054 * B_DELWRI, the current contents have not been written 1055 * to disk. It is possible that the blocks were written 1056 * earlier, but very uncommon. If the blocks have never 1057 * been written, there is no need to send a BIO_DELETE 1058 * for them when they are freed. The gain from avoiding 1059 * the TRIMs for the common case of unwritten blocks 1060 * far exceeds the cost of the write amplification for 1061 * the uncommon case of failing to send a TRIM for the 1062 * blocks that had been written. 1063 */ 1064 ffs_blkfree(ump, fs, ump->um_devvp, 1065 dbtofsb(fs, bp->b_blkno), 1066 fs->fs_bsize, ip->i_number, vp->v_type, NULL, 1067 (bp->b_flags & B_DELWRI) != 0 ? 1068 NOTRIM_KEY : SINGLETON_KEY); 1069 bp->b_blkno = fsbtodb(fs, blkno); 1070 #ifdef INVARIANTS 1071 if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) 1072 panic("ffs_reallocblks: unallocated block 3"); 1073 #endif 1074 #ifdef DIAGNOSTIC 1075 if (prtrealloc) 1076 printf(" %jd,", (intmax_t)blkno); 1077 #endif 1078 } 1079 #ifdef DIAGNOSTIC 1080 if (prtrealloc) { 1081 prtrealloc--; 1082 printf("\n"); 1083 } 1084 #endif 1085 return (0); 1086 1087 fail: 1088 if (ssize < len) 1089 brelse(ebp); 1090 if (sbap != &ip->i_din2->di_db[0]) 1091 brelse(sbp); 1092 return (ENOSPC); 1093 } 1094 1095 /* 1096 * Allocate an inode in the filesystem. 1097 * 1098 * If allocating a directory, use ffs_dirpref to select the inode. 1099 * If allocating in a directory, the following hierarchy is followed: 1100 * 1) allocate the preferred inode. 1101 * 2) allocate an inode in the same cylinder group. 1102 * 3) quadratically rehash into other cylinder groups, until an 1103 * available inode is located. 1104 * If no inode preference is given the following hierarchy is used 1105 * to allocate an inode: 1106 * 1) allocate an inode in cylinder group 0. 1107 * 2) quadratically rehash into other cylinder groups, until an 1108 * available inode is located. 1109 */ 1110 int 1111 ffs_valloc(struct vnode *pvp, 1112 int mode, 1113 struct ucred *cred, 1114 struct vnode **vpp) 1115 { 1116 struct inode *pip; 1117 struct fs *fs; 1118 struct inode *ip; 1119 struct timespec ts; 1120 struct ufsmount *ump; 1121 ino_t ino, ipref; 1122 uint64_t cg; 1123 int error, reclaimed; 1124 1125 *vpp = NULL; 1126 pip = VTOI(pvp); 1127 ump = ITOUMP(pip); 1128 fs = ump->um_fs; 1129 1130 UFS_LOCK(ump); 1131 reclaimed = 0; 1132 retry: 1133 if (fs->fs_cstotal.cs_nifree == 0) 1134 goto noinodes; 1135 1136 if ((mode & IFMT) == IFDIR) 1137 ipref = ffs_dirpref(pip); 1138 else 1139 ipref = pip->i_number; 1140 if (ipref >= fs->fs_ncg * fs->fs_ipg) 1141 ipref = 0; 1142 cg = ino_to_cg(fs, ipref); 1143 /* 1144 * Track number of dirs created one after another 1145 * in a same cg without intervening by files. 1146 */ 1147 if ((mode & IFMT) == IFDIR) { 1148 if (fs->fs_contigdirs[cg] < 255) 1149 fs->fs_contigdirs[cg]++; 1150 } else { 1151 if (fs->fs_contigdirs[cg] > 0) 1152 fs->fs_contigdirs[cg]--; 1153 } 1154 ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 1155 (allocfcn_t *)ffs_nodealloccg); 1156 if (ino == 0) 1157 goto noinodes; 1158 /* 1159 * Get rid of the cached old vnode, force allocation of a new vnode 1160 * for this inode. If this fails, release the allocated ino and 1161 * return the error. 1162 */ 1163 if ((error = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp, 1164 FFSV_FORCEINSMQ | FFSV_REPLACE | FFSV_NEWINODE)) != 0) { 1165 ffs_vfree(pvp, ino, mode); 1166 return (error); 1167 } 1168 /* 1169 * We got an inode, so check mode and panic if it is already allocated. 1170 */ 1171 ip = VTOI(*vpp); 1172 if (ip->i_mode) { 1173 printf("mode = 0%o, inum = %ju, fs = %s\n", 1174 ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt); 1175 panic("ffs_valloc: dup alloc"); 1176 } 1177 if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ 1178 printf("free inode %s/%ju had %ld blocks\n", 1179 fs->fs_fsmnt, (intmax_t)ino, (long)DIP(ip, i_blocks)); 1180 DIP_SET(ip, i_blocks, 0); 1181 } 1182 ip->i_flags = 0; 1183 DIP_SET(ip, i_flags, 0); 1184 if ((mode & IFMT) == IFDIR) 1185 DIP_SET(ip, i_dirdepth, DIP(pip, i_dirdepth) + 1); 1186 /* 1187 * Set up a new generation number for this inode. 1188 */ 1189 while (ip->i_gen == 0 || ++ip->i_gen == 0) 1190 ip->i_gen = arc4random(); 1191 DIP_SET(ip, i_gen, ip->i_gen); 1192 if (fs->fs_magic == FS_UFS2_MAGIC) { 1193 vfs_timestamp(&ts); 1194 ip->i_din2->di_birthtime = ts.tv_sec; 1195 ip->i_din2->di_birthnsec = ts.tv_nsec; 1196 } 1197 ip->i_flag = 0; 1198 (*vpp)->v_vflag = 0; 1199 (*vpp)->v_type = VNON; 1200 if (fs->fs_magic == FS_UFS2_MAGIC) { 1201 (*vpp)->v_op = &ffs_vnodeops2; 1202 UFS_INODE_SET_FLAG(ip, IN_UFS2); 1203 } else { 1204 (*vpp)->v_op = &ffs_vnodeops1; 1205 } 1206 return (0); 1207 noinodes: 1208 if (reclaimed == 0) { 1209 reclaimed = 1; 1210 softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT); 1211 goto retry; 1212 } 1213 if (ffs_fsfail_cleanup_locked(ump, 0)) { 1214 UFS_UNLOCK(ump); 1215 return (ENXIO); 1216 } 1217 if (ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 1218 UFS_UNLOCK(ump); 1219 ffs_fserr(fs, pip->i_number, "out of inodes"); 1220 uprintf("\n%s: create/symlink failed, no inodes free\n", 1221 fs->fs_fsmnt); 1222 } else { 1223 UFS_UNLOCK(ump); 1224 } 1225 return (ENOSPC); 1226 } 1227 1228 /* 1229 * Find a cylinder group to place a directory. 1230 * 1231 * The policy implemented by this algorithm is to allocate a 1232 * directory inode in the same cylinder group as its parent 1233 * directory, but also to reserve space for its files inodes 1234 * and data. Restrict the number of directories which may be 1235 * allocated one after another in the same cylinder group 1236 * without intervening allocation of files. 1237 * 1238 * If we allocate a first level directory then force allocation 1239 * in another cylinder group. 1240 */ 1241 static ino_t 1242 ffs_dirpref(struct inode *pip) 1243 { 1244 struct fs *fs; 1245 int cg, prefcg, curcg, dirsize, cgsize; 1246 int depth, range, start, end, numdirs, power, numerator, denominator; 1247 uint64_t avgifree, avgbfree, avgndir, curdirsize; 1248 uint64_t minifree, minbfree, maxndir; 1249 uint64_t maxcontigdirs; 1250 1251 mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED); 1252 fs = ITOFS(pip); 1253 1254 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 1255 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1256 avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; 1257 1258 /* 1259 * Select a preferred cylinder group to place a new directory. 1260 * If we are near the root of the filesystem we aim to spread 1261 * them out as much as possible. As we descend deeper from the 1262 * root we cluster them closer together around their parent as 1263 * we expect them to be more closely interactive. Higher-level 1264 * directories like usr/src/sys and usr/src/bin should be 1265 * separated while the directories in these areas are more 1266 * likely to be accessed together so should be closer. 1267 * 1268 * We pick a range of cylinder groups around the cylinder group 1269 * of the directory in which we are being created. The size of 1270 * the range for our search is based on our depth from the root 1271 * of our filesystem. We then probe that range based on how many 1272 * directories are already present. The first new directory is at 1273 * 1/2 (middle) of the range; the second is in the first 1/4 of the 1274 * range, then at 3/4, 1/8, 3/8, 5/8, 7/8, 1/16, 3/16, 5/16, etc. 1275 */ 1276 depth = DIP(pip, i_dirdepth); 1277 range = fs->fs_ncg / (1 << depth); 1278 curcg = ino_to_cg(fs, pip->i_number); 1279 start = curcg - (range / 2); 1280 if (start < 0) 1281 start += fs->fs_ncg; 1282 end = curcg + (range / 2); 1283 if (end >= fs->fs_ncg) 1284 end -= fs->fs_ncg; 1285 numdirs = pip->i_effnlink - 1; 1286 power = fls(numdirs); 1287 numerator = (numdirs & ~(1 << (power - 1))) * 2 + 1; 1288 denominator = 1 << power; 1289 prefcg = (curcg - (range / 2) + (range * numerator / denominator)); 1290 if (prefcg < 0) 1291 prefcg += fs->fs_ncg; 1292 if (prefcg >= fs->fs_ncg) 1293 prefcg -= fs->fs_ncg; 1294 /* 1295 * If this filesystem is not tracking directory depths, 1296 * revert to the old algorithm. 1297 */ 1298 if (depth == 0 && pip->i_number != UFS_ROOTINO) 1299 prefcg = curcg; 1300 1301 /* 1302 * Count various limits which used for 1303 * optimal allocation of a directory inode. 1304 */ 1305 maxndir = min(avgndir + (1 << depth), fs->fs_ipg); 1306 minifree = avgifree - avgifree / 4; 1307 if (minifree < 1) 1308 minifree = 1; 1309 minbfree = avgbfree - avgbfree / 4; 1310 if (minbfree < 1) 1311 minbfree = 1; 1312 cgsize = fs->fs_fsize * fs->fs_fpg; 1313 dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; 1314 curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; 1315 if (dirsize < curdirsize) 1316 dirsize = curdirsize; 1317 if (dirsize <= 0) 1318 maxcontigdirs = 0; /* dirsize overflowed */ 1319 else 1320 maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); 1321 if (fs->fs_avgfpdir > 0) 1322 maxcontigdirs = min(maxcontigdirs, 1323 fs->fs_ipg / fs->fs_avgfpdir); 1324 if (maxcontigdirs == 0) 1325 maxcontigdirs = 1; 1326 1327 /* 1328 * Limit number of dirs in one cg and reserve space for 1329 * regular files, but only if we have no deficit in 1330 * inodes or space. 1331 * 1332 * We are trying to find a suitable cylinder group nearby 1333 * our preferred cylinder group to place a new directory. 1334 * We scan from our preferred cylinder group forward looking 1335 * for a cylinder group that meets our criterion. If we get 1336 * to the final cylinder group and do not find anything, 1337 * we start scanning forwards from the beginning of the 1338 * filesystem. While it might seem sensible to start scanning 1339 * backwards or even to alternate looking forward and backward, 1340 * this approach fails badly when the filesystem is nearly full. 1341 * Specifically, we first search all the areas that have no space 1342 * and finally try the one preceding that. We repeat this on 1343 * every request and in the case of the final block end up 1344 * searching the entire filesystem. By jumping to the front 1345 * of the filesystem, our future forward searches always look 1346 * in new cylinder groups so finds every possible block after 1347 * one pass over the filesystem. 1348 */ 1349 for (cg = prefcg; cg < fs->fs_ncg; cg++) 1350 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 1351 fs->fs_cs(fs, cg).cs_nifree >= minifree && 1352 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 1353 if (fs->fs_contigdirs[cg] < maxcontigdirs) 1354 return ((ino_t)(fs->fs_ipg * cg)); 1355 } 1356 for (cg = 0; cg < prefcg; cg++) 1357 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 1358 fs->fs_cs(fs, cg).cs_nifree >= minifree && 1359 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 1360 if (fs->fs_contigdirs[cg] < maxcontigdirs) 1361 return ((ino_t)(fs->fs_ipg * cg)); 1362 } 1363 /* 1364 * This is a backstop when we have deficit in space. 1365 */ 1366 for (cg = prefcg; cg < fs->fs_ncg; cg++) 1367 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 1368 return ((ino_t)(fs->fs_ipg * cg)); 1369 for (cg = 0; cg < prefcg; cg++) 1370 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 1371 break; 1372 return ((ino_t)(fs->fs_ipg * cg)); 1373 } 1374 1375 /* 1376 * Select the desired position for the next block in a file. The file is 1377 * logically divided into sections. The first section is composed of the 1378 * direct blocks and the next fs_maxbpg blocks. Each additional section 1379 * contains fs_maxbpg blocks. 1380 * 1381 * If no blocks have been allocated in the first section, the policy is to 1382 * request a block in the same cylinder group as the inode that describes 1383 * the file. The first indirect is allocated immediately following the last 1384 * direct block and the data blocks for the first indirect immediately 1385 * follow it. 1386 * 1387 * If no blocks have been allocated in any other section, the indirect 1388 * block(s) are allocated in the same cylinder group as its inode in an 1389 * area reserved immediately following the inode blocks. The policy for 1390 * the data blocks is to place them in a cylinder group with a greater than 1391 * average number of free blocks. An appropriate cylinder group is found 1392 * by using a rotor that sweeps the cylinder groups. When a new group of 1393 * blocks is needed, the sweep begins in the cylinder group following the 1394 * cylinder group from which the previous allocation was made. The sweep 1395 * continues until a cylinder group with greater than the average number 1396 * of free blocks is found. If the allocation is for the first block in an 1397 * indirect block or the previous block is a hole, then the information on 1398 * the previous allocation is unavailable; here a best guess is made based 1399 * on the logical block number being allocated. 1400 * 1401 * If a section is already partially allocated, the policy is to 1402 * allocate blocks contiguously within the section if possible. 1403 */ 1404 ufs2_daddr_t 1405 ffs_blkpref_ufs1(struct inode *ip, 1406 ufs_lbn_t lbn, 1407 int indx, 1408 ufs1_daddr_t *bap) 1409 { 1410 struct fs *fs; 1411 uint64_t cg, inocg; 1412 uint64_t avgbfree, startcg; 1413 ufs2_daddr_t pref, prevbn; 1414 1415 KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1416 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1417 fs = ITOFS(ip); 1418 /* 1419 * Allocation of indirect blocks is indicated by passing negative 1420 * values in indx: -1 for single indirect, -2 for double indirect, 1421 * -3 for triple indirect. As noted below, we attempt to allocate 1422 * the first indirect inline with the file data. For all later 1423 * indirect blocks, the data is often allocated in other cylinder 1424 * groups. However to speed random file access and to speed up 1425 * fsck, the filesystem reserves the first fs_metaspace blocks 1426 * (typically half of fs_minfree) of the data area of each cylinder 1427 * group to hold these later indirect blocks. 1428 */ 1429 inocg = ino_to_cg(fs, ip->i_number); 1430 if (indx < 0) { 1431 /* 1432 * Our preference for indirect blocks is the zone at the 1433 * beginning of the inode's cylinder group data area that 1434 * we try to reserve for indirect blocks. 1435 */ 1436 pref = cgmeta(fs, inocg); 1437 /* 1438 * If we are allocating the first indirect block, try to 1439 * place it immediately following the last direct block. 1440 */ 1441 if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && 1442 ip->i_din1->di_db[UFS_NDADDR - 1] != 0) 1443 pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag; 1444 return (pref); 1445 } 1446 /* 1447 * If we are allocating the first data block in the first indirect 1448 * block and the indirect has been allocated in the data block area, 1449 * try to place it immediately following the indirect block. 1450 */ 1451 if (lbn == UFS_NDADDR) { 1452 pref = ip->i_din1->di_ib[0]; 1453 if (pref != 0 && pref >= cgdata(fs, inocg) && 1454 pref < cgbase(fs, inocg + 1)) 1455 return (pref + fs->fs_frag); 1456 } 1457 /* 1458 * If we are at the beginning of a file, or we have already allocated 1459 * the maximum number of blocks per cylinder group, or we do not 1460 * have a block allocated immediately preceding us, then we need 1461 * to decide where to start allocating new blocks. 1462 */ 1463 if (indx == 0) { 1464 prevbn = 0; 1465 } else { 1466 prevbn = bap[indx - 1]; 1467 if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, 1468 fs->fs_bsize) != 0) 1469 prevbn = 0; 1470 } 1471 if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { 1472 /* 1473 * If we are allocating a directory data block, we want 1474 * to place it in the metadata area. 1475 */ 1476 if ((ip->i_mode & IFMT) == IFDIR) 1477 return (cgmeta(fs, inocg)); 1478 /* 1479 * Until we fill all the direct and all the first indirect's 1480 * blocks, we try to allocate in the data area of the inode's 1481 * cylinder group. 1482 */ 1483 if (lbn < UFS_NDADDR + NINDIR(fs)) 1484 return (cgdata(fs, inocg)); 1485 /* 1486 * Find a cylinder with greater than average number of 1487 * unused data blocks. 1488 */ 1489 if (indx == 0 || prevbn == 0) 1490 startcg = inocg + lbn / fs->fs_maxbpg; 1491 else 1492 startcg = dtog(fs, prevbn) + 1; 1493 startcg %= fs->fs_ncg; 1494 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1495 for (cg = startcg; cg < fs->fs_ncg; cg++) 1496 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1497 fs->fs_cgrotor = cg; 1498 return (cgdata(fs, cg)); 1499 } 1500 for (cg = 0; cg <= startcg; cg++) 1501 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1502 fs->fs_cgrotor = cg; 1503 return (cgdata(fs, cg)); 1504 } 1505 return (0); 1506 } 1507 /* 1508 * Otherwise, we just always try to lay things out contiguously. 1509 */ 1510 return (prevbn + fs->fs_frag); 1511 } 1512 1513 /* 1514 * Same as above, but for UFS2 1515 */ 1516 ufs2_daddr_t 1517 ffs_blkpref_ufs2(struct inode *ip, 1518 ufs_lbn_t lbn, 1519 int indx, 1520 ufs2_daddr_t *bap) 1521 { 1522 struct fs *fs; 1523 uint64_t cg, inocg; 1524 uint64_t avgbfree, startcg; 1525 ufs2_daddr_t pref, prevbn; 1526 1527 KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1528 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1529 fs = ITOFS(ip); 1530 /* 1531 * Allocation of indirect blocks is indicated by passing negative 1532 * values in indx: -1 for single indirect, -2 for double indirect, 1533 * -3 for triple indirect. As noted below, we attempt to allocate 1534 * the first indirect inline with the file data. For all later 1535 * indirect blocks, the data is often allocated in other cylinder 1536 * groups. However to speed random file access and to speed up 1537 * fsck, the filesystem reserves the first fs_metaspace blocks 1538 * (typically half of fs_minfree) of the data area of each cylinder 1539 * group to hold these later indirect blocks. 1540 */ 1541 inocg = ino_to_cg(fs, ip->i_number); 1542 if (indx < 0) { 1543 /* 1544 * Our preference for indirect blocks is the zone at the 1545 * beginning of the inode's cylinder group data area that 1546 * we try to reserve for indirect blocks. 1547 */ 1548 pref = cgmeta(fs, inocg); 1549 /* 1550 * If we are allocating the first indirect block, try to 1551 * place it immediately following the last direct block. 1552 */ 1553 if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && 1554 ip->i_din2->di_db[UFS_NDADDR - 1] != 0) 1555 pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag; 1556 return (pref); 1557 } 1558 /* 1559 * If we are allocating the first data block in the first indirect 1560 * block and the indirect has been allocated in the data block area, 1561 * try to place it immediately following the indirect block. 1562 */ 1563 if (lbn == UFS_NDADDR) { 1564 pref = ip->i_din2->di_ib[0]; 1565 if (pref != 0 && pref >= cgdata(fs, inocg) && 1566 pref < cgbase(fs, inocg + 1)) 1567 return (pref + fs->fs_frag); 1568 } 1569 /* 1570 * If we are at the beginning of a file, or we have already allocated 1571 * the maximum number of blocks per cylinder group, or we do not 1572 * have a block allocated immediately preceding us, then we need 1573 * to decide where to start allocating new blocks. 1574 */ 1575 if (indx == 0) { 1576 prevbn = 0; 1577 } else { 1578 prevbn = bap[indx - 1]; 1579 if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, 1580 fs->fs_bsize) != 0) 1581 prevbn = 0; 1582 } 1583 if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { 1584 /* 1585 * If we are allocating a directory data block, we want 1586 * to place it in the metadata area. 1587 */ 1588 if ((ip->i_mode & IFMT) == IFDIR) 1589 return (cgmeta(fs, inocg)); 1590 /* 1591 * Until we fill all the direct and all the first indirect's 1592 * blocks, we try to allocate in the data area of the inode's 1593 * cylinder group. 1594 */ 1595 if (lbn < UFS_NDADDR + NINDIR(fs)) 1596 return (cgdata(fs, inocg)); 1597 /* 1598 * Find a cylinder with greater than average number of 1599 * unused data blocks. 1600 */ 1601 if (indx == 0 || prevbn == 0) 1602 startcg = inocg + lbn / fs->fs_maxbpg; 1603 else 1604 startcg = dtog(fs, prevbn) + 1; 1605 startcg %= fs->fs_ncg; 1606 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1607 for (cg = startcg; cg < fs->fs_ncg; cg++) 1608 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1609 fs->fs_cgrotor = cg; 1610 return (cgdata(fs, cg)); 1611 } 1612 for (cg = 0; cg <= startcg; cg++) 1613 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1614 fs->fs_cgrotor = cg; 1615 return (cgdata(fs, cg)); 1616 } 1617 return (0); 1618 } 1619 /* 1620 * Otherwise, we just always try to lay things out contiguously. 1621 */ 1622 return (prevbn + fs->fs_frag); 1623 } 1624 1625 /* 1626 * Implement the cylinder overflow algorithm. 1627 * 1628 * The policy implemented by this algorithm is: 1629 * 1) allocate the block in its requested cylinder group. 1630 * 2) quadratically rehash on the cylinder group number. 1631 * 3) brute force search for a free block. 1632 * 1633 * Must be called with the UFS lock held. Will release the lock on success 1634 * and return with it held on failure. 1635 */ 1636 /*VARARGS5*/ 1637 static ufs2_daddr_t 1638 ffs_hashalloc(struct inode *ip, 1639 uint64_t cg, 1640 ufs2_daddr_t pref, 1641 int size, /* Search size for data blocks, mode for inodes */ 1642 int rsize, /* Real allocated size. */ 1643 allocfcn_t *allocator) 1644 { 1645 struct fs *fs; 1646 ufs2_daddr_t result; 1647 uint64_t i, icg = cg; 1648 1649 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1650 #ifdef INVARIANTS 1651 if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 1652 panic("ffs_hashalloc: allocation on suspended filesystem"); 1653 #endif 1654 fs = ITOFS(ip); 1655 /* 1656 * 1: preferred cylinder group 1657 */ 1658 result = (*allocator)(ip, cg, pref, size, rsize); 1659 if (result) 1660 return (result); 1661 /* 1662 * 2: quadratic rehash 1663 */ 1664 for (i = 1; i < fs->fs_ncg; i *= 2) { 1665 cg += i; 1666 if (cg >= fs->fs_ncg) 1667 cg -= fs->fs_ncg; 1668 result = (*allocator)(ip, cg, 0, size, rsize); 1669 if (result) 1670 return (result); 1671 } 1672 /* 1673 * 3: brute force search 1674 * Note that we start at i == 2, since 0 was checked initially, 1675 * and 1 is always checked in the quadratic rehash. 1676 */ 1677 cg = (icg + 2) % fs->fs_ncg; 1678 for (i = 2; i < fs->fs_ncg; i++) { 1679 result = (*allocator)(ip, cg, 0, size, rsize); 1680 if (result) 1681 return (result); 1682 cg++; 1683 if (cg == fs->fs_ncg) 1684 cg = 0; 1685 } 1686 return (0); 1687 } 1688 1689 /* 1690 * Determine whether a fragment can be extended. 1691 * 1692 * Check to see if the necessary fragments are available, and 1693 * if they are, allocate them. 1694 */ 1695 static ufs2_daddr_t 1696 ffs_fragextend(struct inode *ip, 1697 uint64_t cg, 1698 ufs2_daddr_t bprev, 1699 int osize, 1700 int nsize) 1701 { 1702 struct fs *fs; 1703 struct cg *cgp; 1704 struct buf *bp; 1705 struct ufsmount *ump; 1706 int nffree; 1707 long bno; 1708 int frags, bbase; 1709 int i, error; 1710 uint8_t *blksfree; 1711 1712 ump = ITOUMP(ip); 1713 fs = ump->um_fs; 1714 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) 1715 return (0); 1716 frags = numfrags(fs, nsize); 1717 bbase = fragnum(fs, bprev); 1718 if (bbase > fragnum(fs, (bprev + frags - 1))) { 1719 /* cannot extend across a block boundary */ 1720 return (0); 1721 } 1722 UFS_UNLOCK(ump); 1723 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) 1724 goto fail; 1725 bno = dtogd(fs, bprev); 1726 blksfree = cg_blksfree(cgp); 1727 for (i = numfrags(fs, osize); i < frags; i++) 1728 if (isclr(blksfree, bno + i)) 1729 goto fail; 1730 /* 1731 * the current fragment can be extended 1732 * deduct the count on fragment being extended into 1733 * increase the count on the remaining fragment (if any) 1734 * allocate the extended piece 1735 */ 1736 for (i = frags; i < fs->fs_frag - bbase; i++) 1737 if (isclr(blksfree, bno + i)) 1738 break; 1739 cgp->cg_frsum[i - numfrags(fs, osize)]--; 1740 if (i != frags) 1741 cgp->cg_frsum[i - frags]++; 1742 for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) { 1743 clrbit(blksfree, bno + i); 1744 cgp->cg_cs.cs_nffree--; 1745 nffree++; 1746 } 1747 UFS_LOCK(ump); 1748 fs->fs_cstotal.cs_nffree -= nffree; 1749 fs->fs_cs(fs, cg).cs_nffree -= nffree; 1750 fs->fs_fmod = 1; 1751 ACTIVECLEAR(fs, cg); 1752 UFS_UNLOCK(ump); 1753 if (DOINGSOFTDEP(ITOV(ip))) 1754 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, 1755 frags, numfrags(fs, osize)); 1756 bdwrite(bp); 1757 return (bprev); 1758 1759 fail: 1760 brelse(bp); 1761 UFS_LOCK(ump); 1762 return (0); 1763 1764 } 1765 1766 /* 1767 * Determine whether a block can be allocated. 1768 * 1769 * Check to see if a block of the appropriate size is available, 1770 * and if it is, allocate it. 1771 */ 1772 static ufs2_daddr_t 1773 ffs_alloccg(struct inode *ip, 1774 uint64_t cg, 1775 ufs2_daddr_t bpref, 1776 int size, 1777 int rsize) 1778 { 1779 struct fs *fs; 1780 struct cg *cgp; 1781 struct buf *bp; 1782 struct ufsmount *ump; 1783 ufs1_daddr_t bno; 1784 ufs2_daddr_t blkno; 1785 int i, allocsiz, error, frags; 1786 uint8_t *blksfree; 1787 1788 ump = ITOUMP(ip); 1789 fs = ump->um_fs; 1790 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) 1791 return (0); 1792 UFS_UNLOCK(ump); 1793 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0 || 1794 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) 1795 goto fail; 1796 if (size == fs->fs_bsize) { 1797 UFS_LOCK(ump); 1798 blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1799 ACTIVECLEAR(fs, cg); 1800 UFS_UNLOCK(ump); 1801 bdwrite(bp); 1802 return (blkno); 1803 } 1804 /* 1805 * check to see if any fragments are already available 1806 * allocsiz is the size which will be allocated, hacking 1807 * it down to a smaller size if necessary 1808 */ 1809 blksfree = cg_blksfree(cgp); 1810 frags = numfrags(fs, size); 1811 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 1812 if (cgp->cg_frsum[allocsiz] != 0) 1813 break; 1814 if (allocsiz == fs->fs_frag) { 1815 /* 1816 * no fragments were available, so a block will be 1817 * allocated, and hacked up 1818 */ 1819 if (cgp->cg_cs.cs_nbfree == 0) 1820 goto fail; 1821 UFS_LOCK(ump); 1822 blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1823 ACTIVECLEAR(fs, cg); 1824 UFS_UNLOCK(ump); 1825 bdwrite(bp); 1826 return (blkno); 1827 } 1828 KASSERT(size == rsize, 1829 ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); 1830 bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); 1831 if (bno < 0) 1832 goto fail; 1833 for (i = 0; i < frags; i++) 1834 clrbit(blksfree, bno + i); 1835 cgp->cg_cs.cs_nffree -= frags; 1836 cgp->cg_frsum[allocsiz]--; 1837 if (frags != allocsiz) 1838 cgp->cg_frsum[allocsiz - frags]++; 1839 UFS_LOCK(ump); 1840 fs->fs_cstotal.cs_nffree -= frags; 1841 fs->fs_cs(fs, cg).cs_nffree -= frags; 1842 fs->fs_fmod = 1; 1843 blkno = cgbase(fs, cg) + bno; 1844 ACTIVECLEAR(fs, cg); 1845 UFS_UNLOCK(ump); 1846 if (DOINGSOFTDEP(ITOV(ip))) 1847 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); 1848 bdwrite(bp); 1849 return (blkno); 1850 1851 fail: 1852 brelse(bp); 1853 UFS_LOCK(ump); 1854 return (0); 1855 } 1856 1857 /* 1858 * Allocate a block in a cylinder group. 1859 * 1860 * This algorithm implements the following policy: 1861 * 1) allocate the requested block. 1862 * 2) allocate a rotationally optimal block in the same cylinder. 1863 * 3) allocate the next available block on the block rotor for the 1864 * specified cylinder group. 1865 * Note that this routine only allocates fs_bsize blocks; these 1866 * blocks may be fragmented by the routine that allocates them. 1867 */ 1868 static ufs2_daddr_t 1869 ffs_alloccgblk(struct inode *ip, 1870 struct buf *bp, 1871 ufs2_daddr_t bpref, 1872 int size) 1873 { 1874 struct fs *fs; 1875 struct cg *cgp; 1876 struct ufsmount *ump; 1877 ufs1_daddr_t bno; 1878 ufs2_daddr_t blkno; 1879 uint8_t *blksfree; 1880 int i, cgbpref; 1881 1882 ump = ITOUMP(ip); 1883 fs = ump->um_fs; 1884 mtx_assert(UFS_MTX(ump), MA_OWNED); 1885 cgp = (struct cg *)bp->b_data; 1886 blksfree = cg_blksfree(cgp); 1887 if (bpref == 0) { 1888 bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag; 1889 } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) { 1890 /* map bpref to correct zone in this cg */ 1891 if (bpref < cgdata(fs, cgbpref)) 1892 bpref = cgmeta(fs, cgp->cg_cgx); 1893 else 1894 bpref = cgdata(fs, cgp->cg_cgx); 1895 } 1896 /* 1897 * if the requested block is available, use it 1898 */ 1899 bno = dtogd(fs, blknum(fs, bpref)); 1900 if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) 1901 goto gotit; 1902 /* 1903 * Take the next available block in this cylinder group. 1904 */ 1905 bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); 1906 if (bno < 0) 1907 return (0); 1908 /* Update cg_rotor only if allocated from the data zone */ 1909 if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) 1910 cgp->cg_rotor = bno; 1911 gotit: 1912 blkno = fragstoblks(fs, bno); 1913 ffs_clrblock(fs, blksfree, (long)blkno); 1914 ffs_clusteracct(fs, cgp, blkno, -1); 1915 cgp->cg_cs.cs_nbfree--; 1916 fs->fs_cstotal.cs_nbfree--; 1917 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; 1918 fs->fs_fmod = 1; 1919 blkno = cgbase(fs, cgp->cg_cgx) + bno; 1920 /* 1921 * If the caller didn't want the whole block free the frags here. 1922 */ 1923 size = numfrags(fs, size); 1924 if (size != fs->fs_frag) { 1925 bno = dtogd(fs, blkno); 1926 for (i = size; i < fs->fs_frag; i++) 1927 setbit(blksfree, bno + i); 1928 i = fs->fs_frag - size; 1929 cgp->cg_cs.cs_nffree += i; 1930 fs->fs_cstotal.cs_nffree += i; 1931 fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; 1932 fs->fs_fmod = 1; 1933 cgp->cg_frsum[i]++; 1934 } 1935 /* XXX Fixme. */ 1936 UFS_UNLOCK(ump); 1937 if (DOINGSOFTDEP(ITOV(ip))) 1938 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0); 1939 UFS_LOCK(ump); 1940 return (blkno); 1941 } 1942 1943 /* 1944 * Determine whether a cluster can be allocated. 1945 * 1946 * We do not currently check for optimal rotational layout if there 1947 * are multiple choices in the same cylinder group. Instead we just 1948 * take the first one that we find following bpref. 1949 */ 1950 static ufs2_daddr_t 1951 ffs_clusteralloc(struct inode *ip, 1952 uint64_t cg, 1953 ufs2_daddr_t bpref, 1954 int len) 1955 { 1956 struct fs *fs; 1957 struct cg *cgp; 1958 struct buf *bp; 1959 struct ufsmount *ump; 1960 int i, run, bit, map, got, error; 1961 ufs2_daddr_t bno; 1962 uint8_t *mapp; 1963 int32_t *lp; 1964 uint8_t *blksfree; 1965 1966 ump = ITOUMP(ip); 1967 fs = ump->um_fs; 1968 if (fs->fs_maxcluster[cg] < len) 1969 return (0); 1970 UFS_UNLOCK(ump); 1971 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { 1972 UFS_LOCK(ump); 1973 return (0); 1974 } 1975 /* 1976 * Check to see if a cluster of the needed size (or bigger) is 1977 * available in this cylinder group. 1978 */ 1979 lp = &cg_clustersum(cgp)[len]; 1980 for (i = len; i <= fs->fs_contigsumsize; i++) 1981 if (*lp++ > 0) 1982 break; 1983 if (i > fs->fs_contigsumsize) { 1984 /* 1985 * This is the first time looking for a cluster in this 1986 * cylinder group. Update the cluster summary information 1987 * to reflect the true maximum sized cluster so that 1988 * future cluster allocation requests can avoid reading 1989 * the cylinder group map only to find no clusters. 1990 */ 1991 lp = &cg_clustersum(cgp)[len - 1]; 1992 for (i = len - 1; i > 0; i--) 1993 if (*lp-- > 0) 1994 break; 1995 UFS_LOCK(ump); 1996 fs->fs_maxcluster[cg] = i; 1997 brelse(bp); 1998 return (0); 1999 } 2000 /* 2001 * Search the cluster map to find a big enough cluster. 2002 * We take the first one that we find, even if it is larger 2003 * than we need as we prefer to get one close to the previous 2004 * block allocation. We do not search before the current 2005 * preference point as we do not want to allocate a block 2006 * that is allocated before the previous one (as we will 2007 * then have to wait for another pass of the elevator 2008 * algorithm before it will be read). We prefer to fail and 2009 * be recalled to try an allocation in the next cylinder group. 2010 */ 2011 if (dtog(fs, bpref) != cg) 2012 bpref = cgdata(fs, cg); 2013 else 2014 bpref = blknum(fs, bpref); 2015 bpref = fragstoblks(fs, dtogd(fs, bpref)); 2016 mapp = &cg_clustersfree(cgp)[bpref / NBBY]; 2017 map = *mapp++; 2018 bit = 1 << (bpref % NBBY); 2019 for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { 2020 if ((map & bit) == 0) { 2021 run = 0; 2022 } else { 2023 run++; 2024 if (run == len) 2025 break; 2026 } 2027 if ((got & (NBBY - 1)) != (NBBY - 1)) { 2028 bit <<= 1; 2029 } else { 2030 map = *mapp++; 2031 bit = 1; 2032 } 2033 } 2034 if (got >= cgp->cg_nclusterblks) { 2035 UFS_LOCK(ump); 2036 brelse(bp); 2037 return (0); 2038 } 2039 /* 2040 * Allocate the cluster that we have found. 2041 */ 2042 blksfree = cg_blksfree(cgp); 2043 for (i = 1; i <= len; i++) 2044 if (!ffs_isblock(fs, blksfree, got - run + i)) 2045 panic("ffs_clusteralloc: map mismatch"); 2046 bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1); 2047 if (dtog(fs, bno) != cg) 2048 panic("ffs_clusteralloc: allocated out of group"); 2049 len = blkstofrags(fs, len); 2050 UFS_LOCK(ump); 2051 for (i = 0; i < len; i += fs->fs_frag) 2052 if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) 2053 panic("ffs_clusteralloc: lost block"); 2054 ACTIVECLEAR(fs, cg); 2055 UFS_UNLOCK(ump); 2056 bdwrite(bp); 2057 return (bno); 2058 } 2059 2060 static inline struct buf * 2061 getinobuf(struct inode *ip, 2062 uint64_t cg, 2063 uint32_t cginoblk, 2064 int gbflags) 2065 { 2066 struct fs *fs; 2067 2068 fs = ITOFS(ip); 2069 return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, 2070 cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, 2071 gbflags)); 2072 } 2073 2074 /* 2075 * Synchronous inode initialization is needed only when barrier writes do not 2076 * work as advertised, and will impose a heavy cost on file creation in a newly 2077 * created filesystem. 2078 */ 2079 static int doasyncinodeinit = 1; 2080 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN, 2081 &doasyncinodeinit, 0, 2082 "Perform inode block initialization using asynchronous writes"); 2083 2084 /* 2085 * Determine whether an inode can be allocated. 2086 * 2087 * Check to see if an inode is available, and if it is, 2088 * allocate it using the following policy: 2089 * 1) allocate the requested inode. 2090 * 2) allocate the next available inode after the requested 2091 * inode in the specified cylinder group. 2092 */ 2093 static ufs2_daddr_t 2094 ffs_nodealloccg(struct inode *ip, 2095 uint64_t cg, 2096 ufs2_daddr_t ipref, 2097 int mode, 2098 int unused) 2099 { 2100 struct fs *fs; 2101 struct cg *cgp; 2102 struct buf *bp, *ibp; 2103 struct ufsmount *ump; 2104 uint8_t *inosused, *loc; 2105 struct ufs2_dinode *dp2; 2106 int error, start, len, i; 2107 uint32_t old_initediblk; 2108 2109 ump = ITOUMP(ip); 2110 fs = ump->um_fs; 2111 check_nifree: 2112 if (fs->fs_cs(fs, cg).cs_nifree == 0) 2113 return (0); 2114 UFS_UNLOCK(ump); 2115 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { 2116 UFS_LOCK(ump); 2117 return (0); 2118 } 2119 restart: 2120 if (cgp->cg_cs.cs_nifree == 0) { 2121 brelse(bp); 2122 UFS_LOCK(ump); 2123 return (0); 2124 } 2125 inosused = cg_inosused(cgp); 2126 if (ipref) { 2127 ipref %= fs->fs_ipg; 2128 if (isclr(inosused, ipref)) 2129 goto gotit; 2130 } 2131 start = cgp->cg_irotor / NBBY; 2132 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); 2133 loc = memcchr(&inosused[start], 0xff, len); 2134 if (loc == NULL) { 2135 len = start + 1; 2136 start = 0; 2137 loc = memcchr(&inosused[start], 0xff, len); 2138 if (loc == NULL) { 2139 printf("cg = %ju, irotor = %ld, fs = %s\n", 2140 (intmax_t)cg, (long)cgp->cg_irotor, fs->fs_fsmnt); 2141 panic("ffs_nodealloccg: map corrupted"); 2142 /* NOTREACHED */ 2143 } 2144 } 2145 ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1; 2146 gotit: 2147 /* 2148 * Check to see if we need to initialize more inodes. 2149 */ 2150 if (fs->fs_magic == FS_UFS2_MAGIC && 2151 ipref + INOPB(fs) > cgp->cg_initediblk && 2152 cgp->cg_initediblk < cgp->cg_niblk) { 2153 old_initediblk = cgp->cg_initediblk; 2154 2155 /* 2156 * Free the cylinder group lock before writing the 2157 * initialized inode block. Entering the 2158 * babarrierwrite() with the cylinder group lock 2159 * causes lock order violation between the lock and 2160 * snaplk. 2161 * 2162 * Another thread can decide to initialize the same 2163 * inode block, but whichever thread first gets the 2164 * cylinder group lock after writing the newly 2165 * allocated inode block will update it and the other 2166 * will realize that it has lost and leave the 2167 * cylinder group unchanged. 2168 */ 2169 ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); 2170 brelse(bp); 2171 if (ibp == NULL) { 2172 /* 2173 * The inode block buffer is already owned by 2174 * another thread, which must initialize it. 2175 * Wait on the buffer to allow another thread 2176 * to finish the updates, with dropped cg 2177 * buffer lock, then retry. 2178 */ 2179 ibp = getinobuf(ip, cg, old_initediblk, 0); 2180 brelse(ibp); 2181 UFS_LOCK(ump); 2182 goto check_nifree; 2183 } 2184 bzero(ibp->b_data, (int)fs->fs_bsize); 2185 dp2 = (struct ufs2_dinode *)(ibp->b_data); 2186 for (i = 0; i < INOPB(fs); i++) { 2187 while (dp2->di_gen == 0) 2188 dp2->di_gen = arc4random(); 2189 dp2++; 2190 } 2191 2192 /* 2193 * Rather than adding a soft updates dependency to ensure 2194 * that the new inode block is written before it is claimed 2195 * by the cylinder group map, we just do a barrier write 2196 * here. The barrier write will ensure that the inode block 2197 * gets written before the updated cylinder group map can be 2198 * written. The barrier write should only slow down bulk 2199 * loading of newly created filesystems. 2200 */ 2201 if (doasyncinodeinit) 2202 babarrierwrite(ibp); 2203 else 2204 bwrite(ibp); 2205 2206 /* 2207 * After the inode block is written, try to update the 2208 * cg initediblk pointer. If another thread beat us 2209 * to it, then leave it unchanged as the other thread 2210 * has already set it correctly. 2211 */ 2212 error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp); 2213 UFS_LOCK(ump); 2214 ACTIVECLEAR(fs, cg); 2215 UFS_UNLOCK(ump); 2216 if (error != 0) 2217 return (error); 2218 if (cgp->cg_initediblk == old_initediblk) 2219 cgp->cg_initediblk += INOPB(fs); 2220 goto restart; 2221 } 2222 cgp->cg_irotor = ipref; 2223 UFS_LOCK(ump); 2224 ACTIVECLEAR(fs, cg); 2225 setbit(inosused, ipref); 2226 cgp->cg_cs.cs_nifree--; 2227 fs->fs_cstotal.cs_nifree--; 2228 fs->fs_cs(fs, cg).cs_nifree--; 2229 fs->fs_fmod = 1; 2230 if ((mode & IFMT) == IFDIR) { 2231 cgp->cg_cs.cs_ndir++; 2232 fs->fs_cstotal.cs_ndir++; 2233 fs->fs_cs(fs, cg).cs_ndir++; 2234 } 2235 UFS_UNLOCK(ump); 2236 if (DOINGSOFTDEP(ITOV(ip))) 2237 softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode); 2238 bdwrite(bp); 2239 return ((ino_t)(cg * fs->fs_ipg + ipref)); 2240 } 2241 2242 /* 2243 * Free a block or fragment. 2244 * 2245 * The specified block or fragment is placed back in the 2246 * free map. If a fragment is deallocated, a possible 2247 * block reassembly is checked. 2248 */ 2249 static void 2250 ffs_blkfree_cg(struct ufsmount *ump, 2251 struct fs *fs, 2252 struct vnode *devvp, 2253 ufs2_daddr_t bno, 2254 long size, 2255 ino_t inum, 2256 struct workhead *dephd) 2257 { 2258 struct mount *mp; 2259 struct cg *cgp; 2260 struct buf *bp; 2261 daddr_t dbn; 2262 ufs1_daddr_t fragno, cgbno; 2263 int i, blk, frags, bbase, error; 2264 uint64_t cg; 2265 uint8_t *blksfree; 2266 struct cdev *dev; 2267 2268 cg = dtog(fs, bno); 2269 if (devvp->v_type == VREG) { 2270 /* devvp is a snapshot */ 2271 MPASS(devvp->v_mount->mnt_data == ump); 2272 dev = ump->um_devvp->v_rdev; 2273 } else if (devvp->v_type == VCHR) { 2274 /* 2275 * devvp is a normal disk device 2276 * XXXKIB: devvp is not locked there, v_rdev access depends on 2277 * busy mount, which prevents mntfs devvp from reclamation. 2278 */ 2279 dev = devvp->v_rdev; 2280 } else 2281 return; 2282 #ifdef INVARIANTS 2283 if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0 || 2284 fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { 2285 printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", 2286 devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, 2287 size, fs->fs_fsmnt); 2288 panic("ffs_blkfree_cg: bad size"); 2289 } 2290 #endif 2291 if ((uint64_t)bno >= fs->fs_size) { 2292 printf("bad block %jd, ino %ju\n", (intmax_t)bno, 2293 (intmax_t)inum); 2294 ffs_fserr(fs, inum, "bad block"); 2295 return; 2296 } 2297 if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { 2298 if (!ffs_fsfail_cleanup(ump, error) || 2299 !MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) 2300 return; 2301 if (devvp->v_type == VREG) 2302 dbn = fragstoblks(fs, cgtod(fs, cg)); 2303 else 2304 dbn = fsbtodb(fs, cgtod(fs, cg)); 2305 error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); 2306 KASSERT(error == 0, ("getblkx failed")); 2307 softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, 2308 numfrags(fs, size), dephd); 2309 bp->b_flags |= B_RELBUF | B_NOCACHE; 2310 bp->b_flags &= ~B_CACHE; 2311 bawrite(bp); 2312 return; 2313 } 2314 cgbno = dtogd(fs, bno); 2315 blksfree = cg_blksfree(cgp); 2316 UFS_LOCK(ump); 2317 if (size == fs->fs_bsize) { 2318 fragno = fragstoblks(fs, cgbno); 2319 if (!ffs_isfreeblock(fs, blksfree, fragno)) { 2320 if (devvp->v_type == VREG) { 2321 UFS_UNLOCK(ump); 2322 /* devvp is a snapshot */ 2323 brelse(bp); 2324 return; 2325 } 2326 printf("dev = %s, block = %jd, fs = %s\n", 2327 devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); 2328 panic("ffs_blkfree_cg: freeing free block"); 2329 } 2330 ffs_setblock(fs, blksfree, fragno); 2331 ffs_clusteracct(fs, cgp, fragno, 1); 2332 cgp->cg_cs.cs_nbfree++; 2333 fs->fs_cstotal.cs_nbfree++; 2334 fs->fs_cs(fs, cg).cs_nbfree++; 2335 } else { 2336 bbase = cgbno - fragnum(fs, cgbno); 2337 /* 2338 * decrement the counts associated with the old frags 2339 */ 2340 blk = blkmap(fs, blksfree, bbase); 2341 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 2342 /* 2343 * deallocate the fragment 2344 */ 2345 frags = numfrags(fs, size); 2346 for (i = 0; i < frags; i++) { 2347 if (isset(blksfree, cgbno + i)) { 2348 printf("dev = %s, block = %jd, fs = %s\n", 2349 devtoname(dev), (intmax_t)(bno + i), 2350 fs->fs_fsmnt); 2351 panic("ffs_blkfree_cg: freeing free frag"); 2352 } 2353 setbit(blksfree, cgbno + i); 2354 } 2355 cgp->cg_cs.cs_nffree += i; 2356 fs->fs_cstotal.cs_nffree += i; 2357 fs->fs_cs(fs, cg).cs_nffree += i; 2358 /* 2359 * add back in counts associated with the new frags 2360 */ 2361 blk = blkmap(fs, blksfree, bbase); 2362 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 2363 /* 2364 * if a complete block has been reassembled, account for it 2365 */ 2366 fragno = fragstoblks(fs, bbase); 2367 if (ffs_isblock(fs, blksfree, fragno)) { 2368 cgp->cg_cs.cs_nffree -= fs->fs_frag; 2369 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 2370 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 2371 ffs_clusteracct(fs, cgp, fragno, 1); 2372 cgp->cg_cs.cs_nbfree++; 2373 fs->fs_cstotal.cs_nbfree++; 2374 fs->fs_cs(fs, cg).cs_nbfree++; 2375 } 2376 } 2377 fs->fs_fmod = 1; 2378 ACTIVECLEAR(fs, cg); 2379 UFS_UNLOCK(ump); 2380 mp = UFSTOVFS(ump); 2381 if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR) 2382 softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, 2383 numfrags(fs, size), dephd); 2384 bdwrite(bp); 2385 } 2386 2387 /* 2388 * Structures and routines associated with trim management. 2389 * 2390 * The following requests are passed to trim_lookup to indicate 2391 * the actions that should be taken. 2392 */ 2393 #define NEW 1 /* if found, error else allocate and hash it */ 2394 #define OLD 2 /* if not found, error, else return it */ 2395 #define REPLACE 3 /* if not found, error else unhash and reallocate it */ 2396 #define DONE 4 /* if not found, error else unhash and return it */ 2397 #define SINGLE 5 /* don't look up, just allocate it and don't hash it */ 2398 2399 MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures"); 2400 2401 #define TRIMLIST_HASH(ump, key) \ 2402 (&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize]) 2403 2404 /* 2405 * These structures describe each of the block free requests aggregated 2406 * together to make up a trim request. 2407 */ 2408 struct trim_blkreq { 2409 TAILQ_ENTRY(trim_blkreq) blkreqlist; 2410 ufs2_daddr_t bno; 2411 long size; 2412 struct workhead *pdephd; 2413 struct workhead dephd; 2414 }; 2415 2416 /* 2417 * Description of a trim request. 2418 */ 2419 struct ffs_blkfree_trim_params { 2420 TAILQ_HEAD(, trim_blkreq) blklist; 2421 LIST_ENTRY(ffs_blkfree_trim_params) hashlist; 2422 struct task task; 2423 struct ufsmount *ump; 2424 struct vnode *devvp; 2425 ino_t inum; 2426 ufs2_daddr_t bno; 2427 long size; 2428 long key; 2429 }; 2430 2431 static void ffs_blkfree_trim_completed(struct buf *); 2432 static void ffs_blkfree_trim_task(void *ctx, int pending __unused); 2433 static struct ffs_blkfree_trim_params *trim_lookup(struct ufsmount *, 2434 struct vnode *, ufs2_daddr_t, long, ino_t, uint64_t, int); 2435 static void ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *); 2436 2437 /* 2438 * Called on trim completion to start a task to free the associated block(s). 2439 */ 2440 static void 2441 ffs_blkfree_trim_completed(struct buf *bp) 2442 { 2443 struct ffs_blkfree_trim_params *tp; 2444 2445 tp = bp->b_fsprivate1; 2446 free(bp, M_TRIM); 2447 TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); 2448 taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task); 2449 } 2450 2451 /* 2452 * Trim completion task that free associated block(s). 2453 */ 2454 static void 2455 ffs_blkfree_trim_task(void *ctx, int pending) 2456 { 2457 struct ffs_blkfree_trim_params *tp; 2458 struct trim_blkreq *blkelm; 2459 struct ufsmount *ump; 2460 2461 tp = ctx; 2462 ump = tp->ump; 2463 while ((blkelm = TAILQ_FIRST(&tp->blklist)) != NULL) { 2464 ffs_blkfree_cg(ump, ump->um_fs, tp->devvp, blkelm->bno, 2465 blkelm->size, tp->inum, blkelm->pdephd); 2466 TAILQ_REMOVE(&tp->blklist, blkelm, blkreqlist); 2467 free(blkelm, M_TRIM); 2468 } 2469 vn_finished_secondary_write(UFSTOVFS(ump)); 2470 UFS_LOCK(ump); 2471 ump->um_trim_inflight -= 1; 2472 ump->um_trim_inflight_blks -= numfrags(ump->um_fs, tp->size); 2473 UFS_UNLOCK(ump); 2474 free(tp, M_TRIM); 2475 } 2476 2477 /* 2478 * Lookup a trim request by inode number. 2479 * Allocate if requested (NEW, REPLACE, SINGLE). 2480 */ 2481 static struct ffs_blkfree_trim_params * 2482 trim_lookup(struct ufsmount *ump, 2483 struct vnode *devvp, 2484 ufs2_daddr_t bno, 2485 long size, 2486 ino_t inum, 2487 uint64_t key, 2488 int alloctype) 2489 { 2490 struct trimlist_hashhead *tphashhead; 2491 struct ffs_blkfree_trim_params *tp, *ntp; 2492 2493 ntp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK); 2494 if (alloctype != SINGLE) { 2495 KASSERT(key >= FIRST_VALID_KEY, ("trim_lookup: invalid key")); 2496 UFS_LOCK(ump); 2497 tphashhead = TRIMLIST_HASH(ump, key); 2498 LIST_FOREACH(tp, tphashhead, hashlist) 2499 if (key == tp->key) 2500 break; 2501 } 2502 switch (alloctype) { 2503 case NEW: 2504 KASSERT(tp == NULL, ("trim_lookup: found trim")); 2505 break; 2506 case OLD: 2507 KASSERT(tp != NULL, 2508 ("trim_lookup: missing call to ffs_blkrelease_start()")); 2509 UFS_UNLOCK(ump); 2510 free(ntp, M_TRIM); 2511 return (tp); 2512 case REPLACE: 2513 KASSERT(tp != NULL, ("trim_lookup: missing REPLACE trim")); 2514 LIST_REMOVE(tp, hashlist); 2515 /* tp will be freed by caller */ 2516 break; 2517 case DONE: 2518 KASSERT(tp != NULL, ("trim_lookup: missing DONE trim")); 2519 LIST_REMOVE(tp, hashlist); 2520 UFS_UNLOCK(ump); 2521 free(ntp, M_TRIM); 2522 return (tp); 2523 } 2524 TAILQ_INIT(&ntp->blklist); 2525 ntp->ump = ump; 2526 ntp->devvp = devvp; 2527 ntp->bno = bno; 2528 ntp->size = size; 2529 ntp->inum = inum; 2530 ntp->key = key; 2531 if (alloctype != SINGLE) { 2532 LIST_INSERT_HEAD(tphashhead, ntp, hashlist); 2533 UFS_UNLOCK(ump); 2534 } 2535 return (ntp); 2536 } 2537 2538 /* 2539 * Dispatch a trim request. 2540 */ 2541 static void 2542 ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *tp) 2543 { 2544 struct ufsmount *ump; 2545 struct mount *mp; 2546 struct buf *bp; 2547 2548 /* 2549 * Postpone the set of the free bit in the cg bitmap until the 2550 * BIO_DELETE is completed. Otherwise, due to disk queue 2551 * reordering, TRIM might be issued after we reuse the block 2552 * and write some new data into it. 2553 */ 2554 ump = tp->ump; 2555 bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO); 2556 bp->b_iocmd = BIO_DELETE; 2557 bp->b_iooffset = dbtob(fsbtodb(ump->um_fs, tp->bno)); 2558 bp->b_iodone = ffs_blkfree_trim_completed; 2559 bp->b_bcount = tp->size; 2560 bp->b_fsprivate1 = tp; 2561 UFS_LOCK(ump); 2562 ump->um_trim_total += 1; 2563 ump->um_trim_inflight += 1; 2564 ump->um_trim_inflight_blks += numfrags(ump->um_fs, tp->size); 2565 ump->um_trim_total_blks += numfrags(ump->um_fs, tp->size); 2566 UFS_UNLOCK(ump); 2567 2568 mp = UFSTOVFS(ump); 2569 vn_start_secondary_write(NULL, &mp, 0); 2570 g_vfs_strategy(ump->um_bo, bp); 2571 } 2572 2573 /* 2574 * Allocate a new key to use to identify a range of blocks. 2575 */ 2576 uint64_t 2577 ffs_blkrelease_start(struct ufsmount *ump, 2578 struct vnode *devvp, 2579 ino_t inum) 2580 { 2581 static u_long masterkey; 2582 uint64_t key; 2583 2584 if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) 2585 return (SINGLETON_KEY); 2586 do { 2587 key = atomic_fetchadd_long(&masterkey, 1); 2588 } while (key < FIRST_VALID_KEY); 2589 (void) trim_lookup(ump, devvp, 0, 0, inum, key, NEW); 2590 return (key); 2591 } 2592 2593 /* 2594 * Deallocate a key that has been used to identify a range of blocks. 2595 */ 2596 void 2597 ffs_blkrelease_finish(struct ufsmount *ump, uint64_t key) 2598 { 2599 struct ffs_blkfree_trim_params *tp; 2600 2601 if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) 2602 return; 2603 /* 2604 * If the vfs.ffs.dotrimcons sysctl option is enabled while 2605 * a file deletion is active, specifically after a call 2606 * to ffs_blkrelease_start() but before the call to 2607 * ffs_blkrelease_finish(), ffs_blkrelease_start() will 2608 * have handed out SINGLETON_KEY rather than starting a 2609 * collection sequence. Thus if we get a SINGLETON_KEY 2610 * passed to ffs_blkrelease_finish(), we just return rather 2611 * than trying to finish the nonexistent sequence. 2612 */ 2613 if (key == SINGLETON_KEY) { 2614 #ifdef INVARIANTS 2615 printf("%s: vfs.ffs.dotrimcons enabled on active filesystem\n", 2616 ump->um_mountp->mnt_stat.f_mntonname); 2617 #endif 2618 return; 2619 } 2620 /* 2621 * We are done with sending blocks using this key. Look up the key 2622 * using the DONE alloctype (in tp) to request that it be unhashed 2623 * as we will not be adding to it. If the key has never been used, 2624 * tp->size will be zero, so we can just free tp. Otherwise the call 2625 * to ffs_blkfree_sendtrim(tp) causes the block range described by 2626 * tp to be issued (and then tp to be freed). 2627 */ 2628 tp = trim_lookup(ump, NULL, 0, 0, 0, key, DONE); 2629 if (tp->size == 0) 2630 free(tp, M_TRIM); 2631 else 2632 ffs_blkfree_sendtrim(tp); 2633 } 2634 2635 /* 2636 * Setup to free a block or fragment. 2637 * 2638 * Check for snapshots that might want to claim the block. 2639 * If trims are requested, prepare a trim request. Attempt to 2640 * aggregate consecutive blocks into a single trim request. 2641 */ 2642 void 2643 ffs_blkfree(struct ufsmount *ump, 2644 struct fs *fs, 2645 struct vnode *devvp, 2646 ufs2_daddr_t bno, 2647 long size, 2648 ino_t inum, 2649 __enum_uint8(vtype) vtype, 2650 struct workhead *dephd, 2651 uint64_t key) 2652 { 2653 struct ffs_blkfree_trim_params *tp, *ntp; 2654 struct trim_blkreq *blkelm; 2655 2656 /* 2657 * Check to see if a snapshot wants to claim the block. 2658 * Check that devvp is a normal disk device, not a snapshot, 2659 * it has a snapshot(s) associated with it, and one of the 2660 * snapshots wants to claim the block. 2661 */ 2662 if (devvp->v_type == VCHR && 2663 (devvp->v_vflag & VV_COPYONWRITE) && 2664 ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { 2665 return; 2666 } 2667 /* 2668 * Nothing to delay if TRIM is not required for this block or TRIM 2669 * is disabled or the operation is performed on a snapshot. 2670 */ 2671 if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) || 2672 devvp->v_type == VREG) { 2673 ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); 2674 return; 2675 } 2676 blkelm = malloc(sizeof(struct trim_blkreq), M_TRIM, M_WAITOK); 2677 blkelm->bno = bno; 2678 blkelm->size = size; 2679 if (dephd == NULL) { 2680 blkelm->pdephd = NULL; 2681 } else { 2682 LIST_INIT(&blkelm->dephd); 2683 LIST_SWAP(dephd, &blkelm->dephd, worklist, wk_list); 2684 blkelm->pdephd = &blkelm->dephd; 2685 } 2686 if (key == SINGLETON_KEY) { 2687 /* 2688 * Just a single non-contiguous piece. Use the SINGLE 2689 * alloctype to return a trim request that will not be 2690 * hashed for future lookup. 2691 */ 2692 tp = trim_lookup(ump, devvp, bno, size, inum, key, SINGLE); 2693 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2694 ffs_blkfree_sendtrim(tp); 2695 return; 2696 } 2697 /* 2698 * The callers of this function are not tracking whether or not 2699 * the blocks are contiguous. They are just saying that they 2700 * are freeing a set of blocks. It is this code that determines 2701 * the pieces of that range that are actually contiguous. 2702 * 2703 * Calling ffs_blkrelease_start() will have created an entry 2704 * that we will use. 2705 */ 2706 tp = trim_lookup(ump, devvp, bno, size, inum, key, OLD); 2707 if (tp->size == 0) { 2708 /* 2709 * First block of a potential range, set block and size 2710 * for the trim block. 2711 */ 2712 tp->bno = bno; 2713 tp->size = size; 2714 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2715 return; 2716 } 2717 /* 2718 * If this block is a continuation of the range (either 2719 * follows at the end or preceeds in the front) then we 2720 * add it to the front or back of the list and return. 2721 * 2722 * If it is not a continuation of the trim that we were 2723 * building, using the REPLACE alloctype, we request that 2724 * the old trim request (still in tp) be unhashed and a 2725 * new range started (in ntp). The ffs_blkfree_sendtrim(tp) 2726 * call causes the block range described by tp to be issued 2727 * (and then tp to be freed). 2728 */ 2729 if (bno + numfrags(fs, size) == tp->bno) { 2730 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2731 tp->bno = bno; 2732 tp->size += size; 2733 return; 2734 } else if (bno == tp->bno + numfrags(fs, tp->size)) { 2735 TAILQ_INSERT_TAIL(&tp->blklist, blkelm, blkreqlist); 2736 tp->size += size; 2737 return; 2738 } 2739 ntp = trim_lookup(ump, devvp, bno, size, inum, key, REPLACE); 2740 TAILQ_INSERT_HEAD(&ntp->blklist, blkelm, blkreqlist); 2741 ffs_blkfree_sendtrim(tp); 2742 } 2743 2744 #ifdef INVARIANTS 2745 /* 2746 * Verify allocation of a block or fragment. Returns true if block or 2747 * fragment is allocated, false if it is free. 2748 */ 2749 static int 2750 ffs_checkblk(struct inode *ip, 2751 ufs2_daddr_t bno, 2752 long size) 2753 { 2754 struct fs *fs; 2755 struct cg *cgp; 2756 struct buf *bp; 2757 ufs1_daddr_t cgbno; 2758 int i, error, frags, free; 2759 uint8_t *blksfree; 2760 2761 fs = ITOFS(ip); 2762 if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0) { 2763 printf("bsize = %ld, size = %ld, fs = %s\n", 2764 (long)fs->fs_bsize, size, fs->fs_fsmnt); 2765 panic("ffs_checkblk: bad size"); 2766 } 2767 if ((uint64_t)bno >= fs->fs_size) 2768 panic("ffs_checkblk: bad block %jd", (intmax_t)bno); 2769 error = ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), 0, &bp, &cgp); 2770 if (error) 2771 panic("ffs_checkblk: cylinder group read failed"); 2772 blksfree = cg_blksfree(cgp); 2773 cgbno = dtogd(fs, bno); 2774 if (size == fs->fs_bsize) { 2775 free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); 2776 } else { 2777 frags = numfrags(fs, size); 2778 for (free = 0, i = 0; i < frags; i++) 2779 if (isset(blksfree, cgbno + i)) 2780 free++; 2781 if (free != 0 && free != frags) 2782 panic("ffs_checkblk: partially free fragment"); 2783 } 2784 brelse(bp); 2785 return (!free); 2786 } 2787 #endif /* INVARIANTS */ 2788 2789 /* 2790 * Free an inode. 2791 */ 2792 int 2793 ffs_vfree(struct vnode *pvp, 2794 ino_t ino, 2795 int mode) 2796 { 2797 struct ufsmount *ump; 2798 2799 if (DOINGSOFTDEP(pvp)) { 2800 softdep_freefile(pvp, ino, mode); 2801 return (0); 2802 } 2803 ump = VFSTOUFS(pvp->v_mount); 2804 return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL)); 2805 } 2806 2807 /* 2808 * Do the actual free operation. 2809 * The specified inode is placed back in the free map. 2810 */ 2811 int 2812 ffs_freefile(struct ufsmount *ump, 2813 struct fs *fs, 2814 struct vnode *devvp, 2815 ino_t ino, 2816 int mode, 2817 struct workhead *wkhd) 2818 { 2819 struct cg *cgp; 2820 struct buf *bp; 2821 daddr_t dbn; 2822 int error; 2823 uint64_t cg; 2824 uint8_t *inosused; 2825 struct cdev *dev; 2826 ino_t cgino; 2827 2828 cg = ino_to_cg(fs, ino); 2829 if (devvp->v_type == VREG) { 2830 /* devvp is a snapshot */ 2831 MPASS(devvp->v_mount->mnt_data == ump); 2832 dev = ump->um_devvp->v_rdev; 2833 } else if (devvp->v_type == VCHR) { 2834 /* devvp is a normal disk device */ 2835 dev = devvp->v_rdev; 2836 } else { 2837 bp = NULL; 2838 return (0); 2839 } 2840 if (ino >= fs->fs_ipg * fs->fs_ncg) 2841 panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s", 2842 devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); 2843 if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { 2844 if (!ffs_fsfail_cleanup(ump, error) || 2845 !MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) 2846 return (error); 2847 if (devvp->v_type == VREG) 2848 dbn = fragstoblks(fs, cgtod(fs, cg)); 2849 else 2850 dbn = fsbtodb(fs, cgtod(fs, cg)); 2851 error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); 2852 KASSERT(error == 0, ("getblkx failed")); 2853 softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd); 2854 bp->b_flags |= B_RELBUF | B_NOCACHE; 2855 bp->b_flags &= ~B_CACHE; 2856 bawrite(bp); 2857 return (error); 2858 } 2859 inosused = cg_inosused(cgp); 2860 cgino = ino % fs->fs_ipg; 2861 if (isclr(inosused, cgino)) { 2862 printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev), 2863 (uintmax_t)ino, fs->fs_fsmnt); 2864 if (fs->fs_ronly == 0) 2865 panic("ffs_freefile: freeing free inode"); 2866 } 2867 clrbit(inosused, cgino); 2868 if (cgino < cgp->cg_irotor) 2869 cgp->cg_irotor = cgino; 2870 cgp->cg_cs.cs_nifree++; 2871 UFS_LOCK(ump); 2872 fs->fs_cstotal.cs_nifree++; 2873 fs->fs_cs(fs, cg).cs_nifree++; 2874 if ((mode & IFMT) == IFDIR) { 2875 cgp->cg_cs.cs_ndir--; 2876 fs->fs_cstotal.cs_ndir--; 2877 fs->fs_cs(fs, cg).cs_ndir--; 2878 } 2879 fs->fs_fmod = 1; 2880 ACTIVECLEAR(fs, cg); 2881 UFS_UNLOCK(ump); 2882 if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR) 2883 softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd); 2884 bdwrite(bp); 2885 return (0); 2886 } 2887 2888 /* 2889 * Check to see if a file is free. 2890 * Used to check for allocated files in snapshots. 2891 */ 2892 int 2893 ffs_checkfreefile(struct fs *fs, 2894 struct vnode *devvp, 2895 ino_t ino) 2896 { 2897 struct cg *cgp; 2898 struct buf *bp; 2899 int ret, error; 2900 uint64_t cg; 2901 uint8_t *inosused; 2902 2903 cg = ino_to_cg(fs, ino); 2904 if ((devvp->v_type != VREG) && (devvp->v_type != VCHR)) 2905 return (1); 2906 if (ino >= fs->fs_ipg * fs->fs_ncg) 2907 return (1); 2908 if ((error = ffs_getcg(fs, devvp, cg, 0, &bp, &cgp)) != 0) 2909 return (1); 2910 inosused = cg_inosused(cgp); 2911 ino %= fs->fs_ipg; 2912 ret = isclr(inosused, ino); 2913 brelse(bp); 2914 return (ret); 2915 } 2916 2917 /* 2918 * Find a block of the specified size in the specified cylinder group. 2919 * 2920 * It is a panic if a request is made to find a block if none are 2921 * available. 2922 */ 2923 static ufs1_daddr_t 2924 ffs_mapsearch(struct fs *fs, 2925 struct cg *cgp, 2926 ufs2_daddr_t bpref, 2927 int allocsiz) 2928 { 2929 ufs1_daddr_t bno; 2930 int start, len, loc, i; 2931 int blk, field, subfield, pos; 2932 uint8_t *blksfree; 2933 2934 /* 2935 * find the fragment by searching through the free block 2936 * map for an appropriate bit pattern 2937 */ 2938 if (bpref) 2939 start = dtogd(fs, bpref) / NBBY; 2940 else 2941 start = cgp->cg_frotor / NBBY; 2942 blksfree = cg_blksfree(cgp); 2943 len = howmany(fs->fs_fpg, NBBY) - start; 2944 loc = scanc((uint64_t)len, (uint8_t *)&blksfree[start], 2945 fragtbl[fs->fs_frag], 2946 (uint8_t)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 2947 if (loc == 0) { 2948 len = start + 1; 2949 start = 0; 2950 loc = scanc((uint64_t)len, (uint8_t *)&blksfree[0], 2951 fragtbl[fs->fs_frag], 2952 (uint8_t)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 2953 if (loc == 0) { 2954 printf("start = %d, len = %d, fs = %s\n", 2955 start, len, fs->fs_fsmnt); 2956 panic("ffs_alloccg: map corrupted"); 2957 /* NOTREACHED */ 2958 } 2959 } 2960 bno = (start + len - loc) * NBBY; 2961 cgp->cg_frotor = bno; 2962 /* 2963 * found the byte in the map 2964 * sift through the bits to find the selected frag 2965 */ 2966 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 2967 blk = blkmap(fs, blksfree, bno); 2968 blk <<= 1; 2969 field = around[allocsiz]; 2970 subfield = inside[allocsiz]; 2971 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { 2972 if ((blk & field) == subfield) 2973 return (bno + pos); 2974 field <<= 1; 2975 subfield <<= 1; 2976 } 2977 } 2978 printf("bno = %ju, fs = %s\n", (intmax_t)bno, fs->fs_fsmnt); 2979 panic("ffs_alloccg: block not in map"); 2980 return (-1); 2981 } 2982 2983 static const struct statfs * 2984 ffs_getmntstat(struct vnode *devvp) 2985 { 2986 2987 if (devvp->v_type == VCHR) 2988 return (&devvp->v_rdev->si_mountpt->mnt_stat); 2989 return (ffs_getmntstat(VFSTOUFS(devvp->v_mount)->um_devvp)); 2990 } 2991 2992 /* 2993 * Fetch and verify a cylinder group. 2994 */ 2995 int 2996 ffs_getcg(struct fs *fs, 2997 struct vnode *devvp, 2998 uint64_t cg, 2999 int flags, 3000 struct buf **bpp, 3001 struct cg **cgpp) 3002 { 3003 struct buf *bp; 3004 struct cg *cgp; 3005 const struct statfs *sfs; 3006 daddr_t blkno; 3007 int error; 3008 3009 *bpp = NULL; 3010 *cgpp = NULL; 3011 if ((fs->fs_metackhash & CK_CYLGRP) != 0) 3012 flags |= GB_CKHASH; 3013 if (devvp->v_type == VREG) 3014 blkno = fragstoblks(fs, cgtod(fs, cg)); 3015 else 3016 blkno = fsbtodb(fs, cgtod(fs, cg)); 3017 error = breadn_flags(devvp, blkno, blkno, (int)fs->fs_cgsize, NULL, 3018 NULL, 0, NOCRED, flags, ffs_ckhash_cg, &bp); 3019 if (error != 0) 3020 return (error); 3021 cgp = (struct cg *)bp->b_data; 3022 if ((fs->fs_metackhash & CK_CYLGRP) != 0 && 3023 (bp->b_flags & B_CKHASH) != 0 && 3024 cgp->cg_ckhash != bp->b_ckhash) { 3025 sfs = ffs_getmntstat(devvp); 3026 printf("UFS %s%s (%s) cylinder checksum failed: cg %ju, cgp: " 3027 "0x%x != bp: 0x%jx\n", 3028 devvp->v_type == VCHR ? "" : "snapshot of ", 3029 sfs->f_mntfromname, sfs->f_mntonname, 3030 (intmax_t)cg, cgp->cg_ckhash, (uintmax_t)bp->b_ckhash); 3031 bp->b_flags &= ~B_CKHASH; 3032 bp->b_flags |= B_INVAL | B_NOCACHE; 3033 brelse(bp); 3034 return (EIO); 3035 } 3036 if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) { 3037 sfs = ffs_getmntstat(devvp); 3038 printf("UFS %s%s (%s)", 3039 devvp->v_type == VCHR ? "" : "snapshot of ", 3040 sfs->f_mntfromname, sfs->f_mntonname); 3041 if (!cg_chkmagic(cgp)) 3042 printf(" cg %ju: bad magic number 0x%x should be " 3043 "0x%x\n", (intmax_t)cg, cgp->cg_magic, CG_MAGIC); 3044 else 3045 printf(": wrong cylinder group cg %ju != cgx %u\n", 3046 (intmax_t)cg, cgp->cg_cgx); 3047 bp->b_flags &= ~B_CKHASH; 3048 bp->b_flags |= B_INVAL | B_NOCACHE; 3049 brelse(bp); 3050 return (EIO); 3051 } 3052 bp->b_flags &= ~B_CKHASH; 3053 bp->b_xflags |= BX_BKGRDWRITE; 3054 /* 3055 * If we are using check hashes on the cylinder group then we want 3056 * to limit changing the cylinder group time to when we are actually 3057 * going to write it to disk so that its check hash remains correct 3058 * in memory. If the CK_CYLGRP flag is set the time is updated in 3059 * ffs_bufwrite() as the buffer is queued for writing. Otherwise we 3060 * update the time here as we have done historically. 3061 */ 3062 if ((fs->fs_metackhash & CK_CYLGRP) != 0) 3063 bp->b_xflags |= BX_CYLGRP; 3064 else 3065 cgp->cg_old_time = cgp->cg_time = time_second; 3066 *bpp = bp; 3067 *cgpp = cgp; 3068 return (0); 3069 } 3070 3071 static void 3072 ffs_ckhash_cg(struct buf *bp) 3073 { 3074 uint32_t ckhash; 3075 struct cg *cgp; 3076 3077 cgp = (struct cg *)bp->b_data; 3078 ckhash = cgp->cg_ckhash; 3079 cgp->cg_ckhash = 0; 3080 bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); 3081 cgp->cg_ckhash = ckhash; 3082 } 3083 3084 /* 3085 * Fserr prints the name of a filesystem with an error diagnostic. 3086 * 3087 * The form of the error message is: 3088 * fs: error message 3089 */ 3090 void 3091 ffs_fserr(struct fs *fs, 3092 ino_t inum, 3093 char *cp) 3094 { 3095 struct thread *td = curthread; /* XXX */ 3096 struct proc *p = td->td_proc; 3097 3098 log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n", 3099 p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum, 3100 fs->fs_fsmnt, cp); 3101 } 3102 3103 /* 3104 * This function provides the capability for the fsck program to 3105 * update an active filesystem. Sixteen operations are provided: 3106 * 3107 * adjrefcnt(inode, amt) - adjusts the reference count on the 3108 * specified inode by the specified amount. Under normal 3109 * operation the count should always go down. Decrementing 3110 * the count to zero will cause the inode to be freed. 3111 * adjblkcnt(inode, amt) - adjust the number of blocks used by the 3112 * inode by the specified amount. 3113 * adjdepth(inode, amt) - adjust the depth of the specified directory 3114 * inode by the specified amount. 3115 * setsize(inode, size) - set the size of the inode to the 3116 * specified size. 3117 * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - 3118 * adjust the superblock summary. 3119 * freedirs(inode, count) - directory inodes [inode..inode + count - 1] 3120 * are marked as free. Inodes should never have to be marked 3121 * as in use. 3122 * freefiles(inode, count) - file inodes [inode..inode + count - 1] 3123 * are marked as free. Inodes should never have to be marked 3124 * as in use. 3125 * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] 3126 * are marked as free. Blocks should never have to be marked 3127 * as in use. 3128 * setflags(flags, set/clear) - the fs_flags field has the specified 3129 * flags set (second parameter +1) or cleared (second parameter -1). 3130 * setcwd(dirinode) - set the current directory to dirinode in the 3131 * filesystem associated with the snapshot. 3132 * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".." 3133 * in the current directory is oldvalue then change it to newvalue. 3134 * unlink(nameptr, oldvalue) - Verify that the inode number associated 3135 * with nameptr in the current directory is oldvalue then unlink it. 3136 */ 3137 3138 static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); 3139 3140 SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, 3141 CTLFLAG_WR | CTLTYPE_STRUCT | CTLFLAG_NEEDGIANT, 3142 0, 0, sysctl_ffs_fsck, "S,fsck", 3143 "Adjust Inode Reference Count"); 3144 3145 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, 3146 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3147 "Adjust Inode Used Blocks Count"); 3148 3149 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_DEPTH, adjdepth, 3150 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3151 "Adjust Directory Inode Depth"); 3152 3153 static SYSCTL_NODE(_vfs_ffs, FFS_SET_SIZE, setsize, 3154 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3155 "Set the inode size"); 3156 3157 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, 3158 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3159 "Adjust number of directories"); 3160 3161 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, 3162 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3163 "Adjust number of free blocks"); 3164 3165 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, 3166 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3167 "Adjust number of free inodes"); 3168 3169 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, 3170 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3171 "Adjust number of free frags"); 3172 3173 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, 3174 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3175 "Adjust number of free clusters"); 3176 3177 static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, 3178 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3179 "Free Range of Directory Inodes"); 3180 3181 static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, 3182 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3183 "Free Range of File Inodes"); 3184 3185 static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, 3186 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3187 "Free Range of Blocks"); 3188 3189 static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, 3190 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3191 "Change Filesystem Flags"); 3192 3193 static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, 3194 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3195 "Set Current Working Directory"); 3196 3197 static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, 3198 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3199 "Change Value of .. Entry"); 3200 3201 static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, 3202 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3203 "Unlink a Duplicate Name"); 3204 3205 #ifdef DIAGNOSTIC 3206 static int fsckcmds = 0; 3207 SYSCTL_INT(_debug, OID_AUTO, ffs_fsckcmds, CTLFLAG_RW, &fsckcmds, 0, 3208 "print out fsck_ffs-based filesystem update commands"); 3209 #endif /* DIAGNOSTIC */ 3210 3211 static int 3212 sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) 3213 { 3214 struct thread *td = curthread; 3215 struct fsck_cmd cmd; 3216 struct ufsmount *ump; 3217 struct vnode *vp, *dvp, *fdvp; 3218 struct inode *ip, *dp; 3219 struct mount *mp; 3220 struct fs *fs; 3221 struct pwd *pwd; 3222 ufs2_daddr_t blkno; 3223 long blkcnt, blksize; 3224 uint64_t key; 3225 struct file *fp; 3226 cap_rights_t rights; 3227 int filetype, error; 3228 3229 if (req->newptr == NULL || req->newlen > sizeof(cmd)) 3230 return (EBADRPC); 3231 if ((error = SYSCTL_IN(req, &cmd, sizeof(cmd))) != 0) 3232 return (error); 3233 if (cmd.version != FFS_CMD_VERSION) 3234 return (ERPCMISMATCH); 3235 if ((error = getvnode(td, cmd.handle, 3236 cap_rights_init_one(&rights, CAP_FSCK), &fp)) != 0) 3237 return (error); 3238 vp = fp->f_vnode; 3239 if (vp->v_type != VREG && vp->v_type != VDIR) { 3240 fdrop(fp, td); 3241 return (EINVAL); 3242 } 3243 vn_start_write(vp, &mp, V_WAIT); 3244 if (mp == NULL || 3245 strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { 3246 vn_finished_write(mp); 3247 fdrop(fp, td); 3248 return (EINVAL); 3249 } 3250 ump = VFSTOUFS(mp); 3251 if (mp->mnt_flag & MNT_RDONLY) { 3252 vn_finished_write(mp); 3253 fdrop(fp, td); 3254 return (EROFS); 3255 } 3256 fs = ump->um_fs; 3257 filetype = IFREG; 3258 3259 switch (oidp->oid_number) { 3260 case FFS_SET_FLAGS: 3261 #ifdef DIAGNOSTIC 3262 if (fsckcmds) 3263 printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, 3264 cmd.size > 0 ? "set" : "clear"); 3265 #endif /* DIAGNOSTIC */ 3266 if (cmd.size > 0) 3267 fs->fs_flags |= (long)cmd.value; 3268 else 3269 fs->fs_flags &= ~(long)cmd.value; 3270 break; 3271 3272 case FFS_ADJ_REFCNT: 3273 #ifdef DIAGNOSTIC 3274 if (fsckcmds) { 3275 printf("%s: adjust inode %jd link count by %jd\n", 3276 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3277 (intmax_t)cmd.size); 3278 } 3279 #endif /* DIAGNOSTIC */ 3280 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3281 break; 3282 ip = VTOI(vp); 3283 ip->i_nlink += cmd.size; 3284 DIP_SET(ip, i_nlink, ip->i_nlink); 3285 ip->i_effnlink += cmd.size; 3286 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3287 error = ffs_update(vp, 1); 3288 if (DOINGSOFTDEP(vp)) 3289 softdep_change_linkcnt(ip); 3290 vput(vp); 3291 break; 3292 3293 case FFS_ADJ_BLKCNT: 3294 #ifdef DIAGNOSTIC 3295 if (fsckcmds) { 3296 printf("%s: adjust inode %jd block count by %jd\n", 3297 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3298 (intmax_t)cmd.size); 3299 } 3300 #endif /* DIAGNOSTIC */ 3301 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3302 break; 3303 ip = VTOI(vp); 3304 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); 3305 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3306 error = ffs_update(vp, 1); 3307 vput(vp); 3308 break; 3309 3310 case FFS_ADJ_DEPTH: 3311 #ifdef DIAGNOSTIC 3312 if (fsckcmds) { 3313 printf("%s: adjust directory inode %jd depth by %jd\n", 3314 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3315 (intmax_t)cmd.size); 3316 } 3317 #endif /* DIAGNOSTIC */ 3318 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3319 break; 3320 if (vp->v_type != VDIR) { 3321 vput(vp); 3322 error = ENOTDIR; 3323 break; 3324 } 3325 ip = VTOI(vp); 3326 DIP_SET(ip, i_dirdepth, DIP(ip, i_dirdepth) + cmd.size); 3327 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3328 error = ffs_update(vp, 1); 3329 vput(vp); 3330 break; 3331 3332 case FFS_SET_SIZE: 3333 #ifdef DIAGNOSTIC 3334 if (fsckcmds) { 3335 printf("%s: set inode %jd size to %jd\n", 3336 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3337 (intmax_t)cmd.size); 3338 } 3339 #endif /* DIAGNOSTIC */ 3340 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3341 break; 3342 ip = VTOI(vp); 3343 DIP_SET(ip, i_size, cmd.size); 3344 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_MODIFIED); 3345 error = ffs_update(vp, 1); 3346 vput(vp); 3347 break; 3348 3349 case FFS_DIR_FREE: 3350 filetype = IFDIR; 3351 /* fall through */ 3352 3353 case FFS_FILE_FREE: 3354 #ifdef DIAGNOSTIC 3355 if (fsckcmds) { 3356 if (cmd.size == 1) 3357 printf("%s: free %s inode %ju\n", 3358 mp->mnt_stat.f_mntonname, 3359 filetype == IFDIR ? "directory" : "file", 3360 (uintmax_t)cmd.value); 3361 else 3362 printf("%s: free %s inodes %ju-%ju\n", 3363 mp->mnt_stat.f_mntonname, 3364 filetype == IFDIR ? "directory" : "file", 3365 (uintmax_t)cmd.value, 3366 (uintmax_t)(cmd.value + cmd.size - 1)); 3367 } 3368 #endif /* DIAGNOSTIC */ 3369 while (cmd.size > 0) { 3370 if ((error = ffs_freefile(ump, fs, ump->um_devvp, 3371 cmd.value, filetype, NULL))) 3372 break; 3373 cmd.size -= 1; 3374 cmd.value += 1; 3375 } 3376 break; 3377 3378 case FFS_BLK_FREE: 3379 #ifdef DIAGNOSTIC 3380 if (fsckcmds) { 3381 if (cmd.size == 1) 3382 printf("%s: free block %jd\n", 3383 mp->mnt_stat.f_mntonname, 3384 (intmax_t)cmd.value); 3385 else 3386 printf("%s: free blocks %jd-%jd\n", 3387 mp->mnt_stat.f_mntonname, 3388 (intmax_t)cmd.value, 3389 (intmax_t)cmd.value + cmd.size - 1); 3390 } 3391 #endif /* DIAGNOSTIC */ 3392 blkno = cmd.value; 3393 blkcnt = cmd.size; 3394 blksize = fs->fs_frag - (blkno % fs->fs_frag); 3395 key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO); 3396 while (blkcnt > 0) { 3397 if (blkcnt < blksize) 3398 blksize = blkcnt; 3399 ffs_blkfree(ump, fs, ump->um_devvp, blkno, 3400 blksize * fs->fs_fsize, UFS_ROOTINO, 3401 VDIR, NULL, key); 3402 blkno += blksize; 3403 blkcnt -= blksize; 3404 blksize = fs->fs_frag; 3405 } 3406 ffs_blkrelease_finish(ump, key); 3407 break; 3408 3409 /* 3410 * Adjust superblock summaries. fsck(8) is expected to 3411 * submit deltas when necessary. 3412 */ 3413 case FFS_ADJ_NDIR: 3414 #ifdef DIAGNOSTIC 3415 if (fsckcmds) { 3416 printf("%s: adjust number of directories by %jd\n", 3417 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3418 } 3419 #endif /* DIAGNOSTIC */ 3420 fs->fs_cstotal.cs_ndir += cmd.value; 3421 break; 3422 3423 case FFS_ADJ_NBFREE: 3424 #ifdef DIAGNOSTIC 3425 if (fsckcmds) { 3426 printf("%s: adjust number of free blocks by %+jd\n", 3427 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3428 } 3429 #endif /* DIAGNOSTIC */ 3430 fs->fs_cstotal.cs_nbfree += cmd.value; 3431 break; 3432 3433 case FFS_ADJ_NIFREE: 3434 #ifdef DIAGNOSTIC 3435 if (fsckcmds) { 3436 printf("%s: adjust number of free inodes by %+jd\n", 3437 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3438 } 3439 #endif /* DIAGNOSTIC */ 3440 fs->fs_cstotal.cs_nifree += cmd.value; 3441 break; 3442 3443 case FFS_ADJ_NFFREE: 3444 #ifdef DIAGNOSTIC 3445 if (fsckcmds) { 3446 printf("%s: adjust number of free frags by %+jd\n", 3447 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3448 } 3449 #endif /* DIAGNOSTIC */ 3450 fs->fs_cstotal.cs_nffree += cmd.value; 3451 break; 3452 3453 case FFS_ADJ_NUMCLUSTERS: 3454 #ifdef DIAGNOSTIC 3455 if (fsckcmds) { 3456 printf("%s: adjust number of free clusters by %+jd\n", 3457 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3458 } 3459 #endif /* DIAGNOSTIC */ 3460 fs->fs_cstotal.cs_numclusters += cmd.value; 3461 break; 3462 3463 case FFS_SET_CWD: 3464 #ifdef DIAGNOSTIC 3465 if (fsckcmds) { 3466 printf("%s: set current directory to inode %jd\n", 3467 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3468 } 3469 #endif /* DIAGNOSTIC */ 3470 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp))) 3471 break; 3472 AUDIT_ARG_VNODE1(vp); 3473 if ((error = change_dir(vp, td)) != 0) { 3474 vput(vp); 3475 break; 3476 } 3477 VOP_UNLOCK(vp); 3478 pwd_chdir(td, vp); 3479 break; 3480 3481 case FFS_SET_DOTDOT: 3482 #ifdef DIAGNOSTIC 3483 if (fsckcmds) { 3484 printf("%s: change .. in cwd from %jd to %jd\n", 3485 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3486 (intmax_t)cmd.size); 3487 } 3488 #endif /* DIAGNOSTIC */ 3489 /* 3490 * First we have to get and lock the parent directory 3491 * to which ".." points. 3492 */ 3493 error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp); 3494 if (error) 3495 break; 3496 /* 3497 * Now we get and lock the child directory containing "..". 3498 */ 3499 pwd = pwd_hold(td); 3500 dvp = pwd->pwd_cdir; 3501 if ((error = vget(dvp, LK_EXCLUSIVE)) != 0) { 3502 vput(fdvp); 3503 pwd_drop(pwd); 3504 break; 3505 } 3506 dp = VTOI(dvp); 3507 SET_I_OFFSET(dp, 12); /* XXX mastertemplate.dot_reclen */ 3508 error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, 3509 DT_DIR, 0); 3510 cache_purge(fdvp); 3511 cache_purge(dvp); 3512 vput(dvp); 3513 vput(fdvp); 3514 pwd_drop(pwd); 3515 break; 3516 3517 case FFS_UNLINK: 3518 #ifdef DIAGNOSTIC 3519 if (fsckcmds) { 3520 char buf[32]; 3521 3522 if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL)) 3523 strncpy(buf, "Name_too_long", 32); 3524 printf("%s: unlink %s (inode %jd)\n", 3525 mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size); 3526 } 3527 #endif /* DIAGNOSTIC */ 3528 /* 3529 * kern_funlinkat will do its own start/finish writes and 3530 * they do not nest, so drop ours here. Setting mp == NULL 3531 * indicates that vn_finished_write is not needed down below. 3532 */ 3533 vn_finished_write(mp); 3534 mp = NULL; 3535 error = kern_funlinkat(td, AT_FDCWD, 3536 (char *)(intptr_t)cmd.value, FD_NONE, UIO_USERSPACE, 3537 0, (ino_t)cmd.size); 3538 break; 3539 3540 default: 3541 #ifdef DIAGNOSTIC 3542 if (fsckcmds) { 3543 printf("Invalid request %d from fsck\n", 3544 oidp->oid_number); 3545 } 3546 #endif /* DIAGNOSTIC */ 3547 error = EINVAL; 3548 break; 3549 } 3550 fdrop(fp, td); 3551 vn_finished_write(mp); 3552 return (error); 3553 } 3554