1 /*- 2 * SPDX-License-Identifier: (BSD-2-Clause AND BSD-3-Clause) 3 * 4 * Copyright (c) 2002 Networks Associates Technology, Inc. 5 * All rights reserved. 6 * 7 * This software was developed for the FreeBSD Project by Marshall 8 * Kirk McKusick and Network Associates Laboratories, the Security 9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 11 * research program 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1982, 1986, 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 62 */ 63 64 #include <sys/cdefs.h> 65 #include "opt_quota.h" 66 67 #include <sys/param.h> 68 #include <sys/systm.h> 69 #include <sys/bio.h> 70 #include <sys/buf.h> 71 #include <sys/capsicum.h> 72 #include <sys/conf.h> 73 #include <sys/fcntl.h> 74 #include <sys/file.h> 75 #include <sys/filedesc.h> 76 #include <sys/gsb_crc32.h> 77 #include <sys/kernel.h> 78 #include <sys/mount.h> 79 #include <sys/priv.h> 80 #include <sys/proc.h> 81 #include <sys/stat.h> 82 #include <sys/syscallsubr.h> 83 #include <sys/sysctl.h> 84 #include <sys/syslog.h> 85 #include <sys/taskqueue.h> 86 #include <sys/vnode.h> 87 88 #include <security/audit/audit.h> 89 90 #include <geom/geom.h> 91 #include <geom/geom_vfs.h> 92 93 #include <ufs/ufs/dir.h> 94 #include <ufs/ufs/extattr.h> 95 #include <ufs/ufs/quota.h> 96 #include <ufs/ufs/inode.h> 97 #include <ufs/ufs/ufs_extern.h> 98 #include <ufs/ufs/ufsmount.h> 99 100 #include <ufs/ffs/fs.h> 101 #include <ufs/ffs/ffs_extern.h> 102 #include <ufs/ffs/softdep.h> 103 104 typedef ufs2_daddr_t allocfcn_t(struct inode *ip, uint64_t cg, 105 ufs2_daddr_t bpref, int size, int rsize); 106 107 static ufs2_daddr_t ffs_alloccg(struct inode *, uint64_t, ufs2_daddr_t, int, 108 int); 109 static ufs2_daddr_t 110 ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); 111 static void ffs_blkfree_cg(struct ufsmount *, struct fs *, 112 struct vnode *, ufs2_daddr_t, long, ino_t, 113 struct workhead *); 114 #ifdef INVARIANTS 115 static int ffs_checkfreeblk(struct inode *, ufs2_daddr_t, long); 116 #endif 117 static void ffs_checkcgintegrity(struct fs *, uint64_t, int); 118 static ufs2_daddr_t ffs_clusteralloc(struct inode *, uint64_t, ufs2_daddr_t, 119 int); 120 static ino_t ffs_dirpref(struct inode *); 121 static ufs2_daddr_t ffs_fragextend(struct inode *, uint64_t, ufs2_daddr_t, 122 int, int); 123 static ufs2_daddr_t ffs_hashalloc(struct inode *, uint64_t, ufs2_daddr_t, 124 int, int, allocfcn_t *); 125 static ufs2_daddr_t ffs_nodealloccg(struct inode *, uint64_t, ufs2_daddr_t, int, 126 int); 127 static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); 128 static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); 129 static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); 130 static void ffs_ckhash_cg(struct buf *); 131 132 /* 133 * Allocate a block in the filesystem. 134 * 135 * The size of the requested block is given, which must be some 136 * multiple of fs_fsize and <= fs_bsize. 137 * A preference may be optionally specified. If a preference is given 138 * the following hierarchy is used to allocate a block: 139 * 1) allocate the requested block. 140 * 2) allocate a rotationally optimal block in the same cylinder. 141 * 3) allocate a block in the same cylinder group. 142 * 4) quadratically rehash into other cylinder groups, until an 143 * available block is located. 144 * If no block preference is given the following hierarchy is used 145 * to allocate a block: 146 * 1) allocate a block in the cylinder group that contains the 147 * inode for the file. 148 * 2) quadratically rehash into other cylinder groups, until an 149 * available block is located. 150 */ 151 int 152 ffs_alloc(struct inode *ip, 153 ufs2_daddr_t lbn, 154 ufs2_daddr_t bpref, 155 int size, 156 int flags, 157 struct ucred *cred, 158 ufs2_daddr_t *bnp) 159 { 160 struct fs *fs; 161 struct ufsmount *ump; 162 ufs2_daddr_t bno; 163 uint64_t cg, reclaimed; 164 int64_t delta; 165 #ifdef QUOTA 166 int error; 167 #endif 168 169 *bnp = 0; 170 ump = ITOUMP(ip); 171 fs = ump->um_fs; 172 mtx_assert(UFS_MTX(ump), MA_OWNED); 173 #ifdef INVARIANTS 174 if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0) { 175 printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", 176 devtoname(ump->um_dev), (long)fs->fs_bsize, size, 177 fs->fs_fsmnt); 178 panic("ffs_alloc: bad size"); 179 } 180 if (cred == NOCRED) 181 panic("ffs_alloc: missing credential"); 182 #endif /* INVARIANTS */ 183 reclaimed = 0; 184 retry: 185 #ifdef QUOTA 186 UFS_UNLOCK(ump); 187 error = chkdq(ip, btodb(size), cred, 0); 188 if (error) 189 return (error); 190 UFS_LOCK(ump); 191 #endif 192 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 193 goto nospace; 194 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && 195 freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) 196 goto nospace; 197 if (bpref >= fs->fs_size) 198 bpref = 0; 199 if (bpref == 0) 200 cg = ino_to_cg(fs, ip->i_number); 201 else 202 cg = dtog(fs, bpref); 203 bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); 204 if (bno > 0) { 205 delta = btodb(size); 206 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 207 if (flags & IO_EXT) 208 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 209 else 210 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 211 *bnp = bno; 212 return (0); 213 } 214 nospace: 215 #ifdef QUOTA 216 UFS_UNLOCK(ump); 217 /* 218 * Restore user's disk quota because allocation failed. 219 */ 220 (void) chkdq(ip, -btodb(size), cred, FORCE); 221 UFS_LOCK(ump); 222 #endif 223 if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 224 reclaimed = 1; 225 softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT); 226 goto retry; 227 } 228 if (ffs_fsfail_cleanup_locked(ump, 0)) { 229 UFS_UNLOCK(ump); 230 return (ENXIO); 231 } 232 if (reclaimed > 0 && 233 ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 234 UFS_UNLOCK(ump); 235 ffs_fserr(fs, ip->i_number, "filesystem full"); 236 uprintf("\n%s: write failed, filesystem is full\n", 237 fs->fs_fsmnt); 238 } else { 239 UFS_UNLOCK(ump); 240 } 241 return (ENOSPC); 242 } 243 244 /* 245 * Reallocate a fragment to a bigger size 246 * 247 * The number and size of the old block is given, and a preference 248 * and new size is also specified. The allocator attempts to extend 249 * the original block. Failing that, the regular block allocator is 250 * invoked to get an appropriate block. 251 */ 252 int 253 ffs_realloccg(struct inode *ip, 254 ufs2_daddr_t lbprev, 255 ufs2_daddr_t bprev, 256 ufs2_daddr_t bpref, 257 int osize, 258 int nsize, 259 int flags, 260 struct ucred *cred, 261 struct buf **bpp) 262 { 263 struct vnode *vp; 264 struct fs *fs; 265 struct buf *bp; 266 struct ufsmount *ump; 267 uint64_t cg, request, reclaimed; 268 int error, gbflags; 269 ufs2_daddr_t bno; 270 int64_t delta; 271 272 vp = ITOV(ip); 273 ump = ITOUMP(ip); 274 fs = ump->um_fs; 275 bp = NULL; 276 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 277 #ifdef WITNESS 278 gbflags |= IS_SNAPSHOT(ip) ? GB_NOWITNESS : 0; 279 #endif 280 281 mtx_assert(UFS_MTX(ump), MA_OWNED); 282 #ifdef INVARIANTS 283 if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 284 panic("ffs_realloccg: allocation on suspended filesystem"); 285 if ((uint64_t)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || 286 (uint64_t)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { 287 printf( 288 "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", 289 devtoname(ump->um_dev), (long)fs->fs_bsize, osize, 290 nsize, fs->fs_fsmnt); 291 panic("ffs_realloccg: bad size"); 292 } 293 if (cred == NOCRED) 294 panic("ffs_realloccg: missing credential"); 295 #endif /* INVARIANTS */ 296 reclaimed = 0; 297 retry: 298 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && 299 freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) { 300 goto nospace; 301 } 302 if (bprev == 0) { 303 printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", 304 devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev, 305 fs->fs_fsmnt); 306 panic("ffs_realloccg: bad bprev"); 307 } 308 UFS_UNLOCK(ump); 309 /* 310 * Allocate the extra space in the buffer. 311 */ 312 error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); 313 if (error) { 314 return (error); 315 } 316 317 if (bp->b_blkno == bp->b_lblkno) { 318 if (lbprev >= UFS_NDADDR) 319 panic("ffs_realloccg: lbprev out of range"); 320 bp->b_blkno = fsbtodb(fs, bprev); 321 } 322 323 #ifdef QUOTA 324 error = chkdq(ip, btodb(nsize - osize), cred, 0); 325 if (error) { 326 brelse(bp); 327 return (error); 328 } 329 #endif 330 /* 331 * Check for extension in the existing location. 332 */ 333 *bpp = NULL; 334 cg = dtog(fs, bprev); 335 UFS_LOCK(ump); 336 bno = ffs_fragextend(ip, cg, bprev, osize, nsize); 337 if (bno) { 338 if (bp->b_blkno != fsbtodb(fs, bno)) 339 panic("ffs_realloccg: bad blockno"); 340 delta = btodb(nsize - osize); 341 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 342 if (flags & IO_EXT) 343 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 344 else 345 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 346 allocbuf(bp, nsize); 347 bp->b_flags |= B_DONE; 348 vfs_bio_bzero_buf(bp, osize, nsize - osize); 349 if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 350 vfs_bio_set_valid(bp, osize, nsize - osize); 351 *bpp = bp; 352 return (0); 353 } 354 /* 355 * Allocate a new disk location. 356 */ 357 if (bpref >= fs->fs_size) 358 bpref = 0; 359 switch ((int)fs->fs_optim) { 360 case FS_OPTSPACE: 361 /* 362 * Allocate an exact sized fragment. Although this makes 363 * best use of space, we will waste time relocating it if 364 * the file continues to grow. If the fragmentation is 365 * less than half of the minimum free reserve, we choose 366 * to begin optimizing for time. 367 */ 368 request = nsize; 369 if (fs->fs_minfree <= 5 || 370 fs->fs_cstotal.cs_nffree > 371 (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) 372 break; 373 log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", 374 fs->fs_fsmnt); 375 fs->fs_optim = FS_OPTTIME; 376 break; 377 case FS_OPTTIME: 378 /* 379 * At this point we have discovered a file that is trying to 380 * grow a small fragment to a larger fragment. To save time, 381 * we allocate a full sized block, then free the unused portion. 382 * If the file continues to grow, the `ffs_fragextend' call 383 * above will be able to grow it in place without further 384 * copying. If aberrant programs cause disk fragmentation to 385 * grow within 2% of the free reserve, we choose to begin 386 * optimizing for space. 387 */ 388 request = fs->fs_bsize; 389 if (fs->fs_cstotal.cs_nffree < 390 (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) 391 break; 392 log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", 393 fs->fs_fsmnt); 394 fs->fs_optim = FS_OPTSPACE; 395 break; 396 default: 397 printf("dev = %s, optim = %ld, fs = %s\n", 398 devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt); 399 panic("ffs_realloccg: bad optim"); 400 /* NOTREACHED */ 401 } 402 bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); 403 if (bno > 0) { 404 bp->b_blkno = fsbtodb(fs, bno); 405 if (!DOINGSOFTDEP(vp)) 406 /* 407 * The usual case is that a smaller fragment that 408 * was just allocated has been replaced with a bigger 409 * fragment or a full-size block. If it is marked as 410 * B_DELWRI, the current contents have not been written 411 * to disk. It is possible that the block was written 412 * earlier, but very uncommon. If the block has never 413 * been written, there is no need to send a BIO_DELETE 414 * for it when it is freed. The gain from avoiding the 415 * TRIMs for the common case of unwritten blocks far 416 * exceeds the cost of the write amplification for the 417 * uncommon case of failing to send a TRIM for a block 418 * that had been written. 419 */ 420 ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, 421 ip->i_number, vp->v_type, NULL, 422 (bp->b_flags & B_DELWRI) != 0 ? 423 NOTRIM_KEY : SINGLETON_KEY); 424 delta = btodb(nsize - osize); 425 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 426 if (flags & IO_EXT) 427 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 428 else 429 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 430 allocbuf(bp, nsize); 431 bp->b_flags |= B_DONE; 432 vfs_bio_bzero_buf(bp, osize, nsize - osize); 433 if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 434 vfs_bio_set_valid(bp, osize, nsize - osize); 435 *bpp = bp; 436 return (0); 437 } 438 #ifdef QUOTA 439 UFS_UNLOCK(ump); 440 /* 441 * Restore user's disk quota because allocation failed. 442 */ 443 (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); 444 UFS_LOCK(ump); 445 #endif 446 nospace: 447 /* 448 * no space available 449 */ 450 if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 451 reclaimed = 1; 452 UFS_UNLOCK(ump); 453 if (bp) { 454 brelse(bp); 455 bp = NULL; 456 } 457 UFS_LOCK(ump); 458 softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); 459 goto retry; 460 } 461 if (bp) 462 brelse(bp); 463 if (ffs_fsfail_cleanup_locked(ump, 0)) { 464 UFS_UNLOCK(ump); 465 return (ENXIO); 466 } 467 if (reclaimed > 0 && 468 ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 469 UFS_UNLOCK(ump); 470 ffs_fserr(fs, ip->i_number, "filesystem full"); 471 uprintf("\n%s: write failed, filesystem is full\n", 472 fs->fs_fsmnt); 473 } else { 474 UFS_UNLOCK(ump); 475 } 476 return (ENOSPC); 477 } 478 479 /* 480 * Reallocate a sequence of blocks into a contiguous sequence of blocks. 481 * 482 * The vnode and an array of buffer pointers for a range of sequential 483 * logical blocks to be made contiguous is given. The allocator attempts 484 * to find a range of sequential blocks starting as close as possible 485 * from the end of the allocation for the logical block immediately 486 * preceding the current range. If successful, the physical block numbers 487 * in the buffer pointers and in the inode are changed to reflect the new 488 * allocation. If unsuccessful, the allocation is left unchanged. The 489 * success in doing the reallocation is returned. Note that the error 490 * return is not reflected back to the user. Rather the previous block 491 * allocation will be used. 492 */ 493 494 SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 495 "FFS filesystem"); 496 497 static int doasyncfree = 1; 498 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, 499 "do not force synchronous writes when blocks are reallocated"); 500 501 static int doreallocblks = 1; 502 SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, 503 "enable block reallocation"); 504 505 static int dotrimcons = 1; 506 SYSCTL_INT(_vfs_ffs, OID_AUTO, dotrimcons, CTLFLAG_RWTUN, &dotrimcons, 0, 507 "enable BIO_DELETE / TRIM consolidation"); 508 509 static int maxclustersearch = 10; 510 SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch, 511 0, "max number of cylinder group to search for contigous blocks"); 512 513 #ifdef DIAGNOSTIC 514 static int prtrealloc = 0; 515 SYSCTL_INT(_debug, OID_AUTO, ffs_prtrealloc, CTLFLAG_RW, &prtrealloc, 0, 516 "print out FFS filesystem block reallocation operations"); 517 #endif 518 519 int 520 ffs_reallocblks( 521 struct vop_reallocblks_args /* { 522 struct vnode *a_vp; 523 struct cluster_save *a_buflist; 524 } */ *ap) 525 { 526 struct ufsmount *ump; 527 int error; 528 529 /* 530 * We used to skip reallocating the blocks of a file into a 531 * contiguous sequence if the underlying flash device requested 532 * BIO_DELETE notifications, because devices that benefit from 533 * BIO_DELETE also benefit from not moving the data. However, 534 * the destination for the data is usually moved before the data 535 * is written to the initially allocated location, so we rarely 536 * suffer the penalty of extra writes. With the addition of the 537 * consolidation of contiguous blocks into single BIO_DELETE 538 * operations, having fewer but larger contiguous blocks reduces 539 * the number of (slow and expensive) BIO_DELETE operations. So 540 * when doing BIO_DELETE consolidation, we do block reallocation. 541 * 542 * Skip if reallocblks has been disabled globally. 543 */ 544 ump = ap->a_vp->v_mount->mnt_data; 545 if ((((ump->um_flags) & UM_CANDELETE) != 0 && dotrimcons == 0) || 546 doreallocblks == 0) 547 return (ENOSPC); 548 549 /* 550 * We can't wait in softdep prealloc as it may fsync and recurse 551 * here. Instead we simply fail to reallocate blocks if this 552 * rare condition arises. 553 */ 554 if (DOINGSUJ(ap->a_vp)) 555 if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) 556 return (ENOSPC); 557 vn_seqc_write_begin(ap->a_vp); 558 error = ump->um_fstype == UFS1 ? ffs_reallocblks_ufs1(ap) : 559 ffs_reallocblks_ufs2(ap); 560 vn_seqc_write_end(ap->a_vp); 561 return (error); 562 } 563 564 static int 565 ffs_reallocblks_ufs1( 566 struct vop_reallocblks_args /* { 567 struct vnode *a_vp; 568 struct cluster_save *a_buflist; 569 } */ *ap) 570 { 571 struct fs *fs; 572 struct inode *ip; 573 struct vnode *vp; 574 struct buf *sbp, *ebp, *bp; 575 ufs1_daddr_t *bap, *sbap, *ebap; 576 struct cluster_save *buflist; 577 struct ufsmount *ump; 578 ufs_lbn_t start_lbn, end_lbn; 579 ufs1_daddr_t soff, newblk, blkno; 580 ufs2_daddr_t pref; 581 struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; 582 int i, cg, len, start_lvl, end_lvl, ssize; 583 584 vp = ap->a_vp; 585 ip = VTOI(vp); 586 ump = ITOUMP(ip); 587 fs = ump->um_fs; 588 /* 589 * If we are not tracking block clusters or if we have less than 4% 590 * free blocks left, then do not attempt to cluster. Running with 591 * less than 5% free block reserve is not recommended and those that 592 * choose to do so do not expect to have good file layout. 593 */ 594 if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 595 return (ENOSPC); 596 buflist = ap->a_buflist; 597 len = buflist->bs_nchildren; 598 start_lbn = buflist->bs_children[0]->b_lblkno; 599 end_lbn = start_lbn + len - 1; 600 #ifdef INVARIANTS 601 for (i = 0; i < len; i++) 602 if (!ffs_checkfreeblk(ip, 603 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 604 panic("ffs_reallocblks: unallocated block 1"); 605 for (i = 1; i < len; i++) 606 if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 607 panic("ffs_reallocblks: non-logical cluster"); 608 blkno = buflist->bs_children[0]->b_blkno; 609 ssize = fsbtodb(fs, fs->fs_frag); 610 for (i = 1; i < len - 1; i++) 611 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 612 panic("ffs_reallocblks: non-physical cluster %d", i); 613 #endif 614 /* 615 * If the cluster crosses the boundary for the first indirect 616 * block, leave space for the indirect block. Indirect blocks 617 * are initially laid out in a position after the last direct 618 * block. Block reallocation would usually destroy locality by 619 * moving the indirect block out of the way to make room for 620 * data blocks if we didn't compensate here. We should also do 621 * this for other indirect block boundaries, but it is only 622 * important for the first one. 623 */ 624 if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) 625 return (ENOSPC); 626 /* 627 * If the latest allocation is in a new cylinder group, assume that 628 * the filesystem has decided to move and do not force it back to 629 * the previous cylinder group. 630 */ 631 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 632 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 633 return (ENOSPC); 634 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 635 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 636 return (ENOSPC); 637 /* 638 * Get the starting offset and block map for the first block. 639 */ 640 if (start_lvl == 0) { 641 sbap = &ip->i_din1->di_db[0]; 642 soff = start_lbn; 643 } else { 644 idp = &start_ap[start_lvl - 1]; 645 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 646 brelse(sbp); 647 return (ENOSPC); 648 } 649 sbap = (ufs1_daddr_t *)sbp->b_data; 650 soff = idp->in_off; 651 } 652 /* 653 * If the block range spans two block maps, get the second map. 654 */ 655 ebap = NULL; 656 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 657 ssize = len; 658 } else { 659 #ifdef INVARIANTS 660 if (start_lvl > 0 && 661 start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 662 panic("ffs_reallocblk: start == end"); 663 #endif 664 ssize = len - (idp->in_off + 1); 665 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 666 goto fail; 667 ebap = (ufs1_daddr_t *)ebp->b_data; 668 } 669 /* 670 * Find the preferred location for the cluster. If we have not 671 * previously failed at this endeavor, then follow our standard 672 * preference calculation. If we have failed at it, then pick up 673 * where we last ended our search. 674 */ 675 UFS_LOCK(ump); 676 if (ip->i_nextclustercg == -1) 677 pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); 678 else 679 pref = cgdata(fs, ip->i_nextclustercg); 680 /* 681 * Search the block map looking for an allocation of the desired size. 682 * To avoid wasting too much time, we limit the number of cylinder 683 * groups that we will search. 684 */ 685 cg = dtog(fs, pref); 686 for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { 687 if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) 688 break; 689 cg += 1; 690 if (cg >= fs->fs_ncg) 691 cg = 0; 692 } 693 /* 694 * If we have failed in our search, record where we gave up for 695 * next time. Otherwise, fall back to our usual search citerion. 696 */ 697 if (newblk == 0) { 698 ip->i_nextclustercg = cg; 699 UFS_UNLOCK(ump); 700 goto fail; 701 } 702 ip->i_nextclustercg = -1; 703 /* 704 * We have found a new contiguous block. 705 * 706 * First we have to replace the old block pointers with the new 707 * block pointers in the inode and indirect blocks associated 708 * with the file. 709 */ 710 #ifdef DIAGNOSTIC 711 if (prtrealloc) 712 printf("realloc: ino %ju, lbns %jd-%jd\n\told:", 713 (uintmax_t)ip->i_number, 714 (intmax_t)start_lbn, (intmax_t)end_lbn); 715 #endif 716 blkno = newblk; 717 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 718 if (i == ssize) { 719 bap = ebap; 720 soff = -i; 721 } 722 #ifdef INVARIANTS 723 if (!ffs_checkfreeblk(ip, 724 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 725 panic("ffs_reallocblks: unallocated block 2"); 726 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 727 panic("ffs_reallocblks: alloc mismatch"); 728 #endif 729 #ifdef DIAGNOSTIC 730 if (prtrealloc) 731 printf(" %d,", *bap); 732 #endif 733 if (DOINGSOFTDEP(vp)) { 734 if (sbap == &ip->i_din1->di_db[0] && i < ssize) 735 softdep_setup_allocdirect(ip, start_lbn + i, 736 blkno, *bap, fs->fs_bsize, fs->fs_bsize, 737 buflist->bs_children[i]); 738 else 739 softdep_setup_allocindir_page(ip, start_lbn + i, 740 i < ssize ? sbp : ebp, soff + i, blkno, 741 *bap, buflist->bs_children[i]); 742 } 743 *bap++ = blkno; 744 } 745 /* 746 * Next we must write out the modified inode and indirect blocks. 747 * For strict correctness, the writes should be synchronous since 748 * the old block values may have been written to disk. In practise 749 * they are almost never written, but if we are concerned about 750 * strict correctness, the `doasyncfree' flag should be set to zero. 751 * 752 * The test on `doasyncfree' should be changed to test a flag 753 * that shows whether the associated buffers and inodes have 754 * been written. The flag should be set when the cluster is 755 * started and cleared whenever the buffer or inode is flushed. 756 * We can then check below to see if it is set, and do the 757 * synchronous write only when it has been cleared. 758 */ 759 if (sbap != &ip->i_din1->di_db[0]) { 760 if (doasyncfree) 761 bdwrite(sbp); 762 else 763 bwrite(sbp); 764 } else { 765 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 766 if (!doasyncfree) 767 ffs_update(vp, 1); 768 } 769 if (ssize < len) { 770 if (doasyncfree) 771 bdwrite(ebp); 772 else 773 bwrite(ebp); 774 } 775 /* 776 * Last, free the old blocks and assign the new blocks to the buffers. 777 */ 778 #ifdef DIAGNOSTIC 779 if (prtrealloc) 780 printf("\n\tnew:"); 781 #endif 782 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 783 bp = buflist->bs_children[i]; 784 if (!DOINGSOFTDEP(vp)) 785 /* 786 * The usual case is that a set of N-contiguous blocks 787 * that was just allocated has been replaced with a 788 * set of N+1-contiguous blocks. If they are marked as 789 * B_DELWRI, the current contents have not been written 790 * to disk. It is possible that the blocks were written 791 * earlier, but very uncommon. If the blocks have never 792 * been written, there is no need to send a BIO_DELETE 793 * for them when they are freed. The gain from avoiding 794 * the TRIMs for the common case of unwritten blocks 795 * far exceeds the cost of the write amplification for 796 * the uncommon case of failing to send a TRIM for the 797 * blocks that had been written. 798 */ 799 ffs_blkfree(ump, fs, ump->um_devvp, 800 dbtofsb(fs, bp->b_blkno), 801 fs->fs_bsize, ip->i_number, vp->v_type, NULL, 802 (bp->b_flags & B_DELWRI) != 0 ? 803 NOTRIM_KEY : SINGLETON_KEY); 804 bp->b_blkno = fsbtodb(fs, blkno); 805 #ifdef INVARIANTS 806 if (!ffs_checkfreeblk(ip, dbtofsb(fs, bp->b_blkno), 807 fs->fs_bsize)) 808 panic("ffs_reallocblks: unallocated block 3"); 809 #endif 810 #ifdef DIAGNOSTIC 811 if (prtrealloc) 812 printf(" %d,", blkno); 813 #endif 814 } 815 #ifdef DIAGNOSTIC 816 if (prtrealloc) { 817 prtrealloc--; 818 printf("\n"); 819 } 820 #endif 821 return (0); 822 823 fail: 824 if (ssize < len) 825 brelse(ebp); 826 if (sbap != &ip->i_din1->di_db[0]) 827 brelse(sbp); 828 return (ENOSPC); 829 } 830 831 static int 832 ffs_reallocblks_ufs2( 833 struct vop_reallocblks_args /* { 834 struct vnode *a_vp; 835 struct cluster_save *a_buflist; 836 } */ *ap) 837 { 838 struct fs *fs; 839 struct inode *ip; 840 struct vnode *vp; 841 struct buf *sbp, *ebp, *bp; 842 ufs2_daddr_t *bap, *sbap, *ebap; 843 struct cluster_save *buflist; 844 struct ufsmount *ump; 845 ufs_lbn_t start_lbn, end_lbn; 846 ufs2_daddr_t soff, newblk, blkno, pref; 847 struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; 848 int i, cg, len, start_lvl, end_lvl, ssize; 849 850 vp = ap->a_vp; 851 ip = VTOI(vp); 852 ump = ITOUMP(ip); 853 fs = ump->um_fs; 854 /* 855 * If we are not tracking block clusters or if we have less than 4% 856 * free blocks left, then do not attempt to cluster. Running with 857 * less than 5% free block reserve is not recommended and those that 858 * choose to do so do not expect to have good file layout. 859 */ 860 if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 861 return (ENOSPC); 862 buflist = ap->a_buflist; 863 len = buflist->bs_nchildren; 864 start_lbn = buflist->bs_children[0]->b_lblkno; 865 end_lbn = start_lbn + len - 1; 866 #ifdef INVARIANTS 867 for (i = 0; i < len; i++) 868 if (!ffs_checkfreeblk(ip, 869 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 870 panic("ffs_reallocblks: unallocated block 1"); 871 for (i = 1; i < len; i++) 872 if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 873 panic("ffs_reallocblks: non-logical cluster"); 874 blkno = buflist->bs_children[0]->b_blkno; 875 ssize = fsbtodb(fs, fs->fs_frag); 876 for (i = 1; i < len - 1; i++) 877 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 878 panic("ffs_reallocblks: non-physical cluster %d", i); 879 #endif 880 /* 881 * If the cluster crosses the boundary for the first indirect 882 * block, do not move anything in it. Indirect blocks are 883 * usually initially laid out in a position between the data 884 * blocks. Block reallocation would usually destroy locality by 885 * moving the indirect block out of the way to make room for 886 * data blocks if we didn't compensate here. We should also do 887 * this for other indirect block boundaries, but it is only 888 * important for the first one. 889 */ 890 if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) 891 return (ENOSPC); 892 /* 893 * If the latest allocation is in a new cylinder group, assume that 894 * the filesystem has decided to move and do not force it back to 895 * the previous cylinder group. 896 */ 897 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 898 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 899 return (ENOSPC); 900 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 901 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 902 return (ENOSPC); 903 /* 904 * Get the starting offset and block map for the first block. 905 */ 906 if (start_lvl == 0) { 907 sbap = &ip->i_din2->di_db[0]; 908 soff = start_lbn; 909 } else { 910 idp = &start_ap[start_lvl - 1]; 911 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 912 brelse(sbp); 913 return (ENOSPC); 914 } 915 sbap = (ufs2_daddr_t *)sbp->b_data; 916 soff = idp->in_off; 917 } 918 /* 919 * If the block range spans two block maps, get the second map. 920 */ 921 ebap = NULL; 922 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 923 ssize = len; 924 } else { 925 #ifdef INVARIANTS 926 if (start_lvl > 0 && 927 start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 928 panic("ffs_reallocblk: start == end"); 929 #endif 930 ssize = len - (idp->in_off + 1); 931 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 932 goto fail; 933 ebap = (ufs2_daddr_t *)ebp->b_data; 934 } 935 /* 936 * Find the preferred location for the cluster. If we have not 937 * previously failed at this endeavor, then follow our standard 938 * preference calculation. If we have failed at it, then pick up 939 * where we last ended our search. 940 */ 941 UFS_LOCK(ump); 942 if (ip->i_nextclustercg == -1) 943 pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); 944 else 945 pref = cgdata(fs, ip->i_nextclustercg); 946 /* 947 * Search the block map looking for an allocation of the desired size. 948 * To avoid wasting too much time, we limit the number of cylinder 949 * groups that we will search. 950 */ 951 cg = dtog(fs, pref); 952 for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { 953 if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) 954 break; 955 cg += 1; 956 if (cg >= fs->fs_ncg) 957 cg = 0; 958 } 959 /* 960 * If we have failed in our search, record where we gave up for 961 * next time. Otherwise, fall back to our usual search citerion. 962 */ 963 if (newblk == 0) { 964 ip->i_nextclustercg = cg; 965 UFS_UNLOCK(ump); 966 goto fail; 967 } 968 ip->i_nextclustercg = -1; 969 /* 970 * We have found a new contiguous block. 971 * 972 * First we have to replace the old block pointers with the new 973 * block pointers in the inode and indirect blocks associated 974 * with the file. 975 */ 976 #ifdef DIAGNOSTIC 977 if (prtrealloc) 978 printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, 979 (intmax_t)start_lbn, (intmax_t)end_lbn); 980 #endif 981 blkno = newblk; 982 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 983 if (i == ssize) { 984 bap = ebap; 985 soff = -i; 986 } 987 #ifdef INVARIANTS 988 if (!ffs_checkfreeblk(ip, 989 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 990 panic("ffs_reallocblks: unallocated block 2"); 991 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 992 panic("ffs_reallocblks: alloc mismatch"); 993 #endif 994 #ifdef DIAGNOSTIC 995 if (prtrealloc) 996 printf(" %jd,", (intmax_t)*bap); 997 #endif 998 if (DOINGSOFTDEP(vp)) { 999 if (sbap == &ip->i_din2->di_db[0] && i < ssize) 1000 softdep_setup_allocdirect(ip, start_lbn + i, 1001 blkno, *bap, fs->fs_bsize, fs->fs_bsize, 1002 buflist->bs_children[i]); 1003 else 1004 softdep_setup_allocindir_page(ip, start_lbn + i, 1005 i < ssize ? sbp : ebp, soff + i, blkno, 1006 *bap, buflist->bs_children[i]); 1007 } 1008 *bap++ = blkno; 1009 } 1010 /* 1011 * Next we must write out the modified inode and indirect blocks. 1012 * For strict correctness, the writes should be synchronous since 1013 * the old block values may have been written to disk. In practise 1014 * they are almost never written, but if we are concerned about 1015 * strict correctness, the `doasyncfree' flag should be set to zero. 1016 * 1017 * The test on `doasyncfree' should be changed to test a flag 1018 * that shows whether the associated buffers and inodes have 1019 * been written. The flag should be set when the cluster is 1020 * started and cleared whenever the buffer or inode is flushed. 1021 * We can then check below to see if it is set, and do the 1022 * synchronous write only when it has been cleared. 1023 */ 1024 if (sbap != &ip->i_din2->di_db[0]) { 1025 if (doasyncfree) 1026 bdwrite(sbp); 1027 else 1028 bwrite(sbp); 1029 } else { 1030 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1031 if (!doasyncfree) 1032 ffs_update(vp, 1); 1033 } 1034 if (ssize < len) { 1035 if (doasyncfree) 1036 bdwrite(ebp); 1037 else 1038 bwrite(ebp); 1039 } 1040 /* 1041 * Last, free the old blocks and assign the new blocks to the buffers. 1042 */ 1043 #ifdef DIAGNOSTIC 1044 if (prtrealloc) 1045 printf("\n\tnew:"); 1046 #endif 1047 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 1048 bp = buflist->bs_children[i]; 1049 if (!DOINGSOFTDEP(vp)) 1050 /* 1051 * The usual case is that a set of N-contiguous blocks 1052 * that was just allocated has been replaced with a 1053 * set of N+1-contiguous blocks. If they are marked as 1054 * B_DELWRI, the current contents have not been written 1055 * to disk. It is possible that the blocks were written 1056 * earlier, but very uncommon. If the blocks have never 1057 * been written, there is no need to send a BIO_DELETE 1058 * for them when they are freed. The gain from avoiding 1059 * the TRIMs for the common case of unwritten blocks 1060 * far exceeds the cost of the write amplification for 1061 * the uncommon case of failing to send a TRIM for the 1062 * blocks that had been written. 1063 */ 1064 ffs_blkfree(ump, fs, ump->um_devvp, 1065 dbtofsb(fs, bp->b_blkno), 1066 fs->fs_bsize, ip->i_number, vp->v_type, NULL, 1067 (bp->b_flags & B_DELWRI) != 0 ? 1068 NOTRIM_KEY : SINGLETON_KEY); 1069 bp->b_blkno = fsbtodb(fs, blkno); 1070 #ifdef INVARIANTS 1071 if (!ffs_checkfreeblk(ip, dbtofsb(fs, bp->b_blkno), 1072 fs->fs_bsize)) 1073 panic("ffs_reallocblks: unallocated block 3"); 1074 #endif 1075 #ifdef DIAGNOSTIC 1076 if (prtrealloc) 1077 printf(" %jd,", (intmax_t)blkno); 1078 #endif 1079 } 1080 #ifdef DIAGNOSTIC 1081 if (prtrealloc) { 1082 prtrealloc--; 1083 printf("\n"); 1084 } 1085 #endif 1086 return (0); 1087 1088 fail: 1089 if (ssize < len) 1090 brelse(ebp); 1091 if (sbap != &ip->i_din2->di_db[0]) 1092 brelse(sbp); 1093 return (ENOSPC); 1094 } 1095 1096 /* 1097 * Allocate an inode in the filesystem. 1098 * 1099 * If allocating a directory, use ffs_dirpref to select the inode. 1100 * If allocating in a directory, the following hierarchy is followed: 1101 * 1) allocate the preferred inode. 1102 * 2) allocate an inode in the same cylinder group. 1103 * 3) quadratically rehash into other cylinder groups, until an 1104 * available inode is located. 1105 * If no inode preference is given the following hierarchy is used 1106 * to allocate an inode: 1107 * 1) allocate an inode in cylinder group 0. 1108 * 2) quadratically rehash into other cylinder groups, until an 1109 * available inode is located. 1110 */ 1111 int 1112 ffs_valloc(struct vnode *pvp, 1113 int mode, 1114 struct ucred *cred, 1115 struct vnode **vpp) 1116 { 1117 struct inode *pip; 1118 struct fs *fs; 1119 struct inode *ip; 1120 struct timespec ts; 1121 struct ufsmount *ump; 1122 ino_t ino, ipref; 1123 uint64_t cg; 1124 int error, reclaimed; 1125 1126 *vpp = NULL; 1127 pip = VTOI(pvp); 1128 ump = ITOUMP(pip); 1129 fs = ump->um_fs; 1130 1131 UFS_LOCK(ump); 1132 reclaimed = 0; 1133 retry: 1134 if (fs->fs_cstotal.cs_nifree == 0) 1135 goto noinodes; 1136 1137 if ((mode & IFMT) == IFDIR) 1138 ipref = ffs_dirpref(pip); 1139 else 1140 ipref = pip->i_number; 1141 if (ipref >= fs->fs_ncg * fs->fs_ipg) 1142 ipref = 0; 1143 cg = ino_to_cg(fs, ipref); 1144 /* 1145 * Track number of dirs created one after another 1146 * in a same cg without intervening by files. 1147 */ 1148 if ((mode & IFMT) == IFDIR) { 1149 if (fs->fs_contigdirs[cg] < 255) 1150 fs->fs_contigdirs[cg]++; 1151 } else { 1152 if (fs->fs_contigdirs[cg] > 0) 1153 fs->fs_contigdirs[cg]--; 1154 } 1155 ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 1156 (allocfcn_t *)ffs_nodealloccg); 1157 if (ino == 0) 1158 goto noinodes; 1159 /* 1160 * Get rid of the cached old vnode, force allocation of a new vnode 1161 * for this inode. If this fails, release the allocated ino and 1162 * return the error. 1163 */ 1164 if ((error = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp, 1165 FFSV_FORCEINSMQ | FFSV_REPLACE | FFSV_NEWINODE)) != 0) { 1166 ffs_vfree(pvp, ino, mode); 1167 return (error); 1168 } 1169 /* 1170 * We got an inode, so check mode and panic if it is already allocated. 1171 */ 1172 ip = VTOI(*vpp); 1173 if (ip->i_mode) { 1174 printf("mode = 0%o, inum = %ju, fs = %s\n", 1175 ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt); 1176 panic("ffs_valloc: dup alloc"); 1177 } 1178 if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ 1179 printf("free inode %s/%ju had %ld blocks\n", 1180 fs->fs_fsmnt, (intmax_t)ino, (long)DIP(ip, i_blocks)); 1181 DIP_SET(ip, i_blocks, 0); 1182 } 1183 ip->i_flags = 0; 1184 DIP_SET(ip, i_flags, 0); 1185 if ((mode & IFMT) == IFDIR) 1186 DIP_SET(ip, i_dirdepth, DIP(pip, i_dirdepth) + 1); 1187 /* 1188 * Set up a new generation number for this inode. 1189 */ 1190 while (ip->i_gen == 0 || ++ip->i_gen == 0) 1191 ip->i_gen = arc4random(); 1192 DIP_SET(ip, i_gen, ip->i_gen); 1193 if (fs->fs_magic == FS_UFS2_MAGIC) { 1194 vfs_timestamp(&ts); 1195 ip->i_din2->di_birthtime = ts.tv_sec; 1196 ip->i_din2->di_birthnsec = ts.tv_nsec; 1197 } 1198 ip->i_flag = 0; 1199 (*vpp)->v_vflag = 0; 1200 (*vpp)->v_type = VNON; 1201 if (fs->fs_magic == FS_UFS2_MAGIC) { 1202 (*vpp)->v_op = &ffs_vnodeops2; 1203 UFS_INODE_SET_FLAG(ip, IN_UFS2); 1204 } else { 1205 (*vpp)->v_op = &ffs_vnodeops1; 1206 } 1207 return (0); 1208 noinodes: 1209 if (reclaimed == 0) { 1210 reclaimed = 1; 1211 softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT); 1212 goto retry; 1213 } 1214 if (ffs_fsfail_cleanup_locked(ump, 0)) { 1215 UFS_UNLOCK(ump); 1216 return (ENXIO); 1217 } 1218 if (ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 1219 UFS_UNLOCK(ump); 1220 ffs_fserr(fs, pip->i_number, "out of inodes"); 1221 uprintf("\n%s: create/symlink failed, no inodes free\n", 1222 fs->fs_fsmnt); 1223 } else { 1224 UFS_UNLOCK(ump); 1225 } 1226 return (ENOSPC); 1227 } 1228 1229 /* 1230 * Find a cylinder group to place a directory. 1231 * 1232 * The policy implemented by this algorithm is to allocate a 1233 * directory inode in the same cylinder group as its parent 1234 * directory, but also to reserve space for its files inodes 1235 * and data. Restrict the number of directories which may be 1236 * allocated one after another in the same cylinder group 1237 * without intervening allocation of files. 1238 * 1239 * If we allocate a first level directory then force allocation 1240 * in another cylinder group. 1241 */ 1242 static ino_t 1243 ffs_dirpref(struct inode *pip) 1244 { 1245 struct fs *fs; 1246 int cg, prefcg, curcg, dirsize, cgsize; 1247 int depth, range, start, end, numdirs, power, numerator, denominator; 1248 uint64_t avgifree, avgbfree, avgndir, curdirsize; 1249 uint64_t minifree, minbfree, maxndir; 1250 uint64_t maxcontigdirs; 1251 1252 mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED); 1253 fs = ITOFS(pip); 1254 1255 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 1256 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1257 avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; 1258 1259 /* 1260 * Select a preferred cylinder group to place a new directory. 1261 * If we are near the root of the filesystem we aim to spread 1262 * them out as much as possible. As we descend deeper from the 1263 * root we cluster them closer together around their parent as 1264 * we expect them to be more closely interactive. Higher-level 1265 * directories like usr/src/sys and usr/src/bin should be 1266 * separated while the directories in these areas are more 1267 * likely to be accessed together so should be closer. 1268 * 1269 * We pick a range of cylinder groups around the cylinder group 1270 * of the directory in which we are being created. The size of 1271 * the range for our search is based on our depth from the root 1272 * of our filesystem. We then probe that range based on how many 1273 * directories are already present. The first new directory is at 1274 * 1/2 (middle) of the range; the second is in the first 1/4 of the 1275 * range, then at 3/4, 1/8, 3/8, 5/8, 7/8, 1/16, 3/16, 5/16, etc. 1276 */ 1277 depth = DIP(pip, i_dirdepth); 1278 range = fs->fs_ncg / (1 << depth); 1279 curcg = ino_to_cg(fs, pip->i_number); 1280 start = curcg - (range / 2); 1281 if (start < 0) 1282 start += fs->fs_ncg; 1283 end = curcg + (range / 2); 1284 if (end >= fs->fs_ncg) 1285 end -= fs->fs_ncg; 1286 numdirs = pip->i_effnlink - 1; 1287 power = fls(numdirs); 1288 numerator = (numdirs & ~(1 << (power - 1))) * 2 + 1; 1289 denominator = 1 << power; 1290 prefcg = (curcg - (range / 2) + (range * numerator / denominator)); 1291 if (prefcg < 0) 1292 prefcg += fs->fs_ncg; 1293 if (prefcg >= fs->fs_ncg) 1294 prefcg -= fs->fs_ncg; 1295 /* 1296 * If this filesystem is not tracking directory depths, 1297 * revert to the old algorithm. 1298 */ 1299 if (depth == 0 && pip->i_number != UFS_ROOTINO) 1300 prefcg = curcg; 1301 1302 /* 1303 * Count various limits which used for 1304 * optimal allocation of a directory inode. 1305 */ 1306 maxndir = min(avgndir + (1 << depth), fs->fs_ipg); 1307 minifree = avgifree - avgifree / 4; 1308 if (minifree < 1) 1309 minifree = 1; 1310 minbfree = avgbfree - avgbfree / 4; 1311 if (minbfree < 1) 1312 minbfree = 1; 1313 cgsize = fs->fs_fsize * fs->fs_fpg; 1314 dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; 1315 curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; 1316 if (dirsize < curdirsize) 1317 dirsize = curdirsize; 1318 if (dirsize <= 0) 1319 maxcontigdirs = 0; /* dirsize overflowed */ 1320 else 1321 maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); 1322 if (fs->fs_avgfpdir > 0) 1323 maxcontigdirs = min(maxcontigdirs, 1324 fs->fs_ipg / fs->fs_avgfpdir); 1325 if (maxcontigdirs == 0) 1326 maxcontigdirs = 1; 1327 1328 /* 1329 * Limit number of dirs in one cg and reserve space for 1330 * regular files, but only if we have no deficit in 1331 * inodes or space. 1332 * 1333 * We are trying to find a suitable cylinder group nearby 1334 * our preferred cylinder group to place a new directory. 1335 * We scan from our preferred cylinder group forward looking 1336 * for a cylinder group that meets our criterion. If we get 1337 * to the final cylinder group and do not find anything, 1338 * we start scanning forwards from the beginning of the 1339 * filesystem. While it might seem sensible to start scanning 1340 * backwards or even to alternate looking forward and backward, 1341 * this approach fails badly when the filesystem is nearly full. 1342 * Specifically, we first search all the areas that have no space 1343 * and finally try the one preceding that. We repeat this on 1344 * every request and in the case of the final block end up 1345 * searching the entire filesystem. By jumping to the front 1346 * of the filesystem, our future forward searches always look 1347 * in new cylinder groups so finds every possible block after 1348 * one pass over the filesystem. 1349 */ 1350 for (cg = prefcg; cg < fs->fs_ncg; cg++) 1351 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 1352 fs->fs_cs(fs, cg).cs_nifree >= minifree && 1353 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 1354 if (fs->fs_contigdirs[cg] < maxcontigdirs) 1355 return ((ino_t)(fs->fs_ipg * cg)); 1356 } 1357 for (cg = 0; cg < prefcg; cg++) 1358 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 1359 fs->fs_cs(fs, cg).cs_nifree >= minifree && 1360 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 1361 if (fs->fs_contigdirs[cg] < maxcontigdirs) 1362 return ((ino_t)(fs->fs_ipg * cg)); 1363 } 1364 /* 1365 * This is a backstop when we have deficit in space. 1366 */ 1367 for (cg = prefcg; cg < fs->fs_ncg; cg++) 1368 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 1369 return ((ino_t)(fs->fs_ipg * cg)); 1370 for (cg = 0; cg < prefcg; cg++) 1371 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 1372 break; 1373 return ((ino_t)(fs->fs_ipg * cg)); 1374 } 1375 1376 /* 1377 * Select the desired position for the next block in a file. The file is 1378 * logically divided into sections. The first section is composed of the 1379 * direct blocks and the next fs_maxbpg blocks. Each additional section 1380 * contains fs_maxbpg blocks. 1381 * 1382 * If no blocks have been allocated in the first section, the policy is to 1383 * request a block in the same cylinder group as the inode that describes 1384 * the file. The first indirect is allocated immediately following the last 1385 * direct block and the data blocks for the first indirect immediately 1386 * follow it. 1387 * 1388 * If no blocks have been allocated in any other section, the indirect 1389 * block(s) are allocated in the same cylinder group as its inode in an 1390 * area reserved immediately following the inode blocks. The policy for 1391 * the data blocks is to place them in a cylinder group with a greater than 1392 * average number of free blocks. An appropriate cylinder group is found 1393 * by using a rotor that sweeps the cylinder groups. When a new group of 1394 * blocks is needed, the sweep begins in the cylinder group following the 1395 * cylinder group from which the previous allocation was made. The sweep 1396 * continues until a cylinder group with greater than the average number 1397 * of free blocks is found. If the allocation is for the first block in an 1398 * indirect block or the previous block is a hole, then the information on 1399 * the previous allocation is unavailable; here a best guess is made based 1400 * on the logical block number being allocated. 1401 * 1402 * If a section is already partially allocated, the policy is to 1403 * allocate blocks contiguously within the section if possible. 1404 */ 1405 ufs2_daddr_t 1406 ffs_blkpref_ufs1(struct inode *ip, 1407 ufs_lbn_t lbn, 1408 int indx, 1409 ufs1_daddr_t *bap) 1410 { 1411 struct fs *fs; 1412 uint64_t cg, inocg; 1413 uint64_t avgbfree, startcg; 1414 ufs2_daddr_t pref, prevbn; 1415 1416 KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1417 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1418 fs = ITOFS(ip); 1419 /* 1420 * Allocation of indirect blocks is indicated by passing negative 1421 * values in indx: -1 for single indirect, -2 for double indirect, 1422 * -3 for triple indirect. As noted below, we attempt to allocate 1423 * the first indirect inline with the file data. For all later 1424 * indirect blocks, the data is often allocated in other cylinder 1425 * groups. However to speed random file access and to speed up 1426 * fsck, the filesystem reserves the first fs_metaspace blocks 1427 * (typically half of fs_minfree) of the data area of each cylinder 1428 * group to hold these later indirect blocks. 1429 */ 1430 inocg = ino_to_cg(fs, ip->i_number); 1431 if (indx < 0) { 1432 /* 1433 * Our preference for indirect blocks is the zone at the 1434 * beginning of the inode's cylinder group data area that 1435 * we try to reserve for indirect blocks. 1436 */ 1437 pref = cgmeta(fs, inocg); 1438 /* 1439 * If we are allocating the first indirect block, try to 1440 * place it immediately following the last direct block. 1441 */ 1442 if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && 1443 ip->i_din1->di_db[UFS_NDADDR - 1] != 0) 1444 pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag; 1445 return (pref); 1446 } 1447 /* 1448 * If we are allocating the first data block in the first indirect 1449 * block and the indirect has been allocated in the data block area, 1450 * try to place it immediately following the indirect block. 1451 */ 1452 if (lbn == UFS_NDADDR) { 1453 pref = ip->i_din1->di_ib[0]; 1454 if (pref != 0 && pref >= cgdata(fs, inocg) && 1455 pref < cgbase(fs, inocg + 1)) 1456 return (pref + fs->fs_frag); 1457 } 1458 /* 1459 * If we are at the beginning of a file, or we have already allocated 1460 * the maximum number of blocks per cylinder group, or we do not 1461 * have a block allocated immediately preceding us, then we need 1462 * to decide where to start allocating new blocks. 1463 */ 1464 if (indx == 0) { 1465 prevbn = 0; 1466 } else { 1467 prevbn = bap[indx - 1]; 1468 if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, 1469 fs->fs_bsize) != 0) 1470 prevbn = 0; 1471 } 1472 if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { 1473 /* 1474 * If we are allocating a directory data block, we want 1475 * to place it in the metadata area. 1476 */ 1477 if ((ip->i_mode & IFMT) == IFDIR) 1478 return (cgmeta(fs, inocg)); 1479 /* 1480 * Until we fill all the direct and all the first indirect's 1481 * blocks, we try to allocate in the data area of the inode's 1482 * cylinder group. 1483 */ 1484 if (lbn < UFS_NDADDR + NINDIR(fs)) 1485 return (cgdata(fs, inocg)); 1486 /* 1487 * Find a cylinder with greater than average number of 1488 * unused data blocks. 1489 */ 1490 if (indx == 0 || prevbn == 0) 1491 startcg = inocg + lbn / fs->fs_maxbpg; 1492 else 1493 startcg = dtog(fs, prevbn) + 1; 1494 startcg %= fs->fs_ncg; 1495 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1496 for (cg = startcg; cg < fs->fs_ncg; cg++) 1497 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1498 fs->fs_cgrotor = cg; 1499 return (cgdata(fs, cg)); 1500 } 1501 for (cg = 0; cg <= startcg; cg++) 1502 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1503 fs->fs_cgrotor = cg; 1504 return (cgdata(fs, cg)); 1505 } 1506 return (0); 1507 } 1508 /* 1509 * Otherwise, we just always try to lay things out contiguously. 1510 */ 1511 return (prevbn + fs->fs_frag); 1512 } 1513 1514 /* 1515 * Same as above, but for UFS2 1516 */ 1517 ufs2_daddr_t 1518 ffs_blkpref_ufs2(struct inode *ip, 1519 ufs_lbn_t lbn, 1520 int indx, 1521 ufs2_daddr_t *bap) 1522 { 1523 struct fs *fs; 1524 uint64_t cg, inocg; 1525 uint64_t avgbfree, startcg; 1526 ufs2_daddr_t pref, prevbn; 1527 1528 KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1529 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1530 fs = ITOFS(ip); 1531 /* 1532 * Allocation of indirect blocks is indicated by passing negative 1533 * values in indx: -1 for single indirect, -2 for double indirect, 1534 * -3 for triple indirect. As noted below, we attempt to allocate 1535 * the first indirect inline with the file data. For all later 1536 * indirect blocks, the data is often allocated in other cylinder 1537 * groups. However to speed random file access and to speed up 1538 * fsck, the filesystem reserves the first fs_metaspace blocks 1539 * (typically half of fs_minfree) of the data area of each cylinder 1540 * group to hold these later indirect blocks. 1541 */ 1542 inocg = ino_to_cg(fs, ip->i_number); 1543 if (indx < 0) { 1544 /* 1545 * Our preference for indirect blocks is the zone at the 1546 * beginning of the inode's cylinder group data area that 1547 * we try to reserve for indirect blocks. 1548 */ 1549 pref = cgmeta(fs, inocg); 1550 /* 1551 * If we are allocating the first indirect block, try to 1552 * place it immediately following the last direct block. 1553 */ 1554 if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && 1555 ip->i_din2->di_db[UFS_NDADDR - 1] != 0) 1556 pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag; 1557 return (pref); 1558 } 1559 /* 1560 * If we are allocating the first data block in the first indirect 1561 * block and the indirect has been allocated in the data block area, 1562 * try to place it immediately following the indirect block. 1563 */ 1564 if (lbn == UFS_NDADDR) { 1565 pref = ip->i_din2->di_ib[0]; 1566 if (pref != 0 && pref >= cgdata(fs, inocg) && 1567 pref < cgbase(fs, inocg + 1)) 1568 return (pref + fs->fs_frag); 1569 } 1570 /* 1571 * If we are at the beginning of a file, or we have already allocated 1572 * the maximum number of blocks per cylinder group, or we do not 1573 * have a block allocated immediately preceding us, then we need 1574 * to decide where to start allocating new blocks. 1575 */ 1576 if (indx == 0) { 1577 prevbn = 0; 1578 } else { 1579 prevbn = bap[indx - 1]; 1580 if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, 1581 fs->fs_bsize) != 0) 1582 prevbn = 0; 1583 } 1584 if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { 1585 /* 1586 * If we are allocating a directory data block, we want 1587 * to place it in the metadata area. 1588 */ 1589 if ((ip->i_mode & IFMT) == IFDIR) 1590 return (cgmeta(fs, inocg)); 1591 /* 1592 * Until we fill all the direct and all the first indirect's 1593 * blocks, we try to allocate in the data area of the inode's 1594 * cylinder group. 1595 */ 1596 if (lbn < UFS_NDADDR + NINDIR(fs)) 1597 return (cgdata(fs, inocg)); 1598 /* 1599 * Find a cylinder with greater than average number of 1600 * unused data blocks. 1601 */ 1602 if (indx == 0 || prevbn == 0) 1603 startcg = inocg + lbn / fs->fs_maxbpg; 1604 else 1605 startcg = dtog(fs, prevbn) + 1; 1606 startcg %= fs->fs_ncg; 1607 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1608 for (cg = startcg; cg < fs->fs_ncg; cg++) 1609 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1610 fs->fs_cgrotor = cg; 1611 return (cgdata(fs, cg)); 1612 } 1613 for (cg = 0; cg <= startcg; cg++) 1614 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1615 fs->fs_cgrotor = cg; 1616 return (cgdata(fs, cg)); 1617 } 1618 return (0); 1619 } 1620 /* 1621 * Otherwise, we just always try to lay things out contiguously. 1622 */ 1623 return (prevbn + fs->fs_frag); 1624 } 1625 1626 /* 1627 * Implement the cylinder overflow algorithm. 1628 * 1629 * The policy implemented by this algorithm is: 1630 * 1) allocate the block in its requested cylinder group. 1631 * 2) quadratically rehash on the cylinder group number. 1632 * 3) brute force search for a free block. 1633 * 1634 * Must be called with the UFS lock held. Will release the lock on success 1635 * and return with it held on failure. 1636 */ 1637 /*VARARGS5*/ 1638 static ufs2_daddr_t 1639 ffs_hashalloc(struct inode *ip, 1640 uint64_t cg, 1641 ufs2_daddr_t pref, 1642 int size, /* Search size for data blocks, mode for inodes */ 1643 int rsize, /* Real allocated size. */ 1644 allocfcn_t *allocator) 1645 { 1646 struct fs *fs; 1647 ufs2_daddr_t result; 1648 uint64_t i, icg = cg; 1649 1650 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1651 #ifdef INVARIANTS 1652 if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 1653 panic("ffs_hashalloc: allocation on suspended filesystem"); 1654 #endif 1655 fs = ITOFS(ip); 1656 /* 1657 * 1: preferred cylinder group 1658 */ 1659 result = (*allocator)(ip, cg, pref, size, rsize); 1660 if (result) 1661 return (result); 1662 /* 1663 * 2: quadratic rehash 1664 */ 1665 for (i = 1; i < fs->fs_ncg; i *= 2) { 1666 cg += i; 1667 if (cg >= fs->fs_ncg) 1668 cg -= fs->fs_ncg; 1669 result = (*allocator)(ip, cg, 0, size, rsize); 1670 if (result) 1671 return (result); 1672 } 1673 /* 1674 * 3: brute force search 1675 * Note that we start at i == 2, since 0 was checked initially, 1676 * and 1 is always checked in the quadratic rehash. 1677 */ 1678 cg = (icg + 2) % fs->fs_ncg; 1679 for (i = 2; i < fs->fs_ncg; i++) { 1680 result = (*allocator)(ip, cg, 0, size, rsize); 1681 if (result) 1682 return (result); 1683 cg++; 1684 if (cg == fs->fs_ncg) 1685 cg = 0; 1686 } 1687 return (0); 1688 } 1689 1690 /* 1691 * Determine whether a fragment can be extended. 1692 * 1693 * Check to see if the necessary fragments are available, and 1694 * if they are, allocate them. 1695 */ 1696 static ufs2_daddr_t 1697 ffs_fragextend(struct inode *ip, 1698 uint64_t cg, 1699 ufs2_daddr_t bprev, 1700 int osize, 1701 int nsize) 1702 { 1703 struct fs *fs; 1704 struct cg *cgp; 1705 struct buf *bp; 1706 struct ufsmount *ump; 1707 int nffree; 1708 long bno; 1709 int frags, bbase; 1710 int i, error; 1711 uint8_t *blksfree; 1712 1713 ump = ITOUMP(ip); 1714 fs = ump->um_fs; 1715 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) 1716 return (0); 1717 frags = numfrags(fs, nsize); 1718 bbase = fragnum(fs, bprev); 1719 if (bbase > fragnum(fs, (bprev + frags - 1))) { 1720 /* cannot extend across a block boundary */ 1721 return (0); 1722 } 1723 UFS_UNLOCK(ump); 1724 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { 1725 ffs_checkcgintegrity(fs, cg, error); 1726 goto fail; 1727 } 1728 bno = dtogd(fs, bprev); 1729 blksfree = cg_blksfree(cgp); 1730 for (i = numfrags(fs, osize); i < frags; i++) 1731 if (isclr(blksfree, bno + i)) 1732 goto fail; 1733 /* 1734 * the current fragment can be extended 1735 * deduct the count on fragment being extended into 1736 * increase the count on the remaining fragment (if any) 1737 * allocate the extended piece 1738 */ 1739 for (i = frags; i < fs->fs_frag - bbase; i++) 1740 if (isclr(blksfree, bno + i)) 1741 break; 1742 cgp->cg_frsum[i - numfrags(fs, osize)]--; 1743 if (i != frags) 1744 cgp->cg_frsum[i - frags]++; 1745 for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) { 1746 clrbit(blksfree, bno + i); 1747 cgp->cg_cs.cs_nffree--; 1748 nffree++; 1749 } 1750 UFS_LOCK(ump); 1751 fs->fs_cstotal.cs_nffree -= nffree; 1752 fs->fs_cs(fs, cg).cs_nffree -= nffree; 1753 fs->fs_fmod = 1; 1754 ACTIVECLEAR(fs, cg); 1755 UFS_UNLOCK(ump); 1756 if (DOINGSOFTDEP(ITOV(ip))) 1757 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, 1758 frags, numfrags(fs, osize)); 1759 bdwrite(bp); 1760 return (bprev); 1761 1762 fail: 1763 brelse(bp); 1764 UFS_LOCK(ump); 1765 return (0); 1766 1767 } 1768 1769 /* 1770 * Determine whether a block can be allocated. 1771 * 1772 * Check to see if a block of the appropriate size is available, 1773 * and if it is, allocate it. 1774 */ 1775 static ufs2_daddr_t 1776 ffs_alloccg(struct inode *ip, 1777 uint64_t cg, 1778 ufs2_daddr_t bpref, 1779 int size, 1780 int rsize) 1781 { 1782 struct fs *fs; 1783 struct cg *cgp; 1784 struct buf *bp; 1785 struct ufsmount *ump; 1786 ufs1_daddr_t bno; 1787 ufs2_daddr_t blkno; 1788 int i, allocsiz, error, frags; 1789 uint8_t *blksfree; 1790 1791 ump = ITOUMP(ip); 1792 fs = ump->um_fs; 1793 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) 1794 return (0); 1795 UFS_UNLOCK(ump); 1796 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0 || 1797 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { 1798 ffs_checkcgintegrity(fs, cg, error); 1799 goto fail; 1800 } 1801 if (size == fs->fs_bsize) { 1802 UFS_LOCK(ump); 1803 blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1804 ACTIVECLEAR(fs, cg); 1805 UFS_UNLOCK(ump); 1806 bdwrite(bp); 1807 return (blkno); 1808 } 1809 /* 1810 * check to see if any fragments are already available 1811 * allocsiz is the size which will be allocated, hacking 1812 * it down to a smaller size if necessary 1813 */ 1814 blksfree = cg_blksfree(cgp); 1815 frags = numfrags(fs, size); 1816 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 1817 if (cgp->cg_frsum[allocsiz] != 0) 1818 break; 1819 if (allocsiz == fs->fs_frag) { 1820 /* 1821 * no fragments were available, so a block will be 1822 * allocated, and hacked up 1823 */ 1824 if (cgp->cg_cs.cs_nbfree == 0) 1825 goto fail; 1826 UFS_LOCK(ump); 1827 blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1828 ACTIVECLEAR(fs, cg); 1829 UFS_UNLOCK(ump); 1830 bdwrite(bp); 1831 return (blkno); 1832 } 1833 KASSERT(size == rsize, 1834 ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); 1835 bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); 1836 if (bno < 0) 1837 goto fail; 1838 for (i = 0; i < frags; i++) 1839 clrbit(blksfree, bno + i); 1840 cgp->cg_cs.cs_nffree -= frags; 1841 cgp->cg_frsum[allocsiz]--; 1842 if (frags != allocsiz) 1843 cgp->cg_frsum[allocsiz - frags]++; 1844 UFS_LOCK(ump); 1845 fs->fs_cstotal.cs_nffree -= frags; 1846 fs->fs_cs(fs, cg).cs_nffree -= frags; 1847 fs->fs_fmod = 1; 1848 blkno = cgbase(fs, cg) + bno; 1849 ACTIVECLEAR(fs, cg); 1850 UFS_UNLOCK(ump); 1851 if (DOINGSOFTDEP(ITOV(ip))) 1852 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); 1853 bdwrite(bp); 1854 return (blkno); 1855 1856 fail: 1857 brelse(bp); 1858 UFS_LOCK(ump); 1859 return (0); 1860 } 1861 1862 /* 1863 * Allocate a block in a cylinder group. 1864 * 1865 * This algorithm implements the following policy: 1866 * 1) allocate the requested block. 1867 * 2) allocate a rotationally optimal block in the same cylinder. 1868 * 3) allocate the next available block on the block rotor for the 1869 * specified cylinder group. 1870 * Note that this routine only allocates fs_bsize blocks; these 1871 * blocks may be fragmented by the routine that allocates them. 1872 */ 1873 static ufs2_daddr_t 1874 ffs_alloccgblk(struct inode *ip, 1875 struct buf *bp, 1876 ufs2_daddr_t bpref, 1877 int size) 1878 { 1879 struct fs *fs; 1880 struct cg *cgp; 1881 struct ufsmount *ump; 1882 ufs1_daddr_t bno; 1883 ufs2_daddr_t blkno; 1884 uint8_t *blksfree; 1885 int i, cgbpref; 1886 1887 ump = ITOUMP(ip); 1888 fs = ump->um_fs; 1889 mtx_assert(UFS_MTX(ump), MA_OWNED); 1890 cgp = (struct cg *)bp->b_data; 1891 blksfree = cg_blksfree(cgp); 1892 if (bpref == 0) { 1893 bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag; 1894 } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) { 1895 /* map bpref to correct zone in this cg */ 1896 if (bpref < cgdata(fs, cgbpref)) 1897 bpref = cgmeta(fs, cgp->cg_cgx); 1898 else 1899 bpref = cgdata(fs, cgp->cg_cgx); 1900 } 1901 /* 1902 * if the requested block is available, use it 1903 */ 1904 bno = dtogd(fs, blknum(fs, bpref)); 1905 if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) 1906 goto gotit; 1907 /* 1908 * Take the next available block in this cylinder group. 1909 */ 1910 bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); 1911 if (bno < 0) 1912 return (0); 1913 /* Update cg_rotor only if allocated from the data zone */ 1914 if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) 1915 cgp->cg_rotor = bno; 1916 gotit: 1917 blkno = fragstoblks(fs, bno); 1918 ffs_clrblock(fs, blksfree, (long)blkno); 1919 ffs_clusteracct(fs, cgp, blkno, -1); 1920 cgp->cg_cs.cs_nbfree--; 1921 fs->fs_cstotal.cs_nbfree--; 1922 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; 1923 fs->fs_fmod = 1; 1924 blkno = cgbase(fs, cgp->cg_cgx) + bno; 1925 /* 1926 * If the caller didn't want the whole block free the frags here. 1927 */ 1928 size = numfrags(fs, size); 1929 if (size != fs->fs_frag) { 1930 bno = dtogd(fs, blkno); 1931 for (i = size; i < fs->fs_frag; i++) 1932 setbit(blksfree, bno + i); 1933 i = fs->fs_frag - size; 1934 cgp->cg_cs.cs_nffree += i; 1935 fs->fs_cstotal.cs_nffree += i; 1936 fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; 1937 fs->fs_fmod = 1; 1938 cgp->cg_frsum[i]++; 1939 } 1940 /* XXX Fixme. */ 1941 UFS_UNLOCK(ump); 1942 if (DOINGSOFTDEP(ITOV(ip))) 1943 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0); 1944 UFS_LOCK(ump); 1945 return (blkno); 1946 } 1947 1948 /* 1949 * Determine whether a cluster can be allocated. 1950 * 1951 * We do not currently check for optimal rotational layout if there 1952 * are multiple choices in the same cylinder group. Instead we just 1953 * take the first one that we find following bpref. 1954 */ 1955 static ufs2_daddr_t 1956 ffs_clusteralloc(struct inode *ip, 1957 uint64_t cg, 1958 ufs2_daddr_t bpref, 1959 int len) 1960 { 1961 struct fs *fs; 1962 struct cg *cgp; 1963 struct buf *bp; 1964 struct ufsmount *ump; 1965 int i, run, bit, map, got, error; 1966 ufs2_daddr_t bno; 1967 uint8_t *mapp; 1968 int32_t *lp; 1969 uint8_t *blksfree; 1970 1971 ump = ITOUMP(ip); 1972 fs = ump->um_fs; 1973 if (fs->fs_maxcluster[cg] < len) 1974 return (0); 1975 UFS_UNLOCK(ump); 1976 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { 1977 ffs_checkcgintegrity(fs, cg, error); 1978 UFS_LOCK(ump); 1979 return (0); 1980 } 1981 /* 1982 * Check to see if a cluster of the needed size (or bigger) is 1983 * available in this cylinder group. 1984 */ 1985 lp = &cg_clustersum(cgp)[len]; 1986 for (i = len; i <= fs->fs_contigsumsize; i++) 1987 if (*lp++ > 0) 1988 break; 1989 if (i > fs->fs_contigsumsize) { 1990 /* 1991 * This is the first time looking for a cluster in this 1992 * cylinder group. Update the cluster summary information 1993 * to reflect the true maximum sized cluster so that 1994 * future cluster allocation requests can avoid reading 1995 * the cylinder group map only to find no clusters. 1996 */ 1997 lp = &cg_clustersum(cgp)[len - 1]; 1998 for (i = len - 1; i > 0; i--) 1999 if (*lp-- > 0) 2000 break; 2001 UFS_LOCK(ump); 2002 fs->fs_maxcluster[cg] = i; 2003 brelse(bp); 2004 return (0); 2005 } 2006 /* 2007 * Search the cluster map to find a big enough cluster. 2008 * We take the first one that we find, even if it is larger 2009 * than we need as we prefer to get one close to the previous 2010 * block allocation. We do not search before the current 2011 * preference point as we do not want to allocate a block 2012 * that is allocated before the previous one (as we will 2013 * then have to wait for another pass of the elevator 2014 * algorithm before it will be read). We prefer to fail and 2015 * be recalled to try an allocation in the next cylinder group. 2016 */ 2017 if (dtog(fs, bpref) != cg) 2018 bpref = cgdata(fs, cg); 2019 else 2020 bpref = blknum(fs, bpref); 2021 bpref = fragstoblks(fs, dtogd(fs, bpref)); 2022 mapp = &cg_clustersfree(cgp)[bpref / NBBY]; 2023 map = *mapp++; 2024 bit = 1 << (bpref % NBBY); 2025 for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { 2026 if ((map & bit) == 0) { 2027 run = 0; 2028 } else { 2029 run++; 2030 if (run == len) 2031 break; 2032 } 2033 if ((got & (NBBY - 1)) != (NBBY - 1)) { 2034 bit <<= 1; 2035 } else { 2036 map = *mapp++; 2037 bit = 1; 2038 } 2039 } 2040 if (got >= cgp->cg_nclusterblks) { 2041 UFS_LOCK(ump); 2042 brelse(bp); 2043 return (0); 2044 } 2045 /* 2046 * Allocate the cluster that we have found. 2047 */ 2048 blksfree = cg_blksfree(cgp); 2049 for (i = 1; i <= len; i++) 2050 if (!ffs_isblock(fs, blksfree, got - run + i)) 2051 panic("ffs_clusteralloc: map mismatch"); 2052 bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1); 2053 if (dtog(fs, bno) != cg) 2054 panic("ffs_clusteralloc: allocated out of group"); 2055 len = blkstofrags(fs, len); 2056 UFS_LOCK(ump); 2057 for (i = 0; i < len; i += fs->fs_frag) 2058 if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) 2059 panic("ffs_clusteralloc: lost block"); 2060 ACTIVECLEAR(fs, cg); 2061 UFS_UNLOCK(ump); 2062 bdwrite(bp); 2063 return (bno); 2064 } 2065 2066 static inline struct buf * 2067 getinobuf(struct inode *ip, 2068 uint64_t cg, 2069 uint32_t cginoblk, 2070 int gbflags) 2071 { 2072 struct fs *fs; 2073 2074 fs = ITOFS(ip); 2075 return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, 2076 cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, 2077 gbflags)); 2078 } 2079 2080 /* 2081 * Synchronous inode initialization is needed only when barrier writes do not 2082 * work as advertised, and will impose a heavy cost on file creation in a newly 2083 * created filesystem. 2084 */ 2085 static int doasyncinodeinit = 1; 2086 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN, 2087 &doasyncinodeinit, 0, 2088 "Perform inode block initialization using asynchronous writes"); 2089 2090 /* 2091 * Determine whether an inode can be allocated. 2092 * 2093 * Check to see if an inode is available, and if it is, 2094 * allocate it using the following policy: 2095 * 1) allocate the requested inode. 2096 * 2) allocate the next available inode after the requested 2097 * inode in the specified cylinder group. 2098 */ 2099 static ufs2_daddr_t 2100 ffs_nodealloccg(struct inode *ip, 2101 uint64_t cg, 2102 ufs2_daddr_t ipref, 2103 int mode, 2104 int unused) 2105 { 2106 struct fs *fs; 2107 struct cg *cgp; 2108 struct buf *bp, *ibp; 2109 struct ufsmount *ump; 2110 uint8_t *inosused, *loc; 2111 struct ufs2_dinode *dp2; 2112 int error, start, len, i; 2113 uint32_t old_initediblk; 2114 2115 ump = ITOUMP(ip); 2116 fs = ump->um_fs; 2117 check_nifree: 2118 if (fs->fs_cs(fs, cg).cs_nifree == 0) 2119 return (0); 2120 UFS_UNLOCK(ump); 2121 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { 2122 ffs_checkcgintegrity(fs, cg, error); 2123 UFS_LOCK(ump); 2124 return (0); 2125 } 2126 restart: 2127 if (cgp->cg_cs.cs_nifree == 0) { 2128 brelse(bp); 2129 UFS_LOCK(ump); 2130 return (0); 2131 } 2132 inosused = cg_inosused(cgp); 2133 if (ipref) { 2134 ipref %= fs->fs_ipg; 2135 if (isclr(inosused, ipref)) 2136 goto gotit; 2137 } 2138 start = cgp->cg_irotor / NBBY; 2139 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); 2140 loc = memcchr(&inosused[start], 0xff, len); 2141 if (loc == NULL) { 2142 len = start + 1; 2143 start = 0; 2144 loc = memcchr(&inosused[start], 0xff, len); 2145 if (loc == NULL) { 2146 printf("cg = %ju, irotor = %ld, fs = %s\n", 2147 (intmax_t)cg, (long)cgp->cg_irotor, fs->fs_fsmnt); 2148 panic("ffs_nodealloccg: map corrupted"); 2149 /* NOTREACHED */ 2150 } 2151 } 2152 ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1; 2153 gotit: 2154 /* 2155 * Check to see if we need to initialize more inodes. 2156 */ 2157 if (fs->fs_magic == FS_UFS2_MAGIC && 2158 ipref + INOPB(fs) > cgp->cg_initediblk && 2159 cgp->cg_initediblk < cgp->cg_niblk) { 2160 old_initediblk = cgp->cg_initediblk; 2161 2162 /* 2163 * Free the cylinder group lock before writing the 2164 * initialized inode block. Entering the 2165 * babarrierwrite() with the cylinder group lock 2166 * causes lock order violation between the lock and 2167 * snaplk. 2168 * 2169 * Another thread can decide to initialize the same 2170 * inode block, but whichever thread first gets the 2171 * cylinder group lock after writing the newly 2172 * allocated inode block will update it and the other 2173 * will realize that it has lost and leave the 2174 * cylinder group unchanged. 2175 */ 2176 ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); 2177 brelse(bp); 2178 if (ibp == NULL) { 2179 /* 2180 * The inode block buffer is already owned by 2181 * another thread, which must initialize it. 2182 * Wait on the buffer to allow another thread 2183 * to finish the updates, with dropped cg 2184 * buffer lock, then retry. 2185 */ 2186 ibp = getinobuf(ip, cg, old_initediblk, 0); 2187 brelse(ibp); 2188 UFS_LOCK(ump); 2189 goto check_nifree; 2190 } 2191 bzero(ibp->b_data, (int)fs->fs_bsize); 2192 dp2 = (struct ufs2_dinode *)(ibp->b_data); 2193 for (i = 0; i < INOPB(fs); i++) { 2194 while (dp2->di_gen == 0) 2195 dp2->di_gen = arc4random(); 2196 dp2++; 2197 } 2198 2199 /* 2200 * Rather than adding a soft updates dependency to ensure 2201 * that the new inode block is written before it is claimed 2202 * by the cylinder group map, we just do a barrier write 2203 * here. The barrier write will ensure that the inode block 2204 * gets written before the updated cylinder group map can be 2205 * written. The barrier write should only slow down bulk 2206 * loading of newly created filesystems. 2207 */ 2208 if (doasyncinodeinit) 2209 babarrierwrite(ibp); 2210 else 2211 bwrite(ibp); 2212 2213 /* 2214 * After the inode block is written, try to update the 2215 * cg initediblk pointer. If another thread beat us 2216 * to it, then leave it unchanged as the other thread 2217 * has already set it correctly. 2218 */ 2219 error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp); 2220 UFS_LOCK(ump); 2221 ACTIVECLEAR(fs, cg); 2222 UFS_UNLOCK(ump); 2223 if (error != 0) 2224 return (error); 2225 if (cgp->cg_initediblk == old_initediblk) 2226 cgp->cg_initediblk += INOPB(fs); 2227 goto restart; 2228 } 2229 cgp->cg_irotor = ipref; 2230 UFS_LOCK(ump); 2231 ACTIVECLEAR(fs, cg); 2232 setbit(inosused, ipref); 2233 cgp->cg_cs.cs_nifree--; 2234 fs->fs_cstotal.cs_nifree--; 2235 fs->fs_cs(fs, cg).cs_nifree--; 2236 fs->fs_fmod = 1; 2237 if ((mode & IFMT) == IFDIR) { 2238 cgp->cg_cs.cs_ndir++; 2239 fs->fs_cstotal.cs_ndir++; 2240 fs->fs_cs(fs, cg).cs_ndir++; 2241 } 2242 UFS_UNLOCK(ump); 2243 if (DOINGSOFTDEP(ITOV(ip))) 2244 softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode); 2245 bdwrite(bp); 2246 return ((ino_t)(cg * fs->fs_ipg + ipref)); 2247 } 2248 2249 /* 2250 * Free a block or fragment. 2251 * 2252 * The specified block or fragment is placed back in the 2253 * free map. If a fragment is deallocated, a possible 2254 * block reassembly is checked. 2255 */ 2256 static void 2257 ffs_blkfree_cg(struct ufsmount *ump, 2258 struct fs *fs, 2259 struct vnode *devvp, 2260 ufs2_daddr_t bno, 2261 long size, 2262 ino_t inum, 2263 struct workhead *dephd) 2264 { 2265 struct mount *mp; 2266 struct cg *cgp; 2267 struct buf *bp; 2268 daddr_t dbn; 2269 ufs1_daddr_t fragno, cgbno; 2270 int i, blk, frags, bbase, error; 2271 uint64_t cg; 2272 uint8_t *blksfree; 2273 struct cdev *dev; 2274 2275 cg = dtog(fs, bno); 2276 if (devvp->v_type == VREG) { 2277 /* devvp is a snapshot */ 2278 MPASS(devvp->v_mount->mnt_data == ump); 2279 dev = ump->um_devvp->v_rdev; 2280 } else if (devvp->v_type == VCHR) { 2281 /* 2282 * devvp is a normal disk device 2283 * XXXKIB: devvp is not locked there, v_rdev access depends on 2284 * busy mount, which prevents mntfs devvp from reclamation. 2285 */ 2286 dev = devvp->v_rdev; 2287 } else 2288 return; 2289 #ifdef INVARIANTS 2290 if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0 || 2291 fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { 2292 printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", 2293 devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, 2294 size, fs->fs_fsmnt); 2295 panic("ffs_blkfree_cg: invalid size"); 2296 } 2297 #endif 2298 if ((uint64_t)bno >= fs->fs_size) { 2299 printf("bad block %jd, ino %ju\n", (intmax_t)bno, 2300 (intmax_t)inum); 2301 ffs_fserr(fs, inum, "bad block"); 2302 return; 2303 } 2304 if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { 2305 if (!MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) 2306 return; 2307 /* 2308 * Would like to just downgrade to read-only. Until that 2309 * capability is available, just toss the cylinder group 2310 * update and mark the filesystem as needing to run fsck. 2311 */ 2312 fs->fs_flags |= FS_NEEDSFSCK; 2313 if (devvp->v_type == VREG) 2314 dbn = fragstoblks(fs, cgtod(fs, cg)); 2315 else 2316 dbn = fsbtodb(fs, cgtod(fs, cg)); 2317 error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); 2318 KASSERT(error == 0, ("getblkx failed")); 2319 softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, 2320 numfrags(fs, size), dephd, true); 2321 bp->b_flags |= B_RELBUF | B_NOCACHE; 2322 bp->b_flags &= ~B_CACHE; 2323 bawrite(bp); 2324 return; 2325 } 2326 cgbno = dtogd(fs, bno); 2327 blksfree = cg_blksfree(cgp); 2328 UFS_LOCK(ump); 2329 if (size == fs->fs_bsize) { 2330 fragno = fragstoblks(fs, cgbno); 2331 if (!ffs_isfreeblock(fs, blksfree, fragno)) { 2332 if (devvp->v_type == VREG) { 2333 UFS_UNLOCK(ump); 2334 /* devvp is a snapshot */ 2335 brelse(bp); 2336 return; 2337 } 2338 printf("dev = %s, block = %jd, fs = %s\n", 2339 devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); 2340 panic("ffs_blkfree_cg: freeing free block"); 2341 } 2342 ffs_setblock(fs, blksfree, fragno); 2343 ffs_clusteracct(fs, cgp, fragno, 1); 2344 cgp->cg_cs.cs_nbfree++; 2345 fs->fs_cstotal.cs_nbfree++; 2346 fs->fs_cs(fs, cg).cs_nbfree++; 2347 } else { 2348 bbase = cgbno - fragnum(fs, cgbno); 2349 /* 2350 * decrement the counts associated with the old frags 2351 */ 2352 blk = blkmap(fs, blksfree, bbase); 2353 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 2354 /* 2355 * deallocate the fragment 2356 */ 2357 frags = numfrags(fs, size); 2358 for (i = 0; i < frags; i++) { 2359 if (isset(blksfree, cgbno + i)) { 2360 printf("dev = %s, block = %jd, fs = %s\n", 2361 devtoname(dev), (intmax_t)(bno + i), 2362 fs->fs_fsmnt); 2363 panic("ffs_blkfree_cg: freeing free frag"); 2364 } 2365 setbit(blksfree, cgbno + i); 2366 } 2367 cgp->cg_cs.cs_nffree += i; 2368 fs->fs_cstotal.cs_nffree += i; 2369 fs->fs_cs(fs, cg).cs_nffree += i; 2370 /* 2371 * add back in counts associated with the new frags 2372 */ 2373 blk = blkmap(fs, blksfree, bbase); 2374 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 2375 /* 2376 * if a complete block has been reassembled, account for it 2377 */ 2378 fragno = fragstoblks(fs, bbase); 2379 if (ffs_isblock(fs, blksfree, fragno)) { 2380 cgp->cg_cs.cs_nffree -= fs->fs_frag; 2381 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 2382 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 2383 ffs_clusteracct(fs, cgp, fragno, 1); 2384 cgp->cg_cs.cs_nbfree++; 2385 fs->fs_cstotal.cs_nbfree++; 2386 fs->fs_cs(fs, cg).cs_nbfree++; 2387 } 2388 } 2389 fs->fs_fmod = 1; 2390 ACTIVECLEAR(fs, cg); 2391 UFS_UNLOCK(ump); 2392 mp = UFSTOVFS(ump); 2393 if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR) 2394 softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, 2395 numfrags(fs, size), dephd, false); 2396 bdwrite(bp); 2397 } 2398 2399 /* 2400 * Structures and routines associated with trim management. 2401 * 2402 * The following requests are passed to trim_lookup to indicate 2403 * the actions that should be taken. 2404 */ 2405 #define NEW 1 /* if found, error else allocate and hash it */ 2406 #define OLD 2 /* if not found, error, else return it */ 2407 #define REPLACE 3 /* if not found, error else unhash and reallocate it */ 2408 #define DONE 4 /* if not found, error else unhash and return it */ 2409 #define SINGLE 5 /* don't look up, just allocate it and don't hash it */ 2410 2411 MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures"); 2412 2413 #define TRIMLIST_HASH(ump, key) \ 2414 (&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize]) 2415 2416 /* 2417 * These structures describe each of the block free requests aggregated 2418 * together to make up a trim request. 2419 */ 2420 struct trim_blkreq { 2421 TAILQ_ENTRY(trim_blkreq) blkreqlist; 2422 ufs2_daddr_t bno; 2423 long size; 2424 struct workhead *pdephd; 2425 struct workhead dephd; 2426 }; 2427 2428 /* 2429 * Description of a trim request. 2430 */ 2431 struct ffs_blkfree_trim_params { 2432 TAILQ_HEAD(, trim_blkreq) blklist; 2433 LIST_ENTRY(ffs_blkfree_trim_params) hashlist; 2434 struct task task; 2435 struct ufsmount *ump; 2436 struct vnode *devvp; 2437 ino_t inum; 2438 ufs2_daddr_t bno; 2439 long size; 2440 long key; 2441 }; 2442 2443 static void ffs_blkfree_trim_completed(struct buf *); 2444 static void ffs_blkfree_trim_task(void *ctx, int pending __unused); 2445 static struct ffs_blkfree_trim_params *trim_lookup(struct ufsmount *, 2446 struct vnode *, ufs2_daddr_t, long, ino_t, uint64_t, int); 2447 static void ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *); 2448 2449 /* 2450 * Called on trim completion to start a task to free the associated block(s). 2451 */ 2452 static void 2453 ffs_blkfree_trim_completed(struct buf *bp) 2454 { 2455 struct ffs_blkfree_trim_params *tp; 2456 2457 tp = bp->b_fsprivate1; 2458 free(bp, M_TRIM); 2459 TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); 2460 taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task); 2461 } 2462 2463 /* 2464 * Trim completion task that free associated block(s). 2465 */ 2466 static void 2467 ffs_blkfree_trim_task(void *ctx, int pending) 2468 { 2469 struct ffs_blkfree_trim_params *tp; 2470 struct trim_blkreq *blkelm; 2471 struct ufsmount *ump; 2472 2473 tp = ctx; 2474 ump = tp->ump; 2475 while ((blkelm = TAILQ_FIRST(&tp->blklist)) != NULL) { 2476 ffs_blkfree_cg(ump, ump->um_fs, tp->devvp, blkelm->bno, 2477 blkelm->size, tp->inum, blkelm->pdephd); 2478 TAILQ_REMOVE(&tp->blklist, blkelm, blkreqlist); 2479 free(blkelm, M_TRIM); 2480 } 2481 vn_finished_secondary_write(UFSTOVFS(ump)); 2482 UFS_LOCK(ump); 2483 ump->um_trim_inflight -= 1; 2484 ump->um_trim_inflight_blks -= numfrags(ump->um_fs, tp->size); 2485 UFS_UNLOCK(ump); 2486 free(tp, M_TRIM); 2487 } 2488 2489 /* 2490 * Lookup a trim request by inode number. 2491 * Allocate if requested (NEW, REPLACE, SINGLE). 2492 */ 2493 static struct ffs_blkfree_trim_params * 2494 trim_lookup(struct ufsmount *ump, 2495 struct vnode *devvp, 2496 ufs2_daddr_t bno, 2497 long size, 2498 ino_t inum, 2499 uint64_t key, 2500 int alloctype) 2501 { 2502 struct trimlist_hashhead *tphashhead; 2503 struct ffs_blkfree_trim_params *tp, *ntp; 2504 2505 ntp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK); 2506 if (alloctype != SINGLE) { 2507 KASSERT(key >= FIRST_VALID_KEY, ("trim_lookup: invalid key")); 2508 UFS_LOCK(ump); 2509 tphashhead = TRIMLIST_HASH(ump, key); 2510 LIST_FOREACH(tp, tphashhead, hashlist) 2511 if (key == tp->key) 2512 break; 2513 } 2514 switch (alloctype) { 2515 case NEW: 2516 KASSERT(tp == NULL, ("trim_lookup: found trim")); 2517 break; 2518 case OLD: 2519 KASSERT(tp != NULL, 2520 ("trim_lookup: missing call to ffs_blkrelease_start()")); 2521 UFS_UNLOCK(ump); 2522 free(ntp, M_TRIM); 2523 return (tp); 2524 case REPLACE: 2525 KASSERT(tp != NULL, ("trim_lookup: missing REPLACE trim")); 2526 LIST_REMOVE(tp, hashlist); 2527 /* tp will be freed by caller */ 2528 break; 2529 case DONE: 2530 KASSERT(tp != NULL, ("trim_lookup: missing DONE trim")); 2531 LIST_REMOVE(tp, hashlist); 2532 UFS_UNLOCK(ump); 2533 free(ntp, M_TRIM); 2534 return (tp); 2535 } 2536 TAILQ_INIT(&ntp->blklist); 2537 ntp->ump = ump; 2538 ntp->devvp = devvp; 2539 ntp->bno = bno; 2540 ntp->size = size; 2541 ntp->inum = inum; 2542 ntp->key = key; 2543 if (alloctype != SINGLE) { 2544 LIST_INSERT_HEAD(tphashhead, ntp, hashlist); 2545 UFS_UNLOCK(ump); 2546 } 2547 return (ntp); 2548 } 2549 2550 /* 2551 * Dispatch a trim request. 2552 */ 2553 static void 2554 ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *tp) 2555 { 2556 struct ufsmount *ump; 2557 struct mount *mp; 2558 struct buf *bp; 2559 2560 /* 2561 * Postpone the set of the free bit in the cg bitmap until the 2562 * BIO_DELETE is completed. Otherwise, due to disk queue 2563 * reordering, TRIM might be issued after we reuse the block 2564 * and write some new data into it. 2565 */ 2566 ump = tp->ump; 2567 bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO); 2568 bp->b_iocmd = BIO_DELETE; 2569 bp->b_iooffset = dbtob(fsbtodb(ump->um_fs, tp->bno)); 2570 bp->b_iodone = ffs_blkfree_trim_completed; 2571 bp->b_bcount = tp->size; 2572 bp->b_fsprivate1 = tp; 2573 UFS_LOCK(ump); 2574 ump->um_trim_total += 1; 2575 ump->um_trim_inflight += 1; 2576 ump->um_trim_inflight_blks += numfrags(ump->um_fs, tp->size); 2577 ump->um_trim_total_blks += numfrags(ump->um_fs, tp->size); 2578 UFS_UNLOCK(ump); 2579 2580 mp = UFSTOVFS(ump); 2581 vn_start_secondary_write(NULL, &mp, 0); 2582 g_vfs_strategy(ump->um_bo, bp); 2583 } 2584 2585 /* 2586 * Allocate a new key to use to identify a range of blocks. 2587 */ 2588 uint64_t 2589 ffs_blkrelease_start(struct ufsmount *ump, 2590 struct vnode *devvp, 2591 ino_t inum) 2592 { 2593 static u_long masterkey; 2594 uint64_t key; 2595 2596 if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) 2597 return (SINGLETON_KEY); 2598 do { 2599 key = atomic_fetchadd_long(&masterkey, 1); 2600 } while (key < FIRST_VALID_KEY); 2601 (void) trim_lookup(ump, devvp, 0, 0, inum, key, NEW); 2602 return (key); 2603 } 2604 2605 /* 2606 * Deallocate a key that has been used to identify a range of blocks. 2607 */ 2608 void 2609 ffs_blkrelease_finish(struct ufsmount *ump, uint64_t key) 2610 { 2611 struct ffs_blkfree_trim_params *tp; 2612 2613 if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) 2614 return; 2615 /* 2616 * If the vfs.ffs.dotrimcons sysctl option is enabled while 2617 * a file deletion is active, specifically after a call 2618 * to ffs_blkrelease_start() but before the call to 2619 * ffs_blkrelease_finish(), ffs_blkrelease_start() will 2620 * have handed out SINGLETON_KEY rather than starting a 2621 * collection sequence. Thus if we get a SINGLETON_KEY 2622 * passed to ffs_blkrelease_finish(), we just return rather 2623 * than trying to finish the nonexistent sequence. 2624 */ 2625 if (key == SINGLETON_KEY) { 2626 #ifdef INVARIANTS 2627 printf("%s: vfs.ffs.dotrimcons enabled on active filesystem\n", 2628 ump->um_mountp->mnt_stat.f_mntonname); 2629 #endif 2630 return; 2631 } 2632 /* 2633 * We are done with sending blocks using this key. Look up the key 2634 * using the DONE alloctype (in tp) to request that it be unhashed 2635 * as we will not be adding to it. If the key has never been used, 2636 * tp->size will be zero, so we can just free tp. Otherwise the call 2637 * to ffs_blkfree_sendtrim(tp) causes the block range described by 2638 * tp to be issued (and then tp to be freed). 2639 */ 2640 tp = trim_lookup(ump, NULL, 0, 0, 0, key, DONE); 2641 if (tp->size == 0) 2642 free(tp, M_TRIM); 2643 else 2644 ffs_blkfree_sendtrim(tp); 2645 } 2646 2647 /* 2648 * Setup to free a block or fragment. 2649 * 2650 * Check for snapshots that might want to claim the block. 2651 * If trims are requested, prepare a trim request. Attempt to 2652 * aggregate consecutive blocks into a single trim request. 2653 */ 2654 void 2655 ffs_blkfree(struct ufsmount *ump, 2656 struct fs *fs, 2657 struct vnode *devvp, 2658 ufs2_daddr_t bno, 2659 long size, 2660 ino_t inum, 2661 __enum_uint8(vtype) vtype, 2662 struct workhead *dephd, 2663 uint64_t key) 2664 { 2665 struct ffs_blkfree_trim_params *tp, *ntp; 2666 struct trim_blkreq *blkelm; 2667 2668 /* 2669 * Check to see if a snapshot wants to claim the block. 2670 * Check that devvp is a normal disk device, not a snapshot, 2671 * it has a snapshot(s) associated with it, and one of the 2672 * snapshots wants to claim the block. 2673 */ 2674 if (devvp->v_type == VCHR && 2675 (devvp->v_vflag & VV_COPYONWRITE) && 2676 ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { 2677 return; 2678 } 2679 /* 2680 * Nothing to delay if TRIM is not required for this block or TRIM 2681 * is disabled or the operation is performed on a snapshot. 2682 */ 2683 if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) || 2684 devvp->v_type == VREG) { 2685 ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); 2686 return; 2687 } 2688 blkelm = malloc(sizeof(struct trim_blkreq), M_TRIM, M_WAITOK); 2689 blkelm->bno = bno; 2690 blkelm->size = size; 2691 if (dephd == NULL) { 2692 blkelm->pdephd = NULL; 2693 } else { 2694 LIST_INIT(&blkelm->dephd); 2695 LIST_SWAP(dephd, &blkelm->dephd, worklist, wk_list); 2696 blkelm->pdephd = &blkelm->dephd; 2697 } 2698 if (key == SINGLETON_KEY) { 2699 /* 2700 * Just a single non-contiguous piece. Use the SINGLE 2701 * alloctype to return a trim request that will not be 2702 * hashed for future lookup. 2703 */ 2704 tp = trim_lookup(ump, devvp, bno, size, inum, key, SINGLE); 2705 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2706 ffs_blkfree_sendtrim(tp); 2707 return; 2708 } 2709 /* 2710 * The callers of this function are not tracking whether or not 2711 * the blocks are contiguous. They are just saying that they 2712 * are freeing a set of blocks. It is this code that determines 2713 * the pieces of that range that are actually contiguous. 2714 * 2715 * Calling ffs_blkrelease_start() will have created an entry 2716 * that we will use. 2717 */ 2718 tp = trim_lookup(ump, devvp, bno, size, inum, key, OLD); 2719 if (tp->size == 0) { 2720 /* 2721 * First block of a potential range, set block and size 2722 * for the trim block. 2723 */ 2724 tp->bno = bno; 2725 tp->size = size; 2726 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2727 return; 2728 } 2729 /* 2730 * If this block is a continuation of the range (either 2731 * follows at the end or preceeds in the front) then we 2732 * add it to the front or back of the list and return. 2733 * 2734 * If it is not a continuation of the trim that we were 2735 * building, using the REPLACE alloctype, we request that 2736 * the old trim request (still in tp) be unhashed and a 2737 * new range started (in ntp). The ffs_blkfree_sendtrim(tp) 2738 * call causes the block range described by tp to be issued 2739 * (and then tp to be freed). 2740 */ 2741 if (bno + numfrags(fs, size) == tp->bno) { 2742 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2743 tp->bno = bno; 2744 tp->size += size; 2745 return; 2746 } else if (bno == tp->bno + numfrags(fs, tp->size)) { 2747 TAILQ_INSERT_TAIL(&tp->blklist, blkelm, blkreqlist); 2748 tp->size += size; 2749 return; 2750 } 2751 ntp = trim_lookup(ump, devvp, bno, size, inum, key, REPLACE); 2752 TAILQ_INSERT_HEAD(&ntp->blklist, blkelm, blkreqlist); 2753 ffs_blkfree_sendtrim(tp); 2754 } 2755 2756 #ifdef INVARIANTS 2757 /* 2758 * Verify allocation of a block or fragment. 2759 * Return 1 if block or fragment is free. 2760 */ 2761 static int 2762 ffs_checkfreeblk(struct inode *ip, 2763 ufs2_daddr_t bno, 2764 long size) 2765 { 2766 struct fs *fs; 2767 struct cg *cgp; 2768 struct buf *bp; 2769 ufs1_daddr_t cgbno; 2770 int i, frags, blkalloced; 2771 uint8_t *blksfree; 2772 2773 fs = ITOFS(ip); 2774 if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0) { 2775 printf("bsize = %ld, size = %ld, fs = %s\n", 2776 (long)fs->fs_bsize, size, fs->fs_fsmnt); 2777 panic("ffs_checkfreeblk: bad size"); 2778 } 2779 if ((uint64_t)bno >= fs->fs_size) 2780 panic("ffs_checkfreeblk: too big block %jd", (intmax_t)bno); 2781 if (ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), 0, &bp, &cgp) != 0) 2782 return (0); 2783 blksfree = cg_blksfree(cgp); 2784 cgbno = dtogd(fs, bno); 2785 if (size == fs->fs_bsize) { 2786 blkalloced = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); 2787 } else { 2788 frags = numfrags(fs, size); 2789 for (blkalloced = 0, i = 0; i < frags; i++) 2790 if (isset(blksfree, cgbno + i)) 2791 blkalloced++; 2792 if (blkalloced != 0 && blkalloced != frags) 2793 panic("ffs_checkfreeblk: partially free fragment"); 2794 } 2795 brelse(bp); 2796 return (blkalloced == 0); 2797 } 2798 #endif /* INVARIANTS */ 2799 2800 /* 2801 * Free an inode. 2802 */ 2803 int 2804 ffs_vfree(struct vnode *pvp, 2805 ino_t ino, 2806 int mode) 2807 { 2808 struct ufsmount *ump; 2809 2810 if (DOINGSOFTDEP(pvp)) { 2811 softdep_freefile(pvp, ino, mode); 2812 return (0); 2813 } 2814 ump = VFSTOUFS(pvp->v_mount); 2815 return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL)); 2816 } 2817 2818 /* 2819 * Do the actual free operation. 2820 * The specified inode is placed back in the free map. 2821 */ 2822 int 2823 ffs_freefile(struct ufsmount *ump, 2824 struct fs *fs, 2825 struct vnode *devvp, 2826 ino_t ino, 2827 int mode, 2828 struct workhead *wkhd) 2829 { 2830 struct cg *cgp; 2831 struct buf *bp; 2832 daddr_t dbn; 2833 int error; 2834 uint64_t cg; 2835 uint8_t *inosused; 2836 struct cdev *dev; 2837 ino_t cgino; 2838 2839 cg = ino_to_cg(fs, ino); 2840 if (devvp->v_type == VREG) { 2841 /* devvp is a snapshot */ 2842 MPASS(devvp->v_mount->mnt_data == ump); 2843 dev = ump->um_devvp->v_rdev; 2844 } else if (devvp->v_type == VCHR) { 2845 /* devvp is a normal disk device */ 2846 dev = devvp->v_rdev; 2847 } else { 2848 bp = NULL; 2849 return (0); 2850 } 2851 if (ino >= fs->fs_ipg * fs->fs_ncg) 2852 panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s", 2853 devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); 2854 if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { 2855 if (!MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) 2856 return (error); 2857 /* 2858 * Would like to just downgrade to read-only. Until that 2859 * capability is available, just toss the cylinder group 2860 * update and mark the filesystem as needing to run fsck. 2861 */ 2862 fs->fs_flags |= FS_NEEDSFSCK; 2863 if (devvp->v_type == VREG) 2864 dbn = fragstoblks(fs, cgtod(fs, cg)); 2865 else 2866 dbn = fsbtodb(fs, cgtod(fs, cg)); 2867 error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); 2868 KASSERT(error == 0, ("getblkx failed")); 2869 softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd, true); 2870 bp->b_flags |= B_RELBUF | B_NOCACHE; 2871 bp->b_flags &= ~B_CACHE; 2872 bawrite(bp); 2873 return (error); 2874 } 2875 inosused = cg_inosused(cgp); 2876 cgino = ino % fs->fs_ipg; 2877 if (isclr(inosused, cgino)) { 2878 printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev), 2879 (uintmax_t)ino, fs->fs_fsmnt); 2880 if (fs->fs_ronly == 0) 2881 panic("ffs_freefile: freeing free inode"); 2882 } 2883 clrbit(inosused, cgino); 2884 if (cgino < cgp->cg_irotor) 2885 cgp->cg_irotor = cgino; 2886 cgp->cg_cs.cs_nifree++; 2887 UFS_LOCK(ump); 2888 fs->fs_cstotal.cs_nifree++; 2889 fs->fs_cs(fs, cg).cs_nifree++; 2890 if ((mode & IFMT) == IFDIR) { 2891 cgp->cg_cs.cs_ndir--; 2892 fs->fs_cstotal.cs_ndir--; 2893 fs->fs_cs(fs, cg).cs_ndir--; 2894 } 2895 fs->fs_fmod = 1; 2896 ACTIVECLEAR(fs, cg); 2897 UFS_UNLOCK(ump); 2898 if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR) 2899 softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd, false); 2900 bdwrite(bp); 2901 return (0); 2902 } 2903 2904 /* 2905 * Check to see if a file is free. 2906 * Used to check for allocated files in snapshots. 2907 * Return 1 if file is free. 2908 */ 2909 int 2910 ffs_checkfreefile(struct fs *fs, 2911 struct vnode *devvp, 2912 ino_t ino) 2913 { 2914 struct cg *cgp; 2915 struct buf *bp; 2916 int ret, error; 2917 uint64_t cg; 2918 uint8_t *inosused; 2919 2920 cg = ino_to_cg(fs, ino); 2921 if ((devvp->v_type != VREG) && (devvp->v_type != VCHR)) 2922 return (1); 2923 if (ino >= fs->fs_ipg * fs->fs_ncg) 2924 return (1); 2925 if ((error = ffs_getcg(fs, devvp, cg, 0, &bp, &cgp)) != 0) 2926 return (1); 2927 inosused = cg_inosused(cgp); 2928 ino %= fs->fs_ipg; 2929 ret = isclr(inosused, ino); 2930 brelse(bp); 2931 return (ret); 2932 } 2933 2934 /* 2935 * Find a block of the specified size in the specified cylinder group. 2936 * 2937 * It is a panic if a request is made to find a block if none are 2938 * available. 2939 */ 2940 static ufs1_daddr_t 2941 ffs_mapsearch(struct fs *fs, 2942 struct cg *cgp, 2943 ufs2_daddr_t bpref, 2944 int allocsiz) 2945 { 2946 ufs1_daddr_t bno; 2947 int start, len, loc, i; 2948 int blk, field, subfield, pos; 2949 uint8_t *blksfree; 2950 2951 /* 2952 * find the fragment by searching through the free block 2953 * map for an appropriate bit pattern 2954 */ 2955 if (bpref) 2956 start = dtogd(fs, bpref) / NBBY; 2957 else 2958 start = cgp->cg_frotor / NBBY; 2959 blksfree = cg_blksfree(cgp); 2960 len = howmany(fs->fs_fpg, NBBY) - start; 2961 loc = scanc((uint64_t)len, (uint8_t *)&blksfree[start], 2962 fragtbl[fs->fs_frag], 2963 (uint8_t)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 2964 if (loc == 0) { 2965 len = start + 1; 2966 start = 0; 2967 loc = scanc((uint64_t)len, (uint8_t *)&blksfree[0], 2968 fragtbl[fs->fs_frag], 2969 (uint8_t)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 2970 if (loc == 0) { 2971 printf("start = %d, len = %d, fs = %s\n", 2972 start, len, fs->fs_fsmnt); 2973 panic("ffs_alloccg: map corrupted"); 2974 /* NOTREACHED */ 2975 } 2976 } 2977 bno = (start + len - loc) * NBBY; 2978 cgp->cg_frotor = bno; 2979 /* 2980 * found the byte in the map 2981 * sift through the bits to find the selected frag 2982 */ 2983 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 2984 blk = blkmap(fs, blksfree, bno); 2985 blk <<= 1; 2986 field = around[allocsiz]; 2987 subfield = inside[allocsiz]; 2988 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { 2989 if ((blk & field) == subfield) 2990 return (bno + pos); 2991 field <<= 1; 2992 subfield <<= 1; 2993 } 2994 } 2995 printf("bno = %ju, fs = %s\n", (intmax_t)bno, fs->fs_fsmnt); 2996 panic("ffs_alloccg: block not in map"); 2997 return (-1); 2998 } 2999 3000 /* 3001 * Fetch and verify a cylinder group. 3002 */ 3003 int 3004 ffs_getcg(struct fs *fs, 3005 struct vnode *devvp, 3006 uint64_t cg, 3007 int flags, 3008 struct buf **bpp, 3009 struct cg **cgpp) 3010 { 3011 struct buf *bp; 3012 struct cg *cgp; 3013 struct mount *mp; 3014 const struct statfs *sfs; 3015 daddr_t blkno; 3016 int error; 3017 3018 *bpp = NULL; 3019 *cgpp = NULL; 3020 if ((fs->fs_metackhash & CK_CYLGRP) != 0) 3021 flags |= GB_CKHASH; 3022 if (devvp->v_type == VCHR) { 3023 blkno = fsbtodb(fs, cgtod(fs, cg)); 3024 mp = devvp->v_rdev->si_mountpt; 3025 } else { 3026 blkno = fragstoblks(fs, cgtod(fs, cg)); 3027 mp = devvp->v_mount; 3028 } 3029 error = breadn_flags(devvp, blkno, blkno, (int)fs->fs_cgsize, NULL, 3030 NULL, 0, NOCRED, flags, ffs_ckhash_cg, &bp); 3031 if (error != 0) 3032 return (error); 3033 cgp = (struct cg *)bp->b_data; 3034 if ((fs->fs_metackhash & CK_CYLGRP) != 0 && 3035 (bp->b_flags & B_CKHASH) != 0 && 3036 cgp->cg_ckhash != bp->b_ckhash) { 3037 if (ppsratecheck(&VFSTOUFS(mp)->um_last_integritymsg, 3038 &VFSTOUFS(mp)->um_secs_integritymsg, 1)) { 3039 sfs = &mp->mnt_stat; 3040 printf("UFS %s%s (%s) cylinder checkhash failed: " 3041 "cg %ju, cgp: 0x%x != bp: 0x%jx\n", 3042 devvp->v_type == VCHR ? "" : "snapshot of ", 3043 sfs->f_mntfromname, sfs->f_mntonname, (intmax_t)cg, 3044 cgp->cg_ckhash, (uintmax_t)bp->b_ckhash); 3045 } 3046 bp->b_flags &= ~B_CKHASH; 3047 bp->b_flags |= B_INVAL | B_NOCACHE; 3048 brelse(bp); 3049 return (EINTEGRITY); 3050 } 3051 if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) { 3052 if (ppsratecheck(&VFSTOUFS(mp)->um_last_integritymsg, 3053 &VFSTOUFS(mp)->um_secs_integritymsg, 1)) { 3054 sfs = &mp->mnt_stat; 3055 printf("UFS %s%s (%s)", 3056 devvp->v_type == VCHR ? "" : "snapshot of ", 3057 sfs->f_mntfromname, sfs->f_mntonname); 3058 if (!cg_chkmagic(cgp)) 3059 printf(" cg %ju: bad magic number 0x%x should " 3060 "be 0x%x\n", (intmax_t)cg, cgp->cg_magic, 3061 CG_MAGIC); 3062 else 3063 printf(": wrong cylinder group cg %ju != " 3064 "cgx %u\n", (intmax_t)cg, cgp->cg_cgx); 3065 } 3066 bp->b_flags &= ~B_CKHASH; 3067 bp->b_flags |= B_INVAL | B_NOCACHE; 3068 brelse(bp); 3069 return (EINTEGRITY); 3070 } 3071 bp->b_flags &= ~B_CKHASH; 3072 bp->b_xflags |= BX_BKGRDWRITE; 3073 /* 3074 * If we are using check hashes on the cylinder group then we want 3075 * to limit changing the cylinder group time to when we are actually 3076 * going to write it to disk so that its check hash remains correct 3077 * in memory. If the CK_CYLGRP flag is set the time is updated in 3078 * ffs_bufwrite() as the buffer is queued for writing. Otherwise we 3079 * update the time here as we have done historically. 3080 */ 3081 if ((fs->fs_metackhash & CK_CYLGRP) != 0) 3082 bp->b_xflags |= BX_CYLGRP; 3083 else 3084 cgp->cg_old_time = cgp->cg_time = time_second; 3085 *bpp = bp; 3086 *cgpp = cgp; 3087 return (0); 3088 } 3089 3090 static void 3091 ffs_ckhash_cg(struct buf *bp) 3092 { 3093 uint32_t ckhash; 3094 struct cg *cgp; 3095 3096 cgp = (struct cg *)bp->b_data; 3097 ckhash = cgp->cg_ckhash; 3098 cgp->cg_ckhash = 0; 3099 bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); 3100 cgp->cg_ckhash = ckhash; 3101 } 3102 3103 /* 3104 * Called when a cylinder group read has failed. If an integrity check 3105 * is the cause of failure then the cylinder group will not be usable 3106 * until the filesystem has been unmounted and fsck has been run to 3107 * repair it. To avoid future attempts to allocate resources from the 3108 * cylinder group, its available resources are set to zero in the 3109 * superblock summary information. Since it will appear to have no 3110 * resources available, no further calls will be made to allocate 3111 * resources from it. When resources are freed to the cylinder group 3112 * the resource free routines will find the cylinder group unusable so 3113 * the resource will simply be discarded and thus will not show up in 3114 * the superblock summary information until they are recovered by fsck. 3115 */ 3116 static void 3117 ffs_checkcgintegrity(struct fs *fs, 3118 uint64_t cg, 3119 int error) 3120 { 3121 3122 if (error != EINTEGRITY) 3123 return; 3124 fs->fs_cstotal.cs_nffree -= fs->fs_cs(fs, cg).cs_nffree; 3125 fs->fs_cs(fs, cg).cs_nffree = 0; 3126 fs->fs_cstotal.cs_nbfree -= fs->fs_cs(fs, cg).cs_nbfree; 3127 fs->fs_cs(fs, cg).cs_nbfree = 0; 3128 fs->fs_cstotal.cs_nifree -= fs->fs_cs(fs, cg).cs_nifree; 3129 fs->fs_cs(fs, cg).cs_nifree = 0; 3130 fs->fs_maxcluster[cg] = 0; 3131 fs->fs_flags |= FS_NEEDSFSCK; 3132 fs->fs_fmod = 1; 3133 } 3134 3135 /* 3136 * Fserr prints the name of a filesystem with an error diagnostic. 3137 * 3138 * The form of the error message is: 3139 * fs: error message 3140 */ 3141 void 3142 ffs_fserr(struct fs *fs, 3143 ino_t inum, 3144 char *cp) 3145 { 3146 struct thread *td = curthread; /* XXX */ 3147 struct proc *p = td->td_proc; 3148 3149 log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n", 3150 p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum, 3151 fs->fs_fsmnt, cp); 3152 } 3153 3154 /* 3155 * This function provides the capability for the fsck program to 3156 * update an active filesystem. Sixteen operations are provided: 3157 * 3158 * adjrefcnt(inode, amt) - adjusts the reference count on the 3159 * specified inode by the specified amount. Under normal 3160 * operation the count should always go down. Decrementing 3161 * the count to zero will cause the inode to be freed. 3162 * adjblkcnt(inode, amt) - adjust the number of blocks used by the 3163 * inode by the specified amount. 3164 * adjdepth(inode, amt) - adjust the depth of the specified directory 3165 * inode by the specified amount. 3166 * setsize(inode, size) - set the size of the inode to the 3167 * specified size. 3168 * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - 3169 * adjust the superblock summary. 3170 * freedirs(inode, count) - directory inodes [inode..inode + count - 1] 3171 * are marked as free. Inodes should never have to be marked 3172 * as in use. 3173 * freefiles(inode, count) - file inodes [inode..inode + count - 1] 3174 * are marked as free. Inodes should never have to be marked 3175 * as in use. 3176 * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] 3177 * are marked as free. Blocks should never have to be marked 3178 * as in use. 3179 * setflags(flags, set/clear) - the fs_flags field has the specified 3180 * flags set (second parameter +1) or cleared (second parameter -1). 3181 * setcwd(dirinode) - set the current directory to dirinode in the 3182 * filesystem associated with the snapshot. 3183 * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".." 3184 * in the current directory is oldvalue then change it to newvalue. 3185 * unlink(nameptr, oldvalue) - Verify that the inode number associated 3186 * with nameptr in the current directory is oldvalue then unlink it. 3187 */ 3188 3189 static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); 3190 3191 SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, 3192 CTLFLAG_WR | CTLTYPE_STRUCT | CTLFLAG_NEEDGIANT, 3193 0, 0, sysctl_ffs_fsck, "S,fsck", 3194 "Adjust Inode Reference Count"); 3195 3196 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, 3197 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3198 "Adjust Inode Used Blocks Count"); 3199 3200 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_DEPTH, adjdepth, 3201 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3202 "Adjust Directory Inode Depth"); 3203 3204 static SYSCTL_NODE(_vfs_ffs, FFS_SET_SIZE, setsize, 3205 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3206 "Set the inode size"); 3207 3208 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, 3209 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3210 "Adjust number of directories"); 3211 3212 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, 3213 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3214 "Adjust number of free blocks"); 3215 3216 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, 3217 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3218 "Adjust number of free inodes"); 3219 3220 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, 3221 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3222 "Adjust number of free frags"); 3223 3224 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, 3225 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3226 "Adjust number of free clusters"); 3227 3228 static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, 3229 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3230 "Free Range of Directory Inodes"); 3231 3232 static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, 3233 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3234 "Free Range of File Inodes"); 3235 3236 static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, 3237 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3238 "Free Range of Blocks"); 3239 3240 static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, 3241 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3242 "Change Filesystem Flags"); 3243 3244 static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, 3245 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3246 "Set Current Working Directory"); 3247 3248 static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, 3249 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3250 "Change Value of .. Entry"); 3251 3252 static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, 3253 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3254 "Unlink a Duplicate Name"); 3255 3256 #ifdef DIAGNOSTIC 3257 static int fsckcmds = 0; 3258 SYSCTL_INT(_debug, OID_AUTO, ffs_fsckcmds, CTLFLAG_RW, &fsckcmds, 0, 3259 "print out fsck_ffs-based filesystem update commands"); 3260 #endif /* DIAGNOSTIC */ 3261 3262 static int 3263 sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) 3264 { 3265 struct thread *td = curthread; 3266 struct fsck_cmd cmd; 3267 struct ufsmount *ump; 3268 struct vnode *vp, *dvp, *fdvp; 3269 struct inode *ip, *dp; 3270 struct mount *mp; 3271 struct fs *fs; 3272 struct pwd *pwd; 3273 ufs2_daddr_t blkno; 3274 long blkcnt, blksize; 3275 uint64_t key; 3276 struct file *fp; 3277 cap_rights_t rights; 3278 int filetype, error; 3279 3280 if (req->newptr == NULL || req->newlen > sizeof(cmd)) 3281 return (EBADRPC); 3282 if ((error = SYSCTL_IN(req, &cmd, sizeof(cmd))) != 0) 3283 return (error); 3284 if (cmd.version != FFS_CMD_VERSION) 3285 return (ERPCMISMATCH); 3286 if ((error = getvnode(td, cmd.handle, 3287 cap_rights_init_one(&rights, CAP_FSCK), &fp)) != 0) 3288 return (error); 3289 vp = fp->f_vnode; 3290 if (vp->v_type != VREG && vp->v_type != VDIR) { 3291 fdrop(fp, td); 3292 return (EINVAL); 3293 } 3294 vn_start_write(vp, &mp, V_WAIT); 3295 if (mp == NULL || 3296 strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { 3297 vn_finished_write(mp); 3298 fdrop(fp, td); 3299 return (EINVAL); 3300 } 3301 ump = VFSTOUFS(mp); 3302 if (mp->mnt_flag & MNT_RDONLY) { 3303 vn_finished_write(mp); 3304 fdrop(fp, td); 3305 return (EROFS); 3306 } 3307 fs = ump->um_fs; 3308 filetype = IFREG; 3309 3310 switch (oidp->oid_number) { 3311 case FFS_SET_FLAGS: 3312 #ifdef DIAGNOSTIC 3313 if (fsckcmds) 3314 printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, 3315 cmd.size > 0 ? "set" : "clear"); 3316 #endif /* DIAGNOSTIC */ 3317 if (cmd.size > 0) 3318 fs->fs_flags |= (long)cmd.value; 3319 else 3320 fs->fs_flags &= ~(long)cmd.value; 3321 break; 3322 3323 case FFS_ADJ_REFCNT: 3324 #ifdef DIAGNOSTIC 3325 if (fsckcmds) { 3326 printf("%s: adjust inode %jd link count by %jd\n", 3327 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3328 (intmax_t)cmd.size); 3329 } 3330 #endif /* DIAGNOSTIC */ 3331 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3332 break; 3333 ip = VTOI(vp); 3334 ip->i_nlink += cmd.size; 3335 DIP_SET(ip, i_nlink, ip->i_nlink); 3336 ip->i_effnlink += cmd.size; 3337 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3338 error = ffs_update(vp, 1); 3339 if (DOINGSOFTDEP(vp)) 3340 softdep_change_linkcnt(ip); 3341 vput(vp); 3342 break; 3343 3344 case FFS_ADJ_BLKCNT: 3345 #ifdef DIAGNOSTIC 3346 if (fsckcmds) { 3347 printf("%s: adjust inode %jd block count by %jd\n", 3348 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3349 (intmax_t)cmd.size); 3350 } 3351 #endif /* DIAGNOSTIC */ 3352 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3353 break; 3354 ip = VTOI(vp); 3355 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); 3356 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3357 error = ffs_update(vp, 1); 3358 vput(vp); 3359 break; 3360 3361 case FFS_ADJ_DEPTH: 3362 #ifdef DIAGNOSTIC 3363 if (fsckcmds) { 3364 printf("%s: adjust directory inode %jd depth by %jd\n", 3365 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3366 (intmax_t)cmd.size); 3367 } 3368 #endif /* DIAGNOSTIC */ 3369 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3370 break; 3371 if (vp->v_type != VDIR) { 3372 vput(vp); 3373 error = ENOTDIR; 3374 break; 3375 } 3376 ip = VTOI(vp); 3377 DIP_SET(ip, i_dirdepth, DIP(ip, i_dirdepth) + cmd.size); 3378 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3379 error = ffs_update(vp, 1); 3380 vput(vp); 3381 break; 3382 3383 case FFS_SET_SIZE: 3384 #ifdef DIAGNOSTIC 3385 if (fsckcmds) { 3386 printf("%s: set inode %jd size to %jd\n", 3387 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3388 (intmax_t)cmd.size); 3389 } 3390 #endif /* DIAGNOSTIC */ 3391 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3392 break; 3393 ip = VTOI(vp); 3394 DIP_SET(ip, i_size, cmd.size); 3395 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_MODIFIED); 3396 error = ffs_update(vp, 1); 3397 vput(vp); 3398 break; 3399 3400 case FFS_DIR_FREE: 3401 filetype = IFDIR; 3402 /* fall through */ 3403 3404 case FFS_FILE_FREE: 3405 #ifdef DIAGNOSTIC 3406 if (fsckcmds) { 3407 if (cmd.size == 1) 3408 printf("%s: free %s inode %ju\n", 3409 mp->mnt_stat.f_mntonname, 3410 filetype == IFDIR ? "directory" : "file", 3411 (uintmax_t)cmd.value); 3412 else 3413 printf("%s: free %s inodes %ju-%ju\n", 3414 mp->mnt_stat.f_mntonname, 3415 filetype == IFDIR ? "directory" : "file", 3416 (uintmax_t)cmd.value, 3417 (uintmax_t)(cmd.value + cmd.size - 1)); 3418 } 3419 #endif /* DIAGNOSTIC */ 3420 while (cmd.size > 0) { 3421 if ((error = ffs_freefile(ump, fs, ump->um_devvp, 3422 cmd.value, filetype, NULL))) 3423 break; 3424 cmd.size -= 1; 3425 cmd.value += 1; 3426 } 3427 break; 3428 3429 case FFS_BLK_FREE: 3430 #ifdef DIAGNOSTIC 3431 if (fsckcmds) { 3432 if (cmd.size == 1) 3433 printf("%s: free block %jd\n", 3434 mp->mnt_stat.f_mntonname, 3435 (intmax_t)cmd.value); 3436 else 3437 printf("%s: free blocks %jd-%jd\n", 3438 mp->mnt_stat.f_mntonname, 3439 (intmax_t)cmd.value, 3440 (intmax_t)cmd.value + cmd.size - 1); 3441 } 3442 #endif /* DIAGNOSTIC */ 3443 blkno = cmd.value; 3444 blkcnt = cmd.size; 3445 blksize = fs->fs_frag - (blkno % fs->fs_frag); 3446 key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO); 3447 while (blkcnt > 0) { 3448 if (blkcnt < blksize) 3449 blksize = blkcnt; 3450 ffs_blkfree(ump, fs, ump->um_devvp, blkno, 3451 blksize * fs->fs_fsize, UFS_ROOTINO, 3452 VDIR, NULL, key); 3453 blkno += blksize; 3454 blkcnt -= blksize; 3455 blksize = fs->fs_frag; 3456 } 3457 ffs_blkrelease_finish(ump, key); 3458 break; 3459 3460 /* 3461 * Adjust superblock summaries. fsck(8) is expected to 3462 * submit deltas when necessary. 3463 */ 3464 case FFS_ADJ_NDIR: 3465 #ifdef DIAGNOSTIC 3466 if (fsckcmds) { 3467 printf("%s: adjust number of directories by %jd\n", 3468 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3469 } 3470 #endif /* DIAGNOSTIC */ 3471 fs->fs_cstotal.cs_ndir += cmd.value; 3472 break; 3473 3474 case FFS_ADJ_NBFREE: 3475 #ifdef DIAGNOSTIC 3476 if (fsckcmds) { 3477 printf("%s: adjust number of free blocks by %+jd\n", 3478 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3479 } 3480 #endif /* DIAGNOSTIC */ 3481 fs->fs_cstotal.cs_nbfree += cmd.value; 3482 break; 3483 3484 case FFS_ADJ_NIFREE: 3485 #ifdef DIAGNOSTIC 3486 if (fsckcmds) { 3487 printf("%s: adjust number of free inodes by %+jd\n", 3488 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3489 } 3490 #endif /* DIAGNOSTIC */ 3491 fs->fs_cstotal.cs_nifree += cmd.value; 3492 break; 3493 3494 case FFS_ADJ_NFFREE: 3495 #ifdef DIAGNOSTIC 3496 if (fsckcmds) { 3497 printf("%s: adjust number of free frags by %+jd\n", 3498 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3499 } 3500 #endif /* DIAGNOSTIC */ 3501 fs->fs_cstotal.cs_nffree += cmd.value; 3502 break; 3503 3504 case FFS_ADJ_NUMCLUSTERS: 3505 #ifdef DIAGNOSTIC 3506 if (fsckcmds) { 3507 printf("%s: adjust number of free clusters by %+jd\n", 3508 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3509 } 3510 #endif /* DIAGNOSTIC */ 3511 fs->fs_cstotal.cs_numclusters += cmd.value; 3512 break; 3513 3514 case FFS_SET_CWD: 3515 #ifdef DIAGNOSTIC 3516 if (fsckcmds) { 3517 printf("%s: set current directory to inode %jd\n", 3518 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3519 } 3520 #endif /* DIAGNOSTIC */ 3521 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp))) 3522 break; 3523 AUDIT_ARG_VNODE1(vp); 3524 if ((error = change_dir(vp, td)) != 0) { 3525 vput(vp); 3526 break; 3527 } 3528 VOP_UNLOCK(vp); 3529 pwd_chdir(td, vp); 3530 break; 3531 3532 case FFS_SET_DOTDOT: 3533 #ifdef DIAGNOSTIC 3534 if (fsckcmds) { 3535 printf("%s: change .. in cwd from %jd to %jd\n", 3536 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3537 (intmax_t)cmd.size); 3538 } 3539 #endif /* DIAGNOSTIC */ 3540 /* 3541 * First we have to get and lock the parent directory 3542 * to which ".." points. 3543 */ 3544 error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp); 3545 if (error) 3546 break; 3547 /* 3548 * Now we get and lock the child directory containing "..". 3549 */ 3550 pwd = pwd_hold(td); 3551 dvp = pwd->pwd_cdir; 3552 if ((error = vget(dvp, LK_EXCLUSIVE)) != 0) { 3553 vput(fdvp); 3554 pwd_drop(pwd); 3555 break; 3556 } 3557 dp = VTOI(dvp); 3558 SET_I_OFFSET(dp, 12); /* XXX mastertemplate.dot_reclen */ 3559 error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, 3560 DT_DIR, 0); 3561 cache_purge(fdvp); 3562 cache_purge(dvp); 3563 vput(dvp); 3564 vput(fdvp); 3565 pwd_drop(pwd); 3566 break; 3567 3568 case FFS_UNLINK: 3569 #ifdef DIAGNOSTIC 3570 if (fsckcmds) { 3571 char buf[32]; 3572 3573 if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL)) 3574 strncpy(buf, "Name_too_long", 32); 3575 printf("%s: unlink %s (inode %jd)\n", 3576 mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size); 3577 } 3578 #endif /* DIAGNOSTIC */ 3579 /* 3580 * kern_funlinkat will do its own start/finish writes and 3581 * they do not nest, so drop ours here. Setting mp == NULL 3582 * indicates that vn_finished_write is not needed down below. 3583 */ 3584 vn_finished_write(mp); 3585 mp = NULL; 3586 error = kern_funlinkat(td, AT_FDCWD, 3587 (char *)(intptr_t)cmd.value, FD_NONE, UIO_USERSPACE, 3588 0, (ino_t)cmd.size); 3589 break; 3590 3591 default: 3592 #ifdef DIAGNOSTIC 3593 if (fsckcmds) { 3594 printf("Invalid request %d from fsck\n", 3595 oidp->oid_number); 3596 } 3597 #endif /* DIAGNOSTIC */ 3598 error = EINVAL; 3599 break; 3600 } 3601 fdrop(fp, td); 3602 vn_finished_write(mp); 3603 return (error); 3604 } 3605