1 /*- 2 * SPDX-License-Identifier: (BSD-2-Clause AND BSD-3-Clause) 3 * 4 * Copyright (c) 2002 Networks Associates Technology, Inc. 5 * All rights reserved. 6 * 7 * This software was developed for the FreeBSD Project by Marshall 8 * Kirk McKusick and Network Associates Laboratories, the Security 9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 11 * research program 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1982, 1986, 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 */ 61 62 #include <sys/cdefs.h> 63 #include "opt_quota.h" 64 65 #include <sys/param.h> 66 #include <sys/systm.h> 67 #include <sys/bio.h> 68 #include <sys/buf.h> 69 #include <sys/capsicum.h> 70 #include <sys/conf.h> 71 #include <sys/fcntl.h> 72 #include <sys/file.h> 73 #include <sys/filedesc.h> 74 #include <sys/gsb_crc32.h> 75 #include <sys/kernel.h> 76 #include <sys/mount.h> 77 #include <sys/priv.h> 78 #include <sys/proc.h> 79 #include <sys/stat.h> 80 #include <sys/syscallsubr.h> 81 #include <sys/sysctl.h> 82 #include <sys/syslog.h> 83 #include <sys/taskqueue.h> 84 #include <sys/vnode.h> 85 86 #include <security/audit/audit.h> 87 88 #include <geom/geom.h> 89 #include <geom/geom_vfs.h> 90 91 #include <ufs/ufs/dir.h> 92 #include <ufs/ufs/extattr.h> 93 #include <ufs/ufs/quota.h> 94 #include <ufs/ufs/inode.h> 95 #include <ufs/ufs/ufs_extern.h> 96 #include <ufs/ufs/ufsmount.h> 97 98 #include <ufs/ffs/fs.h> 99 #include <ufs/ffs/ffs_extern.h> 100 #include <ufs/ffs/softdep.h> 101 102 typedef ufs2_daddr_t allocfcn_t(struct inode *ip, uint64_t cg, 103 ufs2_daddr_t bpref, int size, int rsize); 104 105 static ufs2_daddr_t ffs_alloccg(struct inode *, uint64_t, ufs2_daddr_t, int, 106 int); 107 static ufs2_daddr_t 108 ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); 109 static void ffs_blkfree_cg(struct ufsmount *, struct fs *, 110 struct vnode *, ufs2_daddr_t, long, ino_t, 111 struct workhead *); 112 #ifdef INVARIANTS 113 static int ffs_checkfreeblk(struct inode *, ufs2_daddr_t, long); 114 #endif 115 static void ffs_checkcgintegrity(struct fs *, uint64_t, int); 116 static ufs2_daddr_t ffs_clusteralloc(struct inode *, uint64_t, ufs2_daddr_t, 117 int); 118 static ino_t ffs_dirpref(struct inode *); 119 static ufs2_daddr_t ffs_fragextend(struct inode *, uint64_t, ufs2_daddr_t, 120 int, int); 121 static ufs2_daddr_t ffs_hashalloc(struct inode *, uint64_t, ufs2_daddr_t, 122 int, int, allocfcn_t *); 123 static ufs2_daddr_t ffs_nodealloccg(struct inode *, uint64_t, ufs2_daddr_t, int, 124 int); 125 static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); 126 static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); 127 static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); 128 static void ffs_ckhash_cg(struct buf *); 129 130 /* 131 * Allocate a block in the filesystem. 132 * 133 * The size of the requested block is given, which must be some 134 * multiple of fs_fsize and <= fs_bsize. 135 * A preference may be optionally specified. If a preference is given 136 * the following hierarchy is used to allocate a block: 137 * 1) allocate the requested block. 138 * 2) allocate a rotationally optimal block in the same cylinder. 139 * 3) allocate a block in the same cylinder group. 140 * 4) quadratically rehash into other cylinder groups, until an 141 * available block is located. 142 * If no block preference is given the following hierarchy is used 143 * to allocate a block: 144 * 1) allocate a block in the cylinder group that contains the 145 * inode for the file. 146 * 2) quadratically rehash into other cylinder groups, until an 147 * available block is located. 148 */ 149 int 150 ffs_alloc(struct inode *ip, 151 ufs2_daddr_t lbn, 152 ufs2_daddr_t bpref, 153 int size, 154 int flags, 155 struct ucred *cred, 156 ufs2_daddr_t *bnp) 157 { 158 struct fs *fs; 159 struct ufsmount *ump; 160 ufs2_daddr_t bno; 161 uint64_t cg, reclaimed; 162 int64_t delta; 163 #ifdef QUOTA 164 int error; 165 #endif 166 167 *bnp = 0; 168 ump = ITOUMP(ip); 169 fs = ump->um_fs; 170 mtx_assert(UFS_MTX(ump), MA_OWNED); 171 #ifdef INVARIANTS 172 if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0) { 173 printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", 174 devtoname(ump->um_dev), (long)fs->fs_bsize, size, 175 fs->fs_fsmnt); 176 panic("ffs_alloc: bad size"); 177 } 178 if (cred == NOCRED) 179 panic("ffs_alloc: missing credential"); 180 #endif /* INVARIANTS */ 181 reclaimed = 0; 182 retry: 183 #ifdef QUOTA 184 UFS_UNLOCK(ump); 185 error = chkdq(ip, btodb(size), cred, 0); 186 if (error) 187 return (error); 188 UFS_LOCK(ump); 189 #endif 190 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 191 goto nospace; 192 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && 193 freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) 194 goto nospace; 195 if (bpref >= fs->fs_size) 196 bpref = 0; 197 if (bpref == 0) 198 cg = ino_to_cg(fs, ip->i_number); 199 else 200 cg = dtog(fs, bpref); 201 bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); 202 if (bno > 0) { 203 delta = btodb(size); 204 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 205 if (flags & IO_EXT) 206 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 207 else 208 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 209 *bnp = bno; 210 return (0); 211 } 212 nospace: 213 #ifdef QUOTA 214 UFS_UNLOCK(ump); 215 /* 216 * Restore user's disk quota because allocation failed. 217 */ 218 (void) chkdq(ip, -btodb(size), cred, FORCE); 219 UFS_LOCK(ump); 220 #endif 221 if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 222 reclaimed = 1; 223 softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT); 224 goto retry; 225 } 226 if (ffs_fsfail_cleanup_locked(ump, 0)) { 227 UFS_UNLOCK(ump); 228 return (ENXIO); 229 } 230 if (reclaimed > 0 && 231 ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 232 UFS_UNLOCK(ump); 233 ffs_fserr(fs, ip->i_number, "filesystem full"); 234 uprintf("\n%s: write failed, filesystem is full\n", 235 fs->fs_fsmnt); 236 } else { 237 UFS_UNLOCK(ump); 238 } 239 return (ENOSPC); 240 } 241 242 /* 243 * Reallocate a fragment to a bigger size 244 * 245 * The number and size of the old block is given, and a preference 246 * and new size is also specified. The allocator attempts to extend 247 * the original block. Failing that, the regular block allocator is 248 * invoked to get an appropriate block. 249 */ 250 int 251 ffs_realloccg(struct inode *ip, 252 ufs2_daddr_t lbprev, 253 ufs2_daddr_t bprev, 254 ufs2_daddr_t bpref, 255 int osize, 256 int nsize, 257 int flags, 258 struct ucred *cred, 259 struct buf **bpp) 260 { 261 struct vnode *vp; 262 struct fs *fs; 263 struct buf *bp; 264 struct ufsmount *ump; 265 uint64_t cg, request, reclaimed; 266 int error, gbflags; 267 ufs2_daddr_t bno; 268 int64_t delta; 269 270 vp = ITOV(ip); 271 ump = ITOUMP(ip); 272 fs = ump->um_fs; 273 bp = NULL; 274 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 275 #ifdef WITNESS 276 gbflags |= IS_SNAPSHOT(ip) ? GB_NOWITNESS : 0; 277 #endif 278 279 mtx_assert(UFS_MTX(ump), MA_OWNED); 280 #ifdef INVARIANTS 281 if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 282 panic("ffs_realloccg: allocation on suspended filesystem"); 283 if ((uint64_t)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || 284 (uint64_t)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { 285 printf( 286 "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", 287 devtoname(ump->um_dev), (long)fs->fs_bsize, osize, 288 nsize, fs->fs_fsmnt); 289 panic("ffs_realloccg: bad size"); 290 } 291 if (cred == NOCRED) 292 panic("ffs_realloccg: missing credential"); 293 #endif /* INVARIANTS */ 294 reclaimed = 0; 295 retry: 296 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && 297 freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) { 298 goto nospace; 299 } 300 if (bprev == 0) { 301 printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", 302 devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev, 303 fs->fs_fsmnt); 304 panic("ffs_realloccg: bad bprev"); 305 } 306 UFS_UNLOCK(ump); 307 /* 308 * Allocate the extra space in the buffer. 309 */ 310 error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); 311 if (error) { 312 return (error); 313 } 314 315 if (bp->b_blkno == bp->b_lblkno) { 316 if (lbprev >= UFS_NDADDR) 317 panic("ffs_realloccg: lbprev out of range"); 318 bp->b_blkno = fsbtodb(fs, bprev); 319 } 320 321 #ifdef QUOTA 322 error = chkdq(ip, btodb(nsize - osize), cred, 0); 323 if (error) { 324 brelse(bp); 325 return (error); 326 } 327 #endif 328 /* 329 * Check for extension in the existing location. 330 */ 331 *bpp = NULL; 332 cg = dtog(fs, bprev); 333 UFS_LOCK(ump); 334 bno = ffs_fragextend(ip, cg, bprev, osize, nsize); 335 if (bno) { 336 if (bp->b_blkno != fsbtodb(fs, bno)) 337 panic("ffs_realloccg: bad blockno"); 338 delta = btodb(nsize - osize); 339 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 340 if (flags & IO_EXT) 341 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 342 else 343 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 344 allocbuf(bp, nsize); 345 bp->b_flags |= B_DONE; 346 vfs_bio_bzero_buf(bp, osize, nsize - osize); 347 if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 348 vfs_bio_set_valid(bp, osize, nsize - osize); 349 *bpp = bp; 350 return (0); 351 } 352 /* 353 * Allocate a new disk location. 354 */ 355 if (bpref >= fs->fs_size) 356 bpref = 0; 357 switch ((int)fs->fs_optim) { 358 case FS_OPTSPACE: 359 /* 360 * Allocate an exact sized fragment. Although this makes 361 * best use of space, we will waste time relocating it if 362 * the file continues to grow. If the fragmentation is 363 * less than half of the minimum free reserve, we choose 364 * to begin optimizing for time. 365 */ 366 request = nsize; 367 if (fs->fs_minfree <= 5 || 368 fs->fs_cstotal.cs_nffree > 369 (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) 370 break; 371 log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", 372 fs->fs_fsmnt); 373 fs->fs_optim = FS_OPTTIME; 374 break; 375 case FS_OPTTIME: 376 /* 377 * At this point we have discovered a file that is trying to 378 * grow a small fragment to a larger fragment. To save time, 379 * we allocate a full sized block, then free the unused portion. 380 * If the file continues to grow, the `ffs_fragextend' call 381 * above will be able to grow it in place without further 382 * copying. If aberrant programs cause disk fragmentation to 383 * grow within 2% of the free reserve, we choose to begin 384 * optimizing for space. 385 */ 386 request = fs->fs_bsize; 387 if (fs->fs_cstotal.cs_nffree < 388 (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) 389 break; 390 log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", 391 fs->fs_fsmnt); 392 fs->fs_optim = FS_OPTSPACE; 393 break; 394 default: 395 printf("dev = %s, optim = %ld, fs = %s\n", 396 devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt); 397 panic("ffs_realloccg: bad optim"); 398 /* NOTREACHED */ 399 } 400 bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); 401 if (bno > 0) { 402 bp->b_blkno = fsbtodb(fs, bno); 403 if (!DOINGSOFTDEP(vp)) 404 /* 405 * The usual case is that a smaller fragment that 406 * was just allocated has been replaced with a bigger 407 * fragment or a full-size block. If it is marked as 408 * B_DELWRI, the current contents have not been written 409 * to disk. It is possible that the block was written 410 * earlier, but very uncommon. If the block has never 411 * been written, there is no need to send a BIO_DELETE 412 * for it when it is freed. The gain from avoiding the 413 * TRIMs for the common case of unwritten blocks far 414 * exceeds the cost of the write amplification for the 415 * uncommon case of failing to send a TRIM for a block 416 * that had been written. 417 */ 418 ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, 419 ip->i_number, vp->v_type, NULL, 420 (bp->b_flags & B_DELWRI) != 0 ? 421 NOTRIM_KEY : SINGLETON_KEY); 422 delta = btodb(nsize - osize); 423 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 424 if (flags & IO_EXT) 425 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 426 else 427 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 428 allocbuf(bp, nsize); 429 bp->b_flags |= B_DONE; 430 vfs_bio_bzero_buf(bp, osize, nsize - osize); 431 if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 432 vfs_bio_set_valid(bp, osize, nsize - osize); 433 *bpp = bp; 434 return (0); 435 } 436 #ifdef QUOTA 437 UFS_UNLOCK(ump); 438 /* 439 * Restore user's disk quota because allocation failed. 440 */ 441 (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); 442 UFS_LOCK(ump); 443 #endif 444 nospace: 445 /* 446 * no space available 447 */ 448 if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 449 reclaimed = 1; 450 UFS_UNLOCK(ump); 451 if (bp) { 452 brelse(bp); 453 bp = NULL; 454 } 455 UFS_LOCK(ump); 456 softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); 457 goto retry; 458 } 459 if (bp) 460 brelse(bp); 461 if (ffs_fsfail_cleanup_locked(ump, 0)) { 462 UFS_UNLOCK(ump); 463 return (ENXIO); 464 } 465 if (reclaimed > 0 && 466 ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 467 UFS_UNLOCK(ump); 468 ffs_fserr(fs, ip->i_number, "filesystem full"); 469 uprintf("\n%s: write failed, filesystem is full\n", 470 fs->fs_fsmnt); 471 } else { 472 UFS_UNLOCK(ump); 473 } 474 return (ENOSPC); 475 } 476 477 /* 478 * Reallocate a sequence of blocks into a contiguous sequence of blocks. 479 * 480 * The vnode and an array of buffer pointers for a range of sequential 481 * logical blocks to be made contiguous is given. The allocator attempts 482 * to find a range of sequential blocks starting as close as possible 483 * from the end of the allocation for the logical block immediately 484 * preceding the current range. If successful, the physical block numbers 485 * in the buffer pointers and in the inode are changed to reflect the new 486 * allocation. If unsuccessful, the allocation is left unchanged. The 487 * success in doing the reallocation is returned. Note that the error 488 * return is not reflected back to the user. Rather the previous block 489 * allocation will be used. 490 */ 491 492 SYSCTL_DECL(_vfs_ffs); 493 494 static int doasyncfree = 1; 495 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, 496 "do not force synchronous writes when blocks are reallocated"); 497 498 static int doreallocblks = 1; 499 SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, 500 "enable block reallocation"); 501 502 static int dotrimcons = 1; 503 SYSCTL_INT(_vfs_ffs, OID_AUTO, dotrimcons, CTLFLAG_RWTUN, &dotrimcons, 0, 504 "enable BIO_DELETE / TRIM consolidation"); 505 506 static int maxclustersearch = 10; 507 SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch, 508 0, "max number of cylinder group to search for contigous blocks"); 509 510 #ifdef DIAGNOSTIC 511 static int prtrealloc = 0; 512 SYSCTL_INT(_debug, OID_AUTO, ffs_prtrealloc, CTLFLAG_RW, &prtrealloc, 0, 513 "print out FFS filesystem block reallocation operations"); 514 #endif 515 516 int 517 ffs_reallocblks( 518 struct vop_reallocblks_args /* { 519 struct vnode *a_vp; 520 struct cluster_save *a_buflist; 521 } */ *ap) 522 { 523 struct ufsmount *ump; 524 int error; 525 526 /* 527 * We used to skip reallocating the blocks of a file into a 528 * contiguous sequence if the underlying flash device requested 529 * BIO_DELETE notifications, because devices that benefit from 530 * BIO_DELETE also benefit from not moving the data. However, 531 * the destination for the data is usually moved before the data 532 * is written to the initially allocated location, so we rarely 533 * suffer the penalty of extra writes. With the addition of the 534 * consolidation of contiguous blocks into single BIO_DELETE 535 * operations, having fewer but larger contiguous blocks reduces 536 * the number of (slow and expensive) BIO_DELETE operations. So 537 * when doing BIO_DELETE consolidation, we do block reallocation. 538 * 539 * Skip if reallocblks has been disabled globally. 540 */ 541 ump = ap->a_vp->v_mount->mnt_data; 542 if ((((ump->um_flags) & UM_CANDELETE) != 0 && dotrimcons == 0) || 543 doreallocblks == 0) 544 return (ENOSPC); 545 546 /* 547 * We can't wait in softdep prealloc as it may fsync and recurse 548 * here. Instead we simply fail to reallocate blocks if this 549 * rare condition arises. 550 */ 551 if (DOINGSUJ(ap->a_vp)) 552 if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) 553 return (ENOSPC); 554 vn_seqc_write_begin(ap->a_vp); 555 error = ump->um_fstype == UFS1 ? ffs_reallocblks_ufs1(ap) : 556 ffs_reallocblks_ufs2(ap); 557 vn_seqc_write_end(ap->a_vp); 558 return (error); 559 } 560 561 static int 562 ffs_reallocblks_ufs1( 563 struct vop_reallocblks_args /* { 564 struct vnode *a_vp; 565 struct cluster_save *a_buflist; 566 } */ *ap) 567 { 568 struct fs *fs; 569 struct inode *ip; 570 struct vnode *vp; 571 struct buf *sbp, *ebp, *bp; 572 ufs1_daddr_t *bap, *sbap, *ebap; 573 struct cluster_save *buflist; 574 struct ufsmount *ump; 575 ufs_lbn_t start_lbn, end_lbn; 576 ufs1_daddr_t soff, newblk, blkno; 577 ufs2_daddr_t pref; 578 struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; 579 int i, cg, len, start_lvl, end_lvl, ssize; 580 581 vp = ap->a_vp; 582 ip = VTOI(vp); 583 ump = ITOUMP(ip); 584 fs = ump->um_fs; 585 /* 586 * If we are not tracking block clusters or if we have less than 4% 587 * free blocks left, then do not attempt to cluster. Running with 588 * less than 5% free block reserve is not recommended and those that 589 * choose to do so do not expect to have good file layout. 590 */ 591 if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 592 return (ENOSPC); 593 buflist = ap->a_buflist; 594 len = buflist->bs_nchildren; 595 start_lbn = buflist->bs_children[0]->b_lblkno; 596 end_lbn = start_lbn + len - 1; 597 #ifdef INVARIANTS 598 for (i = 0; i < len; i++) 599 if (!ffs_checkfreeblk(ip, 600 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 601 panic("ffs_reallocblks: unallocated block 1"); 602 for (i = 1; i < len; i++) 603 if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 604 panic("ffs_reallocblks: non-logical cluster"); 605 blkno = buflist->bs_children[0]->b_blkno; 606 ssize = fsbtodb(fs, fs->fs_frag); 607 for (i = 1; i < len - 1; i++) 608 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 609 panic("ffs_reallocblks: non-physical cluster %d", i); 610 #endif 611 /* 612 * If the cluster crosses the boundary for the first indirect 613 * block, leave space for the indirect block. Indirect blocks 614 * are initially laid out in a position after the last direct 615 * block. Block reallocation would usually destroy locality by 616 * moving the indirect block out of the way to make room for 617 * data blocks if we didn't compensate here. We should also do 618 * this for other indirect block boundaries, but it is only 619 * important for the first one. 620 */ 621 if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) 622 return (ENOSPC); 623 /* 624 * If the latest allocation is in a new cylinder group, assume that 625 * the filesystem has decided to move and do not force it back to 626 * the previous cylinder group. 627 */ 628 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 629 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 630 return (ENOSPC); 631 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 632 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 633 return (ENOSPC); 634 /* 635 * Get the starting offset and block map for the first block. 636 */ 637 if (start_lvl == 0) { 638 sbap = &ip->i_din1->di_db[0]; 639 soff = start_lbn; 640 } else { 641 idp = &start_ap[start_lvl - 1]; 642 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 643 brelse(sbp); 644 return (ENOSPC); 645 } 646 sbap = (ufs1_daddr_t *)sbp->b_data; 647 soff = idp->in_off; 648 } 649 /* 650 * If the block range spans two block maps, get the second map. 651 */ 652 ebap = NULL; 653 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 654 ssize = len; 655 } else { 656 #ifdef INVARIANTS 657 if (start_lvl > 0 && 658 start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 659 panic("ffs_reallocblk: start == end"); 660 #endif 661 ssize = len - (idp->in_off + 1); 662 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 663 goto fail; 664 ebap = (ufs1_daddr_t *)ebp->b_data; 665 } 666 /* 667 * Find the preferred location for the cluster. If we have not 668 * previously failed at this endeavor, then follow our standard 669 * preference calculation. If we have failed at it, then pick up 670 * where we last ended our search. 671 */ 672 UFS_LOCK(ump); 673 if (ip->i_nextclustercg == -1) 674 pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); 675 else 676 pref = cgdata(fs, ip->i_nextclustercg); 677 /* 678 * Search the block map looking for an allocation of the desired size. 679 * To avoid wasting too much time, we limit the number of cylinder 680 * groups that we will search. 681 */ 682 cg = dtog(fs, pref); 683 MPASS(cg < fs->fs_ncg); 684 for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { 685 if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) 686 break; 687 cg += 1; 688 if (cg >= fs->fs_ncg) 689 cg = 0; 690 } 691 /* 692 * If we have failed in our search, record where we gave up for 693 * next time. Otherwise, fall back to our usual search citerion. 694 */ 695 if (newblk == 0) { 696 ip->i_nextclustercg = cg; 697 UFS_UNLOCK(ump); 698 goto fail; 699 } 700 ip->i_nextclustercg = -1; 701 /* 702 * We have found a new contiguous block. 703 * 704 * First we have to replace the old block pointers with the new 705 * block pointers in the inode and indirect blocks associated 706 * with the file. 707 */ 708 #ifdef DIAGNOSTIC 709 if (prtrealloc) 710 printf("realloc: ino %ju, lbns %jd-%jd\n\told:", 711 (uintmax_t)ip->i_number, 712 (intmax_t)start_lbn, (intmax_t)end_lbn); 713 #endif 714 blkno = newblk; 715 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 716 if (i == ssize) { 717 bap = ebap; 718 soff = -i; 719 } 720 #ifdef INVARIANTS 721 if (!ffs_checkfreeblk(ip, 722 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 723 panic("ffs_reallocblks: unallocated block 2"); 724 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 725 panic("ffs_reallocblks: alloc mismatch"); 726 #endif 727 #ifdef DIAGNOSTIC 728 if (prtrealloc) 729 printf(" %d,", *bap); 730 #endif 731 if (DOINGSOFTDEP(vp)) { 732 if (sbap == &ip->i_din1->di_db[0] && i < ssize) 733 softdep_setup_allocdirect(ip, start_lbn + i, 734 blkno, *bap, fs->fs_bsize, fs->fs_bsize, 735 buflist->bs_children[i]); 736 else 737 softdep_setup_allocindir_page(ip, start_lbn + i, 738 i < ssize ? sbp : ebp, soff + i, blkno, 739 *bap, buflist->bs_children[i]); 740 } 741 *bap++ = blkno; 742 } 743 /* 744 * Next we must write out the modified inode and indirect blocks. 745 * For strict correctness, the writes should be synchronous since 746 * the old block values may have been written to disk. In practise 747 * they are almost never written, but if we are concerned about 748 * strict correctness, the `doasyncfree' flag should be set to zero. 749 * 750 * The test on `doasyncfree' should be changed to test a flag 751 * that shows whether the associated buffers and inodes have 752 * been written. The flag should be set when the cluster is 753 * started and cleared whenever the buffer or inode is flushed. 754 * We can then check below to see if it is set, and do the 755 * synchronous write only when it has been cleared. 756 */ 757 if (sbap != &ip->i_din1->di_db[0]) { 758 if (doasyncfree) 759 bdwrite(sbp); 760 else 761 bwrite(sbp); 762 } else { 763 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 764 if (!doasyncfree) 765 ffs_update(vp, 1); 766 } 767 if (ssize < len) { 768 if (doasyncfree) 769 bdwrite(ebp); 770 else 771 bwrite(ebp); 772 } 773 /* 774 * Last, free the old blocks and assign the new blocks to the buffers. 775 */ 776 #ifdef DIAGNOSTIC 777 if (prtrealloc) 778 printf("\n\tnew:"); 779 #endif 780 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 781 bp = buflist->bs_children[i]; 782 if (!DOINGSOFTDEP(vp)) 783 /* 784 * The usual case is that a set of N-contiguous blocks 785 * that was just allocated has been replaced with a 786 * set of N+1-contiguous blocks. If they are marked as 787 * B_DELWRI, the current contents have not been written 788 * to disk. It is possible that the blocks were written 789 * earlier, but very uncommon. If the blocks have never 790 * been written, there is no need to send a BIO_DELETE 791 * for them when they are freed. The gain from avoiding 792 * the TRIMs for the common case of unwritten blocks 793 * far exceeds the cost of the write amplification for 794 * the uncommon case of failing to send a TRIM for the 795 * blocks that had been written. 796 */ 797 ffs_blkfree(ump, fs, ump->um_devvp, 798 dbtofsb(fs, bp->b_blkno), 799 fs->fs_bsize, ip->i_number, vp->v_type, NULL, 800 (bp->b_flags & B_DELWRI) != 0 ? 801 NOTRIM_KEY : SINGLETON_KEY); 802 bp->b_blkno = fsbtodb(fs, blkno); 803 #ifdef INVARIANTS 804 if (!ffs_checkfreeblk(ip, dbtofsb(fs, bp->b_blkno), 805 fs->fs_bsize)) 806 panic("ffs_reallocblks: unallocated block 3"); 807 #endif 808 #ifdef DIAGNOSTIC 809 if (prtrealloc) 810 printf(" %d,", blkno); 811 #endif 812 } 813 #ifdef DIAGNOSTIC 814 if (prtrealloc) { 815 prtrealloc--; 816 printf("\n"); 817 } 818 #endif 819 return (0); 820 821 fail: 822 if (ssize < len) 823 brelse(ebp); 824 if (sbap != &ip->i_din1->di_db[0]) 825 brelse(sbp); 826 return (ENOSPC); 827 } 828 829 static int 830 ffs_reallocblks_ufs2( 831 struct vop_reallocblks_args /* { 832 struct vnode *a_vp; 833 struct cluster_save *a_buflist; 834 } */ *ap) 835 { 836 struct fs *fs; 837 struct inode *ip; 838 struct vnode *vp; 839 struct buf *sbp, *ebp, *bp; 840 ufs2_daddr_t *bap, *sbap, *ebap; 841 struct cluster_save *buflist; 842 struct ufsmount *ump; 843 ufs_lbn_t start_lbn, end_lbn; 844 ufs2_daddr_t soff, newblk, blkno, pref; 845 struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; 846 int i, cg, len, start_lvl, end_lvl, ssize; 847 848 vp = ap->a_vp; 849 ip = VTOI(vp); 850 ump = ITOUMP(ip); 851 fs = ump->um_fs; 852 /* 853 * If we are not tracking block clusters or if we have less than 4% 854 * free blocks left, then do not attempt to cluster. Running with 855 * less than 5% free block reserve is not recommended and those that 856 * choose to do so do not expect to have good file layout. 857 */ 858 if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 859 return (ENOSPC); 860 buflist = ap->a_buflist; 861 len = buflist->bs_nchildren; 862 start_lbn = buflist->bs_children[0]->b_lblkno; 863 end_lbn = start_lbn + len - 1; 864 #ifdef INVARIANTS 865 for (i = 0; i < len; i++) 866 if (!ffs_checkfreeblk(ip, 867 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 868 panic("ffs_reallocblks: unallocated block 1"); 869 for (i = 1; i < len; i++) 870 if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 871 panic("ffs_reallocblks: non-logical cluster"); 872 blkno = buflist->bs_children[0]->b_blkno; 873 ssize = fsbtodb(fs, fs->fs_frag); 874 for (i = 1; i < len - 1; i++) 875 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 876 panic("ffs_reallocblks: non-physical cluster %d", i); 877 #endif 878 /* 879 * If the cluster crosses the boundary for the first indirect 880 * block, do not move anything in it. Indirect blocks are 881 * usually initially laid out in a position between the data 882 * blocks. Block reallocation would usually destroy locality by 883 * moving the indirect block out of the way to make room for 884 * data blocks if we didn't compensate here. We should also do 885 * this for other indirect block boundaries, but it is only 886 * important for the first one. 887 */ 888 if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) 889 return (ENOSPC); 890 /* 891 * If the latest allocation is in a new cylinder group, assume that 892 * the filesystem has decided to move and do not force it back to 893 * the previous cylinder group. 894 */ 895 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 896 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 897 return (ENOSPC); 898 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 899 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 900 return (ENOSPC); 901 /* 902 * Get the starting offset and block map for the first block. 903 */ 904 if (start_lvl == 0) { 905 sbap = &ip->i_din2->di_db[0]; 906 soff = start_lbn; 907 } else { 908 idp = &start_ap[start_lvl - 1]; 909 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 910 brelse(sbp); 911 return (ENOSPC); 912 } 913 sbap = (ufs2_daddr_t *)sbp->b_data; 914 soff = idp->in_off; 915 } 916 /* 917 * If the block range spans two block maps, get the second map. 918 */ 919 ebap = NULL; 920 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 921 ssize = len; 922 } else { 923 #ifdef INVARIANTS 924 if (start_lvl > 0 && 925 start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 926 panic("ffs_reallocblk: start == end"); 927 #endif 928 ssize = len - (idp->in_off + 1); 929 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 930 goto fail; 931 ebap = (ufs2_daddr_t *)ebp->b_data; 932 } 933 /* 934 * Find the preferred location for the cluster. If we have not 935 * previously failed at this endeavor, then follow our standard 936 * preference calculation. If we have failed at it, then pick up 937 * where we last ended our search. 938 */ 939 UFS_LOCK(ump); 940 if (ip->i_nextclustercg == -1) 941 pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); 942 else 943 pref = cgdata(fs, ip->i_nextclustercg); 944 /* 945 * Search the block map looking for an allocation of the desired size. 946 * To avoid wasting too much time, we limit the number of cylinder 947 * groups that we will search. 948 */ 949 cg = dtog(fs, pref); 950 MPASS(cg < fs->fs_ncg); 951 for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { 952 if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) 953 break; 954 cg += 1; 955 if (cg >= fs->fs_ncg) 956 cg = 0; 957 } 958 /* 959 * If we have failed in our search, record where we gave up for 960 * next time. Otherwise, fall back to our usual search citerion. 961 */ 962 if (newblk == 0) { 963 ip->i_nextclustercg = cg; 964 UFS_UNLOCK(ump); 965 goto fail; 966 } 967 ip->i_nextclustercg = -1; 968 /* 969 * We have found a new contiguous block. 970 * 971 * First we have to replace the old block pointers with the new 972 * block pointers in the inode and indirect blocks associated 973 * with the file. 974 */ 975 #ifdef DIAGNOSTIC 976 if (prtrealloc) 977 printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, 978 (intmax_t)start_lbn, (intmax_t)end_lbn); 979 #endif 980 blkno = newblk; 981 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 982 if (i == ssize) { 983 bap = ebap; 984 soff = -i; 985 } 986 #ifdef INVARIANTS 987 if (!ffs_checkfreeblk(ip, 988 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 989 panic("ffs_reallocblks: unallocated block 2"); 990 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 991 panic("ffs_reallocblks: alloc mismatch"); 992 #endif 993 #ifdef DIAGNOSTIC 994 if (prtrealloc) 995 printf(" %jd,", (intmax_t)*bap); 996 #endif 997 if (DOINGSOFTDEP(vp)) { 998 if (sbap == &ip->i_din2->di_db[0] && i < ssize) 999 softdep_setup_allocdirect(ip, start_lbn + i, 1000 blkno, *bap, fs->fs_bsize, fs->fs_bsize, 1001 buflist->bs_children[i]); 1002 else 1003 softdep_setup_allocindir_page(ip, start_lbn + i, 1004 i < ssize ? sbp : ebp, soff + i, blkno, 1005 *bap, buflist->bs_children[i]); 1006 } 1007 *bap++ = blkno; 1008 } 1009 /* 1010 * Next we must write out the modified inode and indirect blocks. 1011 * For strict correctness, the writes should be synchronous since 1012 * the old block values may have been written to disk. In practise 1013 * they are almost never written, but if we are concerned about 1014 * strict correctness, the `doasyncfree' flag should be set to zero. 1015 * 1016 * The test on `doasyncfree' should be changed to test a flag 1017 * that shows whether the associated buffers and inodes have 1018 * been written. The flag should be set when the cluster is 1019 * started and cleared whenever the buffer or inode is flushed. 1020 * We can then check below to see if it is set, and do the 1021 * synchronous write only when it has been cleared. 1022 */ 1023 if (sbap != &ip->i_din2->di_db[0]) { 1024 if (doasyncfree) 1025 bdwrite(sbp); 1026 else 1027 bwrite(sbp); 1028 } else { 1029 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1030 if (!doasyncfree) 1031 ffs_update(vp, 1); 1032 } 1033 if (ssize < len) { 1034 if (doasyncfree) 1035 bdwrite(ebp); 1036 else 1037 bwrite(ebp); 1038 } 1039 /* 1040 * Last, free the old blocks and assign the new blocks to the buffers. 1041 */ 1042 #ifdef DIAGNOSTIC 1043 if (prtrealloc) 1044 printf("\n\tnew:"); 1045 #endif 1046 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 1047 bp = buflist->bs_children[i]; 1048 if (!DOINGSOFTDEP(vp)) 1049 /* 1050 * The usual case is that a set of N-contiguous blocks 1051 * that was just allocated has been replaced with a 1052 * set of N+1-contiguous blocks. If they are marked as 1053 * B_DELWRI, the current contents have not been written 1054 * to disk. It is possible that the blocks were written 1055 * earlier, but very uncommon. If the blocks have never 1056 * been written, there is no need to send a BIO_DELETE 1057 * for them when they are freed. The gain from avoiding 1058 * the TRIMs for the common case of unwritten blocks 1059 * far exceeds the cost of the write amplification for 1060 * the uncommon case of failing to send a TRIM for the 1061 * blocks that had been written. 1062 */ 1063 ffs_blkfree(ump, fs, ump->um_devvp, 1064 dbtofsb(fs, bp->b_blkno), 1065 fs->fs_bsize, ip->i_number, vp->v_type, NULL, 1066 (bp->b_flags & B_DELWRI) != 0 ? 1067 NOTRIM_KEY : SINGLETON_KEY); 1068 bp->b_blkno = fsbtodb(fs, blkno); 1069 #ifdef INVARIANTS 1070 if (!ffs_checkfreeblk(ip, dbtofsb(fs, bp->b_blkno), 1071 fs->fs_bsize)) 1072 panic("ffs_reallocblks: unallocated block 3"); 1073 #endif 1074 #ifdef DIAGNOSTIC 1075 if (prtrealloc) 1076 printf(" %jd,", (intmax_t)blkno); 1077 #endif 1078 } 1079 #ifdef DIAGNOSTIC 1080 if (prtrealloc) { 1081 prtrealloc--; 1082 printf("\n"); 1083 } 1084 #endif 1085 return (0); 1086 1087 fail: 1088 if (ssize < len) 1089 brelse(ebp); 1090 if (sbap != &ip->i_din2->di_db[0]) 1091 brelse(sbp); 1092 return (ENOSPC); 1093 } 1094 1095 /* 1096 * Allocate an inode in the filesystem. 1097 * 1098 * If allocating a directory, use ffs_dirpref to select the inode. 1099 * If allocating in a directory, the following hierarchy is followed: 1100 * 1) allocate the preferred inode. 1101 * 2) allocate an inode in the same cylinder group. 1102 * 3) quadratically rehash into other cylinder groups, until an 1103 * available inode is located. 1104 * If no inode preference is given the following hierarchy is used 1105 * to allocate an inode: 1106 * 1) allocate an inode in cylinder group 0. 1107 * 2) quadratically rehash into other cylinder groups, until an 1108 * available inode is located. 1109 */ 1110 int 1111 ffs_valloc(struct vnode *pvp, 1112 int mode, 1113 struct ucred *cred, 1114 struct vnode **vpp) 1115 { 1116 struct inode *pip; 1117 struct fs *fs; 1118 struct inode *ip; 1119 struct timespec ts; 1120 struct ufsmount *ump; 1121 ino_t ino, ipref; 1122 uint64_t cg; 1123 int error, reclaimed; 1124 1125 *vpp = NULL; 1126 pip = VTOI(pvp); 1127 ump = ITOUMP(pip); 1128 fs = ump->um_fs; 1129 1130 UFS_LOCK(ump); 1131 reclaimed = 0; 1132 retry: 1133 if (fs->fs_cstotal.cs_nifree == 0) 1134 goto noinodes; 1135 1136 if ((mode & IFMT) == IFDIR) 1137 ipref = ffs_dirpref(pip); 1138 else 1139 ipref = pip->i_number; 1140 if (ipref >= fs->fs_ncg * fs->fs_ipg) 1141 ipref = 0; 1142 cg = ino_to_cg(fs, ipref); 1143 /* 1144 * Track number of dirs created one after another 1145 * in a same cg without intervening by files. 1146 */ 1147 if ((mode & IFMT) == IFDIR) { 1148 if (fs->fs_contigdirs[cg] < 255) 1149 fs->fs_contigdirs[cg]++; 1150 } else { 1151 if (fs->fs_contigdirs[cg] > 0) 1152 fs->fs_contigdirs[cg]--; 1153 } 1154 ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 1155 (allocfcn_t *)ffs_nodealloccg); 1156 if (ino == 0) 1157 goto noinodes; 1158 /* 1159 * Get rid of the cached old vnode, force allocation of a new vnode 1160 * for this inode. If this fails, release the allocated ino and 1161 * return the error. 1162 */ 1163 if ((error = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp, 1164 FFSV_FORCEINSMQ | FFSV_REPLACE | FFSV_NEWINODE)) != 0) { 1165 ffs_vfree(pvp, ino, mode); 1166 return (error); 1167 } 1168 /* 1169 * We got an inode, so check mode and panic if it is already allocated. 1170 */ 1171 ip = VTOI(*vpp); 1172 if (ip->i_mode) { 1173 printf("mode = 0%o, inum = %ju, fs = %s\n", 1174 ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt); 1175 panic("ffs_valloc: dup alloc"); 1176 } 1177 if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ 1178 printf("free inode %s/%ju had %ld blocks\n", 1179 fs->fs_fsmnt, (intmax_t)ino, (long)DIP(ip, i_blocks)); 1180 DIP_SET(ip, i_blocks, 0); 1181 } 1182 ip->i_flags = 0; 1183 DIP_SET(ip, i_flags, 0); 1184 if ((mode & IFMT) == IFDIR) 1185 DIP_SET(ip, i_dirdepth, DIP(pip, i_dirdepth) + 1); 1186 /* 1187 * Set up a new generation number for this inode. 1188 */ 1189 while (ip->i_gen == 0 || ++ip->i_gen == 0) 1190 ip->i_gen = arc4random(); 1191 DIP_SET(ip, i_gen, ip->i_gen); 1192 if (fs->fs_magic == FS_UFS2_MAGIC) { 1193 vfs_timestamp(&ts); 1194 ip->i_din2->di_birthtime = ts.tv_sec; 1195 ip->i_din2->di_birthnsec = ts.tv_nsec; 1196 } 1197 ip->i_flag = 0; 1198 (*vpp)->v_vflag = 0; 1199 (*vpp)->v_type = VNON; 1200 if (fs->fs_magic == FS_UFS2_MAGIC) { 1201 (*vpp)->v_op = &ffs_vnodeops2; 1202 UFS_INODE_SET_FLAG(ip, IN_UFS2); 1203 } else { 1204 (*vpp)->v_op = &ffs_vnodeops1; 1205 } 1206 return (0); 1207 noinodes: 1208 if (reclaimed == 0) { 1209 reclaimed = 1; 1210 softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT); 1211 goto retry; 1212 } 1213 if (ffs_fsfail_cleanup_locked(ump, 0)) { 1214 UFS_UNLOCK(ump); 1215 return (ENXIO); 1216 } 1217 if (ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 1218 UFS_UNLOCK(ump); 1219 ffs_fserr(fs, pip->i_number, "out of inodes"); 1220 uprintf("\n%s: create/symlink failed, no inodes free\n", 1221 fs->fs_fsmnt); 1222 } else { 1223 UFS_UNLOCK(ump); 1224 } 1225 return (ENOSPC); 1226 } 1227 1228 /* 1229 * Find a cylinder group to place a directory. 1230 * 1231 * The policy implemented by this algorithm is to allocate a 1232 * directory inode in the same cylinder group as its parent 1233 * directory, but also to reserve space for its files inodes 1234 * and data. Restrict the number of directories which may be 1235 * allocated one after another in the same cylinder group 1236 * without intervening allocation of files. 1237 * 1238 * If we allocate a first level directory then force allocation 1239 * in another cylinder group. 1240 */ 1241 static ino_t 1242 ffs_dirpref(struct inode *pip) 1243 { 1244 struct fs *fs; 1245 int cg, prefcg, curcg, dirsize, cgsize; 1246 int depth, range, start, end, numdirs, power, numerator, denominator; 1247 uint64_t avgifree, avgbfree, avgndir, curdirsize; 1248 uint64_t minifree, minbfree, maxndir; 1249 uint64_t maxcontigdirs; 1250 1251 mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED); 1252 fs = ITOFS(pip); 1253 1254 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 1255 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1256 avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; 1257 1258 /* 1259 * Select a preferred cylinder group to place a new directory. 1260 * If we are near the root of the filesystem we aim to spread 1261 * them out as much as possible. As we descend deeper from the 1262 * root we cluster them closer together around their parent as 1263 * we expect them to be more closely interactive. Higher-level 1264 * directories like usr/src/sys and usr/src/bin should be 1265 * separated while the directories in these areas are more 1266 * likely to be accessed together so should be closer. 1267 * 1268 * We pick a range of cylinder groups around the cylinder group 1269 * of the directory in which we are being created. The size of 1270 * the range for our search is based on our depth from the root 1271 * of our filesystem. We then probe that range based on how many 1272 * directories are already present. The first new directory is at 1273 * 1/2 (middle) of the range; the second is in the first 1/4 of the 1274 * range, then at 3/4, 1/8, 3/8, 5/8, 7/8, 1/16, 3/16, 5/16, etc. 1275 */ 1276 depth = DIP(pip, i_dirdepth); 1277 range = fs->fs_ncg / (1 << depth); 1278 curcg = ino_to_cg(fs, pip->i_number); 1279 start = curcg - (range / 2); 1280 if (start < 0) 1281 start += fs->fs_ncg; 1282 end = curcg + (range / 2); 1283 if (end >= fs->fs_ncg) 1284 end -= fs->fs_ncg; 1285 numdirs = pip->i_effnlink - 1; 1286 power = fls(numdirs); 1287 numerator = (numdirs & ~(1 << (power - 1))) * 2 + 1; 1288 denominator = 1 << power; 1289 prefcg = (curcg - (range / 2) + (range * numerator / denominator)); 1290 if (prefcg < 0) 1291 prefcg += fs->fs_ncg; 1292 if (prefcg >= fs->fs_ncg) 1293 prefcg -= fs->fs_ncg; 1294 /* 1295 * If this filesystem is not tracking directory depths, 1296 * revert to the old algorithm. 1297 */ 1298 if (depth == 0 && pip->i_number != UFS_ROOTINO) 1299 prefcg = curcg; 1300 1301 /* 1302 * Count various limits which used for 1303 * optimal allocation of a directory inode. 1304 */ 1305 maxndir = min(avgndir + (1 << depth), fs->fs_ipg); 1306 minifree = avgifree - avgifree / 4; 1307 if (minifree < 1) 1308 minifree = 1; 1309 minbfree = avgbfree - avgbfree / 4; 1310 if (minbfree < 1) 1311 minbfree = 1; 1312 cgsize = fs->fs_fsize * fs->fs_fpg; 1313 dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; 1314 curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; 1315 if (dirsize < curdirsize) 1316 dirsize = curdirsize; 1317 if (dirsize <= 0) 1318 maxcontigdirs = 0; /* dirsize overflowed */ 1319 else 1320 maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); 1321 if (fs->fs_avgfpdir > 0) 1322 maxcontigdirs = min(maxcontigdirs, 1323 fs->fs_ipg / fs->fs_avgfpdir); 1324 if (maxcontigdirs == 0) 1325 maxcontigdirs = 1; 1326 1327 /* 1328 * Limit number of dirs in one cg and reserve space for 1329 * regular files, but only if we have no deficit in 1330 * inodes or space. 1331 * 1332 * We are trying to find a suitable cylinder group nearby 1333 * our preferred cylinder group to place a new directory. 1334 * We scan from our preferred cylinder group forward looking 1335 * for a cylinder group that meets our criterion. If we get 1336 * to the final cylinder group and do not find anything, 1337 * we start scanning forwards from the beginning of the 1338 * filesystem. While it might seem sensible to start scanning 1339 * backwards or even to alternate looking forward and backward, 1340 * this approach fails badly when the filesystem is nearly full. 1341 * Specifically, we first search all the areas that have no space 1342 * and finally try the one preceding that. We repeat this on 1343 * every request and in the case of the final block end up 1344 * searching the entire filesystem. By jumping to the front 1345 * of the filesystem, our future forward searches always look 1346 * in new cylinder groups so finds every possible block after 1347 * one pass over the filesystem. 1348 */ 1349 for (cg = prefcg; cg < fs->fs_ncg; cg++) 1350 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 1351 fs->fs_cs(fs, cg).cs_nifree >= minifree && 1352 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 1353 if (fs->fs_contigdirs[cg] < maxcontigdirs) 1354 return ((ino_t)(fs->fs_ipg * cg)); 1355 } 1356 for (cg = 0; cg < prefcg; cg++) 1357 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 1358 fs->fs_cs(fs, cg).cs_nifree >= minifree && 1359 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 1360 if (fs->fs_contigdirs[cg] < maxcontigdirs) 1361 return ((ino_t)(fs->fs_ipg * cg)); 1362 } 1363 /* 1364 * This is a backstop when we have deficit in space. 1365 */ 1366 for (cg = prefcg; cg < fs->fs_ncg; cg++) 1367 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 1368 return ((ino_t)(fs->fs_ipg * cg)); 1369 for (cg = 0; cg < prefcg; cg++) 1370 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 1371 break; 1372 return ((ino_t)(fs->fs_ipg * cg)); 1373 } 1374 1375 /* 1376 * Select the desired position for the next block in a file. The file is 1377 * logically divided into sections. The first section is composed of the 1378 * direct blocks and the next fs_maxbpg blocks. Each additional section 1379 * contains fs_maxbpg blocks. 1380 * 1381 * If no blocks have been allocated in the first section, the policy is to 1382 * request a block in the same cylinder group as the inode that describes 1383 * the file. The first indirect is allocated immediately following the last 1384 * direct block and the data blocks for the first indirect immediately 1385 * follow it. 1386 * 1387 * If no blocks have been allocated in any other section, the indirect 1388 * block(s) are allocated in the same cylinder group as its inode in an 1389 * area reserved immediately following the inode blocks. The policy for 1390 * the data blocks is to place them in a cylinder group with a greater than 1391 * average number of free blocks. An appropriate cylinder group is found 1392 * by using a rotor that sweeps the cylinder groups. When a new group of 1393 * blocks is needed, the sweep begins in the cylinder group following the 1394 * cylinder group from which the previous allocation was made. The sweep 1395 * continues until a cylinder group with greater than the average number 1396 * of free blocks is found. If the allocation is for the first block in an 1397 * indirect block or the previous block is a hole, then the information on 1398 * the previous allocation is unavailable; here a best guess is made based 1399 * on the logical block number being allocated. 1400 * 1401 * If a section is already partially allocated, the policy is to 1402 * allocate blocks contiguously within the section if possible. 1403 */ 1404 ufs2_daddr_t 1405 ffs_blkpref_ufs1(struct inode *ip, 1406 ufs_lbn_t lbn, 1407 int indx, 1408 ufs1_daddr_t *bap) 1409 { 1410 struct fs *fs; 1411 uint64_t cg, inocg; 1412 uint64_t avgbfree, startcg; 1413 ufs2_daddr_t pref, prevbn; 1414 1415 KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1416 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1417 fs = ITOFS(ip); 1418 /* 1419 * Allocation of indirect blocks is indicated by passing negative 1420 * values in indx: -1 for single indirect, -2 for double indirect, 1421 * -3 for triple indirect. As noted below, we attempt to allocate 1422 * the first indirect inline with the file data. For all later 1423 * indirect blocks, the data is often allocated in other cylinder 1424 * groups. However to speed random file access and to speed up 1425 * fsck, the filesystem reserves the first fs_metaspace blocks 1426 * (typically half of fs_minfree) of the data area of each cylinder 1427 * group to hold these later indirect blocks. 1428 */ 1429 inocg = ino_to_cg(fs, ip->i_number); 1430 if (indx < 0) { 1431 /* 1432 * Our preference for indirect blocks is the zone at the 1433 * beginning of the inode's cylinder group data area that 1434 * we try to reserve for indirect blocks. 1435 */ 1436 pref = cgmeta(fs, inocg); 1437 /* 1438 * If we are allocating the first indirect block, try to 1439 * place it immediately following the last direct block. 1440 */ 1441 if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && 1442 ip->i_din1->di_db[UFS_NDADDR - 1] != 0) { 1443 pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag; 1444 if (dtog(fs, pref) >= fs->fs_ncg) 1445 pref = 0; 1446 } 1447 return (pref); 1448 } 1449 /* 1450 * If we are allocating the first data block in the first indirect 1451 * block and the indirect has been allocated in the data block area, 1452 * try to place it immediately following the indirect block. 1453 */ 1454 if (lbn == UFS_NDADDR) { 1455 pref = ip->i_din1->di_ib[0]; 1456 if (pref != 0 && pref >= cgdata(fs, inocg) && 1457 pref < cgbase(fs, inocg + 1)) { 1458 if (dtog(fs, pref + fs->fs_frag) >= fs->fs_ncg) 1459 return (0); 1460 return (pref + fs->fs_frag); 1461 } 1462 } 1463 /* 1464 * If we are at the beginning of a file, or we have already allocated 1465 * the maximum number of blocks per cylinder group, or we do not 1466 * have a block allocated immediately preceding us, then we need 1467 * to decide where to start allocating new blocks. 1468 */ 1469 if (indx == 0) { 1470 prevbn = 0; 1471 } else { 1472 prevbn = bap[indx - 1]; 1473 if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, 1474 fs->fs_bsize) != 0) 1475 prevbn = 0; 1476 } 1477 if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { 1478 /* 1479 * If we are allocating a directory data block, we want 1480 * to place it in the metadata area. 1481 */ 1482 if ((ip->i_mode & IFMT) == IFDIR) 1483 return (cgmeta(fs, inocg)); 1484 /* 1485 * Until we fill all the direct and all the first indirect's 1486 * blocks, we try to allocate in the data area of the inode's 1487 * cylinder group. 1488 */ 1489 if (lbn < UFS_NDADDR + NINDIR(fs)) 1490 return (cgdata(fs, inocg)); 1491 /* 1492 * Find a cylinder with greater than average number of 1493 * unused data blocks. 1494 */ 1495 if (indx == 0 || prevbn == 0) 1496 startcg = inocg + lbn / fs->fs_maxbpg; 1497 else 1498 startcg = dtog(fs, prevbn) + 1; 1499 startcg %= fs->fs_ncg; 1500 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1501 for (cg = startcg; cg < fs->fs_ncg; cg++) 1502 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1503 fs->fs_cgrotor = cg; 1504 return (cgdata(fs, cg)); 1505 } 1506 for (cg = 0; cg < startcg; cg++) 1507 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1508 fs->fs_cgrotor = cg; 1509 return (cgdata(fs, cg)); 1510 } 1511 return (0); 1512 } 1513 /* 1514 * Otherwise, we just always try to lay things out contiguously. 1515 */ 1516 if (dtog(fs, prevbn + fs->fs_frag) >= fs->fs_ncg) 1517 return (0); 1518 return (prevbn + fs->fs_frag); 1519 } 1520 1521 /* 1522 * Same as above, but for UFS2 1523 */ 1524 ufs2_daddr_t 1525 ffs_blkpref_ufs2(struct inode *ip, 1526 ufs_lbn_t lbn, 1527 int indx, 1528 ufs2_daddr_t *bap) 1529 { 1530 struct fs *fs; 1531 uint64_t cg, inocg; 1532 uint64_t avgbfree, startcg; 1533 ufs2_daddr_t pref, prevbn; 1534 1535 KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1536 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1537 fs = ITOFS(ip); 1538 /* 1539 * Allocation of indirect blocks is indicated by passing negative 1540 * values in indx: -1 for single indirect, -2 for double indirect, 1541 * -3 for triple indirect. As noted below, we attempt to allocate 1542 * the first indirect inline with the file data. For all later 1543 * indirect blocks, the data is often allocated in other cylinder 1544 * groups. However to speed random file access and to speed up 1545 * fsck, the filesystem reserves the first fs_metaspace blocks 1546 * (typically half of fs_minfree) of the data area of each cylinder 1547 * group to hold these later indirect blocks. 1548 */ 1549 inocg = ino_to_cg(fs, ip->i_number); 1550 if (indx < 0) { 1551 /* 1552 * Our preference for indirect blocks is the zone at the 1553 * beginning of the inode's cylinder group data area that 1554 * we try to reserve for indirect blocks. 1555 */ 1556 pref = cgmeta(fs, inocg); 1557 /* 1558 * If we are allocating the first indirect block, try to 1559 * place it immediately following the last direct block. 1560 */ 1561 if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && 1562 ip->i_din2->di_db[UFS_NDADDR - 1] != 0) { 1563 pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag; 1564 if (dtog(fs, pref) >= fs->fs_ncg) 1565 pref = 0; 1566 } 1567 return (pref); 1568 } 1569 /* 1570 * If we are allocating the first data block in the first indirect 1571 * block and the indirect has been allocated in the data block area, 1572 * try to place it immediately following the indirect block. 1573 */ 1574 if (lbn == UFS_NDADDR) { 1575 pref = ip->i_din2->di_ib[0]; 1576 if (pref != 0 && pref >= cgdata(fs, inocg) && 1577 pref < cgbase(fs, inocg + 1)) { 1578 if (dtog(fs, pref + fs->fs_frag) >= fs->fs_ncg) 1579 return (0); 1580 return (pref + fs->fs_frag); 1581 } 1582 } 1583 /* 1584 * If we are at the beginning of a file, or we have already allocated 1585 * the maximum number of blocks per cylinder group, or we do not 1586 * have a block allocated immediately preceding us, then we need 1587 * to decide where to start allocating new blocks. 1588 */ 1589 if (indx == 0) { 1590 prevbn = 0; 1591 } else { 1592 prevbn = bap[indx - 1]; 1593 if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, 1594 fs->fs_bsize) != 0) 1595 prevbn = 0; 1596 } 1597 if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { 1598 /* 1599 * If we are allocating a directory data block, we want 1600 * to place it in the metadata area. 1601 */ 1602 if ((ip->i_mode & IFMT) == IFDIR) 1603 return (cgmeta(fs, inocg)); 1604 /* 1605 * Until we fill all the direct and all the first indirect's 1606 * blocks, we try to allocate in the data area of the inode's 1607 * cylinder group. 1608 */ 1609 if (lbn < UFS_NDADDR + NINDIR(fs)) 1610 return (cgdata(fs, inocg)); 1611 /* 1612 * Find a cylinder with greater than average number of 1613 * unused data blocks. 1614 */ 1615 if (indx == 0 || prevbn == 0) 1616 startcg = inocg + lbn / fs->fs_maxbpg; 1617 else 1618 startcg = dtog(fs, prevbn) + 1; 1619 startcg %= fs->fs_ncg; 1620 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1621 for (cg = startcg; cg < fs->fs_ncg; cg++) 1622 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1623 fs->fs_cgrotor = cg; 1624 return (cgdata(fs, cg)); 1625 } 1626 for (cg = 0; cg < startcg; cg++) 1627 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1628 fs->fs_cgrotor = cg; 1629 return (cgdata(fs, cg)); 1630 } 1631 return (0); 1632 } 1633 /* 1634 * Otherwise, we just always try to lay things out contiguously. 1635 */ 1636 if (dtog(fs, prevbn + fs->fs_frag) >= fs->fs_ncg) 1637 return (0); 1638 return (prevbn + fs->fs_frag); 1639 } 1640 1641 /* 1642 * Implement the cylinder overflow algorithm. 1643 * 1644 * The policy implemented by this algorithm is: 1645 * 1) allocate the block in its requested cylinder group. 1646 * 2) quadratically rehash on the cylinder group number. 1647 * 3) brute force search for a free block. 1648 * 1649 * Must be called with the UFS lock held. Will release the lock on success 1650 * and return with it held on failure. 1651 */ 1652 /*VARARGS5*/ 1653 static ufs2_daddr_t 1654 ffs_hashalloc(struct inode *ip, 1655 uint64_t cg, 1656 ufs2_daddr_t pref, 1657 int size, /* Search size for data blocks, mode for inodes */ 1658 int rsize, /* Real allocated size. */ 1659 allocfcn_t *allocator) 1660 { 1661 struct fs *fs; 1662 ufs2_daddr_t result; 1663 uint64_t i, icg = cg; 1664 1665 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1666 #ifdef INVARIANTS 1667 if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 1668 panic("ffs_hashalloc: allocation on suspended filesystem"); 1669 #endif 1670 fs = ITOFS(ip); 1671 /* 1672 * 1: preferred cylinder group 1673 */ 1674 result = (*allocator)(ip, cg, pref, size, rsize); 1675 if (result) 1676 return (result); 1677 /* 1678 * 2: quadratic rehash 1679 */ 1680 for (i = 1; i < fs->fs_ncg; i *= 2) { 1681 cg += i; 1682 if (cg >= fs->fs_ncg) 1683 cg -= fs->fs_ncg; 1684 result = (*allocator)(ip, cg, 0, size, rsize); 1685 if (result) 1686 return (result); 1687 } 1688 /* 1689 * 3: brute force search 1690 * Note that we start at i == 2, since 0 was checked initially, 1691 * and 1 is always checked in the quadratic rehash. 1692 */ 1693 cg = (icg + 2) % fs->fs_ncg; 1694 for (i = 2; i < fs->fs_ncg; i++) { 1695 result = (*allocator)(ip, cg, 0, size, rsize); 1696 if (result) 1697 return (result); 1698 cg++; 1699 if (cg == fs->fs_ncg) 1700 cg = 0; 1701 } 1702 return (0); 1703 } 1704 1705 /* 1706 * Determine whether a fragment can be extended. 1707 * 1708 * Check to see if the necessary fragments are available, and 1709 * if they are, allocate them. 1710 */ 1711 static ufs2_daddr_t 1712 ffs_fragextend(struct inode *ip, 1713 uint64_t cg, 1714 ufs2_daddr_t bprev, 1715 int osize, 1716 int nsize) 1717 { 1718 struct fs *fs; 1719 struct cg *cgp; 1720 struct buf *bp; 1721 struct ufsmount *ump; 1722 int nffree; 1723 long bno; 1724 int frags, bbase; 1725 int i, error; 1726 uint8_t *blksfree; 1727 1728 ump = ITOUMP(ip); 1729 fs = ump->um_fs; 1730 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) 1731 return (0); 1732 frags = numfrags(fs, nsize); 1733 bbase = fragnum(fs, bprev); 1734 if (bbase > fragnum(fs, (bprev + frags - 1))) { 1735 /* cannot extend across a block boundary */ 1736 return (0); 1737 } 1738 UFS_UNLOCK(ump); 1739 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { 1740 ffs_checkcgintegrity(fs, cg, error); 1741 goto fail; 1742 } 1743 bno = dtogd(fs, bprev); 1744 blksfree = cg_blksfree(cgp); 1745 for (i = numfrags(fs, osize); i < frags; i++) 1746 if (isclr(blksfree, bno + i)) 1747 goto fail; 1748 /* 1749 * the current fragment can be extended 1750 * deduct the count on fragment being extended into 1751 * increase the count on the remaining fragment (if any) 1752 * allocate the extended piece 1753 */ 1754 for (i = frags; i < fs->fs_frag - bbase; i++) 1755 if (isclr(blksfree, bno + i)) 1756 break; 1757 cgp->cg_frsum[i - numfrags(fs, osize)]--; 1758 if (i != frags) 1759 cgp->cg_frsum[i - frags]++; 1760 for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) { 1761 clrbit(blksfree, bno + i); 1762 cgp->cg_cs.cs_nffree--; 1763 nffree++; 1764 } 1765 UFS_LOCK(ump); 1766 fs->fs_cstotal.cs_nffree -= nffree; 1767 fs->fs_cs(fs, cg).cs_nffree -= nffree; 1768 fs->fs_fmod = 1; 1769 ACTIVECLEAR(fs, cg); 1770 UFS_UNLOCK(ump); 1771 if (DOINGSOFTDEP(ITOV(ip))) 1772 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, 1773 frags, numfrags(fs, osize)); 1774 bdwrite(bp); 1775 return (bprev); 1776 1777 fail: 1778 brelse(bp); 1779 UFS_LOCK(ump); 1780 return (0); 1781 1782 } 1783 1784 /* 1785 * Determine whether a block can be allocated. 1786 * 1787 * Check to see if a block of the appropriate size is available, 1788 * and if it is, allocate it. 1789 */ 1790 static ufs2_daddr_t 1791 ffs_alloccg(struct inode *ip, 1792 uint64_t cg, 1793 ufs2_daddr_t bpref, 1794 int size, 1795 int rsize) 1796 { 1797 struct fs *fs; 1798 struct cg *cgp; 1799 struct buf *bp; 1800 struct ufsmount *ump; 1801 ufs1_daddr_t bno; 1802 ufs2_daddr_t blkno; 1803 int i, allocsiz, error, frags; 1804 uint8_t *blksfree; 1805 1806 ump = ITOUMP(ip); 1807 fs = ump->um_fs; 1808 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) 1809 return (0); 1810 UFS_UNLOCK(ump); 1811 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0 || 1812 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { 1813 ffs_checkcgintegrity(fs, cg, error); 1814 goto fail; 1815 } 1816 if (size == fs->fs_bsize) { 1817 UFS_LOCK(ump); 1818 blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1819 ACTIVECLEAR(fs, cg); 1820 UFS_UNLOCK(ump); 1821 bdwrite(bp); 1822 return (blkno); 1823 } 1824 /* 1825 * check to see if any fragments are already available 1826 * allocsiz is the size which will be allocated, hacking 1827 * it down to a smaller size if necessary 1828 */ 1829 blksfree = cg_blksfree(cgp); 1830 frags = numfrags(fs, size); 1831 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 1832 if (cgp->cg_frsum[allocsiz] != 0) 1833 break; 1834 if (allocsiz == fs->fs_frag) { 1835 /* 1836 * no fragments were available, so a block will be 1837 * allocated, and hacked up 1838 */ 1839 if (cgp->cg_cs.cs_nbfree == 0) 1840 goto fail; 1841 UFS_LOCK(ump); 1842 blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1843 ACTIVECLEAR(fs, cg); 1844 UFS_UNLOCK(ump); 1845 bdwrite(bp); 1846 return (blkno); 1847 } 1848 KASSERT(size == rsize, 1849 ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); 1850 bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); 1851 if (bno < 0) 1852 goto fail; 1853 for (i = 0; i < frags; i++) 1854 clrbit(blksfree, bno + i); 1855 cgp->cg_cs.cs_nffree -= frags; 1856 cgp->cg_frsum[allocsiz]--; 1857 if (frags != allocsiz) 1858 cgp->cg_frsum[allocsiz - frags]++; 1859 UFS_LOCK(ump); 1860 fs->fs_cstotal.cs_nffree -= frags; 1861 fs->fs_cs(fs, cg).cs_nffree -= frags; 1862 fs->fs_fmod = 1; 1863 blkno = cgbase(fs, cg) + bno; 1864 ACTIVECLEAR(fs, cg); 1865 UFS_UNLOCK(ump); 1866 if (DOINGSOFTDEP(ITOV(ip))) 1867 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); 1868 bdwrite(bp); 1869 return (blkno); 1870 1871 fail: 1872 brelse(bp); 1873 UFS_LOCK(ump); 1874 return (0); 1875 } 1876 1877 /* 1878 * Allocate a block in a cylinder group. 1879 * 1880 * This algorithm implements the following policy: 1881 * 1) allocate the requested block. 1882 * 2) allocate a rotationally optimal block in the same cylinder. 1883 * 3) allocate the next available block on the block rotor for the 1884 * specified cylinder group. 1885 * Note that this routine only allocates fs_bsize blocks; these 1886 * blocks may be fragmented by the routine that allocates them. 1887 */ 1888 static ufs2_daddr_t 1889 ffs_alloccgblk(struct inode *ip, 1890 struct buf *bp, 1891 ufs2_daddr_t bpref, 1892 int size) 1893 { 1894 struct fs *fs; 1895 struct cg *cgp; 1896 struct ufsmount *ump; 1897 ufs1_daddr_t bno; 1898 ufs2_daddr_t blkno; 1899 uint8_t *blksfree; 1900 int i, cgbpref; 1901 1902 ump = ITOUMP(ip); 1903 fs = ump->um_fs; 1904 mtx_assert(UFS_MTX(ump), MA_OWNED); 1905 cgp = (struct cg *)bp->b_data; 1906 blksfree = cg_blksfree(cgp); 1907 if (bpref == 0) { 1908 bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag; 1909 } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) { 1910 /* map bpref to correct zone in this cg */ 1911 if (bpref < cgdata(fs, cgbpref)) 1912 bpref = cgmeta(fs, cgp->cg_cgx); 1913 else 1914 bpref = cgdata(fs, cgp->cg_cgx); 1915 } 1916 /* 1917 * if the requested block is available, use it 1918 */ 1919 bno = dtogd(fs, blknum(fs, bpref)); 1920 if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) 1921 goto gotit; 1922 /* 1923 * Take the next available block in this cylinder group. 1924 */ 1925 bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); 1926 if (bno < 0) 1927 return (0); 1928 /* Update cg_rotor only if allocated from the data zone */ 1929 if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) 1930 cgp->cg_rotor = bno; 1931 gotit: 1932 blkno = fragstoblks(fs, bno); 1933 ffs_clrblock(fs, blksfree, (long)blkno); 1934 ffs_clusteracct(fs, cgp, blkno, -1); 1935 cgp->cg_cs.cs_nbfree--; 1936 fs->fs_cstotal.cs_nbfree--; 1937 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; 1938 fs->fs_fmod = 1; 1939 blkno = cgbase(fs, cgp->cg_cgx) + bno; 1940 /* 1941 * If the caller didn't want the whole block free the frags here. 1942 */ 1943 size = numfrags(fs, size); 1944 if (size != fs->fs_frag) { 1945 bno = dtogd(fs, blkno); 1946 for (i = size; i < fs->fs_frag; i++) 1947 setbit(blksfree, bno + i); 1948 i = fs->fs_frag - size; 1949 cgp->cg_cs.cs_nffree += i; 1950 fs->fs_cstotal.cs_nffree += i; 1951 fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; 1952 fs->fs_fmod = 1; 1953 cgp->cg_frsum[i]++; 1954 } 1955 /* XXX Fixme. */ 1956 UFS_UNLOCK(ump); 1957 if (DOINGSOFTDEP(ITOV(ip))) 1958 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0); 1959 UFS_LOCK(ump); 1960 return (blkno); 1961 } 1962 1963 /* 1964 * Determine whether a cluster can be allocated. 1965 * 1966 * We do not currently check for optimal rotational layout if there 1967 * are multiple choices in the same cylinder group. Instead we just 1968 * take the first one that we find following bpref. 1969 */ 1970 static ufs2_daddr_t 1971 ffs_clusteralloc(struct inode *ip, 1972 uint64_t cg, 1973 ufs2_daddr_t bpref, 1974 int len) 1975 { 1976 struct fs *fs; 1977 struct cg *cgp; 1978 struct buf *bp; 1979 struct ufsmount *ump; 1980 int i, run, bit, map, got, error; 1981 ufs2_daddr_t bno; 1982 uint8_t *mapp; 1983 int32_t *lp; 1984 uint8_t *blksfree; 1985 1986 ump = ITOUMP(ip); 1987 fs = ump->um_fs; 1988 MPASS(cg < fs->fs_ncg); 1989 if (fs->fs_maxcluster[cg] < len) 1990 return (0); 1991 UFS_UNLOCK(ump); 1992 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { 1993 ffs_checkcgintegrity(fs, cg, error); 1994 UFS_LOCK(ump); 1995 return (0); 1996 } 1997 /* 1998 * Check to see if a cluster of the needed size (or bigger) is 1999 * available in this cylinder group. 2000 */ 2001 lp = &cg_clustersum(cgp)[len]; 2002 for (i = len; i <= fs->fs_contigsumsize; i++) 2003 if (*lp++ > 0) 2004 break; 2005 if (i > fs->fs_contigsumsize) { 2006 /* 2007 * This is the first time looking for a cluster in this 2008 * cylinder group. Update the cluster summary information 2009 * to reflect the true maximum sized cluster so that 2010 * future cluster allocation requests can avoid reading 2011 * the cylinder group map only to find no clusters. 2012 */ 2013 lp = &cg_clustersum(cgp)[len - 1]; 2014 for (i = len - 1; i > 0; i--) 2015 if (*lp-- > 0) 2016 break; 2017 UFS_LOCK(ump); 2018 fs->fs_maxcluster[cg] = i; 2019 brelse(bp); 2020 return (0); 2021 } 2022 /* 2023 * Search the cluster map to find a big enough cluster. 2024 * We take the first one that we find, even if it is larger 2025 * than we need as we prefer to get one close to the previous 2026 * block allocation. We do not search before the current 2027 * preference point as we do not want to allocate a block 2028 * that is allocated before the previous one (as we will 2029 * then have to wait for another pass of the elevator 2030 * algorithm before it will be read). We prefer to fail and 2031 * be recalled to try an allocation in the next cylinder group. 2032 */ 2033 if (dtog(fs, bpref) != cg) 2034 bpref = cgdata(fs, cg); 2035 else 2036 bpref = blknum(fs, bpref); 2037 bpref = fragstoblks(fs, dtogd(fs, bpref)); 2038 mapp = &cg_clustersfree(cgp)[bpref / NBBY]; 2039 map = *mapp++; 2040 bit = 1 << (bpref % NBBY); 2041 for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { 2042 if ((map & bit) == 0) { 2043 run = 0; 2044 } else { 2045 run++; 2046 if (run == len) 2047 break; 2048 } 2049 if ((got & (NBBY - 1)) != (NBBY - 1)) { 2050 bit <<= 1; 2051 } else { 2052 map = *mapp++; 2053 bit = 1; 2054 } 2055 } 2056 if (got >= cgp->cg_nclusterblks) { 2057 UFS_LOCK(ump); 2058 brelse(bp); 2059 return (0); 2060 } 2061 /* 2062 * Allocate the cluster that we have found. 2063 */ 2064 blksfree = cg_blksfree(cgp); 2065 for (i = 1; i <= len; i++) 2066 if (!ffs_isblock(fs, blksfree, got - run + i)) 2067 panic("ffs_clusteralloc: map mismatch"); 2068 bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1); 2069 if (dtog(fs, bno) != cg) 2070 panic("ffs_clusteralloc: allocated out of group"); 2071 len = blkstofrags(fs, len); 2072 UFS_LOCK(ump); 2073 for (i = 0; i < len; i += fs->fs_frag) 2074 if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) 2075 panic("ffs_clusteralloc: lost block"); 2076 ACTIVECLEAR(fs, cg); 2077 UFS_UNLOCK(ump); 2078 bdwrite(bp); 2079 return (bno); 2080 } 2081 2082 static inline struct buf * 2083 getinobuf(struct inode *ip, 2084 uint64_t cg, 2085 uint32_t cginoblk, 2086 int gbflags) 2087 { 2088 struct fs *fs; 2089 2090 fs = ITOFS(ip); 2091 return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, 2092 cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, 2093 gbflags)); 2094 } 2095 2096 /* 2097 * Synchronous inode initialization is needed only when barrier writes do not 2098 * work as advertised, and will impose a heavy cost on file creation in a newly 2099 * created filesystem. 2100 */ 2101 static int doasyncinodeinit = 1; 2102 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN, 2103 &doasyncinodeinit, 0, 2104 "Perform inode block initialization using asynchronous writes"); 2105 2106 /* 2107 * Determine whether an inode can be allocated. 2108 * 2109 * Check to see if an inode is available, and if it is, 2110 * allocate it using the following policy: 2111 * 1) allocate the requested inode. 2112 * 2) allocate the next available inode after the requested 2113 * inode in the specified cylinder group. 2114 */ 2115 static ufs2_daddr_t 2116 ffs_nodealloccg(struct inode *ip, 2117 uint64_t cg, 2118 ufs2_daddr_t ipref, 2119 int mode, 2120 int unused) 2121 { 2122 struct fs *fs; 2123 struct cg *cgp; 2124 struct buf *bp, *ibp; 2125 struct ufsmount *ump; 2126 uint8_t *inosused, *loc; 2127 struct ufs2_dinode *dp2; 2128 int error, start, len, i; 2129 uint32_t old_initediblk; 2130 2131 ump = ITOUMP(ip); 2132 fs = ump->um_fs; 2133 check_nifree: 2134 if (fs->fs_cs(fs, cg).cs_nifree == 0) 2135 return (0); 2136 UFS_UNLOCK(ump); 2137 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { 2138 ffs_checkcgintegrity(fs, cg, error); 2139 UFS_LOCK(ump); 2140 return (0); 2141 } 2142 restart: 2143 if (cgp->cg_cs.cs_nifree == 0) { 2144 brelse(bp); 2145 UFS_LOCK(ump); 2146 return (0); 2147 } 2148 inosused = cg_inosused(cgp); 2149 if (ipref) { 2150 ipref %= fs->fs_ipg; 2151 if (isclr(inosused, ipref)) 2152 goto gotit; 2153 } 2154 start = cgp->cg_irotor / NBBY; 2155 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); 2156 loc = memcchr(&inosused[start], 0xff, len); 2157 if (loc == NULL) { 2158 len = start + 1; 2159 start = 0; 2160 loc = memcchr(&inosused[start], 0xff, len); 2161 if (loc == NULL) { 2162 printf("cg = %ju, irotor = %ld, fs = %s\n", 2163 (intmax_t)cg, (long)cgp->cg_irotor, fs->fs_fsmnt); 2164 panic("ffs_nodealloccg: map corrupted"); 2165 /* NOTREACHED */ 2166 } 2167 } 2168 ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1; 2169 gotit: 2170 /* 2171 * Check to see if we need to initialize more inodes. 2172 */ 2173 if (fs->fs_magic == FS_UFS2_MAGIC && 2174 ipref + INOPB(fs) > cgp->cg_initediblk && 2175 cgp->cg_initediblk < cgp->cg_niblk) { 2176 old_initediblk = cgp->cg_initediblk; 2177 2178 /* 2179 * Free the cylinder group lock before writing the 2180 * initialized inode block. Entering the 2181 * babarrierwrite() with the cylinder group lock 2182 * causes lock order violation between the lock and 2183 * snaplk. 2184 * 2185 * Another thread can decide to initialize the same 2186 * inode block, but whichever thread first gets the 2187 * cylinder group lock after writing the newly 2188 * allocated inode block will update it and the other 2189 * will realize that it has lost and leave the 2190 * cylinder group unchanged. 2191 */ 2192 ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); 2193 brelse(bp); 2194 if (ibp == NULL) { 2195 /* 2196 * The inode block buffer is already owned by 2197 * another thread, which must initialize it. 2198 * Wait on the buffer to allow another thread 2199 * to finish the updates, with dropped cg 2200 * buffer lock, then retry. 2201 */ 2202 ibp = getinobuf(ip, cg, old_initediblk, 0); 2203 brelse(ibp); 2204 UFS_LOCK(ump); 2205 goto check_nifree; 2206 } 2207 bzero(ibp->b_data, (int)fs->fs_bsize); 2208 dp2 = (struct ufs2_dinode *)(ibp->b_data); 2209 for (i = 0; i < INOPB(fs); i++) { 2210 while (dp2->di_gen == 0) 2211 dp2->di_gen = arc4random(); 2212 dp2++; 2213 } 2214 2215 /* 2216 * Rather than adding a soft updates dependency to ensure 2217 * that the new inode block is written before it is claimed 2218 * by the cylinder group map, we just do a barrier write 2219 * here. The barrier write will ensure that the inode block 2220 * gets written before the updated cylinder group map can be 2221 * written. The barrier write should only slow down bulk 2222 * loading of newly created filesystems. 2223 */ 2224 if (doasyncinodeinit) 2225 babarrierwrite(ibp); 2226 else 2227 bwrite(ibp); 2228 2229 /* 2230 * After the inode block is written, try to update the 2231 * cg initediblk pointer. If another thread beat us 2232 * to it, then leave it unchanged as the other thread 2233 * has already set it correctly. 2234 */ 2235 error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp); 2236 UFS_LOCK(ump); 2237 ACTIVECLEAR(fs, cg); 2238 UFS_UNLOCK(ump); 2239 if (error != 0) 2240 return (error); 2241 if (cgp->cg_initediblk == old_initediblk) 2242 cgp->cg_initediblk += INOPB(fs); 2243 goto restart; 2244 } 2245 cgp->cg_irotor = ipref; 2246 UFS_LOCK(ump); 2247 ACTIVECLEAR(fs, cg); 2248 setbit(inosused, ipref); 2249 cgp->cg_cs.cs_nifree--; 2250 fs->fs_cstotal.cs_nifree--; 2251 fs->fs_cs(fs, cg).cs_nifree--; 2252 fs->fs_fmod = 1; 2253 if ((mode & IFMT) == IFDIR) { 2254 cgp->cg_cs.cs_ndir++; 2255 fs->fs_cstotal.cs_ndir++; 2256 fs->fs_cs(fs, cg).cs_ndir++; 2257 } 2258 UFS_UNLOCK(ump); 2259 if (DOINGSOFTDEP(ITOV(ip))) 2260 softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode); 2261 bdwrite(bp); 2262 return ((ino_t)(cg * fs->fs_ipg + ipref)); 2263 } 2264 2265 /* 2266 * Free a block or fragment. 2267 * 2268 * The specified block or fragment is placed back in the 2269 * free map. If a fragment is deallocated, a possible 2270 * block reassembly is checked. 2271 */ 2272 static void 2273 ffs_blkfree_cg(struct ufsmount *ump, 2274 struct fs *fs, 2275 struct vnode *devvp, 2276 ufs2_daddr_t bno, 2277 long size, 2278 ino_t inum, 2279 struct workhead *dephd) 2280 { 2281 struct mount *mp; 2282 struct cg *cgp; 2283 struct buf *bp; 2284 daddr_t dbn; 2285 ufs1_daddr_t fragno, cgbno; 2286 int i, blk, frags, bbase, error; 2287 uint64_t cg; 2288 uint8_t *blksfree; 2289 struct cdev *dev; 2290 2291 cg = dtog(fs, bno); 2292 if (devvp->v_type == VREG) { 2293 /* devvp is a snapshot */ 2294 MPASS(devvp->v_mount->mnt_data == ump); 2295 dev = ump->um_devvp->v_rdev; 2296 } else if (devvp->v_type == VCHR) { 2297 /* 2298 * devvp is a normal disk device 2299 * XXXKIB: devvp is not locked there, v_rdev access depends on 2300 * busy mount, which prevents mntfs devvp from reclamation. 2301 */ 2302 dev = devvp->v_rdev; 2303 } else 2304 return; 2305 #ifdef INVARIANTS 2306 if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0 || 2307 fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { 2308 printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", 2309 devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, 2310 size, fs->fs_fsmnt); 2311 panic("ffs_blkfree_cg: invalid size"); 2312 } 2313 #endif 2314 if ((uint64_t)bno >= fs->fs_size) { 2315 printf("bad block %jd, ino %ju\n", (intmax_t)bno, 2316 (intmax_t)inum); 2317 ffs_fserr(fs, inum, "bad block"); 2318 return; 2319 } 2320 if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { 2321 if (!MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) 2322 return; 2323 /* 2324 * Would like to just downgrade to read-only. Until that 2325 * capability is available, just toss the cylinder group 2326 * update and mark the filesystem as needing to run fsck. 2327 */ 2328 fs->fs_flags |= FS_NEEDSFSCK; 2329 if (devvp->v_type == VREG) 2330 dbn = fragstoblks(fs, cgtod(fs, cg)); 2331 else 2332 dbn = fsbtodb(fs, cgtod(fs, cg)); 2333 error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); 2334 KASSERT(error == 0, ("getblkx failed")); 2335 softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, 2336 numfrags(fs, size), dephd, true); 2337 bp->b_flags |= B_RELBUF | B_NOCACHE; 2338 bp->b_flags &= ~B_CACHE; 2339 bawrite(bp); 2340 return; 2341 } 2342 cgbno = dtogd(fs, bno); 2343 blksfree = cg_blksfree(cgp); 2344 UFS_LOCK(ump); 2345 if (size == fs->fs_bsize) { 2346 fragno = fragstoblks(fs, cgbno); 2347 if (!ffs_isfreeblock(fs, blksfree, fragno)) { 2348 if (devvp->v_type == VREG) { 2349 UFS_UNLOCK(ump); 2350 /* devvp is a snapshot */ 2351 brelse(bp); 2352 return; 2353 } 2354 printf("dev = %s, block = %jd, fs = %s\n", 2355 devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); 2356 panic("ffs_blkfree_cg: freeing free block"); 2357 } 2358 ffs_setblock(fs, blksfree, fragno); 2359 ffs_clusteracct(fs, cgp, fragno, 1); 2360 cgp->cg_cs.cs_nbfree++; 2361 fs->fs_cstotal.cs_nbfree++; 2362 fs->fs_cs(fs, cg).cs_nbfree++; 2363 } else { 2364 bbase = cgbno - fragnum(fs, cgbno); 2365 /* 2366 * decrement the counts associated with the old frags 2367 */ 2368 blk = blkmap(fs, blksfree, bbase); 2369 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 2370 /* 2371 * deallocate the fragment 2372 */ 2373 frags = numfrags(fs, size); 2374 for (i = 0; i < frags; i++) { 2375 if (isset(blksfree, cgbno + i)) { 2376 printf("dev = %s, block = %jd, fs = %s\n", 2377 devtoname(dev), (intmax_t)(bno + i), 2378 fs->fs_fsmnt); 2379 panic("ffs_blkfree_cg: freeing free frag"); 2380 } 2381 setbit(blksfree, cgbno + i); 2382 } 2383 cgp->cg_cs.cs_nffree += i; 2384 fs->fs_cstotal.cs_nffree += i; 2385 fs->fs_cs(fs, cg).cs_nffree += i; 2386 /* 2387 * add back in counts associated with the new frags 2388 */ 2389 blk = blkmap(fs, blksfree, bbase); 2390 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 2391 /* 2392 * if a complete block has been reassembled, account for it 2393 */ 2394 fragno = fragstoblks(fs, bbase); 2395 if (ffs_isblock(fs, blksfree, fragno)) { 2396 cgp->cg_cs.cs_nffree -= fs->fs_frag; 2397 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 2398 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 2399 ffs_clusteracct(fs, cgp, fragno, 1); 2400 cgp->cg_cs.cs_nbfree++; 2401 fs->fs_cstotal.cs_nbfree++; 2402 fs->fs_cs(fs, cg).cs_nbfree++; 2403 } 2404 } 2405 fs->fs_fmod = 1; 2406 ACTIVECLEAR(fs, cg); 2407 UFS_UNLOCK(ump); 2408 mp = UFSTOVFS(ump); 2409 if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR) 2410 softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, 2411 numfrags(fs, size), dephd, false); 2412 bdwrite(bp); 2413 } 2414 2415 /* 2416 * Structures and routines associated with trim management. 2417 * 2418 * The following requests are passed to trim_lookup to indicate 2419 * the actions that should be taken. 2420 */ 2421 #define NEW 1 /* if found, error else allocate and hash it */ 2422 #define OLD 2 /* if not found, error, else return it */ 2423 #define REPLACE 3 /* if not found, error else unhash and reallocate it */ 2424 #define DONE 4 /* if not found, error else unhash and return it */ 2425 #define SINGLE 5 /* don't look up, just allocate it and don't hash it */ 2426 2427 MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures"); 2428 2429 #define TRIMLIST_HASH(ump, key) \ 2430 (&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize]) 2431 2432 /* 2433 * These structures describe each of the block free requests aggregated 2434 * together to make up a trim request. 2435 */ 2436 struct trim_blkreq { 2437 TAILQ_ENTRY(trim_blkreq) blkreqlist; 2438 ufs2_daddr_t bno; 2439 long size; 2440 struct workhead *pdephd; 2441 struct workhead dephd; 2442 }; 2443 2444 /* 2445 * Description of a trim request. 2446 */ 2447 struct ffs_blkfree_trim_params { 2448 TAILQ_HEAD(, trim_blkreq) blklist; 2449 LIST_ENTRY(ffs_blkfree_trim_params) hashlist; 2450 struct task task; 2451 struct ufsmount *ump; 2452 struct vnode *devvp; 2453 ino_t inum; 2454 ufs2_daddr_t bno; 2455 long size; 2456 long key; 2457 }; 2458 2459 static void ffs_blkfree_trim_completed(struct buf *); 2460 static void ffs_blkfree_trim_task(void *ctx, int pending __unused); 2461 static struct ffs_blkfree_trim_params *trim_lookup(struct ufsmount *, 2462 struct vnode *, ufs2_daddr_t, long, ino_t, uint64_t, int); 2463 static void ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *); 2464 2465 /* 2466 * Called on trim completion to start a task to free the associated block(s). 2467 */ 2468 static void 2469 ffs_blkfree_trim_completed(struct buf *bp) 2470 { 2471 struct ffs_blkfree_trim_params *tp; 2472 2473 tp = bp->b_fsprivate1; 2474 free(bp, M_TRIM); 2475 TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); 2476 taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task); 2477 } 2478 2479 /* 2480 * Trim completion task that free associated block(s). 2481 */ 2482 static void 2483 ffs_blkfree_trim_task(void *ctx, int pending) 2484 { 2485 struct ffs_blkfree_trim_params *tp; 2486 struct trim_blkreq *blkelm; 2487 struct ufsmount *ump; 2488 2489 tp = ctx; 2490 ump = tp->ump; 2491 while ((blkelm = TAILQ_FIRST(&tp->blklist)) != NULL) { 2492 ffs_blkfree_cg(ump, ump->um_fs, tp->devvp, blkelm->bno, 2493 blkelm->size, tp->inum, blkelm->pdephd); 2494 TAILQ_REMOVE(&tp->blklist, blkelm, blkreqlist); 2495 free(blkelm, M_TRIM); 2496 } 2497 vn_finished_secondary_write(UFSTOVFS(ump)); 2498 UFS_LOCK(ump); 2499 ump->um_trim_inflight -= 1; 2500 ump->um_trim_inflight_blks -= numfrags(ump->um_fs, tp->size); 2501 UFS_UNLOCK(ump); 2502 free(tp, M_TRIM); 2503 } 2504 2505 /* 2506 * Lookup a trim request by inode number. 2507 * Allocate if requested (NEW, REPLACE, SINGLE). 2508 */ 2509 static struct ffs_blkfree_trim_params * 2510 trim_lookup(struct ufsmount *ump, 2511 struct vnode *devvp, 2512 ufs2_daddr_t bno, 2513 long size, 2514 ino_t inum, 2515 uint64_t key, 2516 int alloctype) 2517 { 2518 struct trimlist_hashhead *tphashhead; 2519 struct ffs_blkfree_trim_params *tp, *ntp; 2520 2521 ntp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK); 2522 if (alloctype != SINGLE) { 2523 KASSERT(key >= FIRST_VALID_KEY, ("trim_lookup: invalid key")); 2524 UFS_LOCK(ump); 2525 tphashhead = TRIMLIST_HASH(ump, key); 2526 LIST_FOREACH(tp, tphashhead, hashlist) 2527 if (key == tp->key) 2528 break; 2529 } 2530 switch (alloctype) { 2531 case NEW: 2532 KASSERT(tp == NULL, ("trim_lookup: found trim")); 2533 break; 2534 case OLD: 2535 KASSERT(tp != NULL, 2536 ("trim_lookup: missing call to ffs_blkrelease_start()")); 2537 UFS_UNLOCK(ump); 2538 free(ntp, M_TRIM); 2539 return (tp); 2540 case REPLACE: 2541 KASSERT(tp != NULL, ("trim_lookup: missing REPLACE trim")); 2542 LIST_REMOVE(tp, hashlist); 2543 /* tp will be freed by caller */ 2544 break; 2545 case DONE: 2546 KASSERT(tp != NULL, ("trim_lookup: missing DONE trim")); 2547 LIST_REMOVE(tp, hashlist); 2548 UFS_UNLOCK(ump); 2549 free(ntp, M_TRIM); 2550 return (tp); 2551 } 2552 TAILQ_INIT(&ntp->blklist); 2553 ntp->ump = ump; 2554 ntp->devvp = devvp; 2555 ntp->bno = bno; 2556 ntp->size = size; 2557 ntp->inum = inum; 2558 ntp->key = key; 2559 if (alloctype != SINGLE) { 2560 LIST_INSERT_HEAD(tphashhead, ntp, hashlist); 2561 UFS_UNLOCK(ump); 2562 } 2563 return (ntp); 2564 } 2565 2566 /* 2567 * Dispatch a trim request. 2568 */ 2569 static void 2570 ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *tp) 2571 { 2572 struct ufsmount *ump; 2573 struct mount *mp; 2574 struct buf *bp; 2575 2576 /* 2577 * Postpone the set of the free bit in the cg bitmap until the 2578 * BIO_DELETE is completed. Otherwise, due to disk queue 2579 * reordering, TRIM might be issued after we reuse the block 2580 * and write some new data into it. 2581 */ 2582 ump = tp->ump; 2583 bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO); 2584 bp->b_iocmd = BIO_DELETE; 2585 bp->b_iooffset = dbtob(fsbtodb(ump->um_fs, tp->bno)); 2586 bp->b_iodone = ffs_blkfree_trim_completed; 2587 bp->b_bcount = tp->size; 2588 bp->b_fsprivate1 = tp; 2589 UFS_LOCK(ump); 2590 ump->um_trim_total += 1; 2591 ump->um_trim_inflight += 1; 2592 ump->um_trim_inflight_blks += numfrags(ump->um_fs, tp->size); 2593 ump->um_trim_total_blks += numfrags(ump->um_fs, tp->size); 2594 UFS_UNLOCK(ump); 2595 2596 mp = UFSTOVFS(ump); 2597 vn_start_secondary_write(NULL, &mp, 0); 2598 g_vfs_strategy(ump->um_bo, bp); 2599 } 2600 2601 /* 2602 * Allocate a new key to use to identify a range of blocks. 2603 */ 2604 uint64_t 2605 ffs_blkrelease_start(struct ufsmount *ump, 2606 struct vnode *devvp, 2607 ino_t inum) 2608 { 2609 static u_long masterkey; 2610 uint64_t key; 2611 2612 if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) 2613 return (SINGLETON_KEY); 2614 do { 2615 key = atomic_fetchadd_long(&masterkey, 1); 2616 } while (key < FIRST_VALID_KEY); 2617 (void) trim_lookup(ump, devvp, 0, 0, inum, key, NEW); 2618 return (key); 2619 } 2620 2621 /* 2622 * Deallocate a key that has been used to identify a range of blocks. 2623 */ 2624 void 2625 ffs_blkrelease_finish(struct ufsmount *ump, uint64_t key) 2626 { 2627 struct ffs_blkfree_trim_params *tp; 2628 2629 if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) 2630 return; 2631 /* 2632 * If the vfs.ffs.dotrimcons sysctl option is enabled while 2633 * a file deletion is active, specifically after a call 2634 * to ffs_blkrelease_start() but before the call to 2635 * ffs_blkrelease_finish(), ffs_blkrelease_start() will 2636 * have handed out SINGLETON_KEY rather than starting a 2637 * collection sequence. Thus if we get a SINGLETON_KEY 2638 * passed to ffs_blkrelease_finish(), we just return rather 2639 * than trying to finish the nonexistent sequence. 2640 */ 2641 if (key == SINGLETON_KEY) { 2642 #ifdef INVARIANTS 2643 printf("%s: vfs.ffs.dotrimcons enabled on active filesystem\n", 2644 ump->um_mountp->mnt_stat.f_mntonname); 2645 #endif 2646 return; 2647 } 2648 /* 2649 * We are done with sending blocks using this key. Look up the key 2650 * using the DONE alloctype (in tp) to request that it be unhashed 2651 * as we will not be adding to it. If the key has never been used, 2652 * tp->size will be zero, so we can just free tp. Otherwise the call 2653 * to ffs_blkfree_sendtrim(tp) causes the block range described by 2654 * tp to be issued (and then tp to be freed). 2655 */ 2656 tp = trim_lookup(ump, NULL, 0, 0, 0, key, DONE); 2657 if (tp->size == 0) 2658 free(tp, M_TRIM); 2659 else 2660 ffs_blkfree_sendtrim(tp); 2661 } 2662 2663 /* 2664 * Setup to free a block or fragment. 2665 * 2666 * Check for snapshots that might want to claim the block. 2667 * If trims are requested, prepare a trim request. Attempt to 2668 * aggregate consecutive blocks into a single trim request. 2669 */ 2670 void 2671 ffs_blkfree(struct ufsmount *ump, 2672 struct fs *fs, 2673 struct vnode *devvp, 2674 ufs2_daddr_t bno, 2675 long size, 2676 ino_t inum, 2677 __enum_uint8(vtype) vtype, 2678 struct workhead *dephd, 2679 uint64_t key) 2680 { 2681 struct ffs_blkfree_trim_params *tp, *ntp; 2682 struct trim_blkreq *blkelm; 2683 2684 /* 2685 * Check to see if a snapshot wants to claim the block. 2686 * Check that devvp is a normal disk device, not a snapshot, 2687 * it has a snapshot(s) associated with it, and one of the 2688 * snapshots wants to claim the block. 2689 */ 2690 if (devvp->v_type == VCHR && 2691 (devvp->v_vflag & VV_COPYONWRITE) && 2692 ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { 2693 return; 2694 } 2695 /* 2696 * Nothing to delay if TRIM is not required for this block or TRIM 2697 * is disabled or the operation is performed on a snapshot. 2698 */ 2699 if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) || 2700 devvp->v_type == VREG) { 2701 ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); 2702 return; 2703 } 2704 blkelm = malloc(sizeof(struct trim_blkreq), M_TRIM, M_WAITOK); 2705 blkelm->bno = bno; 2706 blkelm->size = size; 2707 if (dephd == NULL) { 2708 blkelm->pdephd = NULL; 2709 } else { 2710 LIST_INIT(&blkelm->dephd); 2711 LIST_SWAP(dephd, &blkelm->dephd, worklist, wk_list); 2712 blkelm->pdephd = &blkelm->dephd; 2713 } 2714 if (key == SINGLETON_KEY) { 2715 /* 2716 * Just a single non-contiguous piece. Use the SINGLE 2717 * alloctype to return a trim request that will not be 2718 * hashed for future lookup. 2719 */ 2720 tp = trim_lookup(ump, devvp, bno, size, inum, key, SINGLE); 2721 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2722 ffs_blkfree_sendtrim(tp); 2723 return; 2724 } 2725 /* 2726 * The callers of this function are not tracking whether or not 2727 * the blocks are contiguous. They are just saying that they 2728 * are freeing a set of blocks. It is this code that determines 2729 * the pieces of that range that are actually contiguous. 2730 * 2731 * Calling ffs_blkrelease_start() will have created an entry 2732 * that we will use. 2733 */ 2734 tp = trim_lookup(ump, devvp, bno, size, inum, key, OLD); 2735 if (tp->size == 0) { 2736 /* 2737 * First block of a potential range, set block and size 2738 * for the trim block. 2739 */ 2740 tp->bno = bno; 2741 tp->size = size; 2742 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2743 return; 2744 } 2745 /* 2746 * If this block is a continuation of the range (either 2747 * follows at the end or preceeds in the front) then we 2748 * add it to the front or back of the list and return. 2749 * 2750 * If it is not a continuation of the trim that we were 2751 * building, using the REPLACE alloctype, we request that 2752 * the old trim request (still in tp) be unhashed and a 2753 * new range started (in ntp). The ffs_blkfree_sendtrim(tp) 2754 * call causes the block range described by tp to be issued 2755 * (and then tp to be freed). 2756 */ 2757 if (bno + numfrags(fs, size) == tp->bno) { 2758 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2759 tp->bno = bno; 2760 tp->size += size; 2761 return; 2762 } else if (bno == tp->bno + numfrags(fs, tp->size)) { 2763 TAILQ_INSERT_TAIL(&tp->blklist, blkelm, blkreqlist); 2764 tp->size += size; 2765 return; 2766 } 2767 ntp = trim_lookup(ump, devvp, bno, size, inum, key, REPLACE); 2768 TAILQ_INSERT_HEAD(&ntp->blklist, blkelm, blkreqlist); 2769 ffs_blkfree_sendtrim(tp); 2770 } 2771 2772 #ifdef INVARIANTS 2773 /* 2774 * Verify allocation of a block or fragment. 2775 * Return 1 if block or fragment is free. 2776 */ 2777 static int 2778 ffs_checkfreeblk(struct inode *ip, 2779 ufs2_daddr_t bno, 2780 long size) 2781 { 2782 struct fs *fs; 2783 struct cg *cgp; 2784 struct buf *bp; 2785 ufs1_daddr_t cgbno; 2786 int i, frags, blkalloced; 2787 uint8_t *blksfree; 2788 2789 fs = ITOFS(ip); 2790 if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0) { 2791 printf("bsize = %ld, size = %ld, fs = %s\n", 2792 (long)fs->fs_bsize, size, fs->fs_fsmnt); 2793 panic("ffs_checkfreeblk: bad size"); 2794 } 2795 if ((uint64_t)bno >= fs->fs_size) 2796 panic("ffs_checkfreeblk: too big block %jd", (intmax_t)bno); 2797 if (ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), 0, &bp, &cgp) != 0) 2798 return (0); 2799 blksfree = cg_blksfree(cgp); 2800 cgbno = dtogd(fs, bno); 2801 if (size == fs->fs_bsize) { 2802 blkalloced = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); 2803 } else { 2804 frags = numfrags(fs, size); 2805 for (blkalloced = 0, i = 0; i < frags; i++) 2806 if (isset(blksfree, cgbno + i)) 2807 blkalloced++; 2808 if (blkalloced != 0 && blkalloced != frags) 2809 panic("ffs_checkfreeblk: partially free fragment"); 2810 } 2811 brelse(bp); 2812 return (blkalloced == 0); 2813 } 2814 #endif /* INVARIANTS */ 2815 2816 /* 2817 * Free an inode. 2818 */ 2819 int 2820 ffs_vfree(struct vnode *pvp, 2821 ino_t ino, 2822 int mode) 2823 { 2824 struct ufsmount *ump; 2825 2826 if (DOINGSOFTDEP(pvp)) { 2827 softdep_freefile(pvp, ino, mode); 2828 return (0); 2829 } 2830 ump = VFSTOUFS(pvp->v_mount); 2831 return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL)); 2832 } 2833 2834 /* 2835 * Do the actual free operation. 2836 * The specified inode is placed back in the free map. 2837 */ 2838 int 2839 ffs_freefile(struct ufsmount *ump, 2840 struct fs *fs, 2841 struct vnode *devvp, 2842 ino_t ino, 2843 int mode, 2844 struct workhead *wkhd) 2845 { 2846 struct cg *cgp; 2847 struct buf *bp; 2848 daddr_t dbn; 2849 int error; 2850 uint64_t cg; 2851 uint8_t *inosused; 2852 struct cdev *dev; 2853 ino_t cgino; 2854 2855 cg = ino_to_cg(fs, ino); 2856 if (devvp->v_type == VREG) { 2857 /* devvp is a snapshot */ 2858 MPASS(devvp->v_mount->mnt_data == ump); 2859 dev = ump->um_devvp->v_rdev; 2860 } else if (devvp->v_type == VCHR) { 2861 /* devvp is a normal disk device */ 2862 dev = devvp->v_rdev; 2863 } else { 2864 bp = NULL; 2865 return (0); 2866 } 2867 if (ino >= fs->fs_ipg * fs->fs_ncg) 2868 panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s", 2869 devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); 2870 if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { 2871 if (!MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) 2872 return (error); 2873 /* 2874 * Would like to just downgrade to read-only. Until that 2875 * capability is available, just toss the cylinder group 2876 * update and mark the filesystem as needing to run fsck. 2877 */ 2878 fs->fs_flags |= FS_NEEDSFSCK; 2879 if (devvp->v_type == VREG) 2880 dbn = fragstoblks(fs, cgtod(fs, cg)); 2881 else 2882 dbn = fsbtodb(fs, cgtod(fs, cg)); 2883 error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); 2884 KASSERT(error == 0, ("getblkx failed")); 2885 softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd, true); 2886 bp->b_flags |= B_RELBUF | B_NOCACHE; 2887 bp->b_flags &= ~B_CACHE; 2888 bawrite(bp); 2889 return (error); 2890 } 2891 inosused = cg_inosused(cgp); 2892 cgino = ino % fs->fs_ipg; 2893 if (isclr(inosused, cgino)) { 2894 printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev), 2895 (uintmax_t)ino, fs->fs_fsmnt); 2896 if (fs->fs_ronly == 0) 2897 panic("ffs_freefile: freeing free inode"); 2898 } 2899 clrbit(inosused, cgino); 2900 if (cgino < cgp->cg_irotor) 2901 cgp->cg_irotor = cgino; 2902 cgp->cg_cs.cs_nifree++; 2903 UFS_LOCK(ump); 2904 fs->fs_cstotal.cs_nifree++; 2905 fs->fs_cs(fs, cg).cs_nifree++; 2906 if ((mode & IFMT) == IFDIR) { 2907 cgp->cg_cs.cs_ndir--; 2908 fs->fs_cstotal.cs_ndir--; 2909 fs->fs_cs(fs, cg).cs_ndir--; 2910 } 2911 fs->fs_fmod = 1; 2912 ACTIVECLEAR(fs, cg); 2913 UFS_UNLOCK(ump); 2914 if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR) 2915 softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd, false); 2916 bdwrite(bp); 2917 return (0); 2918 } 2919 2920 /* 2921 * Check to see if a file is free. 2922 * Used to check for allocated files in snapshots. 2923 * Return 1 if file is free. 2924 */ 2925 int 2926 ffs_checkfreefile(struct fs *fs, 2927 struct vnode *devvp, 2928 ino_t ino) 2929 { 2930 struct cg *cgp; 2931 struct buf *bp; 2932 int ret, error; 2933 uint64_t cg; 2934 uint8_t *inosused; 2935 2936 cg = ino_to_cg(fs, ino); 2937 if ((devvp->v_type != VREG) && (devvp->v_type != VCHR)) 2938 return (1); 2939 if (ino >= fs->fs_ipg * fs->fs_ncg) 2940 return (1); 2941 if ((error = ffs_getcg(fs, devvp, cg, 0, &bp, &cgp)) != 0) 2942 return (1); 2943 inosused = cg_inosused(cgp); 2944 ino %= fs->fs_ipg; 2945 ret = isclr(inosused, ino); 2946 brelse(bp); 2947 return (ret); 2948 } 2949 2950 /* 2951 * Find a block of the specified size in the specified cylinder group. 2952 * 2953 * It is a panic if a request is made to find a block if none are 2954 * available. 2955 */ 2956 static ufs1_daddr_t 2957 ffs_mapsearch(struct fs *fs, 2958 struct cg *cgp, 2959 ufs2_daddr_t bpref, 2960 int allocsiz) 2961 { 2962 ufs1_daddr_t bno; 2963 int start, len, loc, i; 2964 int blk, field, subfield, pos; 2965 uint8_t *blksfree; 2966 2967 /* 2968 * find the fragment by searching through the free block 2969 * map for an appropriate bit pattern 2970 */ 2971 if (bpref) 2972 start = dtogd(fs, bpref) / NBBY; 2973 else 2974 start = cgp->cg_frotor / NBBY; 2975 blksfree = cg_blksfree(cgp); 2976 len = howmany(fs->fs_fpg, NBBY) - start; 2977 loc = scanc((uint64_t)len, (uint8_t *)&blksfree[start], 2978 fragtbl[fs->fs_frag], 2979 (uint8_t)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 2980 if (loc == 0) { 2981 len = start + 1; 2982 start = 0; 2983 loc = scanc((uint64_t)len, (uint8_t *)&blksfree[0], 2984 fragtbl[fs->fs_frag], 2985 (uint8_t)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 2986 if (loc == 0) { 2987 printf("start = %d, len = %d, fs = %s\n", 2988 start, len, fs->fs_fsmnt); 2989 panic("ffs_alloccg: map corrupted"); 2990 /* NOTREACHED */ 2991 } 2992 } 2993 bno = (start + len - loc) * NBBY; 2994 cgp->cg_frotor = bno; 2995 /* 2996 * found the byte in the map 2997 * sift through the bits to find the selected frag 2998 */ 2999 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 3000 blk = blkmap(fs, blksfree, bno); 3001 blk <<= 1; 3002 field = around[allocsiz]; 3003 subfield = inside[allocsiz]; 3004 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { 3005 if ((blk & field) == subfield) 3006 return (bno + pos); 3007 field <<= 1; 3008 subfield <<= 1; 3009 } 3010 } 3011 printf("bno = %ju, fs = %s\n", (intmax_t)bno, fs->fs_fsmnt); 3012 panic("ffs_alloccg: block not in map"); 3013 return (-1); 3014 } 3015 3016 /* 3017 * Fetch and verify a cylinder group. 3018 */ 3019 int 3020 ffs_getcg(struct fs *fs, 3021 struct vnode *devvp, 3022 uint64_t cg, 3023 int flags, 3024 struct buf **bpp, 3025 struct cg **cgpp) 3026 { 3027 struct buf *bp; 3028 struct cg *cgp; 3029 struct mount *mp; 3030 const struct statfs *sfs; 3031 daddr_t blkno; 3032 int error; 3033 3034 *bpp = NULL; 3035 *cgpp = NULL; 3036 if ((fs->fs_metackhash & CK_CYLGRP) != 0) 3037 flags |= GB_CKHASH; 3038 if (devvp->v_type == VCHR) { 3039 blkno = fsbtodb(fs, cgtod(fs, cg)); 3040 mp = devvp->v_rdev->si_mountpt; 3041 } else { 3042 blkno = fragstoblks(fs, cgtod(fs, cg)); 3043 mp = devvp->v_mount; 3044 } 3045 error = breadn_flags(devvp, blkno, blkno, (int)fs->fs_cgsize, NULL, 3046 NULL, 0, NOCRED, flags, ffs_ckhash_cg, &bp); 3047 if (error != 0) 3048 return (error); 3049 cgp = (struct cg *)bp->b_data; 3050 if ((fs->fs_metackhash & CK_CYLGRP) != 0 && 3051 (bp->b_flags & B_CKHASH) != 0 && 3052 cgp->cg_ckhash != bp->b_ckhash) { 3053 if (ppsratecheck(&VFSTOUFS(mp)->um_last_integritymsg, 3054 &VFSTOUFS(mp)->um_secs_integritymsg, 1)) { 3055 sfs = &mp->mnt_stat; 3056 printf("UFS %s%s (%s) cylinder checkhash failed: " 3057 "cg %ju, cgp: 0x%x != bp: 0x%jx\n", 3058 devvp->v_type == VCHR ? "" : "snapshot of ", 3059 sfs->f_mntfromname, sfs->f_mntonname, (intmax_t)cg, 3060 cgp->cg_ckhash, (uintmax_t)bp->b_ckhash); 3061 } 3062 bp->b_flags &= ~B_CKHASH; 3063 bp->b_flags |= B_INVAL | B_NOCACHE; 3064 brelse(bp); 3065 return (EINTEGRITY); 3066 } 3067 if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) { 3068 if (ppsratecheck(&VFSTOUFS(mp)->um_last_integritymsg, 3069 &VFSTOUFS(mp)->um_secs_integritymsg, 1)) { 3070 sfs = &mp->mnt_stat; 3071 printf("UFS %s%s (%s)", 3072 devvp->v_type == VCHR ? "" : "snapshot of ", 3073 sfs->f_mntfromname, sfs->f_mntonname); 3074 if (!cg_chkmagic(cgp)) 3075 printf(" cg %ju: bad magic number 0x%x should " 3076 "be 0x%x\n", (intmax_t)cg, cgp->cg_magic, 3077 CG_MAGIC); 3078 else 3079 printf(": wrong cylinder group cg %ju != " 3080 "cgx %u\n", (intmax_t)cg, cgp->cg_cgx); 3081 } 3082 bp->b_flags &= ~B_CKHASH; 3083 bp->b_flags |= B_INVAL | B_NOCACHE; 3084 brelse(bp); 3085 return (EINTEGRITY); 3086 } 3087 bp->b_flags &= ~B_CKHASH; 3088 bp->b_xflags |= BX_BKGRDWRITE; 3089 /* 3090 * If we are using check hashes on the cylinder group then we want 3091 * to limit changing the cylinder group time to when we are actually 3092 * going to write it to disk so that its check hash remains correct 3093 * in memory. If the CK_CYLGRP flag is set the time is updated in 3094 * ffs_bufwrite() as the buffer is queued for writing. Otherwise we 3095 * update the time here as we have done historically. 3096 */ 3097 if ((fs->fs_metackhash & CK_CYLGRP) != 0) 3098 bp->b_xflags |= BX_CYLGRP; 3099 else 3100 cgp->cg_old_time = cgp->cg_time = time_second; 3101 *bpp = bp; 3102 *cgpp = cgp; 3103 return (0); 3104 } 3105 3106 static void 3107 ffs_ckhash_cg(struct buf *bp) 3108 { 3109 uint32_t ckhash; 3110 struct cg *cgp; 3111 3112 cgp = (struct cg *)bp->b_data; 3113 ckhash = cgp->cg_ckhash; 3114 cgp->cg_ckhash = 0; 3115 bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); 3116 cgp->cg_ckhash = ckhash; 3117 } 3118 3119 /* 3120 * Called when a cylinder group read has failed. If an integrity check 3121 * is the cause of failure then the cylinder group will not be usable 3122 * until the filesystem has been unmounted and fsck has been run to 3123 * repair it. To avoid future attempts to allocate resources from the 3124 * cylinder group, its available resources are set to zero in the 3125 * superblock summary information. Since it will appear to have no 3126 * resources available, no further calls will be made to allocate 3127 * resources from it. When resources are freed to the cylinder group 3128 * the resource free routines will find the cylinder group unusable so 3129 * the resource will simply be discarded and thus will not show up in 3130 * the superblock summary information until they are recovered by fsck. 3131 */ 3132 static void 3133 ffs_checkcgintegrity(struct fs *fs, 3134 uint64_t cg, 3135 int error) 3136 { 3137 3138 if (error != EINTEGRITY) 3139 return; 3140 fs->fs_cstotal.cs_nffree -= fs->fs_cs(fs, cg).cs_nffree; 3141 fs->fs_cs(fs, cg).cs_nffree = 0; 3142 fs->fs_cstotal.cs_nbfree -= fs->fs_cs(fs, cg).cs_nbfree; 3143 fs->fs_cs(fs, cg).cs_nbfree = 0; 3144 fs->fs_cstotal.cs_nifree -= fs->fs_cs(fs, cg).cs_nifree; 3145 fs->fs_cs(fs, cg).cs_nifree = 0; 3146 fs->fs_maxcluster[cg] = 0; 3147 fs->fs_flags |= FS_NEEDSFSCK; 3148 fs->fs_fmod = 1; 3149 } 3150 3151 /* 3152 * Fserr prints the name of a filesystem with an error diagnostic. 3153 * 3154 * The form of the error message is: 3155 * fs: error message 3156 */ 3157 void 3158 ffs_fserr(struct fs *fs, 3159 ino_t inum, 3160 char *cp) 3161 { 3162 struct thread *td = curthread; /* XXX */ 3163 struct proc *p = td->td_proc; 3164 3165 log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n", 3166 p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum, 3167 fs->fs_fsmnt, cp); 3168 } 3169 3170 /* 3171 * This function provides the capability for the fsck program to 3172 * update an active filesystem. Sixteen operations are provided: 3173 * 3174 * adjrefcnt(inode, amt) - adjusts the reference count on the 3175 * specified inode by the specified amount. Under normal 3176 * operation the count should always go down. Decrementing 3177 * the count to zero will cause the inode to be freed. 3178 * adjblkcnt(inode, amt) - adjust the number of blocks used by the 3179 * inode by the specified amount. 3180 * adjdepth(inode, amt) - adjust the depth of the specified directory 3181 * inode by the specified amount. 3182 * setsize(inode, size) - set the size of the inode to the 3183 * specified size. 3184 * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - 3185 * adjust the superblock summary. 3186 * freedirs(inode, count) - directory inodes [inode..inode + count - 1] 3187 * are marked as free. Inodes should never have to be marked 3188 * as in use. 3189 * freefiles(inode, count) - file inodes [inode..inode + count - 1] 3190 * are marked as free. Inodes should never have to be marked 3191 * as in use. 3192 * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] 3193 * are marked as free. Blocks should never have to be marked 3194 * as in use. 3195 * setflags(flags, set/clear) - the fs_flags field has the specified 3196 * flags set (second parameter +1) or cleared (second parameter -1). 3197 * setcwd(dirinode) - set the current directory to dirinode in the 3198 * filesystem associated with the snapshot. 3199 * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".." 3200 * in the current directory is oldvalue then change it to newvalue. 3201 * unlink(nameptr, oldvalue) - Verify that the inode number associated 3202 * with nameptr in the current directory is oldvalue then unlink it. 3203 */ 3204 3205 static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); 3206 3207 SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, 3208 CTLFLAG_WR | CTLTYPE_STRUCT | CTLFLAG_NEEDGIANT, 3209 0, 0, sysctl_ffs_fsck, "S,fsck", 3210 "Adjust Inode Reference Count"); 3211 3212 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, 3213 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3214 "Adjust Inode Used Blocks Count"); 3215 3216 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_DEPTH, adjdepth, 3217 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3218 "Adjust Directory Inode Depth"); 3219 3220 static SYSCTL_NODE(_vfs_ffs, FFS_SET_SIZE, setsize, 3221 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3222 "Set the inode size"); 3223 3224 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, 3225 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3226 "Adjust number of directories"); 3227 3228 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, 3229 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3230 "Adjust number of free blocks"); 3231 3232 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, 3233 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3234 "Adjust number of free inodes"); 3235 3236 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, 3237 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3238 "Adjust number of free frags"); 3239 3240 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, 3241 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3242 "Adjust number of free clusters"); 3243 3244 static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, 3245 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3246 "Free Range of Directory Inodes"); 3247 3248 static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, 3249 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3250 "Free Range of File Inodes"); 3251 3252 static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, 3253 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3254 "Free Range of Blocks"); 3255 3256 static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, 3257 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3258 "Change Filesystem Flags"); 3259 3260 static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, 3261 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3262 "Set Current Working Directory"); 3263 3264 static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, 3265 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3266 "Change Value of .. Entry"); 3267 3268 static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, 3269 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3270 "Unlink a Duplicate Name"); 3271 3272 #ifdef DIAGNOSTIC 3273 static int fsckcmds = 0; 3274 SYSCTL_INT(_debug, OID_AUTO, ffs_fsckcmds, CTLFLAG_RW, &fsckcmds, 0, 3275 "print out fsck_ffs-based filesystem update commands"); 3276 #endif /* DIAGNOSTIC */ 3277 3278 static int 3279 sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) 3280 { 3281 struct thread *td = curthread; 3282 struct fsck_cmd cmd; 3283 struct ufsmount *ump; 3284 struct vnode *vp, *dvp, *fdvp; 3285 struct inode *ip, *dp; 3286 struct mount *mp; 3287 struct fs *fs; 3288 struct pwd *pwd; 3289 ufs2_daddr_t blkno; 3290 long blkcnt, blksize; 3291 uint64_t key; 3292 struct file *fp; 3293 cap_rights_t rights; 3294 int filetype, error; 3295 3296 if (req->newptr == NULL || req->newlen > sizeof(cmd)) 3297 return (EBADRPC); 3298 if ((error = SYSCTL_IN(req, &cmd, sizeof(cmd))) != 0) 3299 return (error); 3300 if (cmd.version != FFS_CMD_VERSION) 3301 return (ERPCMISMATCH); 3302 if ((error = getvnode(td, cmd.handle, 3303 cap_rights_init_one(&rights, CAP_FSCK), &fp)) != 0) 3304 return (error); 3305 vp = fp->f_vnode; 3306 if (vp->v_type != VREG && vp->v_type != VDIR) { 3307 fdrop(fp, td); 3308 return (EINVAL); 3309 } 3310 vn_start_write(vp, &mp, V_WAIT); 3311 if (mp == NULL || 3312 strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { 3313 vn_finished_write(mp); 3314 fdrop(fp, td); 3315 return (EINVAL); 3316 } 3317 ump = VFSTOUFS(mp); 3318 if (mp->mnt_flag & MNT_RDONLY) { 3319 vn_finished_write(mp); 3320 fdrop(fp, td); 3321 return (EROFS); 3322 } 3323 fs = ump->um_fs; 3324 filetype = IFREG; 3325 3326 switch (oidp->oid_number) { 3327 case FFS_SET_FLAGS: 3328 #ifdef DIAGNOSTIC 3329 if (fsckcmds) 3330 printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, 3331 cmd.size > 0 ? "set" : "clear"); 3332 #endif /* DIAGNOSTIC */ 3333 if (cmd.size > 0) 3334 fs->fs_flags |= (long)cmd.value; 3335 else 3336 fs->fs_flags &= ~(long)cmd.value; 3337 break; 3338 3339 case FFS_ADJ_REFCNT: 3340 #ifdef DIAGNOSTIC 3341 if (fsckcmds) { 3342 printf("%s: adjust inode %jd link count by %jd\n", 3343 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3344 (intmax_t)cmd.size); 3345 } 3346 #endif /* DIAGNOSTIC */ 3347 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3348 break; 3349 ip = VTOI(vp); 3350 ip->i_nlink += cmd.size; 3351 DIP_SET_NLINK(ip, ip->i_nlink); 3352 ip->i_effnlink += cmd.size; 3353 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3354 error = ffs_update(vp, 1); 3355 if (DOINGSOFTDEP(vp)) 3356 softdep_change_linkcnt(ip); 3357 vput(vp); 3358 break; 3359 3360 case FFS_ADJ_BLKCNT: 3361 #ifdef DIAGNOSTIC 3362 if (fsckcmds) { 3363 printf("%s: adjust inode %jd block count by %jd\n", 3364 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3365 (intmax_t)cmd.size); 3366 } 3367 #endif /* DIAGNOSTIC */ 3368 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3369 break; 3370 ip = VTOI(vp); 3371 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); 3372 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3373 error = ffs_update(vp, 1); 3374 vput(vp); 3375 break; 3376 3377 case FFS_ADJ_DEPTH: 3378 #ifdef DIAGNOSTIC 3379 if (fsckcmds) { 3380 printf("%s: adjust directory inode %jd depth by %jd\n", 3381 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3382 (intmax_t)cmd.size); 3383 } 3384 #endif /* DIAGNOSTIC */ 3385 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3386 break; 3387 if (vp->v_type != VDIR) { 3388 vput(vp); 3389 error = ENOTDIR; 3390 break; 3391 } 3392 ip = VTOI(vp); 3393 DIP_SET(ip, i_dirdepth, DIP(ip, i_dirdepth) + cmd.size); 3394 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3395 error = ffs_update(vp, 1); 3396 vput(vp); 3397 break; 3398 3399 case FFS_SET_SIZE: 3400 #ifdef DIAGNOSTIC 3401 if (fsckcmds) { 3402 printf("%s: set inode %jd size to %jd\n", 3403 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3404 (intmax_t)cmd.size); 3405 } 3406 #endif /* DIAGNOSTIC */ 3407 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3408 break; 3409 ip = VTOI(vp); 3410 DIP_SET(ip, i_size, cmd.size); 3411 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_MODIFIED); 3412 error = ffs_update(vp, 1); 3413 vput(vp); 3414 break; 3415 3416 case FFS_DIR_FREE: 3417 filetype = IFDIR; 3418 /* fall through */ 3419 3420 case FFS_FILE_FREE: 3421 #ifdef DIAGNOSTIC 3422 if (fsckcmds) { 3423 if (cmd.size == 1) 3424 printf("%s: free %s inode %ju\n", 3425 mp->mnt_stat.f_mntonname, 3426 filetype == IFDIR ? "directory" : "file", 3427 (uintmax_t)cmd.value); 3428 else 3429 printf("%s: free %s inodes %ju-%ju\n", 3430 mp->mnt_stat.f_mntonname, 3431 filetype == IFDIR ? "directory" : "file", 3432 (uintmax_t)cmd.value, 3433 (uintmax_t)(cmd.value + cmd.size - 1)); 3434 } 3435 #endif /* DIAGNOSTIC */ 3436 while (cmd.size > 0) { 3437 if ((error = ffs_freefile(ump, fs, ump->um_devvp, 3438 cmd.value, filetype, NULL))) 3439 break; 3440 cmd.size -= 1; 3441 cmd.value += 1; 3442 } 3443 break; 3444 3445 case FFS_BLK_FREE: 3446 #ifdef DIAGNOSTIC 3447 if (fsckcmds) { 3448 if (cmd.size == 1) 3449 printf("%s: free block %jd\n", 3450 mp->mnt_stat.f_mntonname, 3451 (intmax_t)cmd.value); 3452 else 3453 printf("%s: free blocks %jd-%jd\n", 3454 mp->mnt_stat.f_mntonname, 3455 (intmax_t)cmd.value, 3456 (intmax_t)cmd.value + cmd.size - 1); 3457 } 3458 #endif /* DIAGNOSTIC */ 3459 blkno = cmd.value; 3460 blkcnt = cmd.size; 3461 blksize = fs->fs_frag - (blkno % fs->fs_frag); 3462 key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO); 3463 while (blkcnt > 0) { 3464 if (blkcnt < blksize) 3465 blksize = blkcnt; 3466 ffs_blkfree(ump, fs, ump->um_devvp, blkno, 3467 blksize * fs->fs_fsize, UFS_ROOTINO, 3468 VDIR, NULL, key); 3469 blkno += blksize; 3470 blkcnt -= blksize; 3471 blksize = fs->fs_frag; 3472 } 3473 ffs_blkrelease_finish(ump, key); 3474 break; 3475 3476 /* 3477 * Adjust superblock summaries. fsck(8) is expected to 3478 * submit deltas when necessary. 3479 */ 3480 case FFS_ADJ_NDIR: 3481 #ifdef DIAGNOSTIC 3482 if (fsckcmds) { 3483 printf("%s: adjust number of directories by %jd\n", 3484 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3485 } 3486 #endif /* DIAGNOSTIC */ 3487 fs->fs_cstotal.cs_ndir += cmd.value; 3488 break; 3489 3490 case FFS_ADJ_NBFREE: 3491 #ifdef DIAGNOSTIC 3492 if (fsckcmds) { 3493 printf("%s: adjust number of free blocks by %+jd\n", 3494 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3495 } 3496 #endif /* DIAGNOSTIC */ 3497 fs->fs_cstotal.cs_nbfree += cmd.value; 3498 break; 3499 3500 case FFS_ADJ_NIFREE: 3501 #ifdef DIAGNOSTIC 3502 if (fsckcmds) { 3503 printf("%s: adjust number of free inodes by %+jd\n", 3504 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3505 } 3506 #endif /* DIAGNOSTIC */ 3507 fs->fs_cstotal.cs_nifree += cmd.value; 3508 break; 3509 3510 case FFS_ADJ_NFFREE: 3511 #ifdef DIAGNOSTIC 3512 if (fsckcmds) { 3513 printf("%s: adjust number of free frags by %+jd\n", 3514 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3515 } 3516 #endif /* DIAGNOSTIC */ 3517 fs->fs_cstotal.cs_nffree += cmd.value; 3518 break; 3519 3520 case FFS_ADJ_NUMCLUSTERS: 3521 #ifdef DIAGNOSTIC 3522 if (fsckcmds) { 3523 printf("%s: adjust number of free clusters by %+jd\n", 3524 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3525 } 3526 #endif /* DIAGNOSTIC */ 3527 fs->fs_cstotal.cs_numclusters += cmd.value; 3528 break; 3529 3530 case FFS_SET_CWD: 3531 #ifdef DIAGNOSTIC 3532 if (fsckcmds) { 3533 printf("%s: set current directory to inode %jd\n", 3534 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3535 } 3536 #endif /* DIAGNOSTIC */ 3537 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp))) 3538 break; 3539 AUDIT_ARG_VNODE1(vp); 3540 if ((error = change_dir(vp, td)) != 0) { 3541 vput(vp); 3542 break; 3543 } 3544 VOP_UNLOCK(vp); 3545 pwd_chdir(td, vp); 3546 break; 3547 3548 case FFS_SET_DOTDOT: 3549 #ifdef DIAGNOSTIC 3550 if (fsckcmds) { 3551 printf("%s: change .. in cwd from %jd to %jd\n", 3552 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3553 (intmax_t)cmd.size); 3554 } 3555 #endif /* DIAGNOSTIC */ 3556 /* 3557 * First we have to get and lock the parent directory 3558 * to which ".." points. 3559 */ 3560 error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp); 3561 if (error) 3562 break; 3563 /* 3564 * Now we get and lock the child directory containing "..". 3565 */ 3566 pwd = pwd_hold(td); 3567 dvp = pwd->pwd_cdir; 3568 if ((error = vget(dvp, LK_EXCLUSIVE)) != 0) { 3569 vput(fdvp); 3570 pwd_drop(pwd); 3571 break; 3572 } 3573 dp = VTOI(dvp); 3574 SET_I_OFFSET(dp, 12); /* XXX mastertemplate.dot_reclen */ 3575 error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, 3576 DT_DIR, 0); 3577 cache_purge(fdvp); 3578 cache_purge(dvp); 3579 vput(dvp); 3580 vput(fdvp); 3581 pwd_drop(pwd); 3582 break; 3583 3584 case FFS_UNLINK: 3585 #ifdef DIAGNOSTIC 3586 if (fsckcmds) { 3587 char buf[32]; 3588 3589 if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL)) 3590 strncpy(buf, "Name_too_long", 32); 3591 printf("%s: unlink %s (inode %jd)\n", 3592 mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size); 3593 } 3594 #endif /* DIAGNOSTIC */ 3595 /* 3596 * kern_funlinkat will do its own start/finish writes and 3597 * they do not nest, so drop ours here. Setting mp == NULL 3598 * indicates that vn_finished_write is not needed down below. 3599 */ 3600 vn_finished_write(mp); 3601 mp = NULL; 3602 error = kern_funlinkat(td, AT_FDCWD, 3603 (char *)(intptr_t)cmd.value, FD_NONE, UIO_USERSPACE, 3604 0, (ino_t)cmd.size); 3605 break; 3606 3607 default: 3608 #ifdef DIAGNOSTIC 3609 if (fsckcmds) { 3610 printf("Invalid request %d from fsck\n", 3611 oidp->oid_number); 3612 } 3613 #endif /* DIAGNOSTIC */ 3614 error = EINVAL; 3615 break; 3616 } 3617 fdrop(fp, td); 3618 vn_finished_write(mp); 3619 return (error); 3620 } 3621