1 /*- 2 * SPDX-License-Identifier: (BSD-2-Clause AND BSD-3-Clause) 3 * 4 * Copyright (c) 2002 Networks Associates Technology, Inc. 5 * All rights reserved. 6 * 7 * This software was developed for the FreeBSD Project by Marshall 8 * Kirk McKusick and Network Associates Laboratories, the Security 9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR 10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS 11 * research program 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1982, 1986, 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 62 */ 63 64 #include <sys/cdefs.h> 65 __FBSDID("$FreeBSD$"); 66 67 #include "opt_quota.h" 68 69 #include <sys/param.h> 70 #include <sys/systm.h> 71 #include <sys/bio.h> 72 #include <sys/buf.h> 73 #include <sys/capsicum.h> 74 #include <sys/conf.h> 75 #include <sys/fcntl.h> 76 #include <sys/file.h> 77 #include <sys/filedesc.h> 78 #include <sys/gsb_crc32.h> 79 #include <sys/kernel.h> 80 #include <sys/mount.h> 81 #include <sys/priv.h> 82 #include <sys/proc.h> 83 #include <sys/stat.h> 84 #include <sys/syscallsubr.h> 85 #include <sys/sysctl.h> 86 #include <sys/syslog.h> 87 #include <sys/taskqueue.h> 88 #include <sys/vnode.h> 89 90 #include <security/audit/audit.h> 91 92 #include <geom/geom.h> 93 #include <geom/geom_vfs.h> 94 95 #include <ufs/ufs/dir.h> 96 #include <ufs/ufs/extattr.h> 97 #include <ufs/ufs/quota.h> 98 #include <ufs/ufs/inode.h> 99 #include <ufs/ufs/ufs_extern.h> 100 #include <ufs/ufs/ufsmount.h> 101 102 #include <ufs/ffs/fs.h> 103 #include <ufs/ffs/ffs_extern.h> 104 #include <ufs/ffs/softdep.h> 105 106 typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref, 107 int size, int rsize); 108 109 static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int); 110 static ufs2_daddr_t 111 ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); 112 static void ffs_blkfree_cg(struct ufsmount *, struct fs *, 113 struct vnode *, ufs2_daddr_t, long, ino_t, 114 struct workhead *); 115 #ifdef INVARIANTS 116 static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); 117 #endif 118 static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int); 119 static ino_t ffs_dirpref(struct inode *); 120 static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t, 121 int, int); 122 static ufs2_daddr_t ffs_hashalloc 123 (struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *); 124 static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int, 125 int); 126 static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); 127 static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); 128 static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); 129 static void ffs_ckhash_cg(struct buf *); 130 131 /* 132 * Allocate a block in the filesystem. 133 * 134 * The size of the requested block is given, which must be some 135 * multiple of fs_fsize and <= fs_bsize. 136 * A preference may be optionally specified. If a preference is given 137 * the following hierarchy is used to allocate a block: 138 * 1) allocate the requested block. 139 * 2) allocate a rotationally optimal block in the same cylinder. 140 * 3) allocate a block in the same cylinder group. 141 * 4) quadratically rehash into other cylinder groups, until an 142 * available block is located. 143 * If no block preference is given the following hierarchy is used 144 * to allocate a block: 145 * 1) allocate a block in the cylinder group that contains the 146 * inode for the file. 147 * 2) quadratically rehash into other cylinder groups, until an 148 * available block is located. 149 */ 150 int 151 ffs_alloc(struct inode *ip, 152 ufs2_daddr_t lbn, 153 ufs2_daddr_t bpref, 154 int size, 155 int flags, 156 struct ucred *cred, 157 ufs2_daddr_t *bnp) 158 { 159 struct fs *fs; 160 struct ufsmount *ump; 161 ufs2_daddr_t bno; 162 u_int cg, reclaimed; 163 int64_t delta; 164 #ifdef QUOTA 165 int error; 166 #endif 167 168 *bnp = 0; 169 ump = ITOUMP(ip); 170 fs = ump->um_fs; 171 mtx_assert(UFS_MTX(ump), MA_OWNED); 172 #ifdef INVARIANTS 173 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { 174 printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", 175 devtoname(ump->um_dev), (long)fs->fs_bsize, size, 176 fs->fs_fsmnt); 177 panic("ffs_alloc: bad size"); 178 } 179 if (cred == NOCRED) 180 panic("ffs_alloc: missing credential"); 181 #endif /* INVARIANTS */ 182 reclaimed = 0; 183 retry: 184 #ifdef QUOTA 185 UFS_UNLOCK(ump); 186 error = chkdq(ip, btodb(size), cred, 0); 187 if (error) 188 return (error); 189 UFS_LOCK(ump); 190 #endif 191 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 192 goto nospace; 193 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && 194 freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) 195 goto nospace; 196 if (bpref >= fs->fs_size) 197 bpref = 0; 198 if (bpref == 0) 199 cg = ino_to_cg(fs, ip->i_number); 200 else 201 cg = dtog(fs, bpref); 202 bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); 203 if (bno > 0) { 204 delta = btodb(size); 205 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 206 if (flags & IO_EXT) 207 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 208 else 209 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 210 *bnp = bno; 211 return (0); 212 } 213 nospace: 214 #ifdef QUOTA 215 UFS_UNLOCK(ump); 216 /* 217 * Restore user's disk quota because allocation failed. 218 */ 219 (void) chkdq(ip, -btodb(size), cred, FORCE); 220 UFS_LOCK(ump); 221 #endif 222 if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 223 reclaimed = 1; 224 softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT); 225 goto retry; 226 } 227 if (ffs_fsfail_cleanup_locked(ump, 0)) { 228 UFS_UNLOCK(ump); 229 return (ENXIO); 230 } 231 if (reclaimed > 0 && 232 ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 233 UFS_UNLOCK(ump); 234 ffs_fserr(fs, ip->i_number, "filesystem full"); 235 uprintf("\n%s: write failed, filesystem is full\n", 236 fs->fs_fsmnt); 237 } else { 238 UFS_UNLOCK(ump); 239 } 240 return (ENOSPC); 241 } 242 243 /* 244 * Reallocate a fragment to a bigger size 245 * 246 * The number and size of the old block is given, and a preference 247 * and new size is also specified. The allocator attempts to extend 248 * the original block. Failing that, the regular block allocator is 249 * invoked to get an appropriate block. 250 */ 251 int 252 ffs_realloccg(struct inode *ip, 253 ufs2_daddr_t lbprev, 254 ufs2_daddr_t bprev, 255 ufs2_daddr_t bpref, 256 int osize, 257 int nsize, 258 int flags, 259 struct ucred *cred, 260 struct buf **bpp) 261 { 262 struct vnode *vp; 263 struct fs *fs; 264 struct buf *bp; 265 struct ufsmount *ump; 266 u_int cg, request, reclaimed; 267 int error, gbflags; 268 ufs2_daddr_t bno; 269 int64_t delta; 270 271 vp = ITOV(ip); 272 ump = ITOUMP(ip); 273 fs = ump->um_fs; 274 bp = NULL; 275 gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; 276 #ifdef WITNESS 277 gbflags |= IS_SNAPSHOT(ip) ? GB_NOWITNESS : 0; 278 #endif 279 280 mtx_assert(UFS_MTX(ump), MA_OWNED); 281 #ifdef INVARIANTS 282 if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 283 panic("ffs_realloccg: allocation on suspended filesystem"); 284 if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || 285 (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { 286 printf( 287 "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", 288 devtoname(ump->um_dev), (long)fs->fs_bsize, osize, 289 nsize, fs->fs_fsmnt); 290 panic("ffs_realloccg: bad size"); 291 } 292 if (cred == NOCRED) 293 panic("ffs_realloccg: missing credential"); 294 #endif /* INVARIANTS */ 295 reclaimed = 0; 296 retry: 297 if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && 298 freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) { 299 goto nospace; 300 } 301 if (bprev == 0) { 302 printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", 303 devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev, 304 fs->fs_fsmnt); 305 panic("ffs_realloccg: bad bprev"); 306 } 307 UFS_UNLOCK(ump); 308 /* 309 * Allocate the extra space in the buffer. 310 */ 311 error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); 312 if (error) { 313 return (error); 314 } 315 316 if (bp->b_blkno == bp->b_lblkno) { 317 if (lbprev >= UFS_NDADDR) 318 panic("ffs_realloccg: lbprev out of range"); 319 bp->b_blkno = fsbtodb(fs, bprev); 320 } 321 322 #ifdef QUOTA 323 error = chkdq(ip, btodb(nsize - osize), cred, 0); 324 if (error) { 325 brelse(bp); 326 return (error); 327 } 328 #endif 329 /* 330 * Check for extension in the existing location. 331 */ 332 *bpp = NULL; 333 cg = dtog(fs, bprev); 334 UFS_LOCK(ump); 335 bno = ffs_fragextend(ip, cg, bprev, osize, nsize); 336 if (bno) { 337 if (bp->b_blkno != fsbtodb(fs, bno)) 338 panic("ffs_realloccg: bad blockno"); 339 delta = btodb(nsize - osize); 340 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 341 if (flags & IO_EXT) 342 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 343 else 344 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 345 allocbuf(bp, nsize); 346 bp->b_flags |= B_DONE; 347 vfs_bio_bzero_buf(bp, osize, nsize - osize); 348 if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 349 vfs_bio_set_valid(bp, osize, nsize - osize); 350 *bpp = bp; 351 return (0); 352 } 353 /* 354 * Allocate a new disk location. 355 */ 356 if (bpref >= fs->fs_size) 357 bpref = 0; 358 switch ((int)fs->fs_optim) { 359 case FS_OPTSPACE: 360 /* 361 * Allocate an exact sized fragment. Although this makes 362 * best use of space, we will waste time relocating it if 363 * the file continues to grow. If the fragmentation is 364 * less than half of the minimum free reserve, we choose 365 * to begin optimizing for time. 366 */ 367 request = nsize; 368 if (fs->fs_minfree <= 5 || 369 fs->fs_cstotal.cs_nffree > 370 (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) 371 break; 372 log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", 373 fs->fs_fsmnt); 374 fs->fs_optim = FS_OPTTIME; 375 break; 376 case FS_OPTTIME: 377 /* 378 * At this point we have discovered a file that is trying to 379 * grow a small fragment to a larger fragment. To save time, 380 * we allocate a full sized block, then free the unused portion. 381 * If the file continues to grow, the `ffs_fragextend' call 382 * above will be able to grow it in place without further 383 * copying. If aberrant programs cause disk fragmentation to 384 * grow within 2% of the free reserve, we choose to begin 385 * optimizing for space. 386 */ 387 request = fs->fs_bsize; 388 if (fs->fs_cstotal.cs_nffree < 389 (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) 390 break; 391 log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", 392 fs->fs_fsmnt); 393 fs->fs_optim = FS_OPTSPACE; 394 break; 395 default: 396 printf("dev = %s, optim = %ld, fs = %s\n", 397 devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt); 398 panic("ffs_realloccg: bad optim"); 399 /* NOTREACHED */ 400 } 401 bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); 402 if (bno > 0) { 403 bp->b_blkno = fsbtodb(fs, bno); 404 if (!DOINGSOFTDEP(vp)) 405 /* 406 * The usual case is that a smaller fragment that 407 * was just allocated has been replaced with a bigger 408 * fragment or a full-size block. If it is marked as 409 * B_DELWRI, the current contents have not been written 410 * to disk. It is possible that the block was written 411 * earlier, but very uncommon. If the block has never 412 * been written, there is no need to send a BIO_DELETE 413 * for it when it is freed. The gain from avoiding the 414 * TRIMs for the common case of unwritten blocks far 415 * exceeds the cost of the write amplification for the 416 * uncommon case of failing to send a TRIM for a block 417 * that had been written. 418 */ 419 ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, 420 ip->i_number, vp->v_type, NULL, 421 (bp->b_flags & B_DELWRI) != 0 ? 422 NOTRIM_KEY : SINGLETON_KEY); 423 delta = btodb(nsize - osize); 424 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); 425 if (flags & IO_EXT) 426 UFS_INODE_SET_FLAG(ip, IN_CHANGE); 427 else 428 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 429 allocbuf(bp, nsize); 430 bp->b_flags |= B_DONE; 431 vfs_bio_bzero_buf(bp, osize, nsize - osize); 432 if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) 433 vfs_bio_set_valid(bp, osize, nsize - osize); 434 *bpp = bp; 435 return (0); 436 } 437 #ifdef QUOTA 438 UFS_UNLOCK(ump); 439 /* 440 * Restore user's disk quota because allocation failed. 441 */ 442 (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); 443 UFS_LOCK(ump); 444 #endif 445 nospace: 446 /* 447 * no space available 448 */ 449 if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { 450 reclaimed = 1; 451 UFS_UNLOCK(ump); 452 if (bp) { 453 brelse(bp); 454 bp = NULL; 455 } 456 UFS_LOCK(ump); 457 softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); 458 goto retry; 459 } 460 if (bp) 461 brelse(bp); 462 if (ffs_fsfail_cleanup_locked(ump, 0)) { 463 UFS_UNLOCK(ump); 464 return (ENXIO); 465 } 466 if (reclaimed > 0 && 467 ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 468 UFS_UNLOCK(ump); 469 ffs_fserr(fs, ip->i_number, "filesystem full"); 470 uprintf("\n%s: write failed, filesystem is full\n", 471 fs->fs_fsmnt); 472 } else { 473 UFS_UNLOCK(ump); 474 } 475 return (ENOSPC); 476 } 477 478 /* 479 * Reallocate a sequence of blocks into a contiguous sequence of blocks. 480 * 481 * The vnode and an array of buffer pointers for a range of sequential 482 * logical blocks to be made contiguous is given. The allocator attempts 483 * to find a range of sequential blocks starting as close as possible 484 * from the end of the allocation for the logical block immediately 485 * preceding the current range. If successful, the physical block numbers 486 * in the buffer pointers and in the inode are changed to reflect the new 487 * allocation. If unsuccessful, the allocation is left unchanged. The 488 * success in doing the reallocation is returned. Note that the error 489 * return is not reflected back to the user. Rather the previous block 490 * allocation will be used. 491 */ 492 493 SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 494 "FFS filesystem"); 495 496 static int doasyncfree = 1; 497 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, 498 "do not force synchronous writes when blocks are reallocated"); 499 500 static int doreallocblks = 1; 501 SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, 502 "enable block reallocation"); 503 504 static int dotrimcons = 1; 505 SYSCTL_INT(_vfs_ffs, OID_AUTO, dotrimcons, CTLFLAG_RWTUN, &dotrimcons, 0, 506 "enable BIO_DELETE / TRIM consolidation"); 507 508 static int maxclustersearch = 10; 509 SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch, 510 0, "max number of cylinder group to search for contigous blocks"); 511 512 #ifdef DIAGNOSTIC 513 static int prtrealloc = 0; 514 SYSCTL_INT(_debug, OID_AUTO, ffs_prtrealloc, CTLFLAG_RW, &prtrealloc, 0, 515 "print out FFS filesystem block reallocation operations"); 516 #endif 517 518 int 519 ffs_reallocblks( 520 struct vop_reallocblks_args /* { 521 struct vnode *a_vp; 522 struct cluster_save *a_buflist; 523 } */ *ap) 524 { 525 struct ufsmount *ump; 526 int error; 527 528 /* 529 * We used to skip reallocating the blocks of a file into a 530 * contiguous sequence if the underlying flash device requested 531 * BIO_DELETE notifications, because devices that benefit from 532 * BIO_DELETE also benefit from not moving the data. However, 533 * the destination for the data is usually moved before the data 534 * is written to the initially allocated location, so we rarely 535 * suffer the penalty of extra writes. With the addition of the 536 * consolidation of contiguous blocks into single BIO_DELETE 537 * operations, having fewer but larger contiguous blocks reduces 538 * the number of (slow and expensive) BIO_DELETE operations. So 539 * when doing BIO_DELETE consolidation, we do block reallocation. 540 * 541 * Skip if reallocblks has been disabled globally. 542 */ 543 ump = ap->a_vp->v_mount->mnt_data; 544 if ((((ump->um_flags) & UM_CANDELETE) != 0 && dotrimcons == 0) || 545 doreallocblks == 0) 546 return (ENOSPC); 547 548 /* 549 * We can't wait in softdep prealloc as it may fsync and recurse 550 * here. Instead we simply fail to reallocate blocks if this 551 * rare condition arises. 552 */ 553 if (DOINGSUJ(ap->a_vp)) 554 if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) 555 return (ENOSPC); 556 vn_seqc_write_begin(ap->a_vp); 557 error = ump->um_fstype == UFS1 ? ffs_reallocblks_ufs1(ap) : 558 ffs_reallocblks_ufs2(ap); 559 vn_seqc_write_end(ap->a_vp); 560 return (error); 561 } 562 563 static int 564 ffs_reallocblks_ufs1( 565 struct vop_reallocblks_args /* { 566 struct vnode *a_vp; 567 struct cluster_save *a_buflist; 568 } */ *ap) 569 { 570 struct fs *fs; 571 struct inode *ip; 572 struct vnode *vp; 573 struct buf *sbp, *ebp, *bp; 574 ufs1_daddr_t *bap, *sbap, *ebap; 575 struct cluster_save *buflist; 576 struct ufsmount *ump; 577 ufs_lbn_t start_lbn, end_lbn; 578 ufs1_daddr_t soff, newblk, blkno; 579 ufs2_daddr_t pref; 580 struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; 581 int i, cg, len, start_lvl, end_lvl, ssize; 582 583 vp = ap->a_vp; 584 ip = VTOI(vp); 585 ump = ITOUMP(ip); 586 fs = ump->um_fs; 587 /* 588 * If we are not tracking block clusters or if we have less than 4% 589 * free blocks left, then do not attempt to cluster. Running with 590 * less than 5% free block reserve is not recommended and those that 591 * choose to do so do not expect to have good file layout. 592 */ 593 if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 594 return (ENOSPC); 595 buflist = ap->a_buflist; 596 len = buflist->bs_nchildren; 597 start_lbn = buflist->bs_children[0]->b_lblkno; 598 end_lbn = start_lbn + len - 1; 599 #ifdef INVARIANTS 600 for (i = 0; i < len; i++) 601 if (!ffs_checkblk(ip, 602 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 603 panic("ffs_reallocblks: unallocated block 1"); 604 for (i = 1; i < len; i++) 605 if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 606 panic("ffs_reallocblks: non-logical cluster"); 607 blkno = buflist->bs_children[0]->b_blkno; 608 ssize = fsbtodb(fs, fs->fs_frag); 609 for (i = 1; i < len - 1; i++) 610 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 611 panic("ffs_reallocblks: non-physical cluster %d", i); 612 #endif 613 /* 614 * If the cluster crosses the boundary for the first indirect 615 * block, leave space for the indirect block. Indirect blocks 616 * are initially laid out in a position after the last direct 617 * block. Block reallocation would usually destroy locality by 618 * moving the indirect block out of the way to make room for 619 * data blocks if we didn't compensate here. We should also do 620 * this for other indirect block boundaries, but it is only 621 * important for the first one. 622 */ 623 if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) 624 return (ENOSPC); 625 /* 626 * If the latest allocation is in a new cylinder group, assume that 627 * the filesystem has decided to move and do not force it back to 628 * the previous cylinder group. 629 */ 630 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 631 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 632 return (ENOSPC); 633 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 634 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 635 return (ENOSPC); 636 /* 637 * Get the starting offset and block map for the first block. 638 */ 639 if (start_lvl == 0) { 640 sbap = &ip->i_din1->di_db[0]; 641 soff = start_lbn; 642 } else { 643 idp = &start_ap[start_lvl - 1]; 644 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 645 brelse(sbp); 646 return (ENOSPC); 647 } 648 sbap = (ufs1_daddr_t *)sbp->b_data; 649 soff = idp->in_off; 650 } 651 /* 652 * If the block range spans two block maps, get the second map. 653 */ 654 ebap = NULL; 655 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 656 ssize = len; 657 } else { 658 #ifdef INVARIANTS 659 if (start_lvl > 0 && 660 start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 661 panic("ffs_reallocblk: start == end"); 662 #endif 663 ssize = len - (idp->in_off + 1); 664 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 665 goto fail; 666 ebap = (ufs1_daddr_t *)ebp->b_data; 667 } 668 /* 669 * Find the preferred location for the cluster. If we have not 670 * previously failed at this endeavor, then follow our standard 671 * preference calculation. If we have failed at it, then pick up 672 * where we last ended our search. 673 */ 674 UFS_LOCK(ump); 675 if (ip->i_nextclustercg == -1) 676 pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); 677 else 678 pref = cgdata(fs, ip->i_nextclustercg); 679 /* 680 * Search the block map looking for an allocation of the desired size. 681 * To avoid wasting too much time, we limit the number of cylinder 682 * groups that we will search. 683 */ 684 cg = dtog(fs, pref); 685 for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { 686 if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) 687 break; 688 cg += 1; 689 if (cg >= fs->fs_ncg) 690 cg = 0; 691 } 692 /* 693 * If we have failed in our search, record where we gave up for 694 * next time. Otherwise, fall back to our usual search citerion. 695 */ 696 if (newblk == 0) { 697 ip->i_nextclustercg = cg; 698 UFS_UNLOCK(ump); 699 goto fail; 700 } 701 ip->i_nextclustercg = -1; 702 /* 703 * We have found a new contiguous block. 704 * 705 * First we have to replace the old block pointers with the new 706 * block pointers in the inode and indirect blocks associated 707 * with the file. 708 */ 709 #ifdef DIAGNOSTIC 710 if (prtrealloc) 711 printf("realloc: ino %ju, lbns %jd-%jd\n\told:", 712 (uintmax_t)ip->i_number, 713 (intmax_t)start_lbn, (intmax_t)end_lbn); 714 #endif 715 blkno = newblk; 716 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 717 if (i == ssize) { 718 bap = ebap; 719 soff = -i; 720 } 721 #ifdef INVARIANTS 722 if (!ffs_checkblk(ip, 723 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 724 panic("ffs_reallocblks: unallocated block 2"); 725 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 726 panic("ffs_reallocblks: alloc mismatch"); 727 #endif 728 #ifdef DIAGNOSTIC 729 if (prtrealloc) 730 printf(" %d,", *bap); 731 #endif 732 if (DOINGSOFTDEP(vp)) { 733 if (sbap == &ip->i_din1->di_db[0] && i < ssize) 734 softdep_setup_allocdirect(ip, start_lbn + i, 735 blkno, *bap, fs->fs_bsize, fs->fs_bsize, 736 buflist->bs_children[i]); 737 else 738 softdep_setup_allocindir_page(ip, start_lbn + i, 739 i < ssize ? sbp : ebp, soff + i, blkno, 740 *bap, buflist->bs_children[i]); 741 } 742 *bap++ = blkno; 743 } 744 /* 745 * Next we must write out the modified inode and indirect blocks. 746 * For strict correctness, the writes should be synchronous since 747 * the old block values may have been written to disk. In practise 748 * they are almost never written, but if we are concerned about 749 * strict correctness, the `doasyncfree' flag should be set to zero. 750 * 751 * The test on `doasyncfree' should be changed to test a flag 752 * that shows whether the associated buffers and inodes have 753 * been written. The flag should be set when the cluster is 754 * started and cleared whenever the buffer or inode is flushed. 755 * We can then check below to see if it is set, and do the 756 * synchronous write only when it has been cleared. 757 */ 758 if (sbap != &ip->i_din1->di_db[0]) { 759 if (doasyncfree) 760 bdwrite(sbp); 761 else 762 bwrite(sbp); 763 } else { 764 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 765 if (!doasyncfree) 766 ffs_update(vp, 1); 767 } 768 if (ssize < len) { 769 if (doasyncfree) 770 bdwrite(ebp); 771 else 772 bwrite(ebp); 773 } 774 /* 775 * Last, free the old blocks and assign the new blocks to the buffers. 776 */ 777 #ifdef DIAGNOSTIC 778 if (prtrealloc) 779 printf("\n\tnew:"); 780 #endif 781 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 782 bp = buflist->bs_children[i]; 783 if (!DOINGSOFTDEP(vp)) 784 /* 785 * The usual case is that a set of N-contiguous blocks 786 * that was just allocated has been replaced with a 787 * set of N+1-contiguous blocks. If they are marked as 788 * B_DELWRI, the current contents have not been written 789 * to disk. It is possible that the blocks were written 790 * earlier, but very uncommon. If the blocks have never 791 * been written, there is no need to send a BIO_DELETE 792 * for them when they are freed. The gain from avoiding 793 * the TRIMs for the common case of unwritten blocks 794 * far exceeds the cost of the write amplification for 795 * the uncommon case of failing to send a TRIM for the 796 * blocks that had been written. 797 */ 798 ffs_blkfree(ump, fs, ump->um_devvp, 799 dbtofsb(fs, bp->b_blkno), 800 fs->fs_bsize, ip->i_number, vp->v_type, NULL, 801 (bp->b_flags & B_DELWRI) != 0 ? 802 NOTRIM_KEY : SINGLETON_KEY); 803 bp->b_blkno = fsbtodb(fs, blkno); 804 #ifdef INVARIANTS 805 if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) 806 panic("ffs_reallocblks: unallocated block 3"); 807 #endif 808 #ifdef DIAGNOSTIC 809 if (prtrealloc) 810 printf(" %d,", blkno); 811 #endif 812 } 813 #ifdef DIAGNOSTIC 814 if (prtrealloc) { 815 prtrealloc--; 816 printf("\n"); 817 } 818 #endif 819 return (0); 820 821 fail: 822 if (ssize < len) 823 brelse(ebp); 824 if (sbap != &ip->i_din1->di_db[0]) 825 brelse(sbp); 826 return (ENOSPC); 827 } 828 829 static int 830 ffs_reallocblks_ufs2( 831 struct vop_reallocblks_args /* { 832 struct vnode *a_vp; 833 struct cluster_save *a_buflist; 834 } */ *ap) 835 { 836 struct fs *fs; 837 struct inode *ip; 838 struct vnode *vp; 839 struct buf *sbp, *ebp, *bp; 840 ufs2_daddr_t *bap, *sbap, *ebap; 841 struct cluster_save *buflist; 842 struct ufsmount *ump; 843 ufs_lbn_t start_lbn, end_lbn; 844 ufs2_daddr_t soff, newblk, blkno, pref; 845 struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; 846 int i, cg, len, start_lvl, end_lvl, ssize; 847 848 vp = ap->a_vp; 849 ip = VTOI(vp); 850 ump = ITOUMP(ip); 851 fs = ump->um_fs; 852 /* 853 * If we are not tracking block clusters or if we have less than 4% 854 * free blocks left, then do not attempt to cluster. Running with 855 * less than 5% free block reserve is not recommended and those that 856 * choose to do so do not expect to have good file layout. 857 */ 858 if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) 859 return (ENOSPC); 860 buflist = ap->a_buflist; 861 len = buflist->bs_nchildren; 862 start_lbn = buflist->bs_children[0]->b_lblkno; 863 end_lbn = start_lbn + len - 1; 864 #ifdef INVARIANTS 865 for (i = 0; i < len; i++) 866 if (!ffs_checkblk(ip, 867 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 868 panic("ffs_reallocblks: unallocated block 1"); 869 for (i = 1; i < len; i++) 870 if (buflist->bs_children[i]->b_lblkno != start_lbn + i) 871 panic("ffs_reallocblks: non-logical cluster"); 872 blkno = buflist->bs_children[0]->b_blkno; 873 ssize = fsbtodb(fs, fs->fs_frag); 874 for (i = 1; i < len - 1; i++) 875 if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) 876 panic("ffs_reallocblks: non-physical cluster %d", i); 877 #endif 878 /* 879 * If the cluster crosses the boundary for the first indirect 880 * block, do not move anything in it. Indirect blocks are 881 * usually initially laid out in a position between the data 882 * blocks. Block reallocation would usually destroy locality by 883 * moving the indirect block out of the way to make room for 884 * data blocks if we didn't compensate here. We should also do 885 * this for other indirect block boundaries, but it is only 886 * important for the first one. 887 */ 888 if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) 889 return (ENOSPC); 890 /* 891 * If the latest allocation is in a new cylinder group, assume that 892 * the filesystem has decided to move and do not force it back to 893 * the previous cylinder group. 894 */ 895 if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != 896 dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) 897 return (ENOSPC); 898 if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || 899 ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) 900 return (ENOSPC); 901 /* 902 * Get the starting offset and block map for the first block. 903 */ 904 if (start_lvl == 0) { 905 sbap = &ip->i_din2->di_db[0]; 906 soff = start_lbn; 907 } else { 908 idp = &start_ap[start_lvl - 1]; 909 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { 910 brelse(sbp); 911 return (ENOSPC); 912 } 913 sbap = (ufs2_daddr_t *)sbp->b_data; 914 soff = idp->in_off; 915 } 916 /* 917 * If the block range spans two block maps, get the second map. 918 */ 919 ebap = NULL; 920 if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { 921 ssize = len; 922 } else { 923 #ifdef INVARIANTS 924 if (start_lvl > 0 && 925 start_ap[start_lvl - 1].in_lbn == idp->in_lbn) 926 panic("ffs_reallocblk: start == end"); 927 #endif 928 ssize = len - (idp->in_off + 1); 929 if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) 930 goto fail; 931 ebap = (ufs2_daddr_t *)ebp->b_data; 932 } 933 /* 934 * Find the preferred location for the cluster. If we have not 935 * previously failed at this endeavor, then follow our standard 936 * preference calculation. If we have failed at it, then pick up 937 * where we last ended our search. 938 */ 939 UFS_LOCK(ump); 940 if (ip->i_nextclustercg == -1) 941 pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); 942 else 943 pref = cgdata(fs, ip->i_nextclustercg); 944 /* 945 * Search the block map looking for an allocation of the desired size. 946 * To avoid wasting too much time, we limit the number of cylinder 947 * groups that we will search. 948 */ 949 cg = dtog(fs, pref); 950 for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { 951 if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) 952 break; 953 cg += 1; 954 if (cg >= fs->fs_ncg) 955 cg = 0; 956 } 957 /* 958 * If we have failed in our search, record where we gave up for 959 * next time. Otherwise, fall back to our usual search citerion. 960 */ 961 if (newblk == 0) { 962 ip->i_nextclustercg = cg; 963 UFS_UNLOCK(ump); 964 goto fail; 965 } 966 ip->i_nextclustercg = -1; 967 /* 968 * We have found a new contiguous block. 969 * 970 * First we have to replace the old block pointers with the new 971 * block pointers in the inode and indirect blocks associated 972 * with the file. 973 */ 974 #ifdef DIAGNOSTIC 975 if (prtrealloc) 976 printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, 977 (intmax_t)start_lbn, (intmax_t)end_lbn); 978 #endif 979 blkno = newblk; 980 for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { 981 if (i == ssize) { 982 bap = ebap; 983 soff = -i; 984 } 985 #ifdef INVARIANTS 986 if (!ffs_checkblk(ip, 987 dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) 988 panic("ffs_reallocblks: unallocated block 2"); 989 if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) 990 panic("ffs_reallocblks: alloc mismatch"); 991 #endif 992 #ifdef DIAGNOSTIC 993 if (prtrealloc) 994 printf(" %jd,", (intmax_t)*bap); 995 #endif 996 if (DOINGSOFTDEP(vp)) { 997 if (sbap == &ip->i_din2->di_db[0] && i < ssize) 998 softdep_setup_allocdirect(ip, start_lbn + i, 999 blkno, *bap, fs->fs_bsize, fs->fs_bsize, 1000 buflist->bs_children[i]); 1001 else 1002 softdep_setup_allocindir_page(ip, start_lbn + i, 1003 i < ssize ? sbp : ebp, soff + i, blkno, 1004 *bap, buflist->bs_children[i]); 1005 } 1006 *bap++ = blkno; 1007 } 1008 /* 1009 * Next we must write out the modified inode and indirect blocks. 1010 * For strict correctness, the writes should be synchronous since 1011 * the old block values may have been written to disk. In practise 1012 * they are almost never written, but if we are concerned about 1013 * strict correctness, the `doasyncfree' flag should be set to zero. 1014 * 1015 * The test on `doasyncfree' should be changed to test a flag 1016 * that shows whether the associated buffers and inodes have 1017 * been written. The flag should be set when the cluster is 1018 * started and cleared whenever the buffer or inode is flushed. 1019 * We can then check below to see if it is set, and do the 1020 * synchronous write only when it has been cleared. 1021 */ 1022 if (sbap != &ip->i_din2->di_db[0]) { 1023 if (doasyncfree) 1024 bdwrite(sbp); 1025 else 1026 bwrite(sbp); 1027 } else { 1028 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1029 if (!doasyncfree) 1030 ffs_update(vp, 1); 1031 } 1032 if (ssize < len) { 1033 if (doasyncfree) 1034 bdwrite(ebp); 1035 else 1036 bwrite(ebp); 1037 } 1038 /* 1039 * Last, free the old blocks and assign the new blocks to the buffers. 1040 */ 1041 #ifdef DIAGNOSTIC 1042 if (prtrealloc) 1043 printf("\n\tnew:"); 1044 #endif 1045 for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { 1046 bp = buflist->bs_children[i]; 1047 if (!DOINGSOFTDEP(vp)) 1048 /* 1049 * The usual case is that a set of N-contiguous blocks 1050 * that was just allocated has been replaced with a 1051 * set of N+1-contiguous blocks. If they are marked as 1052 * B_DELWRI, the current contents have not been written 1053 * to disk. It is possible that the blocks were written 1054 * earlier, but very uncommon. If the blocks have never 1055 * been written, there is no need to send a BIO_DELETE 1056 * for them when they are freed. The gain from avoiding 1057 * the TRIMs for the common case of unwritten blocks 1058 * far exceeds the cost of the write amplification for 1059 * the uncommon case of failing to send a TRIM for the 1060 * blocks that had been written. 1061 */ 1062 ffs_blkfree(ump, fs, ump->um_devvp, 1063 dbtofsb(fs, bp->b_blkno), 1064 fs->fs_bsize, ip->i_number, vp->v_type, NULL, 1065 (bp->b_flags & B_DELWRI) != 0 ? 1066 NOTRIM_KEY : SINGLETON_KEY); 1067 bp->b_blkno = fsbtodb(fs, blkno); 1068 #ifdef INVARIANTS 1069 if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) 1070 panic("ffs_reallocblks: unallocated block 3"); 1071 #endif 1072 #ifdef DIAGNOSTIC 1073 if (prtrealloc) 1074 printf(" %jd,", (intmax_t)blkno); 1075 #endif 1076 } 1077 #ifdef DIAGNOSTIC 1078 if (prtrealloc) { 1079 prtrealloc--; 1080 printf("\n"); 1081 } 1082 #endif 1083 return (0); 1084 1085 fail: 1086 if (ssize < len) 1087 brelse(ebp); 1088 if (sbap != &ip->i_din2->di_db[0]) 1089 brelse(sbp); 1090 return (ENOSPC); 1091 } 1092 1093 /* 1094 * Allocate an inode in the filesystem. 1095 * 1096 * If allocating a directory, use ffs_dirpref to select the inode. 1097 * If allocating in a directory, the following hierarchy is followed: 1098 * 1) allocate the preferred inode. 1099 * 2) allocate an inode in the same cylinder group. 1100 * 3) quadratically rehash into other cylinder groups, until an 1101 * available inode is located. 1102 * If no inode preference is given the following hierarchy is used 1103 * to allocate an inode: 1104 * 1) allocate an inode in cylinder group 0. 1105 * 2) quadratically rehash into other cylinder groups, until an 1106 * available inode is located. 1107 */ 1108 int 1109 ffs_valloc(struct vnode *pvp, 1110 int mode, 1111 struct ucred *cred, 1112 struct vnode **vpp) 1113 { 1114 struct inode *pip; 1115 struct fs *fs; 1116 struct inode *ip; 1117 struct timespec ts; 1118 struct ufsmount *ump; 1119 ino_t ino, ipref; 1120 u_int cg; 1121 int error, reclaimed; 1122 1123 *vpp = NULL; 1124 pip = VTOI(pvp); 1125 ump = ITOUMP(pip); 1126 fs = ump->um_fs; 1127 1128 UFS_LOCK(ump); 1129 reclaimed = 0; 1130 retry: 1131 if (fs->fs_cstotal.cs_nifree == 0) 1132 goto noinodes; 1133 1134 if ((mode & IFMT) == IFDIR) 1135 ipref = ffs_dirpref(pip); 1136 else 1137 ipref = pip->i_number; 1138 if (ipref >= fs->fs_ncg * fs->fs_ipg) 1139 ipref = 0; 1140 cg = ino_to_cg(fs, ipref); 1141 /* 1142 * Track number of dirs created one after another 1143 * in a same cg without intervening by files. 1144 */ 1145 if ((mode & IFMT) == IFDIR) { 1146 if (fs->fs_contigdirs[cg] < 255) 1147 fs->fs_contigdirs[cg]++; 1148 } else { 1149 if (fs->fs_contigdirs[cg] > 0) 1150 fs->fs_contigdirs[cg]--; 1151 } 1152 ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 1153 (allocfcn_t *)ffs_nodealloccg); 1154 if (ino == 0) 1155 goto noinodes; 1156 /* 1157 * Get rid of the cached old vnode, force allocation of a new vnode 1158 * for this inode. If this fails, release the allocated ino and 1159 * return the error. 1160 */ 1161 if ((error = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp, 1162 FFSV_FORCEINSMQ | FFSV_REPLACE | FFSV_NEWINODE)) != 0) { 1163 ffs_vfree(pvp, ino, mode); 1164 return (error); 1165 } 1166 /* 1167 * We got an inode, so check mode and panic if it is already allocated. 1168 */ 1169 ip = VTOI(*vpp); 1170 if (ip->i_mode) { 1171 printf("mode = 0%o, inum = %ju, fs = %s\n", 1172 ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt); 1173 panic("ffs_valloc: dup alloc"); 1174 } 1175 if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ 1176 printf("free inode %s/%lu had %ld blocks\n", 1177 fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks)); 1178 DIP_SET(ip, i_blocks, 0); 1179 } 1180 ip->i_flags = 0; 1181 DIP_SET(ip, i_flags, 0); 1182 if ((mode & IFMT) == IFDIR) 1183 DIP_SET(ip, i_dirdepth, DIP(pip, i_dirdepth) + 1); 1184 /* 1185 * Set up a new generation number for this inode. 1186 */ 1187 while (ip->i_gen == 0 || ++ip->i_gen == 0) 1188 ip->i_gen = arc4random(); 1189 DIP_SET(ip, i_gen, ip->i_gen); 1190 if (fs->fs_magic == FS_UFS2_MAGIC) { 1191 vfs_timestamp(&ts); 1192 ip->i_din2->di_birthtime = ts.tv_sec; 1193 ip->i_din2->di_birthnsec = ts.tv_nsec; 1194 } 1195 ip->i_flag = 0; 1196 (*vpp)->v_vflag = 0; 1197 (*vpp)->v_type = VNON; 1198 if (fs->fs_magic == FS_UFS2_MAGIC) { 1199 (*vpp)->v_op = &ffs_vnodeops2; 1200 UFS_INODE_SET_FLAG(ip, IN_UFS2); 1201 } else { 1202 (*vpp)->v_op = &ffs_vnodeops1; 1203 } 1204 return (0); 1205 noinodes: 1206 if (reclaimed == 0) { 1207 reclaimed = 1; 1208 softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT); 1209 goto retry; 1210 } 1211 if (ffs_fsfail_cleanup_locked(ump, 0)) { 1212 UFS_UNLOCK(ump); 1213 return (ENXIO); 1214 } 1215 if (ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { 1216 UFS_UNLOCK(ump); 1217 ffs_fserr(fs, pip->i_number, "out of inodes"); 1218 uprintf("\n%s: create/symlink failed, no inodes free\n", 1219 fs->fs_fsmnt); 1220 } else { 1221 UFS_UNLOCK(ump); 1222 } 1223 return (ENOSPC); 1224 } 1225 1226 /* 1227 * Find a cylinder group to place a directory. 1228 * 1229 * The policy implemented by this algorithm is to allocate a 1230 * directory inode in the same cylinder group as its parent 1231 * directory, but also to reserve space for its files inodes 1232 * and data. Restrict the number of directories which may be 1233 * allocated one after another in the same cylinder group 1234 * without intervening allocation of files. 1235 * 1236 * If we allocate a first level directory then force allocation 1237 * in another cylinder group. 1238 */ 1239 static ino_t 1240 ffs_dirpref(struct inode *pip) 1241 { 1242 struct fs *fs; 1243 int cg, prefcg, curcg, dirsize, cgsize; 1244 int depth, range, start, end, numdirs, power, numerator, denominator; 1245 u_int avgifree, avgbfree, avgndir, curdirsize; 1246 u_int minifree, minbfree, maxndir; 1247 u_int maxcontigdirs; 1248 1249 mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED); 1250 fs = ITOFS(pip); 1251 1252 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 1253 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1254 avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; 1255 1256 /* 1257 * Select a preferred cylinder group to place a new directory. 1258 * If we are near the root of the filesystem we aim to spread 1259 * them out as much as possible. As we descend deeper from the 1260 * root we cluster them closer together around their parent as 1261 * we expect them to be more closely interactive. Higher-level 1262 * directories like usr/src/sys and usr/src/bin should be 1263 * separated while the directories in these areas are more 1264 * likely to be accessed together so should be closer. 1265 * 1266 * We pick a range of cylinder groups around the cylinder group 1267 * of the directory in which we are being created. The size of 1268 * the range for our search is based on our depth from the root 1269 * of our filesystem. We then probe that range based on how many 1270 * directories are already present. The first new directory is at 1271 * 1/2 (middle) of the range; the second is in the first 1/4 of the 1272 * range, then at 3/4, 1/8, 3/8, 5/8, 7/8, 1/16, 3/16, 5/16, etc. 1273 */ 1274 depth = DIP(pip, i_dirdepth); 1275 range = fs->fs_ncg / (1 << depth); 1276 curcg = ino_to_cg(fs, pip->i_number); 1277 start = curcg - (range / 2); 1278 if (start < 0) 1279 start += fs->fs_ncg; 1280 end = curcg + (range / 2); 1281 if (end >= fs->fs_ncg) 1282 end -= fs->fs_ncg; 1283 numdirs = pip->i_effnlink - 1; 1284 power = fls(numdirs); 1285 numerator = (numdirs & ~(1 << (power - 1))) * 2 + 1; 1286 denominator = 1 << power; 1287 prefcg = (curcg - (range / 2) + (range * numerator / denominator)); 1288 if (prefcg < 0) 1289 prefcg += fs->fs_ncg; 1290 if (prefcg >= fs->fs_ncg) 1291 prefcg -= fs->fs_ncg; 1292 /* 1293 * If this filesystem is not tracking directory depths, 1294 * revert to the old algorithm. 1295 */ 1296 if (depth == 0 && pip->i_number != UFS_ROOTINO) 1297 prefcg = curcg; 1298 1299 /* 1300 * Count various limits which used for 1301 * optimal allocation of a directory inode. 1302 */ 1303 maxndir = min(avgndir + (1 << depth), fs->fs_ipg); 1304 minifree = avgifree - avgifree / 4; 1305 if (minifree < 1) 1306 minifree = 1; 1307 minbfree = avgbfree - avgbfree / 4; 1308 if (minbfree < 1) 1309 minbfree = 1; 1310 cgsize = fs->fs_fsize * fs->fs_fpg; 1311 dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; 1312 curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; 1313 if (dirsize < curdirsize) 1314 dirsize = curdirsize; 1315 if (dirsize <= 0) 1316 maxcontigdirs = 0; /* dirsize overflowed */ 1317 else 1318 maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); 1319 if (fs->fs_avgfpdir > 0) 1320 maxcontigdirs = min(maxcontigdirs, 1321 fs->fs_ipg / fs->fs_avgfpdir); 1322 if (maxcontigdirs == 0) 1323 maxcontigdirs = 1; 1324 1325 /* 1326 * Limit number of dirs in one cg and reserve space for 1327 * regular files, but only if we have no deficit in 1328 * inodes or space. 1329 * 1330 * We are trying to find a suitable cylinder group nearby 1331 * our preferred cylinder group to place a new directory. 1332 * We scan from our preferred cylinder group forward looking 1333 * for a cylinder group that meets our criterion. If we get 1334 * to the final cylinder group and do not find anything, 1335 * we start scanning forwards from the beginning of the 1336 * filesystem. While it might seem sensible to start scanning 1337 * backwards or even to alternate looking forward and backward, 1338 * this approach fails badly when the filesystem is nearly full. 1339 * Specifically, we first search all the areas that have no space 1340 * and finally try the one preceding that. We repeat this on 1341 * every request and in the case of the final block end up 1342 * searching the entire filesystem. By jumping to the front 1343 * of the filesystem, our future forward searches always look 1344 * in new cylinder groups so finds every possible block after 1345 * one pass over the filesystem. 1346 */ 1347 for (cg = prefcg; cg < fs->fs_ncg; cg++) 1348 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 1349 fs->fs_cs(fs, cg).cs_nifree >= minifree && 1350 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 1351 if (fs->fs_contigdirs[cg] < maxcontigdirs) 1352 return ((ino_t)(fs->fs_ipg * cg)); 1353 } 1354 for (cg = 0; cg < prefcg; cg++) 1355 if (fs->fs_cs(fs, cg).cs_ndir < maxndir && 1356 fs->fs_cs(fs, cg).cs_nifree >= minifree && 1357 fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { 1358 if (fs->fs_contigdirs[cg] < maxcontigdirs) 1359 return ((ino_t)(fs->fs_ipg * cg)); 1360 } 1361 /* 1362 * This is a backstop when we have deficit in space. 1363 */ 1364 for (cg = prefcg; cg < fs->fs_ncg; cg++) 1365 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 1366 return ((ino_t)(fs->fs_ipg * cg)); 1367 for (cg = 0; cg < prefcg; cg++) 1368 if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) 1369 break; 1370 return ((ino_t)(fs->fs_ipg * cg)); 1371 } 1372 1373 /* 1374 * Select the desired position for the next block in a file. The file is 1375 * logically divided into sections. The first section is composed of the 1376 * direct blocks and the next fs_maxbpg blocks. Each additional section 1377 * contains fs_maxbpg blocks. 1378 * 1379 * If no blocks have been allocated in the first section, the policy is to 1380 * request a block in the same cylinder group as the inode that describes 1381 * the file. The first indirect is allocated immediately following the last 1382 * direct block and the data blocks for the first indirect immediately 1383 * follow it. 1384 * 1385 * If no blocks have been allocated in any other section, the indirect 1386 * block(s) are allocated in the same cylinder group as its inode in an 1387 * area reserved immediately following the inode blocks. The policy for 1388 * the data blocks is to place them in a cylinder group with a greater than 1389 * average number of free blocks. An appropriate cylinder group is found 1390 * by using a rotor that sweeps the cylinder groups. When a new group of 1391 * blocks is needed, the sweep begins in the cylinder group following the 1392 * cylinder group from which the previous allocation was made. The sweep 1393 * continues until a cylinder group with greater than the average number 1394 * of free blocks is found. If the allocation is for the first block in an 1395 * indirect block or the previous block is a hole, then the information on 1396 * the previous allocation is unavailable; here a best guess is made based 1397 * on the logical block number being allocated. 1398 * 1399 * If a section is already partially allocated, the policy is to 1400 * allocate blocks contiguously within the section if possible. 1401 */ 1402 ufs2_daddr_t 1403 ffs_blkpref_ufs1(struct inode *ip, 1404 ufs_lbn_t lbn, 1405 int indx, 1406 ufs1_daddr_t *bap) 1407 { 1408 struct fs *fs; 1409 u_int cg, inocg; 1410 u_int avgbfree, startcg; 1411 ufs2_daddr_t pref, prevbn; 1412 1413 KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1414 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1415 fs = ITOFS(ip); 1416 /* 1417 * Allocation of indirect blocks is indicated by passing negative 1418 * values in indx: -1 for single indirect, -2 for double indirect, 1419 * -3 for triple indirect. As noted below, we attempt to allocate 1420 * the first indirect inline with the file data. For all later 1421 * indirect blocks, the data is often allocated in other cylinder 1422 * groups. However to speed random file access and to speed up 1423 * fsck, the filesystem reserves the first fs_metaspace blocks 1424 * (typically half of fs_minfree) of the data area of each cylinder 1425 * group to hold these later indirect blocks. 1426 */ 1427 inocg = ino_to_cg(fs, ip->i_number); 1428 if (indx < 0) { 1429 /* 1430 * Our preference for indirect blocks is the zone at the 1431 * beginning of the inode's cylinder group data area that 1432 * we try to reserve for indirect blocks. 1433 */ 1434 pref = cgmeta(fs, inocg); 1435 /* 1436 * If we are allocating the first indirect block, try to 1437 * place it immediately following the last direct block. 1438 */ 1439 if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && 1440 ip->i_din1->di_db[UFS_NDADDR - 1] != 0) 1441 pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag; 1442 return (pref); 1443 } 1444 /* 1445 * If we are allocating the first data block in the first indirect 1446 * block and the indirect has been allocated in the data block area, 1447 * try to place it immediately following the indirect block. 1448 */ 1449 if (lbn == UFS_NDADDR) { 1450 pref = ip->i_din1->di_ib[0]; 1451 if (pref != 0 && pref >= cgdata(fs, inocg) && 1452 pref < cgbase(fs, inocg + 1)) 1453 return (pref + fs->fs_frag); 1454 } 1455 /* 1456 * If we are at the beginning of a file, or we have already allocated 1457 * the maximum number of blocks per cylinder group, or we do not 1458 * have a block allocated immediately preceding us, then we need 1459 * to decide where to start allocating new blocks. 1460 */ 1461 if (indx == 0) { 1462 prevbn = 0; 1463 } else { 1464 prevbn = bap[indx - 1]; 1465 if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, 1466 fs->fs_bsize) != 0) 1467 prevbn = 0; 1468 } 1469 if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { 1470 /* 1471 * If we are allocating a directory data block, we want 1472 * to place it in the metadata area. 1473 */ 1474 if ((ip->i_mode & IFMT) == IFDIR) 1475 return (cgmeta(fs, inocg)); 1476 /* 1477 * Until we fill all the direct and all the first indirect's 1478 * blocks, we try to allocate in the data area of the inode's 1479 * cylinder group. 1480 */ 1481 if (lbn < UFS_NDADDR + NINDIR(fs)) 1482 return (cgdata(fs, inocg)); 1483 /* 1484 * Find a cylinder with greater than average number of 1485 * unused data blocks. 1486 */ 1487 if (indx == 0 || prevbn == 0) 1488 startcg = inocg + lbn / fs->fs_maxbpg; 1489 else 1490 startcg = dtog(fs, prevbn) + 1; 1491 startcg %= fs->fs_ncg; 1492 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1493 for (cg = startcg; cg < fs->fs_ncg; cg++) 1494 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1495 fs->fs_cgrotor = cg; 1496 return (cgdata(fs, cg)); 1497 } 1498 for (cg = 0; cg <= startcg; cg++) 1499 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1500 fs->fs_cgrotor = cg; 1501 return (cgdata(fs, cg)); 1502 } 1503 return (0); 1504 } 1505 /* 1506 * Otherwise, we just always try to lay things out contiguously. 1507 */ 1508 return (prevbn + fs->fs_frag); 1509 } 1510 1511 /* 1512 * Same as above, but for UFS2 1513 */ 1514 ufs2_daddr_t 1515 ffs_blkpref_ufs2(struct inode *ip, 1516 ufs_lbn_t lbn, 1517 int indx, 1518 ufs2_daddr_t *bap) 1519 { 1520 struct fs *fs; 1521 u_int cg, inocg; 1522 u_int avgbfree, startcg; 1523 ufs2_daddr_t pref, prevbn; 1524 1525 KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); 1526 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1527 fs = ITOFS(ip); 1528 /* 1529 * Allocation of indirect blocks is indicated by passing negative 1530 * values in indx: -1 for single indirect, -2 for double indirect, 1531 * -3 for triple indirect. As noted below, we attempt to allocate 1532 * the first indirect inline with the file data. For all later 1533 * indirect blocks, the data is often allocated in other cylinder 1534 * groups. However to speed random file access and to speed up 1535 * fsck, the filesystem reserves the first fs_metaspace blocks 1536 * (typically half of fs_minfree) of the data area of each cylinder 1537 * group to hold these later indirect blocks. 1538 */ 1539 inocg = ino_to_cg(fs, ip->i_number); 1540 if (indx < 0) { 1541 /* 1542 * Our preference for indirect blocks is the zone at the 1543 * beginning of the inode's cylinder group data area that 1544 * we try to reserve for indirect blocks. 1545 */ 1546 pref = cgmeta(fs, inocg); 1547 /* 1548 * If we are allocating the first indirect block, try to 1549 * place it immediately following the last direct block. 1550 */ 1551 if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && 1552 ip->i_din2->di_db[UFS_NDADDR - 1] != 0) 1553 pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag; 1554 return (pref); 1555 } 1556 /* 1557 * If we are allocating the first data block in the first indirect 1558 * block and the indirect has been allocated in the data block area, 1559 * try to place it immediately following the indirect block. 1560 */ 1561 if (lbn == UFS_NDADDR) { 1562 pref = ip->i_din2->di_ib[0]; 1563 if (pref != 0 && pref >= cgdata(fs, inocg) && 1564 pref < cgbase(fs, inocg + 1)) 1565 return (pref + fs->fs_frag); 1566 } 1567 /* 1568 * If we are at the beginning of a file, or we have already allocated 1569 * the maximum number of blocks per cylinder group, or we do not 1570 * have a block allocated immediately preceding us, then we need 1571 * to decide where to start allocating new blocks. 1572 */ 1573 if (indx == 0) { 1574 prevbn = 0; 1575 } else { 1576 prevbn = bap[indx - 1]; 1577 if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, 1578 fs->fs_bsize) != 0) 1579 prevbn = 0; 1580 } 1581 if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { 1582 /* 1583 * If we are allocating a directory data block, we want 1584 * to place it in the metadata area. 1585 */ 1586 if ((ip->i_mode & IFMT) == IFDIR) 1587 return (cgmeta(fs, inocg)); 1588 /* 1589 * Until we fill all the direct and all the first indirect's 1590 * blocks, we try to allocate in the data area of the inode's 1591 * cylinder group. 1592 */ 1593 if (lbn < UFS_NDADDR + NINDIR(fs)) 1594 return (cgdata(fs, inocg)); 1595 /* 1596 * Find a cylinder with greater than average number of 1597 * unused data blocks. 1598 */ 1599 if (indx == 0 || prevbn == 0) 1600 startcg = inocg + lbn / fs->fs_maxbpg; 1601 else 1602 startcg = dtog(fs, prevbn) + 1; 1603 startcg %= fs->fs_ncg; 1604 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 1605 for (cg = startcg; cg < fs->fs_ncg; cg++) 1606 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1607 fs->fs_cgrotor = cg; 1608 return (cgdata(fs, cg)); 1609 } 1610 for (cg = 0; cg <= startcg; cg++) 1611 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 1612 fs->fs_cgrotor = cg; 1613 return (cgdata(fs, cg)); 1614 } 1615 return (0); 1616 } 1617 /* 1618 * Otherwise, we just always try to lay things out contiguously. 1619 */ 1620 return (prevbn + fs->fs_frag); 1621 } 1622 1623 /* 1624 * Implement the cylinder overflow algorithm. 1625 * 1626 * The policy implemented by this algorithm is: 1627 * 1) allocate the block in its requested cylinder group. 1628 * 2) quadratically rehash on the cylinder group number. 1629 * 3) brute force search for a free block. 1630 * 1631 * Must be called with the UFS lock held. Will release the lock on success 1632 * and return with it held on failure. 1633 */ 1634 /*VARARGS5*/ 1635 static ufs2_daddr_t 1636 ffs_hashalloc(struct inode *ip, 1637 u_int cg, 1638 ufs2_daddr_t pref, 1639 int size, /* Search size for data blocks, mode for inodes */ 1640 int rsize, /* Real allocated size. */ 1641 allocfcn_t *allocator) 1642 { 1643 struct fs *fs; 1644 ufs2_daddr_t result; 1645 u_int i, icg = cg; 1646 1647 mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); 1648 #ifdef INVARIANTS 1649 if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) 1650 panic("ffs_hashalloc: allocation on suspended filesystem"); 1651 #endif 1652 fs = ITOFS(ip); 1653 /* 1654 * 1: preferred cylinder group 1655 */ 1656 result = (*allocator)(ip, cg, pref, size, rsize); 1657 if (result) 1658 return (result); 1659 /* 1660 * 2: quadratic rehash 1661 */ 1662 for (i = 1; i < fs->fs_ncg; i *= 2) { 1663 cg += i; 1664 if (cg >= fs->fs_ncg) 1665 cg -= fs->fs_ncg; 1666 result = (*allocator)(ip, cg, 0, size, rsize); 1667 if (result) 1668 return (result); 1669 } 1670 /* 1671 * 3: brute force search 1672 * Note that we start at i == 2, since 0 was checked initially, 1673 * and 1 is always checked in the quadratic rehash. 1674 */ 1675 cg = (icg + 2) % fs->fs_ncg; 1676 for (i = 2; i < fs->fs_ncg; i++) { 1677 result = (*allocator)(ip, cg, 0, size, rsize); 1678 if (result) 1679 return (result); 1680 cg++; 1681 if (cg == fs->fs_ncg) 1682 cg = 0; 1683 } 1684 return (0); 1685 } 1686 1687 /* 1688 * Determine whether a fragment can be extended. 1689 * 1690 * Check to see if the necessary fragments are available, and 1691 * if they are, allocate them. 1692 */ 1693 static ufs2_daddr_t 1694 ffs_fragextend(struct inode *ip, 1695 u_int cg, 1696 ufs2_daddr_t bprev, 1697 int osize, 1698 int nsize) 1699 { 1700 struct fs *fs; 1701 struct cg *cgp; 1702 struct buf *bp; 1703 struct ufsmount *ump; 1704 int nffree; 1705 long bno; 1706 int frags, bbase; 1707 int i, error; 1708 u_int8_t *blksfree; 1709 1710 ump = ITOUMP(ip); 1711 fs = ump->um_fs; 1712 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) 1713 return (0); 1714 frags = numfrags(fs, nsize); 1715 bbase = fragnum(fs, bprev); 1716 if (bbase > fragnum(fs, (bprev + frags - 1))) { 1717 /* cannot extend across a block boundary */ 1718 return (0); 1719 } 1720 UFS_UNLOCK(ump); 1721 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) 1722 goto fail; 1723 bno = dtogd(fs, bprev); 1724 blksfree = cg_blksfree(cgp); 1725 for (i = numfrags(fs, osize); i < frags; i++) 1726 if (isclr(blksfree, bno + i)) 1727 goto fail; 1728 /* 1729 * the current fragment can be extended 1730 * deduct the count on fragment being extended into 1731 * increase the count on the remaining fragment (if any) 1732 * allocate the extended piece 1733 */ 1734 for (i = frags; i < fs->fs_frag - bbase; i++) 1735 if (isclr(blksfree, bno + i)) 1736 break; 1737 cgp->cg_frsum[i - numfrags(fs, osize)]--; 1738 if (i != frags) 1739 cgp->cg_frsum[i - frags]++; 1740 for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) { 1741 clrbit(blksfree, bno + i); 1742 cgp->cg_cs.cs_nffree--; 1743 nffree++; 1744 } 1745 UFS_LOCK(ump); 1746 fs->fs_cstotal.cs_nffree -= nffree; 1747 fs->fs_cs(fs, cg).cs_nffree -= nffree; 1748 fs->fs_fmod = 1; 1749 ACTIVECLEAR(fs, cg); 1750 UFS_UNLOCK(ump); 1751 if (DOINGSOFTDEP(ITOV(ip))) 1752 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, 1753 frags, numfrags(fs, osize)); 1754 bdwrite(bp); 1755 return (bprev); 1756 1757 fail: 1758 brelse(bp); 1759 UFS_LOCK(ump); 1760 return (0); 1761 1762 } 1763 1764 /* 1765 * Determine whether a block can be allocated. 1766 * 1767 * Check to see if a block of the appropriate size is available, 1768 * and if it is, allocate it. 1769 */ 1770 static ufs2_daddr_t 1771 ffs_alloccg(struct inode *ip, 1772 u_int cg, 1773 ufs2_daddr_t bpref, 1774 int size, 1775 int rsize) 1776 { 1777 struct fs *fs; 1778 struct cg *cgp; 1779 struct buf *bp; 1780 struct ufsmount *ump; 1781 ufs1_daddr_t bno; 1782 ufs2_daddr_t blkno; 1783 int i, allocsiz, error, frags; 1784 u_int8_t *blksfree; 1785 1786 ump = ITOUMP(ip); 1787 fs = ump->um_fs; 1788 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) 1789 return (0); 1790 UFS_UNLOCK(ump); 1791 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0 || 1792 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) 1793 goto fail; 1794 if (size == fs->fs_bsize) { 1795 UFS_LOCK(ump); 1796 blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1797 ACTIVECLEAR(fs, cg); 1798 UFS_UNLOCK(ump); 1799 bdwrite(bp); 1800 return (blkno); 1801 } 1802 /* 1803 * check to see if any fragments are already available 1804 * allocsiz is the size which will be allocated, hacking 1805 * it down to a smaller size if necessary 1806 */ 1807 blksfree = cg_blksfree(cgp); 1808 frags = numfrags(fs, size); 1809 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 1810 if (cgp->cg_frsum[allocsiz] != 0) 1811 break; 1812 if (allocsiz == fs->fs_frag) { 1813 /* 1814 * no fragments were available, so a block will be 1815 * allocated, and hacked up 1816 */ 1817 if (cgp->cg_cs.cs_nbfree == 0) 1818 goto fail; 1819 UFS_LOCK(ump); 1820 blkno = ffs_alloccgblk(ip, bp, bpref, rsize); 1821 ACTIVECLEAR(fs, cg); 1822 UFS_UNLOCK(ump); 1823 bdwrite(bp); 1824 return (blkno); 1825 } 1826 KASSERT(size == rsize, 1827 ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); 1828 bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); 1829 if (bno < 0) 1830 goto fail; 1831 for (i = 0; i < frags; i++) 1832 clrbit(blksfree, bno + i); 1833 cgp->cg_cs.cs_nffree -= frags; 1834 cgp->cg_frsum[allocsiz]--; 1835 if (frags != allocsiz) 1836 cgp->cg_frsum[allocsiz - frags]++; 1837 UFS_LOCK(ump); 1838 fs->fs_cstotal.cs_nffree -= frags; 1839 fs->fs_cs(fs, cg).cs_nffree -= frags; 1840 fs->fs_fmod = 1; 1841 blkno = cgbase(fs, cg) + bno; 1842 ACTIVECLEAR(fs, cg); 1843 UFS_UNLOCK(ump); 1844 if (DOINGSOFTDEP(ITOV(ip))) 1845 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); 1846 bdwrite(bp); 1847 return (blkno); 1848 1849 fail: 1850 brelse(bp); 1851 UFS_LOCK(ump); 1852 return (0); 1853 } 1854 1855 /* 1856 * Allocate a block in a cylinder group. 1857 * 1858 * This algorithm implements the following policy: 1859 * 1) allocate the requested block. 1860 * 2) allocate a rotationally optimal block in the same cylinder. 1861 * 3) allocate the next available block on the block rotor for the 1862 * specified cylinder group. 1863 * Note that this routine only allocates fs_bsize blocks; these 1864 * blocks may be fragmented by the routine that allocates them. 1865 */ 1866 static ufs2_daddr_t 1867 ffs_alloccgblk(struct inode *ip, 1868 struct buf *bp, 1869 ufs2_daddr_t bpref, 1870 int size) 1871 { 1872 struct fs *fs; 1873 struct cg *cgp; 1874 struct ufsmount *ump; 1875 ufs1_daddr_t bno; 1876 ufs2_daddr_t blkno; 1877 u_int8_t *blksfree; 1878 int i, cgbpref; 1879 1880 ump = ITOUMP(ip); 1881 fs = ump->um_fs; 1882 mtx_assert(UFS_MTX(ump), MA_OWNED); 1883 cgp = (struct cg *)bp->b_data; 1884 blksfree = cg_blksfree(cgp); 1885 if (bpref == 0) { 1886 bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag; 1887 } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) { 1888 /* map bpref to correct zone in this cg */ 1889 if (bpref < cgdata(fs, cgbpref)) 1890 bpref = cgmeta(fs, cgp->cg_cgx); 1891 else 1892 bpref = cgdata(fs, cgp->cg_cgx); 1893 } 1894 /* 1895 * if the requested block is available, use it 1896 */ 1897 bno = dtogd(fs, blknum(fs, bpref)); 1898 if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) 1899 goto gotit; 1900 /* 1901 * Take the next available block in this cylinder group. 1902 */ 1903 bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); 1904 if (bno < 0) 1905 return (0); 1906 /* Update cg_rotor only if allocated from the data zone */ 1907 if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) 1908 cgp->cg_rotor = bno; 1909 gotit: 1910 blkno = fragstoblks(fs, bno); 1911 ffs_clrblock(fs, blksfree, (long)blkno); 1912 ffs_clusteracct(fs, cgp, blkno, -1); 1913 cgp->cg_cs.cs_nbfree--; 1914 fs->fs_cstotal.cs_nbfree--; 1915 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; 1916 fs->fs_fmod = 1; 1917 blkno = cgbase(fs, cgp->cg_cgx) + bno; 1918 /* 1919 * If the caller didn't want the whole block free the frags here. 1920 */ 1921 size = numfrags(fs, size); 1922 if (size != fs->fs_frag) { 1923 bno = dtogd(fs, blkno); 1924 for (i = size; i < fs->fs_frag; i++) 1925 setbit(blksfree, bno + i); 1926 i = fs->fs_frag - size; 1927 cgp->cg_cs.cs_nffree += i; 1928 fs->fs_cstotal.cs_nffree += i; 1929 fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; 1930 fs->fs_fmod = 1; 1931 cgp->cg_frsum[i]++; 1932 } 1933 /* XXX Fixme. */ 1934 UFS_UNLOCK(ump); 1935 if (DOINGSOFTDEP(ITOV(ip))) 1936 softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0); 1937 UFS_LOCK(ump); 1938 return (blkno); 1939 } 1940 1941 /* 1942 * Determine whether a cluster can be allocated. 1943 * 1944 * We do not currently check for optimal rotational layout if there 1945 * are multiple choices in the same cylinder group. Instead we just 1946 * take the first one that we find following bpref. 1947 */ 1948 static ufs2_daddr_t 1949 ffs_clusteralloc(struct inode *ip, 1950 u_int cg, 1951 ufs2_daddr_t bpref, 1952 int len) 1953 { 1954 struct fs *fs; 1955 struct cg *cgp; 1956 struct buf *bp; 1957 struct ufsmount *ump; 1958 int i, run, bit, map, got, error; 1959 ufs2_daddr_t bno; 1960 u_char *mapp; 1961 int32_t *lp; 1962 u_int8_t *blksfree; 1963 1964 ump = ITOUMP(ip); 1965 fs = ump->um_fs; 1966 if (fs->fs_maxcluster[cg] < len) 1967 return (0); 1968 UFS_UNLOCK(ump); 1969 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { 1970 UFS_LOCK(ump); 1971 return (0); 1972 } 1973 /* 1974 * Check to see if a cluster of the needed size (or bigger) is 1975 * available in this cylinder group. 1976 */ 1977 lp = &cg_clustersum(cgp)[len]; 1978 for (i = len; i <= fs->fs_contigsumsize; i++) 1979 if (*lp++ > 0) 1980 break; 1981 if (i > fs->fs_contigsumsize) { 1982 /* 1983 * This is the first time looking for a cluster in this 1984 * cylinder group. Update the cluster summary information 1985 * to reflect the true maximum sized cluster so that 1986 * future cluster allocation requests can avoid reading 1987 * the cylinder group map only to find no clusters. 1988 */ 1989 lp = &cg_clustersum(cgp)[len - 1]; 1990 for (i = len - 1; i > 0; i--) 1991 if (*lp-- > 0) 1992 break; 1993 UFS_LOCK(ump); 1994 fs->fs_maxcluster[cg] = i; 1995 brelse(bp); 1996 return (0); 1997 } 1998 /* 1999 * Search the cluster map to find a big enough cluster. 2000 * We take the first one that we find, even if it is larger 2001 * than we need as we prefer to get one close to the previous 2002 * block allocation. We do not search before the current 2003 * preference point as we do not want to allocate a block 2004 * that is allocated before the previous one (as we will 2005 * then have to wait for another pass of the elevator 2006 * algorithm before it will be read). We prefer to fail and 2007 * be recalled to try an allocation in the next cylinder group. 2008 */ 2009 if (dtog(fs, bpref) != cg) 2010 bpref = cgdata(fs, cg); 2011 else 2012 bpref = blknum(fs, bpref); 2013 bpref = fragstoblks(fs, dtogd(fs, bpref)); 2014 mapp = &cg_clustersfree(cgp)[bpref / NBBY]; 2015 map = *mapp++; 2016 bit = 1 << (bpref % NBBY); 2017 for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { 2018 if ((map & bit) == 0) { 2019 run = 0; 2020 } else { 2021 run++; 2022 if (run == len) 2023 break; 2024 } 2025 if ((got & (NBBY - 1)) != (NBBY - 1)) { 2026 bit <<= 1; 2027 } else { 2028 map = *mapp++; 2029 bit = 1; 2030 } 2031 } 2032 if (got >= cgp->cg_nclusterblks) { 2033 UFS_LOCK(ump); 2034 brelse(bp); 2035 return (0); 2036 } 2037 /* 2038 * Allocate the cluster that we have found. 2039 */ 2040 blksfree = cg_blksfree(cgp); 2041 for (i = 1; i <= len; i++) 2042 if (!ffs_isblock(fs, blksfree, got - run + i)) 2043 panic("ffs_clusteralloc: map mismatch"); 2044 bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1); 2045 if (dtog(fs, bno) != cg) 2046 panic("ffs_clusteralloc: allocated out of group"); 2047 len = blkstofrags(fs, len); 2048 UFS_LOCK(ump); 2049 for (i = 0; i < len; i += fs->fs_frag) 2050 if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) 2051 panic("ffs_clusteralloc: lost block"); 2052 ACTIVECLEAR(fs, cg); 2053 UFS_UNLOCK(ump); 2054 bdwrite(bp); 2055 return (bno); 2056 } 2057 2058 static inline struct buf * 2059 getinobuf(struct inode *ip, 2060 u_int cg, 2061 u_int32_t cginoblk, 2062 int gbflags) 2063 { 2064 struct fs *fs; 2065 2066 fs = ITOFS(ip); 2067 return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, 2068 cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, 2069 gbflags)); 2070 } 2071 2072 /* 2073 * Synchronous inode initialization is needed only when barrier writes do not 2074 * work as advertised, and will impose a heavy cost on file creation in a newly 2075 * created filesystem. 2076 */ 2077 static int doasyncinodeinit = 1; 2078 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN, 2079 &doasyncinodeinit, 0, 2080 "Perform inode block initialization using asynchronous writes"); 2081 2082 /* 2083 * Determine whether an inode can be allocated. 2084 * 2085 * Check to see if an inode is available, and if it is, 2086 * allocate it using the following policy: 2087 * 1) allocate the requested inode. 2088 * 2) allocate the next available inode after the requested 2089 * inode in the specified cylinder group. 2090 */ 2091 static ufs2_daddr_t 2092 ffs_nodealloccg(struct inode *ip, 2093 u_int cg, 2094 ufs2_daddr_t ipref, 2095 int mode, 2096 int unused) 2097 { 2098 struct fs *fs; 2099 struct cg *cgp; 2100 struct buf *bp, *ibp; 2101 struct ufsmount *ump; 2102 u_int8_t *inosused, *loc; 2103 struct ufs2_dinode *dp2; 2104 int error, start, len, i; 2105 u_int32_t old_initediblk; 2106 2107 ump = ITOUMP(ip); 2108 fs = ump->um_fs; 2109 check_nifree: 2110 if (fs->fs_cs(fs, cg).cs_nifree == 0) 2111 return (0); 2112 UFS_UNLOCK(ump); 2113 if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { 2114 UFS_LOCK(ump); 2115 return (0); 2116 } 2117 restart: 2118 if (cgp->cg_cs.cs_nifree == 0) { 2119 brelse(bp); 2120 UFS_LOCK(ump); 2121 return (0); 2122 } 2123 inosused = cg_inosused(cgp); 2124 if (ipref) { 2125 ipref %= fs->fs_ipg; 2126 if (isclr(inosused, ipref)) 2127 goto gotit; 2128 } 2129 start = cgp->cg_irotor / NBBY; 2130 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); 2131 loc = memcchr(&inosused[start], 0xff, len); 2132 if (loc == NULL) { 2133 len = start + 1; 2134 start = 0; 2135 loc = memcchr(&inosused[start], 0xff, len); 2136 if (loc == NULL) { 2137 printf("cg = %d, irotor = %ld, fs = %s\n", 2138 cg, (long)cgp->cg_irotor, fs->fs_fsmnt); 2139 panic("ffs_nodealloccg: map corrupted"); 2140 /* NOTREACHED */ 2141 } 2142 } 2143 ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1; 2144 gotit: 2145 /* 2146 * Check to see if we need to initialize more inodes. 2147 */ 2148 if (fs->fs_magic == FS_UFS2_MAGIC && 2149 ipref + INOPB(fs) > cgp->cg_initediblk && 2150 cgp->cg_initediblk < cgp->cg_niblk) { 2151 old_initediblk = cgp->cg_initediblk; 2152 2153 /* 2154 * Free the cylinder group lock before writing the 2155 * initialized inode block. Entering the 2156 * babarrierwrite() with the cylinder group lock 2157 * causes lock order violation between the lock and 2158 * snaplk. 2159 * 2160 * Another thread can decide to initialize the same 2161 * inode block, but whichever thread first gets the 2162 * cylinder group lock after writing the newly 2163 * allocated inode block will update it and the other 2164 * will realize that it has lost and leave the 2165 * cylinder group unchanged. 2166 */ 2167 ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); 2168 brelse(bp); 2169 if (ibp == NULL) { 2170 /* 2171 * The inode block buffer is already owned by 2172 * another thread, which must initialize it. 2173 * Wait on the buffer to allow another thread 2174 * to finish the updates, with dropped cg 2175 * buffer lock, then retry. 2176 */ 2177 ibp = getinobuf(ip, cg, old_initediblk, 0); 2178 brelse(ibp); 2179 UFS_LOCK(ump); 2180 goto check_nifree; 2181 } 2182 bzero(ibp->b_data, (int)fs->fs_bsize); 2183 dp2 = (struct ufs2_dinode *)(ibp->b_data); 2184 for (i = 0; i < INOPB(fs); i++) { 2185 while (dp2->di_gen == 0) 2186 dp2->di_gen = arc4random(); 2187 dp2++; 2188 } 2189 2190 /* 2191 * Rather than adding a soft updates dependency to ensure 2192 * that the new inode block is written before it is claimed 2193 * by the cylinder group map, we just do a barrier write 2194 * here. The barrier write will ensure that the inode block 2195 * gets written before the updated cylinder group map can be 2196 * written. The barrier write should only slow down bulk 2197 * loading of newly created filesystems. 2198 */ 2199 if (doasyncinodeinit) 2200 babarrierwrite(ibp); 2201 else 2202 bwrite(ibp); 2203 2204 /* 2205 * After the inode block is written, try to update the 2206 * cg initediblk pointer. If another thread beat us 2207 * to it, then leave it unchanged as the other thread 2208 * has already set it correctly. 2209 */ 2210 error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp); 2211 UFS_LOCK(ump); 2212 ACTIVECLEAR(fs, cg); 2213 UFS_UNLOCK(ump); 2214 if (error != 0) 2215 return (error); 2216 if (cgp->cg_initediblk == old_initediblk) 2217 cgp->cg_initediblk += INOPB(fs); 2218 goto restart; 2219 } 2220 cgp->cg_irotor = ipref; 2221 UFS_LOCK(ump); 2222 ACTIVECLEAR(fs, cg); 2223 setbit(inosused, ipref); 2224 cgp->cg_cs.cs_nifree--; 2225 fs->fs_cstotal.cs_nifree--; 2226 fs->fs_cs(fs, cg).cs_nifree--; 2227 fs->fs_fmod = 1; 2228 if ((mode & IFMT) == IFDIR) { 2229 cgp->cg_cs.cs_ndir++; 2230 fs->fs_cstotal.cs_ndir++; 2231 fs->fs_cs(fs, cg).cs_ndir++; 2232 } 2233 UFS_UNLOCK(ump); 2234 if (DOINGSOFTDEP(ITOV(ip))) 2235 softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode); 2236 bdwrite(bp); 2237 return ((ino_t)(cg * fs->fs_ipg + ipref)); 2238 } 2239 2240 /* 2241 * Free a block or fragment. 2242 * 2243 * The specified block or fragment is placed back in the 2244 * free map. If a fragment is deallocated, a possible 2245 * block reassembly is checked. 2246 */ 2247 static void 2248 ffs_blkfree_cg(struct ufsmount *ump, 2249 struct fs *fs, 2250 struct vnode *devvp, 2251 ufs2_daddr_t bno, 2252 long size, 2253 ino_t inum, 2254 struct workhead *dephd) 2255 { 2256 struct mount *mp; 2257 struct cg *cgp; 2258 struct buf *bp; 2259 daddr_t dbn; 2260 ufs1_daddr_t fragno, cgbno; 2261 int i, blk, frags, bbase, error; 2262 u_int cg; 2263 u_int8_t *blksfree; 2264 struct cdev *dev; 2265 2266 cg = dtog(fs, bno); 2267 if (devvp->v_type == VREG) { 2268 /* devvp is a snapshot */ 2269 MPASS(devvp->v_mount->mnt_data == ump); 2270 dev = ump->um_devvp->v_rdev; 2271 } else if (devvp->v_type == VCHR) { 2272 /* 2273 * devvp is a normal disk device 2274 * XXXKIB: devvp is not locked there, v_rdev access depends on 2275 * busy mount, which prevents mntfs devvp from reclamation. 2276 */ 2277 dev = devvp->v_rdev; 2278 } else 2279 return; 2280 #ifdef INVARIANTS 2281 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 || 2282 fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { 2283 printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", 2284 devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, 2285 size, fs->fs_fsmnt); 2286 panic("ffs_blkfree_cg: bad size"); 2287 } 2288 #endif 2289 if ((u_int)bno >= fs->fs_size) { 2290 printf("bad block %jd, ino %lu\n", (intmax_t)bno, 2291 (u_long)inum); 2292 ffs_fserr(fs, inum, "bad block"); 2293 return; 2294 } 2295 if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { 2296 if (!ffs_fsfail_cleanup(ump, error) || 2297 !MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) 2298 return; 2299 if (devvp->v_type == VREG) 2300 dbn = fragstoblks(fs, cgtod(fs, cg)); 2301 else 2302 dbn = fsbtodb(fs, cgtod(fs, cg)); 2303 error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); 2304 KASSERT(error == 0, ("getblkx failed")); 2305 softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, 2306 numfrags(fs, size), dephd); 2307 bp->b_flags |= B_RELBUF | B_NOCACHE; 2308 bp->b_flags &= ~B_CACHE; 2309 bawrite(bp); 2310 return; 2311 } 2312 cgbno = dtogd(fs, bno); 2313 blksfree = cg_blksfree(cgp); 2314 UFS_LOCK(ump); 2315 if (size == fs->fs_bsize) { 2316 fragno = fragstoblks(fs, cgbno); 2317 if (!ffs_isfreeblock(fs, blksfree, fragno)) { 2318 if (devvp->v_type == VREG) { 2319 UFS_UNLOCK(ump); 2320 /* devvp is a snapshot */ 2321 brelse(bp); 2322 return; 2323 } 2324 printf("dev = %s, block = %jd, fs = %s\n", 2325 devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); 2326 panic("ffs_blkfree_cg: freeing free block"); 2327 } 2328 ffs_setblock(fs, blksfree, fragno); 2329 ffs_clusteracct(fs, cgp, fragno, 1); 2330 cgp->cg_cs.cs_nbfree++; 2331 fs->fs_cstotal.cs_nbfree++; 2332 fs->fs_cs(fs, cg).cs_nbfree++; 2333 } else { 2334 bbase = cgbno - fragnum(fs, cgbno); 2335 /* 2336 * decrement the counts associated with the old frags 2337 */ 2338 blk = blkmap(fs, blksfree, bbase); 2339 ffs_fragacct(fs, blk, cgp->cg_frsum, -1); 2340 /* 2341 * deallocate the fragment 2342 */ 2343 frags = numfrags(fs, size); 2344 for (i = 0; i < frags; i++) { 2345 if (isset(blksfree, cgbno + i)) { 2346 printf("dev = %s, block = %jd, fs = %s\n", 2347 devtoname(dev), (intmax_t)(bno + i), 2348 fs->fs_fsmnt); 2349 panic("ffs_blkfree_cg: freeing free frag"); 2350 } 2351 setbit(blksfree, cgbno + i); 2352 } 2353 cgp->cg_cs.cs_nffree += i; 2354 fs->fs_cstotal.cs_nffree += i; 2355 fs->fs_cs(fs, cg).cs_nffree += i; 2356 /* 2357 * add back in counts associated with the new frags 2358 */ 2359 blk = blkmap(fs, blksfree, bbase); 2360 ffs_fragacct(fs, blk, cgp->cg_frsum, 1); 2361 /* 2362 * if a complete block has been reassembled, account for it 2363 */ 2364 fragno = fragstoblks(fs, bbase); 2365 if (ffs_isblock(fs, blksfree, fragno)) { 2366 cgp->cg_cs.cs_nffree -= fs->fs_frag; 2367 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 2368 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 2369 ffs_clusteracct(fs, cgp, fragno, 1); 2370 cgp->cg_cs.cs_nbfree++; 2371 fs->fs_cstotal.cs_nbfree++; 2372 fs->fs_cs(fs, cg).cs_nbfree++; 2373 } 2374 } 2375 fs->fs_fmod = 1; 2376 ACTIVECLEAR(fs, cg); 2377 UFS_UNLOCK(ump); 2378 mp = UFSTOVFS(ump); 2379 if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR) 2380 softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, 2381 numfrags(fs, size), dephd); 2382 bdwrite(bp); 2383 } 2384 2385 /* 2386 * Structures and routines associated with trim management. 2387 * 2388 * The following requests are passed to trim_lookup to indicate 2389 * the actions that should be taken. 2390 */ 2391 #define NEW 1 /* if found, error else allocate and hash it */ 2392 #define OLD 2 /* if not found, error, else return it */ 2393 #define REPLACE 3 /* if not found, error else unhash and reallocate it */ 2394 #define DONE 4 /* if not found, error else unhash and return it */ 2395 #define SINGLE 5 /* don't look up, just allocate it and don't hash it */ 2396 2397 MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures"); 2398 2399 #define TRIMLIST_HASH(ump, key) \ 2400 (&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize]) 2401 2402 /* 2403 * These structures describe each of the block free requests aggregated 2404 * together to make up a trim request. 2405 */ 2406 struct trim_blkreq { 2407 TAILQ_ENTRY(trim_blkreq) blkreqlist; 2408 ufs2_daddr_t bno; 2409 long size; 2410 struct workhead *pdephd; 2411 struct workhead dephd; 2412 }; 2413 2414 /* 2415 * Description of a trim request. 2416 */ 2417 struct ffs_blkfree_trim_params { 2418 TAILQ_HEAD(, trim_blkreq) blklist; 2419 LIST_ENTRY(ffs_blkfree_trim_params) hashlist; 2420 struct task task; 2421 struct ufsmount *ump; 2422 struct vnode *devvp; 2423 ino_t inum; 2424 ufs2_daddr_t bno; 2425 long size; 2426 long key; 2427 }; 2428 2429 static void ffs_blkfree_trim_completed(struct buf *); 2430 static void ffs_blkfree_trim_task(void *ctx, int pending __unused); 2431 static struct ffs_blkfree_trim_params *trim_lookup(struct ufsmount *, 2432 struct vnode *, ufs2_daddr_t, long, ino_t, u_long, int); 2433 static void ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *); 2434 2435 /* 2436 * Called on trim completion to start a task to free the associated block(s). 2437 */ 2438 static void 2439 ffs_blkfree_trim_completed(struct buf *bp) 2440 { 2441 struct ffs_blkfree_trim_params *tp; 2442 2443 tp = bp->b_fsprivate1; 2444 free(bp, M_TRIM); 2445 TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); 2446 taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task); 2447 } 2448 2449 /* 2450 * Trim completion task that free associated block(s). 2451 */ 2452 static void 2453 ffs_blkfree_trim_task(void *ctx, int pending) 2454 { 2455 struct ffs_blkfree_trim_params *tp; 2456 struct trim_blkreq *blkelm; 2457 struct ufsmount *ump; 2458 2459 tp = ctx; 2460 ump = tp->ump; 2461 while ((blkelm = TAILQ_FIRST(&tp->blklist)) != NULL) { 2462 ffs_blkfree_cg(ump, ump->um_fs, tp->devvp, blkelm->bno, 2463 blkelm->size, tp->inum, blkelm->pdephd); 2464 TAILQ_REMOVE(&tp->blklist, blkelm, blkreqlist); 2465 free(blkelm, M_TRIM); 2466 } 2467 vn_finished_secondary_write(UFSTOVFS(ump)); 2468 UFS_LOCK(ump); 2469 ump->um_trim_inflight -= 1; 2470 ump->um_trim_inflight_blks -= numfrags(ump->um_fs, tp->size); 2471 UFS_UNLOCK(ump); 2472 free(tp, M_TRIM); 2473 } 2474 2475 /* 2476 * Lookup a trim request by inode number. 2477 * Allocate if requested (NEW, REPLACE, SINGLE). 2478 */ 2479 static struct ffs_blkfree_trim_params * 2480 trim_lookup(struct ufsmount *ump, 2481 struct vnode *devvp, 2482 ufs2_daddr_t bno, 2483 long size, 2484 ino_t inum, 2485 u_long key, 2486 int alloctype) 2487 { 2488 struct trimlist_hashhead *tphashhead; 2489 struct ffs_blkfree_trim_params *tp, *ntp; 2490 2491 ntp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK); 2492 if (alloctype != SINGLE) { 2493 KASSERT(key >= FIRST_VALID_KEY, ("trim_lookup: invalid key")); 2494 UFS_LOCK(ump); 2495 tphashhead = TRIMLIST_HASH(ump, key); 2496 LIST_FOREACH(tp, tphashhead, hashlist) 2497 if (key == tp->key) 2498 break; 2499 } 2500 switch (alloctype) { 2501 case NEW: 2502 KASSERT(tp == NULL, ("trim_lookup: found trim")); 2503 break; 2504 case OLD: 2505 KASSERT(tp != NULL, 2506 ("trim_lookup: missing call to ffs_blkrelease_start()")); 2507 UFS_UNLOCK(ump); 2508 free(ntp, M_TRIM); 2509 return (tp); 2510 case REPLACE: 2511 KASSERT(tp != NULL, ("trim_lookup: missing REPLACE trim")); 2512 LIST_REMOVE(tp, hashlist); 2513 /* tp will be freed by caller */ 2514 break; 2515 case DONE: 2516 KASSERT(tp != NULL, ("trim_lookup: missing DONE trim")); 2517 LIST_REMOVE(tp, hashlist); 2518 UFS_UNLOCK(ump); 2519 free(ntp, M_TRIM); 2520 return (tp); 2521 } 2522 TAILQ_INIT(&ntp->blklist); 2523 ntp->ump = ump; 2524 ntp->devvp = devvp; 2525 ntp->bno = bno; 2526 ntp->size = size; 2527 ntp->inum = inum; 2528 ntp->key = key; 2529 if (alloctype != SINGLE) { 2530 LIST_INSERT_HEAD(tphashhead, ntp, hashlist); 2531 UFS_UNLOCK(ump); 2532 } 2533 return (ntp); 2534 } 2535 2536 /* 2537 * Dispatch a trim request. 2538 */ 2539 static void 2540 ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *tp) 2541 { 2542 struct ufsmount *ump; 2543 struct mount *mp; 2544 struct buf *bp; 2545 2546 /* 2547 * Postpone the set of the free bit in the cg bitmap until the 2548 * BIO_DELETE is completed. Otherwise, due to disk queue 2549 * reordering, TRIM might be issued after we reuse the block 2550 * and write some new data into it. 2551 */ 2552 ump = tp->ump; 2553 bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO); 2554 bp->b_iocmd = BIO_DELETE; 2555 bp->b_iooffset = dbtob(fsbtodb(ump->um_fs, tp->bno)); 2556 bp->b_iodone = ffs_blkfree_trim_completed; 2557 bp->b_bcount = tp->size; 2558 bp->b_fsprivate1 = tp; 2559 UFS_LOCK(ump); 2560 ump->um_trim_total += 1; 2561 ump->um_trim_inflight += 1; 2562 ump->um_trim_inflight_blks += numfrags(ump->um_fs, tp->size); 2563 ump->um_trim_total_blks += numfrags(ump->um_fs, tp->size); 2564 UFS_UNLOCK(ump); 2565 2566 mp = UFSTOVFS(ump); 2567 vn_start_secondary_write(NULL, &mp, 0); 2568 g_vfs_strategy(ump->um_bo, bp); 2569 } 2570 2571 /* 2572 * Allocate a new key to use to identify a range of blocks. 2573 */ 2574 u_long 2575 ffs_blkrelease_start(struct ufsmount *ump, 2576 struct vnode *devvp, 2577 ino_t inum) 2578 { 2579 static u_long masterkey; 2580 u_long key; 2581 2582 if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) 2583 return (SINGLETON_KEY); 2584 do { 2585 key = atomic_fetchadd_long(&masterkey, 1); 2586 } while (key < FIRST_VALID_KEY); 2587 (void) trim_lookup(ump, devvp, 0, 0, inum, key, NEW); 2588 return (key); 2589 } 2590 2591 /* 2592 * Deallocate a key that has been used to identify a range of blocks. 2593 */ 2594 void 2595 ffs_blkrelease_finish(struct ufsmount *ump, u_long key) 2596 { 2597 struct ffs_blkfree_trim_params *tp; 2598 2599 if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) 2600 return; 2601 /* 2602 * If the vfs.ffs.dotrimcons sysctl option is enabled while 2603 * a file deletion is active, specifically after a call 2604 * to ffs_blkrelease_start() but before the call to 2605 * ffs_blkrelease_finish(), ffs_blkrelease_start() will 2606 * have handed out SINGLETON_KEY rather than starting a 2607 * collection sequence. Thus if we get a SINGLETON_KEY 2608 * passed to ffs_blkrelease_finish(), we just return rather 2609 * than trying to finish the nonexistent sequence. 2610 */ 2611 if (key == SINGLETON_KEY) { 2612 #ifdef INVARIANTS 2613 printf("%s: vfs.ffs.dotrimcons enabled on active filesystem\n", 2614 ump->um_mountp->mnt_stat.f_mntonname); 2615 #endif 2616 return; 2617 } 2618 /* 2619 * We are done with sending blocks using this key. Look up the key 2620 * using the DONE alloctype (in tp) to request that it be unhashed 2621 * as we will not be adding to it. If the key has never been used, 2622 * tp->size will be zero, so we can just free tp. Otherwise the call 2623 * to ffs_blkfree_sendtrim(tp) causes the block range described by 2624 * tp to be issued (and then tp to be freed). 2625 */ 2626 tp = trim_lookup(ump, NULL, 0, 0, 0, key, DONE); 2627 if (tp->size == 0) 2628 free(tp, M_TRIM); 2629 else 2630 ffs_blkfree_sendtrim(tp); 2631 } 2632 2633 /* 2634 * Setup to free a block or fragment. 2635 * 2636 * Check for snapshots that might want to claim the block. 2637 * If trims are requested, prepare a trim request. Attempt to 2638 * aggregate consecutive blocks into a single trim request. 2639 */ 2640 void 2641 ffs_blkfree(struct ufsmount *ump, 2642 struct fs *fs, 2643 struct vnode *devvp, 2644 ufs2_daddr_t bno, 2645 long size, 2646 ino_t inum, 2647 enum vtype vtype, 2648 struct workhead *dephd, 2649 u_long key) 2650 { 2651 struct ffs_blkfree_trim_params *tp, *ntp; 2652 struct trim_blkreq *blkelm; 2653 2654 /* 2655 * Check to see if a snapshot wants to claim the block. 2656 * Check that devvp is a normal disk device, not a snapshot, 2657 * it has a snapshot(s) associated with it, and one of the 2658 * snapshots wants to claim the block. 2659 */ 2660 if (devvp->v_type == VCHR && 2661 (devvp->v_vflag & VV_COPYONWRITE) && 2662 ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { 2663 return; 2664 } 2665 /* 2666 * Nothing to delay if TRIM is not required for this block or TRIM 2667 * is disabled or the operation is performed on a snapshot. 2668 */ 2669 if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) || 2670 devvp->v_type == VREG) { 2671 ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); 2672 return; 2673 } 2674 blkelm = malloc(sizeof(struct trim_blkreq), M_TRIM, M_WAITOK); 2675 blkelm->bno = bno; 2676 blkelm->size = size; 2677 if (dephd == NULL) { 2678 blkelm->pdephd = NULL; 2679 } else { 2680 LIST_INIT(&blkelm->dephd); 2681 LIST_SWAP(dephd, &blkelm->dephd, worklist, wk_list); 2682 blkelm->pdephd = &blkelm->dephd; 2683 } 2684 if (key == SINGLETON_KEY) { 2685 /* 2686 * Just a single non-contiguous piece. Use the SINGLE 2687 * alloctype to return a trim request that will not be 2688 * hashed for future lookup. 2689 */ 2690 tp = trim_lookup(ump, devvp, bno, size, inum, key, SINGLE); 2691 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2692 ffs_blkfree_sendtrim(tp); 2693 return; 2694 } 2695 /* 2696 * The callers of this function are not tracking whether or not 2697 * the blocks are contiguous. They are just saying that they 2698 * are freeing a set of blocks. It is this code that determines 2699 * the pieces of that range that are actually contiguous. 2700 * 2701 * Calling ffs_blkrelease_start() will have created an entry 2702 * that we will use. 2703 */ 2704 tp = trim_lookup(ump, devvp, bno, size, inum, key, OLD); 2705 if (tp->size == 0) { 2706 /* 2707 * First block of a potential range, set block and size 2708 * for the trim block. 2709 */ 2710 tp->bno = bno; 2711 tp->size = size; 2712 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2713 return; 2714 } 2715 /* 2716 * If this block is a continuation of the range (either 2717 * follows at the end or preceeds in the front) then we 2718 * add it to the front or back of the list and return. 2719 * 2720 * If it is not a continuation of the trim that we were 2721 * building, using the REPLACE alloctype, we request that 2722 * the old trim request (still in tp) be unhashed and a 2723 * new range started (in ntp). The ffs_blkfree_sendtrim(tp) 2724 * call causes the block range described by tp to be issued 2725 * (and then tp to be freed). 2726 */ 2727 if (bno + numfrags(fs, size) == tp->bno) { 2728 TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); 2729 tp->bno = bno; 2730 tp->size += size; 2731 return; 2732 } else if (bno == tp->bno + numfrags(fs, tp->size)) { 2733 TAILQ_INSERT_TAIL(&tp->blklist, blkelm, blkreqlist); 2734 tp->size += size; 2735 return; 2736 } 2737 ntp = trim_lookup(ump, devvp, bno, size, inum, key, REPLACE); 2738 TAILQ_INSERT_HEAD(&ntp->blklist, blkelm, blkreqlist); 2739 ffs_blkfree_sendtrim(tp); 2740 } 2741 2742 #ifdef INVARIANTS 2743 /* 2744 * Verify allocation of a block or fragment. Returns true if block or 2745 * fragment is allocated, false if it is free. 2746 */ 2747 static int 2748 ffs_checkblk(struct inode *ip, 2749 ufs2_daddr_t bno, 2750 long size) 2751 { 2752 struct fs *fs; 2753 struct cg *cgp; 2754 struct buf *bp; 2755 ufs1_daddr_t cgbno; 2756 int i, error, frags, free; 2757 u_int8_t *blksfree; 2758 2759 fs = ITOFS(ip); 2760 if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { 2761 printf("bsize = %ld, size = %ld, fs = %s\n", 2762 (long)fs->fs_bsize, size, fs->fs_fsmnt); 2763 panic("ffs_checkblk: bad size"); 2764 } 2765 if ((u_int)bno >= fs->fs_size) 2766 panic("ffs_checkblk: bad block %jd", (intmax_t)bno); 2767 error = ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), 0, &bp, &cgp); 2768 if (error) 2769 panic("ffs_checkblk: cylinder group read failed"); 2770 blksfree = cg_blksfree(cgp); 2771 cgbno = dtogd(fs, bno); 2772 if (size == fs->fs_bsize) { 2773 free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); 2774 } else { 2775 frags = numfrags(fs, size); 2776 for (free = 0, i = 0; i < frags; i++) 2777 if (isset(blksfree, cgbno + i)) 2778 free++; 2779 if (free != 0 && free != frags) 2780 panic("ffs_checkblk: partially free fragment"); 2781 } 2782 brelse(bp); 2783 return (!free); 2784 } 2785 #endif /* INVARIANTS */ 2786 2787 /* 2788 * Free an inode. 2789 */ 2790 int 2791 ffs_vfree(struct vnode *pvp, 2792 ino_t ino, 2793 int mode) 2794 { 2795 struct ufsmount *ump; 2796 2797 if (DOINGSOFTDEP(pvp)) { 2798 softdep_freefile(pvp, ino, mode); 2799 return (0); 2800 } 2801 ump = VFSTOUFS(pvp->v_mount); 2802 return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL)); 2803 } 2804 2805 /* 2806 * Do the actual free operation. 2807 * The specified inode is placed back in the free map. 2808 */ 2809 int 2810 ffs_freefile(struct ufsmount *ump, 2811 struct fs *fs, 2812 struct vnode *devvp, 2813 ino_t ino, 2814 int mode, 2815 struct workhead *wkhd) 2816 { 2817 struct cg *cgp; 2818 struct buf *bp; 2819 daddr_t dbn; 2820 int error; 2821 u_int cg; 2822 u_int8_t *inosused; 2823 struct cdev *dev; 2824 ino_t cgino; 2825 2826 cg = ino_to_cg(fs, ino); 2827 if (devvp->v_type == VREG) { 2828 /* devvp is a snapshot */ 2829 MPASS(devvp->v_mount->mnt_data == ump); 2830 dev = ump->um_devvp->v_rdev; 2831 } else if (devvp->v_type == VCHR) { 2832 /* devvp is a normal disk device */ 2833 dev = devvp->v_rdev; 2834 } else { 2835 bp = NULL; 2836 return (0); 2837 } 2838 if (ino >= fs->fs_ipg * fs->fs_ncg) 2839 panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s", 2840 devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); 2841 if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { 2842 if (!ffs_fsfail_cleanup(ump, error) || 2843 !MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) 2844 return (error); 2845 if (devvp->v_type == VREG) 2846 dbn = fragstoblks(fs, cgtod(fs, cg)); 2847 else 2848 dbn = fsbtodb(fs, cgtod(fs, cg)); 2849 error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); 2850 KASSERT(error == 0, ("getblkx failed")); 2851 softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd); 2852 bp->b_flags |= B_RELBUF | B_NOCACHE; 2853 bp->b_flags &= ~B_CACHE; 2854 bawrite(bp); 2855 return (error); 2856 } 2857 inosused = cg_inosused(cgp); 2858 cgino = ino % fs->fs_ipg; 2859 if (isclr(inosused, cgino)) { 2860 printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev), 2861 (uintmax_t)ino, fs->fs_fsmnt); 2862 if (fs->fs_ronly == 0) 2863 panic("ffs_freefile: freeing free inode"); 2864 } 2865 clrbit(inosused, cgino); 2866 if (cgino < cgp->cg_irotor) 2867 cgp->cg_irotor = cgino; 2868 cgp->cg_cs.cs_nifree++; 2869 UFS_LOCK(ump); 2870 fs->fs_cstotal.cs_nifree++; 2871 fs->fs_cs(fs, cg).cs_nifree++; 2872 if ((mode & IFMT) == IFDIR) { 2873 cgp->cg_cs.cs_ndir--; 2874 fs->fs_cstotal.cs_ndir--; 2875 fs->fs_cs(fs, cg).cs_ndir--; 2876 } 2877 fs->fs_fmod = 1; 2878 ACTIVECLEAR(fs, cg); 2879 UFS_UNLOCK(ump); 2880 if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR) 2881 softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd); 2882 bdwrite(bp); 2883 return (0); 2884 } 2885 2886 /* 2887 * Check to see if a file is free. 2888 * Used to check for allocated files in snapshots. 2889 */ 2890 int 2891 ffs_checkfreefile(struct fs *fs, 2892 struct vnode *devvp, 2893 ino_t ino) 2894 { 2895 struct cg *cgp; 2896 struct buf *bp; 2897 int ret, error; 2898 u_int cg; 2899 u_int8_t *inosused; 2900 2901 cg = ino_to_cg(fs, ino); 2902 if ((devvp->v_type != VREG) && (devvp->v_type != VCHR)) 2903 return (1); 2904 if (ino >= fs->fs_ipg * fs->fs_ncg) 2905 return (1); 2906 if ((error = ffs_getcg(fs, devvp, cg, 0, &bp, &cgp)) != 0) 2907 return (1); 2908 inosused = cg_inosused(cgp); 2909 ino %= fs->fs_ipg; 2910 ret = isclr(inosused, ino); 2911 brelse(bp); 2912 return (ret); 2913 } 2914 2915 /* 2916 * Find a block of the specified size in the specified cylinder group. 2917 * 2918 * It is a panic if a request is made to find a block if none are 2919 * available. 2920 */ 2921 static ufs1_daddr_t 2922 ffs_mapsearch(struct fs *fs, 2923 struct cg *cgp, 2924 ufs2_daddr_t bpref, 2925 int allocsiz) 2926 { 2927 ufs1_daddr_t bno; 2928 int start, len, loc, i; 2929 int blk, field, subfield, pos; 2930 u_int8_t *blksfree; 2931 2932 /* 2933 * find the fragment by searching through the free block 2934 * map for an appropriate bit pattern 2935 */ 2936 if (bpref) 2937 start = dtogd(fs, bpref) / NBBY; 2938 else 2939 start = cgp->cg_frotor / NBBY; 2940 blksfree = cg_blksfree(cgp); 2941 len = howmany(fs->fs_fpg, NBBY) - start; 2942 loc = scanc((u_int)len, (u_char *)&blksfree[start], 2943 fragtbl[fs->fs_frag], 2944 (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 2945 if (loc == 0) { 2946 len = start + 1; 2947 start = 0; 2948 loc = scanc((u_int)len, (u_char *)&blksfree[0], 2949 fragtbl[fs->fs_frag], 2950 (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 2951 if (loc == 0) { 2952 printf("start = %d, len = %d, fs = %s\n", 2953 start, len, fs->fs_fsmnt); 2954 panic("ffs_alloccg: map corrupted"); 2955 /* NOTREACHED */ 2956 } 2957 } 2958 bno = (start + len - loc) * NBBY; 2959 cgp->cg_frotor = bno; 2960 /* 2961 * found the byte in the map 2962 * sift through the bits to find the selected frag 2963 */ 2964 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 2965 blk = blkmap(fs, blksfree, bno); 2966 blk <<= 1; 2967 field = around[allocsiz]; 2968 subfield = inside[allocsiz]; 2969 for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { 2970 if ((blk & field) == subfield) 2971 return (bno + pos); 2972 field <<= 1; 2973 subfield <<= 1; 2974 } 2975 } 2976 printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt); 2977 panic("ffs_alloccg: block not in map"); 2978 return (-1); 2979 } 2980 2981 static const struct statfs * 2982 ffs_getmntstat(struct vnode *devvp) 2983 { 2984 2985 if (devvp->v_type == VCHR) 2986 return (&devvp->v_rdev->si_mountpt->mnt_stat); 2987 return (ffs_getmntstat(VFSTOUFS(devvp->v_mount)->um_devvp)); 2988 } 2989 2990 /* 2991 * Fetch and verify a cylinder group. 2992 */ 2993 int 2994 ffs_getcg(struct fs *fs, 2995 struct vnode *devvp, 2996 u_int cg, 2997 int flags, 2998 struct buf **bpp, 2999 struct cg **cgpp) 3000 { 3001 struct buf *bp; 3002 struct cg *cgp; 3003 const struct statfs *sfs; 3004 daddr_t blkno; 3005 int error; 3006 3007 *bpp = NULL; 3008 *cgpp = NULL; 3009 if ((fs->fs_metackhash & CK_CYLGRP) != 0) 3010 flags |= GB_CKHASH; 3011 if (devvp->v_type == VREG) 3012 blkno = fragstoblks(fs, cgtod(fs, cg)); 3013 else 3014 blkno = fsbtodb(fs, cgtod(fs, cg)); 3015 error = breadn_flags(devvp, blkno, blkno, (int)fs->fs_cgsize, NULL, 3016 NULL, 0, NOCRED, flags, ffs_ckhash_cg, &bp); 3017 if (error != 0) 3018 return (error); 3019 cgp = (struct cg *)bp->b_data; 3020 if ((fs->fs_metackhash & CK_CYLGRP) != 0 && 3021 (bp->b_flags & B_CKHASH) != 0 && 3022 cgp->cg_ckhash != bp->b_ckhash) { 3023 sfs = ffs_getmntstat(devvp); 3024 printf("UFS %s%s (%s) cylinder checksum failed: cg %u, cgp: " 3025 "0x%x != bp: 0x%jx\n", 3026 devvp->v_type == VCHR ? "" : "snapshot of ", 3027 sfs->f_mntfromname, sfs->f_mntonname, 3028 cg, cgp->cg_ckhash, (uintmax_t)bp->b_ckhash); 3029 bp->b_flags &= ~B_CKHASH; 3030 bp->b_flags |= B_INVAL | B_NOCACHE; 3031 brelse(bp); 3032 return (EIO); 3033 } 3034 if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) { 3035 sfs = ffs_getmntstat(devvp); 3036 printf("UFS %s%s (%s)", 3037 devvp->v_type == VCHR ? "" : "snapshot of ", 3038 sfs->f_mntfromname, sfs->f_mntonname); 3039 if (!cg_chkmagic(cgp)) 3040 printf(" cg %u: bad magic number 0x%x should be 0x%x\n", 3041 cg, cgp->cg_magic, CG_MAGIC); 3042 else 3043 printf(": wrong cylinder group cg %u != cgx %u\n", cg, 3044 cgp->cg_cgx); 3045 bp->b_flags &= ~B_CKHASH; 3046 bp->b_flags |= B_INVAL | B_NOCACHE; 3047 brelse(bp); 3048 return (EIO); 3049 } 3050 bp->b_flags &= ~B_CKHASH; 3051 bp->b_xflags |= BX_BKGRDWRITE; 3052 /* 3053 * If we are using check hashes on the cylinder group then we want 3054 * to limit changing the cylinder group time to when we are actually 3055 * going to write it to disk so that its check hash remains correct 3056 * in memory. If the CK_CYLGRP flag is set the time is updated in 3057 * ffs_bufwrite() as the buffer is queued for writing. Otherwise we 3058 * update the time here as we have done historically. 3059 */ 3060 if ((fs->fs_metackhash & CK_CYLGRP) != 0) 3061 bp->b_xflags |= BX_CYLGRP; 3062 else 3063 cgp->cg_old_time = cgp->cg_time = time_second; 3064 *bpp = bp; 3065 *cgpp = cgp; 3066 return (0); 3067 } 3068 3069 static void 3070 ffs_ckhash_cg(struct buf *bp) 3071 { 3072 uint32_t ckhash; 3073 struct cg *cgp; 3074 3075 cgp = (struct cg *)bp->b_data; 3076 ckhash = cgp->cg_ckhash; 3077 cgp->cg_ckhash = 0; 3078 bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); 3079 cgp->cg_ckhash = ckhash; 3080 } 3081 3082 /* 3083 * Fserr prints the name of a filesystem with an error diagnostic. 3084 * 3085 * The form of the error message is: 3086 * fs: error message 3087 */ 3088 void 3089 ffs_fserr(struct fs *fs, 3090 ino_t inum, 3091 char *cp) 3092 { 3093 struct thread *td = curthread; /* XXX */ 3094 struct proc *p = td->td_proc; 3095 3096 log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n", 3097 p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum, 3098 fs->fs_fsmnt, cp); 3099 } 3100 3101 /* 3102 * This function provides the capability for the fsck program to 3103 * update an active filesystem. Fourteen operations are provided: 3104 * 3105 * adjrefcnt(inode, amt) - adjusts the reference count on the 3106 * specified inode by the specified amount. Under normal 3107 * operation the count should always go down. Decrementing 3108 * the count to zero will cause the inode to be freed. 3109 * adjblkcnt(inode, amt) - adjust the number of blocks used by the 3110 * inode by the specified amount. 3111 * adjdepth(inode, amt) - adjust the depth of the specified directory 3112 * inode by the specified amount. 3113 * setsize(inode, size) - set the size of the inode to the 3114 * specified size. 3115 * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - 3116 * adjust the superblock summary. 3117 * freedirs(inode, count) - directory inodes [inode..inode + count - 1] 3118 * are marked as free. Inodes should never have to be marked 3119 * as in use. 3120 * freefiles(inode, count) - file inodes [inode..inode + count - 1] 3121 * are marked as free. Inodes should never have to be marked 3122 * as in use. 3123 * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] 3124 * are marked as free. Blocks should never have to be marked 3125 * as in use. 3126 * setflags(flags, set/clear) - the fs_flags field has the specified 3127 * flags set (second parameter +1) or cleared (second parameter -1). 3128 * setcwd(dirinode) - set the current directory to dirinode in the 3129 * filesystem associated with the snapshot. 3130 * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".." 3131 * in the current directory is oldvalue then change it to newvalue. 3132 * unlink(nameptr, oldvalue) - Verify that the inode number associated 3133 * with nameptr in the current directory is oldvalue then unlink it. 3134 */ 3135 3136 static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); 3137 3138 SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, 3139 CTLFLAG_WR | CTLTYPE_STRUCT | CTLFLAG_NEEDGIANT, 3140 0, 0, sysctl_ffs_fsck, "S,fsck", 3141 "Adjust Inode Reference Count"); 3142 3143 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, 3144 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3145 "Adjust Inode Used Blocks Count"); 3146 3147 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_DEPTH, adjdepth, 3148 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3149 "Adjust Directory Inode Depth"); 3150 3151 static SYSCTL_NODE(_vfs_ffs, FFS_SET_SIZE, setsize, 3152 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3153 "Set the inode size"); 3154 3155 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, 3156 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3157 "Adjust number of directories"); 3158 3159 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, 3160 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3161 "Adjust number of free blocks"); 3162 3163 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, 3164 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3165 "Adjust number of free inodes"); 3166 3167 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, 3168 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3169 "Adjust number of free frags"); 3170 3171 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, 3172 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3173 "Adjust number of free clusters"); 3174 3175 static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, 3176 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3177 "Free Range of Directory Inodes"); 3178 3179 static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, 3180 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3181 "Free Range of File Inodes"); 3182 3183 static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, 3184 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3185 "Free Range of Blocks"); 3186 3187 static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, 3188 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3189 "Change Filesystem Flags"); 3190 3191 static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, 3192 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3193 "Set Current Working Directory"); 3194 3195 static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, 3196 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3197 "Change Value of .. Entry"); 3198 3199 static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, 3200 CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, 3201 "Unlink a Duplicate Name"); 3202 3203 #ifdef DIAGNOSTIC 3204 static int fsckcmds = 0; 3205 SYSCTL_INT(_debug, OID_AUTO, ffs_fsckcmds, CTLFLAG_RW, &fsckcmds, 0, 3206 "print out fsck_ffs-based filesystem update commands"); 3207 #endif /* DIAGNOSTIC */ 3208 3209 static int 3210 sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) 3211 { 3212 struct thread *td = curthread; 3213 struct fsck_cmd cmd; 3214 struct ufsmount *ump; 3215 struct vnode *vp, *dvp, *fdvp; 3216 struct inode *ip, *dp; 3217 struct mount *mp; 3218 struct fs *fs; 3219 struct pwd *pwd; 3220 ufs2_daddr_t blkno; 3221 long blkcnt, blksize; 3222 u_long key; 3223 struct file *fp; 3224 cap_rights_t rights; 3225 int filetype, error; 3226 3227 if (req->newptr == NULL || req->newlen > sizeof(cmd)) 3228 return (EBADRPC); 3229 if ((error = SYSCTL_IN(req, &cmd, sizeof(cmd))) != 0) 3230 return (error); 3231 if (cmd.version != FFS_CMD_VERSION) 3232 return (ERPCMISMATCH); 3233 if ((error = getvnode(td, cmd.handle, 3234 cap_rights_init_one(&rights, CAP_FSCK), &fp)) != 0) 3235 return (error); 3236 vp = fp->f_vnode; 3237 if (vp->v_type != VREG && vp->v_type != VDIR) { 3238 fdrop(fp, td); 3239 return (EINVAL); 3240 } 3241 vn_start_write(vp, &mp, V_WAIT); 3242 if (mp == NULL || 3243 strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { 3244 vn_finished_write(mp); 3245 fdrop(fp, td); 3246 return (EINVAL); 3247 } 3248 ump = VFSTOUFS(mp); 3249 if (mp->mnt_flag & MNT_RDONLY) { 3250 vn_finished_write(mp); 3251 fdrop(fp, td); 3252 return (EROFS); 3253 } 3254 fs = ump->um_fs; 3255 filetype = IFREG; 3256 3257 switch (oidp->oid_number) { 3258 case FFS_SET_FLAGS: 3259 #ifdef DIAGNOSTIC 3260 if (fsckcmds) 3261 printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, 3262 cmd.size > 0 ? "set" : "clear"); 3263 #endif /* DIAGNOSTIC */ 3264 if (cmd.size > 0) 3265 fs->fs_flags |= (long)cmd.value; 3266 else 3267 fs->fs_flags &= ~(long)cmd.value; 3268 break; 3269 3270 case FFS_ADJ_REFCNT: 3271 #ifdef DIAGNOSTIC 3272 if (fsckcmds) { 3273 printf("%s: adjust inode %jd link count by %jd\n", 3274 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3275 (intmax_t)cmd.size); 3276 } 3277 #endif /* DIAGNOSTIC */ 3278 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3279 break; 3280 ip = VTOI(vp); 3281 ip->i_nlink += cmd.size; 3282 DIP_SET(ip, i_nlink, ip->i_nlink); 3283 ip->i_effnlink += cmd.size; 3284 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3285 error = ffs_update(vp, 1); 3286 if (DOINGSOFTDEP(vp)) 3287 softdep_change_linkcnt(ip); 3288 vput(vp); 3289 break; 3290 3291 case FFS_ADJ_BLKCNT: 3292 #ifdef DIAGNOSTIC 3293 if (fsckcmds) { 3294 printf("%s: adjust inode %jd block count by %jd\n", 3295 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3296 (intmax_t)cmd.size); 3297 } 3298 #endif /* DIAGNOSTIC */ 3299 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3300 break; 3301 ip = VTOI(vp); 3302 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); 3303 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3304 error = ffs_update(vp, 1); 3305 vput(vp); 3306 break; 3307 3308 case FFS_ADJ_DEPTH: 3309 #ifdef DIAGNOSTIC 3310 if (fsckcmds) { 3311 printf("%s: adjust directory inode %jd depth by %jd\n", 3312 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3313 (intmax_t)cmd.size); 3314 } 3315 #endif /* DIAGNOSTIC */ 3316 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3317 break; 3318 if (vp->v_type != VDIR) { 3319 vput(vp); 3320 error = ENOTDIR; 3321 break; 3322 } 3323 ip = VTOI(vp); 3324 DIP_SET(ip, i_dirdepth, DIP(ip, i_dirdepth) + cmd.size); 3325 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); 3326 error = ffs_update(vp, 1); 3327 vput(vp); 3328 break; 3329 3330 case FFS_SET_SIZE: 3331 #ifdef DIAGNOSTIC 3332 if (fsckcmds) { 3333 printf("%s: set inode %jd size to %jd\n", 3334 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3335 (intmax_t)cmd.size); 3336 } 3337 #endif /* DIAGNOSTIC */ 3338 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) 3339 break; 3340 ip = VTOI(vp); 3341 DIP_SET(ip, i_size, cmd.size); 3342 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_MODIFIED); 3343 error = ffs_update(vp, 1); 3344 vput(vp); 3345 break; 3346 3347 case FFS_DIR_FREE: 3348 filetype = IFDIR; 3349 /* fall through */ 3350 3351 case FFS_FILE_FREE: 3352 #ifdef DIAGNOSTIC 3353 if (fsckcmds) { 3354 if (cmd.size == 1) 3355 printf("%s: free %s inode %ju\n", 3356 mp->mnt_stat.f_mntonname, 3357 filetype == IFDIR ? "directory" : "file", 3358 (uintmax_t)cmd.value); 3359 else 3360 printf("%s: free %s inodes %ju-%ju\n", 3361 mp->mnt_stat.f_mntonname, 3362 filetype == IFDIR ? "directory" : "file", 3363 (uintmax_t)cmd.value, 3364 (uintmax_t)(cmd.value + cmd.size - 1)); 3365 } 3366 #endif /* DIAGNOSTIC */ 3367 while (cmd.size > 0) { 3368 if ((error = ffs_freefile(ump, fs, ump->um_devvp, 3369 cmd.value, filetype, NULL))) 3370 break; 3371 cmd.size -= 1; 3372 cmd.value += 1; 3373 } 3374 break; 3375 3376 case FFS_BLK_FREE: 3377 #ifdef DIAGNOSTIC 3378 if (fsckcmds) { 3379 if (cmd.size == 1) 3380 printf("%s: free block %jd\n", 3381 mp->mnt_stat.f_mntonname, 3382 (intmax_t)cmd.value); 3383 else 3384 printf("%s: free blocks %jd-%jd\n", 3385 mp->mnt_stat.f_mntonname, 3386 (intmax_t)cmd.value, 3387 (intmax_t)cmd.value + cmd.size - 1); 3388 } 3389 #endif /* DIAGNOSTIC */ 3390 blkno = cmd.value; 3391 blkcnt = cmd.size; 3392 blksize = fs->fs_frag - (blkno % fs->fs_frag); 3393 key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO); 3394 while (blkcnt > 0) { 3395 if (blkcnt < blksize) 3396 blksize = blkcnt; 3397 ffs_blkfree(ump, fs, ump->um_devvp, blkno, 3398 blksize * fs->fs_fsize, UFS_ROOTINO, 3399 VDIR, NULL, key); 3400 blkno += blksize; 3401 blkcnt -= blksize; 3402 blksize = fs->fs_frag; 3403 } 3404 ffs_blkrelease_finish(ump, key); 3405 break; 3406 3407 /* 3408 * Adjust superblock summaries. fsck(8) is expected to 3409 * submit deltas when necessary. 3410 */ 3411 case FFS_ADJ_NDIR: 3412 #ifdef DIAGNOSTIC 3413 if (fsckcmds) { 3414 printf("%s: adjust number of directories by %jd\n", 3415 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3416 } 3417 #endif /* DIAGNOSTIC */ 3418 fs->fs_cstotal.cs_ndir += cmd.value; 3419 break; 3420 3421 case FFS_ADJ_NBFREE: 3422 #ifdef DIAGNOSTIC 3423 if (fsckcmds) { 3424 printf("%s: adjust number of free blocks by %+jd\n", 3425 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3426 } 3427 #endif /* DIAGNOSTIC */ 3428 fs->fs_cstotal.cs_nbfree += cmd.value; 3429 break; 3430 3431 case FFS_ADJ_NIFREE: 3432 #ifdef DIAGNOSTIC 3433 if (fsckcmds) { 3434 printf("%s: adjust number of free inodes by %+jd\n", 3435 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3436 } 3437 #endif /* DIAGNOSTIC */ 3438 fs->fs_cstotal.cs_nifree += cmd.value; 3439 break; 3440 3441 case FFS_ADJ_NFFREE: 3442 #ifdef DIAGNOSTIC 3443 if (fsckcmds) { 3444 printf("%s: adjust number of free frags by %+jd\n", 3445 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3446 } 3447 #endif /* DIAGNOSTIC */ 3448 fs->fs_cstotal.cs_nffree += cmd.value; 3449 break; 3450 3451 case FFS_ADJ_NUMCLUSTERS: 3452 #ifdef DIAGNOSTIC 3453 if (fsckcmds) { 3454 printf("%s: adjust number of free clusters by %+jd\n", 3455 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3456 } 3457 #endif /* DIAGNOSTIC */ 3458 fs->fs_cstotal.cs_numclusters += cmd.value; 3459 break; 3460 3461 case FFS_SET_CWD: 3462 #ifdef DIAGNOSTIC 3463 if (fsckcmds) { 3464 printf("%s: set current directory to inode %jd\n", 3465 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); 3466 } 3467 #endif /* DIAGNOSTIC */ 3468 if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp))) 3469 break; 3470 AUDIT_ARG_VNODE1(vp); 3471 if ((error = change_dir(vp, td)) != 0) { 3472 vput(vp); 3473 break; 3474 } 3475 VOP_UNLOCK(vp); 3476 pwd_chdir(td, vp); 3477 break; 3478 3479 case FFS_SET_DOTDOT: 3480 #ifdef DIAGNOSTIC 3481 if (fsckcmds) { 3482 printf("%s: change .. in cwd from %jd to %jd\n", 3483 mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, 3484 (intmax_t)cmd.size); 3485 } 3486 #endif /* DIAGNOSTIC */ 3487 /* 3488 * First we have to get and lock the parent directory 3489 * to which ".." points. 3490 */ 3491 error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp); 3492 if (error) 3493 break; 3494 /* 3495 * Now we get and lock the child directory containing "..". 3496 */ 3497 pwd = pwd_hold(td); 3498 dvp = pwd->pwd_cdir; 3499 if ((error = vget(dvp, LK_EXCLUSIVE)) != 0) { 3500 vput(fdvp); 3501 pwd_drop(pwd); 3502 break; 3503 } 3504 dp = VTOI(dvp); 3505 SET_I_OFFSET(dp, 12); /* XXX mastertemplate.dot_reclen */ 3506 error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, 3507 DT_DIR, 0); 3508 cache_purge(fdvp); 3509 cache_purge(dvp); 3510 vput(dvp); 3511 vput(fdvp); 3512 pwd_drop(pwd); 3513 break; 3514 3515 case FFS_UNLINK: 3516 #ifdef DIAGNOSTIC 3517 if (fsckcmds) { 3518 char buf[32]; 3519 3520 if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL)) 3521 strncpy(buf, "Name_too_long", 32); 3522 printf("%s: unlink %s (inode %jd)\n", 3523 mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size); 3524 } 3525 #endif /* DIAGNOSTIC */ 3526 /* 3527 * kern_funlinkat will do its own start/finish writes and 3528 * they do not nest, so drop ours here. Setting mp == NULL 3529 * indicates that vn_finished_write is not needed down below. 3530 */ 3531 vn_finished_write(mp); 3532 mp = NULL; 3533 error = kern_funlinkat(td, AT_FDCWD, 3534 (char *)(intptr_t)cmd.value, FD_NONE, UIO_USERSPACE, 3535 0, (ino_t)cmd.size); 3536 break; 3537 3538 default: 3539 #ifdef DIAGNOSTIC 3540 if (fsckcmds) { 3541 printf("Invalid request %d from fsck\n", 3542 oidp->oid_number); 3543 } 3544 #endif /* DIAGNOSTIC */ 3545 error = EINVAL; 3546 break; 3547 } 3548 fdrop(fp, td); 3549 vn_finished_write(mp); 3550 return (error); 3551 } 3552