1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #include <sys/types.h> 40 #include <sys/t_lock.h> 41 #include <sys/param.h> 42 #include <sys/time.h> 43 #include <sys/fs/ufs_fs.h> 44 #include <sys/cmn_err.h> 45 46 #ifdef _KERNEL 47 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/buf.h> 51 #include <sys/conf.h> 52 #include <sys/user.h> 53 #include <sys/var.h> 54 #include <sys/vfs.h> 55 #include <sys/vnode.h> 56 #include <sys/proc.h> 57 #include <sys/debug.h> 58 #include <sys/fssnap_if.h> 59 #include <sys/fs/ufs_inode.h> 60 #include <sys/fs/ufs_trans.h> 61 #include <sys/fs/ufs_panic.h> 62 #include <sys/fs/ufs_bio.h> 63 #include <sys/fs/ufs_log.h> 64 #include <sys/kmem.h> 65 #include <sys/policy.h> 66 #include <vm/hat.h> 67 #include <vm/as.h> 68 #include <vm/seg.h> 69 #include <vm/pvn.h> 70 #include <vm/seg_map.h> 71 #include <sys/swap.h> 72 #include <vm/seg_kmem.h> 73 74 #else /* _KERNEL */ 75 76 #define ASSERT(x) /* don't use asserts for fsck et al */ 77 78 #endif /* _KERNEL */ 79 80 #ifdef _KERNEL 81 82 /* 83 * Used to verify that a given entry on the ufs_instances list (see below) 84 * still refers to a mounted file system. 85 * 86 * XXX: This is a crock that substitutes for proper locking to coordinate 87 * updates to and uses of the entries in ufs_instances. 88 */ 89 struct check_node { 90 struct vfs *vfsp; 91 struct ufsvfs *ufsvfs; 92 dev_t vfs_dev; 93 }; 94 95 static vfs_t *still_mounted(struct check_node *); 96 97 /* 98 * All ufs file system instances are linked together into a list starting at 99 * ufs_instances. The list is updated as part of mount and unmount. It's 100 * consulted in ufs_update, to allow syncing out all ufs file system instances 101 * in a batch. 102 * 103 * ufsvfs_mutex guards access to this list and to the {,old}ufsvfslist 104 * manipulated in ufs_funmount_cleanup. (A given ufs instance is always on 105 * exactly one of these lists except while it's being allocated or 106 * deallocated.) 107 */ 108 struct ufsvfs *ufs_instances; 109 extern kmutex_t ufsvfs_mutex; /* XXX: move this to ufs_inode.h? */ 110 111 /* 112 * ufsvfs list manipulation routines 113 */ 114 115 /* 116 * Link ufsp in at the head of the list of ufs_instances. 117 */ 118 void 119 ufs_vfs_add(struct ufsvfs *ufsp) 120 { 121 mutex_enter(&ufsvfs_mutex); 122 ufsp->vfs_next = ufs_instances; 123 ufs_instances = ufsp; 124 mutex_exit(&ufsvfs_mutex); 125 } 126 127 /* 128 * Remove ufsp from the list of ufs_instances. 129 * 130 * Does no error checking; ufsp is assumed to actually be on the list. 131 */ 132 void 133 ufs_vfs_remove(struct ufsvfs *ufsp) 134 { 135 struct ufsvfs **delpt = &ufs_instances; 136 137 mutex_enter(&ufsvfs_mutex); 138 for (; *delpt != NULL; delpt = &((*delpt)->vfs_next)) { 139 if (*delpt == ufsp) { 140 *delpt = ufsp->vfs_next; 141 ufsp->vfs_next = NULL; 142 break; 143 } 144 } 145 mutex_exit(&ufsvfs_mutex); 146 } 147 148 /* 149 * Clean up state resulting from a forcible unmount that couldn't be handled 150 * directly during the unmount. (See commentary in the unmount code for more 151 * info.) 152 */ 153 static void 154 ufs_funmount_cleanup() 155 { 156 struct ufsvfs *ufsvfsp; 157 extern struct ufsvfs *oldufsvfslist, *ufsvfslist; 158 159 /* 160 * Assumption: it's now safe to blow away the entries on 161 * oldufsvfslist. 162 */ 163 mutex_enter(&ufsvfs_mutex); 164 while ((ufsvfsp = oldufsvfslist) != NULL) { 165 oldufsvfslist = ufsvfsp->vfs_next; 166 167 mutex_destroy(&ufsvfsp->vfs_lock); 168 kmem_free(ufsvfsp, sizeof (struct ufsvfs)); 169 } 170 /* 171 * Rotate more recent unmount entries into place in preparation for 172 * the next time around. 173 */ 174 oldufsvfslist = ufsvfslist; 175 ufsvfslist = NULL; 176 mutex_exit(&ufsvfs_mutex); 177 } 178 179 180 /* 181 * ufs_update performs the ufs part of `sync'. It goes through the disk 182 * queues to initiate sandbagged IO; goes through the inodes to write 183 * modified nodes; and it goes through the mount table to initiate 184 * the writing of the modified super blocks. 185 */ 186 extern time_t time; 187 time_t ufs_sync_time; 188 time_t ufs_sync_time_secs = 1; 189 190 extern kmutex_t ufs_scan_lock; 191 192 void 193 ufs_update(int flag) 194 { 195 struct vfs *vfsp; 196 struct fs *fs; 197 struct ufsvfs *ufsp; 198 struct ufsvfs *ufsnext; 199 struct ufsvfs *update_list = NULL; 200 int check_cnt = 0; 201 size_t check_size; 202 struct check_node *check_list, *ptr; 203 int cheap = flag & SYNC_ATTR; 204 205 /* 206 * This is a hack. A design flaw in the forced unmount protocol 207 * could allow a thread to attempt to use a kmem_freed ufsvfs 208 * structure in ufs_lockfs_begin/ufs_check_lockfs. This window 209 * is difficult to hit, even during the lockfs stress tests. 210 * So the hacky fix is to wait awhile before kmem_free'ing the 211 * ufsvfs structures for forcibly unmounted file systems. `Awhile' 212 * is defined as every other call from fsflush (~60 seconds). 213 */ 214 if (cheap) 215 ufs_funmount_cleanup(); 216 217 /* 218 * Examine all ufsvfs structures and add those that we can lock to the 219 * update list. This is so that we don't hold the list lock for a 220 * long time. If vfs_lock fails for a file system instance, then skip 221 * it because somebody is doing a unmount on it. 222 */ 223 mutex_enter(&ufsvfs_mutex); 224 for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) { 225 vfsp = ufsp->vfs_vfs; 226 if (vfs_lock(vfsp) != 0) 227 continue; 228 ufsp->vfs_wnext = update_list; 229 update_list = ufsp; 230 check_cnt++; 231 } 232 mutex_exit(&ufsvfs_mutex); 233 234 if (update_list == NULL) 235 return; 236 237 check_size = sizeof (struct check_node) * check_cnt; 238 check_list = ptr = kmem_alloc(check_size, KM_NOSLEEP); 239 240 /* 241 * Write back modified superblocks. 242 * Consistency check that the superblock of 243 * each file system is still in the buffer cache. 244 * 245 * Note that the update_list traversal is done without the protection 246 * of an overall list lock, so it's necessary to rely on the fact that 247 * each entry of the list is vfs_locked when moving from one entry to 248 * the next. This works because a concurrent attempt to add an entry 249 * to another thread's update_list won't find it, since it'll already 250 * be locked. 251 */ 252 check_cnt = 0; 253 for (ufsp = update_list; ufsp != NULL; ufsp = ufsnext) { 254 /* 255 * Need to grab the next ptr before we unlock this one so 256 * another thread doesn't grab it and change it before we move 257 * on to the next vfs. (Once we unlock it, it's ok if another 258 * thread finds it to add it to its own update_list; we don't 259 * attempt to refer to it through our list any more.) 260 */ 261 ufsnext = ufsp->vfs_wnext; 262 vfsp = ufsp->vfs_vfs; 263 264 /* 265 * Seems like this can't happen, so perhaps it should become 266 * an ASSERT(vfsp->vfs_data != NULL). 267 */ 268 if (!vfsp->vfs_data) { 269 vfs_unlock(vfsp); 270 continue; 271 } 272 273 fs = ufsp->vfs_fs; 274 275 /* 276 * don't update a locked superblock during a panic; it 277 * may be in an inconsistent state 278 */ 279 if (panicstr) { 280 if (!mutex_tryenter(&ufsp->vfs_lock)) { 281 vfs_unlock(vfsp); 282 continue; 283 } 284 } else 285 mutex_enter(&ufsp->vfs_lock); 286 /* 287 * Build up the STABLE check list, so we can unlock the vfs 288 * until we do the actual checking. 289 */ 290 if (check_list != NULL) { 291 if ((fs->fs_ronly == 0) && 292 (fs->fs_clean != FSBAD) && 293 (fs->fs_clean != FSSUSPEND)) { 294 ptr->vfsp = vfsp; 295 ptr->ufsvfs = ufsp; 296 ptr->vfs_dev = vfsp->vfs_dev; 297 ptr++; 298 check_cnt++; 299 } 300 } 301 302 /* 303 * superblock is not modified 304 */ 305 if (fs->fs_fmod == 0) { 306 mutex_exit(&ufsp->vfs_lock); 307 vfs_unlock(vfsp); 308 continue; 309 } 310 if (fs->fs_ronly != 0) { 311 mutex_exit(&ufsp->vfs_lock); 312 vfs_unlock(vfsp); 313 (void) ufs_fault(ufsp->vfs_root, 314 "fs = %s update: ro fs mod\n", fs->fs_fsmnt); 315 /* 316 * XXX: Why is this a return instead of a continue? 317 * This may be an attempt to replace a panic with 318 * something less drastic, but there's cleanup we 319 * should be doing that's not being done (e.g., 320 * unlocking the remaining entries on the list). 321 */ 322 return; 323 } 324 fs->fs_fmod = 0; 325 mutex_exit(&ufsp->vfs_lock); 326 TRANS_SBUPDATE(ufsp, vfsp, TOP_SBUPDATE_UPDATE); 327 vfs_unlock(vfsp); 328 } 329 330 ufs_sync_time = time; 331 332 /* 333 * Avoid racing with ufs_unmount() and ufs_sync(). 334 */ 335 mutex_enter(&ufs_scan_lock); 336 337 (void) ufs_scan_inodes(1, ufs_sync_inode, (void *)(uintptr_t)cheap, 338 NULL); 339 340 mutex_exit(&ufs_scan_lock); 341 342 /* 343 * Force stale buffer cache information to be flushed, 344 * for all devices. This should cause any remaining control 345 * information (e.g., cg and inode info) to be flushed back. 346 */ 347 bflush((dev_t)NODEV); 348 349 if (check_list == NULL) 350 return; 351 352 /* 353 * For each UFS filesystem in the STABLE check_list, update 354 * the clean flag if warranted. 355 */ 356 for (ptr = check_list; check_cnt > 0; check_cnt--, ptr++) { 357 int error; 358 359 /* 360 * still_mounted() returns with vfsp and the vfs_reflock 361 * held if ptr refers to a vfs that is still mounted. 362 */ 363 if ((vfsp = still_mounted(ptr)) == NULL) 364 continue; 365 ufs_checkclean(vfsp); 366 /* 367 * commit any outstanding async transactions 368 */ 369 ufsp = (struct ufsvfs *)vfsp->vfs_data; 370 curthread->t_flag |= T_DONTBLOCK; 371 TRANS_BEGIN_SYNC(ufsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, 372 error); 373 if (!error) { 374 TRANS_END_SYNC(ufsp, error, TOP_COMMIT_UPDATE, 375 TOP_COMMIT_SIZE); 376 } 377 curthread->t_flag &= ~T_DONTBLOCK; 378 379 vfs_unlock(vfsp); 380 } 381 382 kmem_free(check_list, check_size); 383 } 384 385 int 386 ufs_sync_inode(struct inode *ip, void *arg) 387 { 388 int cheap = (int)(uintptr_t)arg; 389 struct ufsvfs *ufsvfsp; 390 uint_t flag = ip->i_flag; 391 392 if (cheap && ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) == 0)) 393 return (0); 394 395 /* 396 * if we are panic'ing; then don't update the inode if this 397 * file system is FSSTABLE. Otherwise, we would have to 398 * force the superblock to FSACTIVE and the superblock 399 * may not be in a good state. Also, if the inode is 400 * IREF'ed then it may be in an inconsistent state. Don't 401 * push it. Finally, don't push the inode if the fs is 402 * logging; the transaction will be discarded at boot. 403 */ 404 if (panicstr) { 405 406 if (flag & IREF) 407 return (0); 408 409 if (ip->i_ufsvfs == NULL || 410 (ip->i_fs->fs_clean == FSSTABLE || 411 ip->i_fs->fs_clean == FSLOG)) 412 return (0); 413 } 414 415 ufsvfsp = ip->i_ufsvfs; 416 417 /* 418 * Limit access time only updates 419 */ 420 if (((flag & (IMOD|IMODACC|IUPD|ICHG|IACC)) == IMODACC) && ufsvfsp) { 421 /* 422 * if file system has deferred access time turned on and there 423 * was no IO recently, don't bother flushing it. It will be 424 * flushed when I/Os start again. 425 */ 426 if (cheap && (ufsvfsp->vfs_dfritime & UFS_DFRATIME) && 427 (ufsvfsp->vfs_iotstamp + ufs_iowait < ddi_get_lbolt())) 428 return (0); 429 /* 430 * an app issueing a sync() can take forever on a trans device 431 * when NetWorker or find is running because all of the 432 * directorys' access times have to be updated. So, we limit 433 * the time we spend updating access times per sync. 434 */ 435 if (TRANS_ISTRANS(ufsvfsp) && ((ufs_sync_time + 436 ufs_sync_time_secs) < time)) 437 return (0); 438 } 439 440 /* 441 * if we are running on behalf of the flush thread or this is 442 * a swap file, then simply do a delay update of the inode. 443 * Otherwise, push the pages and then do a delayed inode update. 444 */ 445 if (cheap || IS_SWAPVP(ITOV(ip))) { 446 TRANS_IUPDAT(ip, 0); 447 } else { 448 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_SYNC); 449 } 450 return (0); 451 } 452 453 /* 454 * Flush all the pages associated with an inode using the given 'flags', 455 * then force inode information to be written back using the given 'waitfor'. 456 */ 457 int 458 ufs_syncip(struct inode *ip, int flags, int waitfor, top_t topid) 459 { 460 int error; 461 struct vnode *vp = ITOV(ip); 462 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 463 int dotrans = 0; 464 465 /* 466 * Return if file system has been forcibly umounted. 467 */ 468 if (ufsvfsp == NULL) 469 return (EIO); 470 /* 471 * don't need to VOP_PUTPAGE if there are no pages 472 */ 473 if (!vn_has_cached_data(vp) || vp->v_type == VCHR) { 474 error = 0; 475 } else { 476 /* 477 * if the inode we're working on is a shadow inode 478 * or quota inode we need to make sure that the 479 * ufs_putpage call is inside a transaction as this 480 * could include meta data changes. 481 */ 482 if ((ip->i_mode & IFMT) == IFSHAD || 483 ufsvfsp->vfs_qinod == ip) { 484 dotrans = 1; 485 curthread->t_flag |= T_DONTBLOCK; 486 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE, 487 TOP_PUTPAGE_SIZE(ip)); 488 } 489 error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, 490 flags, CRED(), NULL); 491 if (dotrans) { 492 TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE, 493 TOP_PUTPAGE_SIZE(ip)); 494 curthread->t_flag &= ~T_DONTBLOCK; 495 dotrans = 0; 496 } 497 } 498 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 499 goto out; 500 /* 501 * waitfor represents two things - 502 * 1. whether data sync or file sync. 503 * 2. if file sync then ufs_iupdat should 'waitfor' disk i/o or not. 504 */ 505 if (waitfor == I_DSYNC) { 506 /* 507 * If data sync, only IATTCHG (size/block change) requires 508 * inode update, fdatasync()/FDSYNC implementation. 509 */ 510 if (ip->i_flag & (IBDWRITE|IATTCHG)) { 511 /* 512 * Enter a transaction to provide mutual exclusion 513 * with deltamap_push and avoid a race where 514 * the inode flush could get dropped. 515 */ 516 if ((curthread->t_flag & T_DONTBLOCK) == 0) { 517 dotrans = 1; 518 curthread->t_flag |= T_DONTBLOCK; 519 TRANS_BEGIN_ASYNC(ufsvfsp, topid, 520 TOP_SYNCIP_SIZE); 521 } 522 rw_enter(&ip->i_contents, RW_READER); 523 mutex_enter(&ip->i_tlock); 524 ip->i_flag &= ~IMODTIME; 525 mutex_exit(&ip->i_tlock); 526 ufs_iupdat(ip, 1); 527 rw_exit(&ip->i_contents); 528 if (dotrans) { 529 TRANS_END_ASYNC(ufsvfsp, topid, 530 TOP_SYNCIP_SIZE); 531 curthread->t_flag &= ~T_DONTBLOCK; 532 } 533 } 534 } else { 535 /* For file sync, any inode change requires inode update */ 536 if (ip->i_flag & (IBDWRITE|IUPD|IACC|ICHG|IMOD|IMODACC)) { 537 /* 538 * Enter a transaction to provide mutual exclusion 539 * with deltamap_push and avoid a race where 540 * the inode flush could get dropped. 541 */ 542 if ((curthread->t_flag & T_DONTBLOCK) == 0) { 543 dotrans = 1; 544 curthread->t_flag |= T_DONTBLOCK; 545 TRANS_BEGIN_ASYNC(ufsvfsp, topid, 546 TOP_SYNCIP_SIZE); 547 } 548 rw_enter(&ip->i_contents, RW_READER); 549 mutex_enter(&ip->i_tlock); 550 ip->i_flag &= ~IMODTIME; 551 mutex_exit(&ip->i_tlock); 552 ufs_iupdat(ip, waitfor); 553 rw_exit(&ip->i_contents); 554 if (dotrans) { 555 TRANS_END_ASYNC(ufsvfsp, topid, 556 TOP_SYNCIP_SIZE); 557 curthread->t_flag &= ~T_DONTBLOCK; 558 } 559 } 560 } 561 562 out: 563 return (error); 564 } 565 /* 566 * Flush all indirect blocks related to an inode. 567 * Supports triple indirect blocks also. 568 */ 569 int 570 ufs_sync_indir(struct inode *ip) 571 { 572 int i; 573 daddr_t blkno; 574 daddr_t lbn; /* logical blkno of last blk in file */ 575 daddr_t clbn; /* current logical blk */ 576 daddr32_t *bap; 577 struct fs *fs; 578 struct buf *bp; 579 int bsize; 580 struct ufsvfs *ufsvfsp; 581 int j; 582 daddr_t indirect_blkno; 583 daddr32_t *indirect_bap; 584 struct buf *indirect_bp; 585 586 ufsvfsp = ip->i_ufsvfs; 587 /* 588 * unnecessary when logging; allocation blocks are kept up-to-date 589 */ 590 if (TRANS_ISTRANS(ufsvfsp)) 591 return (0); 592 593 fs = ufsvfsp->vfs_fs; 594 bsize = fs->fs_bsize; 595 lbn = (daddr_t)lblkno(fs, ip->i_size - 1); 596 if (lbn < NDADDR) 597 return (0); /* No indirect blocks used */ 598 if (lbn < NDADDR + NINDIR(fs)) { 599 /* File has one indirect block. */ 600 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, ip->i_ib[0])); 601 return (0); 602 } 603 604 /* Write out all the first level indirect blocks */ 605 for (i = 0; i < NIADDR; i++) { 606 if ((blkno = ip->i_ib[i]) == 0) 607 continue; 608 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno)); 609 } 610 /* Write out second level of indirect blocks */ 611 if ((blkno = ip->i_ib[1]) == 0) 612 return (0); 613 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize); 614 if (bp->b_flags & B_ERROR) { 615 brelse(bp); 616 return (EIO); 617 } 618 bap = bp->b_un.b_daddr; 619 clbn = NDADDR + NINDIR(fs); 620 for (i = 0; i < NINDIR(fs); i++) { 621 if (clbn > lbn) 622 break; 623 clbn += NINDIR(fs); 624 if ((blkno = bap[i]) == 0) 625 continue; 626 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno)); 627 } 628 629 brelse(bp); 630 /* write out third level indirect blocks */ 631 632 if ((blkno = ip->i_ib[2]) == 0) 633 return (0); 634 635 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize); 636 if (bp->b_flags & B_ERROR) { 637 brelse(bp); 638 return (EIO); 639 } 640 bap = bp->b_un.b_daddr; 641 clbn = NDADDR + NINDIR(fs) + (NINDIR(fs) * NINDIR(fs)); 642 643 for (i = 0; i < NINDIR(fs); i++) { 644 if (clbn > lbn) 645 break; 646 if ((indirect_blkno = bap[i]) == 0) 647 continue; 648 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, indirect_blkno)); 649 indirect_bp = UFS_BREAD(ufsvfsp, ip->i_dev, 650 (daddr_t)fsbtodb(fs, indirect_blkno), bsize); 651 if (indirect_bp->b_flags & B_ERROR) { 652 brelse(indirect_bp); 653 brelse(bp); 654 return (EIO); 655 } 656 indirect_bap = indirect_bp->b_un.b_daddr; 657 for (j = 0; j < NINDIR(fs); j++) { 658 if (clbn > lbn) 659 break; 660 clbn += NINDIR(fs); 661 if ((blkno = indirect_bap[j]) == 0) 662 continue; 663 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno)); 664 } 665 brelse(indirect_bp); 666 } 667 brelse(bp); 668 669 return (0); 670 } 671 672 /* 673 * Flush all indirect blocks related to an offset of a file. 674 * read/write in sync mode may have to flush indirect blocks. 675 */ 676 int 677 ufs_indirblk_sync(struct inode *ip, offset_t off) 678 { 679 daddr_t lbn; 680 struct fs *fs; 681 struct buf *bp; 682 int i, j, shft; 683 daddr_t ob, nb, tbn; 684 daddr32_t *bap; 685 int nindirshift, nindiroffset; 686 struct ufsvfs *ufsvfsp; 687 688 ufsvfsp = ip->i_ufsvfs; 689 /* 690 * unnecessary when logging; allocation blocks are kept up-to-date 691 */ 692 if (TRANS_ISTRANS(ufsvfsp)) 693 return (0); 694 695 fs = ufsvfsp->vfs_fs; 696 697 lbn = (daddr_t)lblkno(fs, off); 698 if (lbn < 0) 699 return (EFBIG); 700 701 /* The first NDADDR are direct so nothing to do */ 702 if (lbn < NDADDR) 703 return (0); 704 705 nindirshift = ip->i_ufsvfs->vfs_nindirshift; 706 nindiroffset = ip->i_ufsvfs->vfs_nindiroffset; 707 708 /* Determine level of indirect blocks */ 709 shft = 0; 710 tbn = lbn - NDADDR; 711 for (j = NIADDR; j > 0; j--) { 712 longlong_t sh; 713 714 shft += nindirshift; 715 sh = 1LL << shft; 716 if (tbn < sh) 717 break; 718 tbn -= (daddr_t)sh; 719 } 720 721 if (j == 0) 722 return (EFBIG); 723 724 if ((nb = ip->i_ib[NIADDR - j]) == 0) 725 return (0); /* UFS Hole */ 726 727 /* Flush first level indirect block */ 728 blkflush(ip->i_dev, fsbtodb(fs, nb)); 729 730 /* Fetch through next levels */ 731 for (; j < NIADDR; j++) { 732 ob = nb; 733 bp = UFS_BREAD(ufsvfsp, 734 ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize); 735 if (bp->b_flags & B_ERROR) { 736 brelse(bp); 737 return (EIO); 738 } 739 bap = bp->b_un.b_daddr; 740 shft -= nindirshift; /* sh / nindir */ 741 i = (tbn >> shft) & nindiroffset; /* (tbn /sh) & nindir */ 742 nb = bap[i]; 743 brelse(bp); 744 if (nb == 0) { 745 return (0); /* UFS hole */ 746 } 747 blkflush(ip->i_dev, fsbtodb(fs, nb)); 748 } 749 return (0); 750 } 751 752 #ifdef DEBUG 753 754 /* 755 * The bad block checking routines: ufs_indir_badblock() and ufs_badblock() 756 * are very expensive. It's been found from profiling that we're 757 * spending 6-7% of our time in ufs_badblock, and another 1-2% in 758 * ufs_indir_badblock. They are only called via ASSERTs (from debug kernels). 759 * In addition from experience no failures have been found in recent 760 * years. So the following tunable can be set to enable checking. 761 */ 762 int ufs_badblock_checks = 0; 763 764 /* 765 * Check that a given indirect block contains blocks in range 766 */ 767 int 768 ufs_indir_badblock(struct inode *ip, daddr32_t *bap) 769 { 770 int i; 771 int err = 0; 772 773 if (ufs_badblock_checks) { 774 for (i = 0; i < NINDIR(ip->i_fs) - 1; i++) 775 if (bap[i] != 0 && (err = ufs_badblock(ip, bap[i]))) 776 break; 777 } 778 return (err); 779 } 780 781 /* 782 * Check that a specified block number is in range. 783 */ 784 int 785 ufs_badblock(struct inode *ip, daddr_t bn) 786 { 787 long c; 788 daddr_t sum; 789 790 if (!ufs_badblock_checks) 791 return (0); 792 ASSERT(bn); 793 if (bn <= 0 || bn > ip->i_fs->fs_size) 794 return (bn); 795 796 sum = 0; 797 c = dtog(ip->i_fs, bn); 798 if (c == 0) { 799 sum = howmany(ip->i_fs->fs_cssize, ip->i_fs->fs_fsize); 800 } 801 /* 802 * if block no. is below this cylinder group, 803 * within the space reserved for superblock, inodes, (summary data) 804 * or if it is above this cylinder group 805 * then its invalid 806 * It's hard to see how we'd be outside this cyl, but let's be careful. 807 */ 808 if ((bn < cgbase(ip->i_fs, c)) || 809 (bn >= cgsblock(ip->i_fs, c) && bn < cgdmin(ip->i_fs, c)+sum) || 810 (bn >= (unsigned)cgbase(ip->i_fs, c+1))) 811 return (bn); 812 813 return (0); /* not a bad block */ 814 } 815 816 #endif /* DEBUG */ 817 818 /* 819 * When i_rwlock is write-locked or has a writer pended, then the inode 820 * is going to change in a way that the filesystem will be marked as 821 * active. So no need to let the filesystem be mark as stable now. 822 * Also to ensure the filesystem consistency during the directory 823 * operations, filesystem cannot be marked as stable if i_rwlock of 824 * the directory inode is write-locked. 825 */ 826 827 /* 828 * Check for busy inodes for this filesystem. 829 * NOTE: Needs better way to do this expensive operation in the future. 830 */ 831 static void 832 ufs_icheck(struct ufsvfs *ufsvfsp, int *isbusyp, int *isreclaimp) 833 { 834 union ihead *ih; 835 struct inode *ip; 836 int i; 837 int isnottrans = !TRANS_ISTRANS(ufsvfsp); 838 int isbusy = *isbusyp; 839 int isreclaim = *isreclaimp; 840 841 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 842 mutex_enter(&ih_lock[i]); 843 for (ip = ih->ih_chain[0]; 844 ip != (struct inode *)ih; 845 ip = ip->i_forw) { 846 /* 847 * if inode is busy/modified/deleted, filesystem is busy 848 */ 849 if (ip->i_ufsvfs != ufsvfsp) 850 continue; 851 if ((ip->i_flag & (IMOD | IUPD | ICHG)) || 852 (RW_ISWRITER(&ip->i_rwlock))) 853 isbusy = 1; 854 if ((ip->i_nlink <= 0) && (ip->i_flag & IREF)) 855 isreclaim = 1; 856 if (isbusy && (isreclaim || isnottrans)) 857 break; 858 } 859 mutex_exit(&ih_lock[i]); 860 if (isbusy && (isreclaim || isnottrans)) 861 break; 862 } 863 *isbusyp = isbusy; 864 *isreclaimp = isreclaim; 865 } 866 867 /* 868 * As part of the ufs 'sync' operation, this routine is called to mark 869 * the filesystem as STABLE if there is no modified metadata in memory. 870 */ 871 void 872 ufs_checkclean(struct vfs *vfsp) 873 { 874 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 875 struct fs *fs = ufsvfsp->vfs_fs; 876 int isbusy; 877 int isreclaim; 878 int updatesb; 879 880 ASSERT(vfs_lock_held(vfsp)); 881 882 /* 883 * filesystem is stable or cleanflag processing is disabled; do nothing 884 * no transitions when panic'ing 885 */ 886 if (fs->fs_ronly || 887 fs->fs_clean == FSBAD || 888 fs->fs_clean == FSSUSPEND || 889 fs->fs_clean == FSSTABLE || 890 panicstr) 891 return; 892 893 /* 894 * if logging and nothing to reclaim; do nothing 895 */ 896 if ((fs->fs_clean == FSLOG) && 897 (((fs->fs_reclaim & FS_RECLAIM) == 0) || 898 (fs->fs_reclaim & FS_RECLAIMING))) 899 return; 900 901 /* 902 * FS_CHECKCLEAN is reset if the file system goes dirty 903 * FS_CHECKRECLAIM is reset if a file gets deleted 904 */ 905 mutex_enter(&ufsvfsp->vfs_lock); 906 fs->fs_reclaim |= (FS_CHECKCLEAN | FS_CHECKRECLAIM); 907 mutex_exit(&ufsvfsp->vfs_lock); 908 909 updatesb = 0; 910 911 /* 912 * if logging or buffers are busy; do nothing 913 */ 914 isbusy = isreclaim = 0; 915 if ((fs->fs_clean == FSLOG) || 916 (bcheck(vfsp->vfs_dev, ufsvfsp->vfs_bufp))) 917 isbusy = 1; 918 919 /* 920 * isreclaim == TRUE means can't change the state of fs_reclaim 921 */ 922 isreclaim = 923 ((fs->fs_clean == FSLOG) && 924 (((fs->fs_reclaim & FS_RECLAIM) == 0) || 925 (fs->fs_reclaim & FS_RECLAIMING))); 926 927 /* 928 * if fs is busy or can't change the state of fs_reclaim; do nothing 929 */ 930 if (isbusy && isreclaim) 931 return; 932 933 /* 934 * look for busy or deleted inodes; (deleted == needs reclaim) 935 */ 936 ufs_icheck(ufsvfsp, &isbusy, &isreclaim); 937 938 mutex_enter(&ufsvfsp->vfs_lock); 939 940 /* 941 * IF POSSIBLE, RESET RECLAIM 942 */ 943 /* 944 * the reclaim thread is not running 945 */ 946 if ((fs->fs_reclaim & FS_RECLAIMING) == 0) 947 /* 948 * no files were deleted during the scan 949 */ 950 if (fs->fs_reclaim & FS_CHECKRECLAIM) 951 /* 952 * no deleted files were found in the inode cache 953 */ 954 if ((isreclaim == 0) && (fs->fs_reclaim & FS_RECLAIM)) { 955 fs->fs_reclaim &= ~FS_RECLAIM; 956 updatesb = 1; 957 } 958 /* 959 * IF POSSIBLE, SET STABLE 960 */ 961 /* 962 * not logging 963 */ 964 if (fs->fs_clean != FSLOG) 965 /* 966 * file system has not gone dirty since the scan began 967 */ 968 if (fs->fs_reclaim & FS_CHECKCLEAN) 969 /* 970 * nothing dirty was found in the buffer or inode cache 971 */ 972 if ((isbusy == 0) && (isreclaim == 0) && 973 (fs->fs_clean != FSSTABLE)) { 974 fs->fs_clean = FSSTABLE; 975 updatesb = 1; 976 } 977 978 mutex_exit(&ufsvfsp->vfs_lock); 979 if (updatesb) { 980 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE); 981 } 982 } 983 984 /* 985 * called whenever an unlink occurs 986 */ 987 void 988 ufs_setreclaim(struct inode *ip) 989 { 990 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 991 struct fs *fs = ufsvfsp->vfs_fs; 992 993 if (ip->i_nlink || fs->fs_ronly || (fs->fs_clean != FSLOG)) 994 return; 995 996 /* 997 * reclaim-needed bit is already set or we need to tell 998 * ufs_checkclean that a file has been deleted 999 */ 1000 if ((fs->fs_reclaim & (FS_RECLAIM | FS_CHECKRECLAIM)) == FS_RECLAIM) 1001 return; 1002 1003 mutex_enter(&ufsvfsp->vfs_lock); 1004 /* 1005 * inform ufs_checkclean that the file system has gone dirty 1006 */ 1007 fs->fs_reclaim &= ~FS_CHECKRECLAIM; 1008 1009 /* 1010 * set the reclaim-needed bit 1011 */ 1012 if ((fs->fs_reclaim & FS_RECLAIM) == 0) { 1013 fs->fs_reclaim |= FS_RECLAIM; 1014 ufs_sbwrite(ufsvfsp); 1015 } 1016 mutex_exit(&ufsvfsp->vfs_lock); 1017 } 1018 1019 /* 1020 * Before any modified metadata written back to the disk, this routine 1021 * is called to mark the filesystem as ACTIVE. 1022 */ 1023 void 1024 ufs_notclean(struct ufsvfs *ufsvfsp) 1025 { 1026 struct fs *fs = ufsvfsp->vfs_fs; 1027 1028 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock)); 1029 ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs)); 1030 1031 /* 1032 * inform ufs_checkclean that the file system has gone dirty 1033 */ 1034 fs->fs_reclaim &= ~FS_CHECKCLEAN; 1035 1036 /* 1037 * ignore if active or bad or suspended or readonly or logging 1038 */ 1039 if ((fs->fs_clean == FSACTIVE) || (fs->fs_clean == FSLOG) || 1040 (fs->fs_clean == FSBAD) || (fs->fs_clean == FSSUSPEND) || 1041 (fs->fs_ronly)) { 1042 mutex_exit(&ufsvfsp->vfs_lock); 1043 return; 1044 } 1045 fs->fs_clean = FSACTIVE; 1046 /* 1047 * write superblock synchronously 1048 */ 1049 ufs_sbwrite(ufsvfsp); 1050 mutex_exit(&ufsvfsp->vfs_lock); 1051 } 1052 1053 /* 1054 * ufs specific fbwrite() 1055 */ 1056 int 1057 ufs_fbwrite(struct fbuf *fbp, struct inode *ip) 1058 { 1059 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1060 1061 if (TRANS_ISTRANS(ufsvfsp)) 1062 return (fbwrite(fbp)); 1063 mutex_enter(&ufsvfsp->vfs_lock); 1064 ufs_notclean(ufsvfsp); 1065 return ((ufsvfsp->vfs_dio) ? fbdwrite(fbp) : fbwrite(fbp)); 1066 } 1067 1068 /* 1069 * ufs specific fbiwrite() 1070 */ 1071 int 1072 ufs_fbiwrite(struct fbuf *fbp, struct inode *ip, daddr_t bn, long bsize) 1073 { 1074 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1075 o_mode_t ifmt = ip->i_mode & IFMT; 1076 buf_t *bp; 1077 int error; 1078 1079 mutex_enter(&ufsvfsp->vfs_lock); 1080 ufs_notclean(ufsvfsp); 1081 if (ifmt == IFDIR || ifmt == IFSHAD || ifmt == IFATTRDIR || 1082 (ip->i_ufsvfs->vfs_qinod == ip)) { 1083 TRANS_DELTA(ufsvfsp, ldbtob(bn * (offset_t)(btod(bsize))), 1084 fbp->fb_count, DT_FBI, 0, 0); 1085 } 1086 /* 1087 * Inlined version of fbiwrite() 1088 */ 1089 bp = pageio_setup((struct page *)NULL, fbp->fb_count, 1090 ip->i_devvp, B_WRITE); 1091 bp->b_flags &= ~B_PAGEIO; 1092 bp->b_un.b_addr = fbp->fb_addr; 1093 1094 bp->b_blkno = bn * btod(bsize); 1095 bp->b_dev = cmpdev(ip->i_dev); /* store in old dev format */ 1096 bp->b_edev = ip->i_dev; 1097 bp->b_proc = NULL; /* i.e. the kernel */ 1098 bp->b_file = ip->i_vnode; 1099 bp->b_offset = -1; 1100 1101 if (ufsvfsp->vfs_log) { 1102 lufs_write_strategy(ufsvfsp->vfs_log, bp); 1103 } else if (ufsvfsp->vfs_snapshot) { 1104 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 1105 } else { 1106 ufsvfsp->vfs_iotstamp = ddi_get_lbolt(); 1107 ub.ub_fbiwrites.value.ul++; 1108 (void) bdev_strategy(bp); 1109 lwp_stat_update(LWP_STAT_OUBLK, 1); 1110 } 1111 error = biowait(bp); 1112 pageio_done(bp); 1113 fbrelse(fbp, S_OTHER); 1114 return (error); 1115 } 1116 1117 /* 1118 * Write the ufs superblock only. 1119 */ 1120 void 1121 ufs_sbwrite(struct ufsvfs *ufsvfsp) 1122 { 1123 char sav_fs_fmod; 1124 struct fs *fs = ufsvfsp->vfs_fs; 1125 struct buf *bp = ufsvfsp->vfs_bufp; 1126 1127 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock)); 1128 1129 /* 1130 * for ulockfs processing, limit the superblock writes 1131 */ 1132 if ((ufsvfsp->vfs_ulockfs.ul_sbowner) && 1133 (curthread != ufsvfsp->vfs_ulockfs.ul_sbowner)) { 1134 /* try again later */ 1135 fs->fs_fmod = 1; 1136 return; 1137 } 1138 1139 ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs)); 1140 /* 1141 * update superblock timestamp and fs_clean checksum 1142 * if marked FSBAD, we always want an erroneous 1143 * checksum to force repair 1144 */ 1145 fs->fs_time = gethrestime_sec(); 1146 fs->fs_state = (fs->fs_clean != FSBAD) ? 1147 FSOKAY - fs->fs_time : -(FSOKAY - fs->fs_time); 1148 switch (fs->fs_clean) { 1149 case FSCLEAN: 1150 case FSSTABLE: 1151 fs->fs_reclaim &= ~FS_RECLAIM; 1152 break; 1153 case FSACTIVE: 1154 case FSSUSPEND: 1155 case FSBAD: 1156 case FSLOG: 1157 break; 1158 default: 1159 fs->fs_clean = FSACTIVE; 1160 break; 1161 } 1162 /* 1163 * reset incore only bits 1164 */ 1165 fs->fs_reclaim &= ~(FS_CHECKCLEAN | FS_CHECKRECLAIM); 1166 1167 /* 1168 * delta the whole superblock 1169 */ 1170 TRANS_DELTA(ufsvfsp, ldbtob(SBLOCK), sizeof (struct fs), 1171 DT_SB, NULL, 0); 1172 /* 1173 * retain the incore state of fs_fmod; set the ondisk state to 0 1174 */ 1175 sav_fs_fmod = fs->fs_fmod; 1176 fs->fs_fmod = 0; 1177 1178 /* 1179 * Don't release the buffer after written to the disk 1180 */ 1181 UFS_BWRITE2(ufsvfsp, bp); 1182 fs->fs_fmod = sav_fs_fmod; /* reset fs_fmod's incore state */ 1183 } 1184 1185 /* 1186 * Returns vfs pointer if vfs still being mounted. vfs lock is held. 1187 * Otherwise, returns NULL. 1188 * 1189 * For our purposes, "still mounted" means that the file system still appears 1190 * on the list of UFS file system instances. 1191 */ 1192 static vfs_t * 1193 still_mounted(struct check_node *checkp) 1194 { 1195 struct vfs *vfsp; 1196 struct ufsvfs *ufsp; 1197 1198 mutex_enter(&ufsvfs_mutex); 1199 for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) { 1200 if (ufsp != checkp->ufsvfs) 1201 continue; 1202 /* 1203 * Tentative match: verify it and try to lock. (It's not at 1204 * all clear how the verification could fail, given that we've 1205 * gotten this far. We would have had to reallocate the 1206 * ufsvfs struct at hand for a new incarnation; is that really 1207 * possible in the interval from constructing the check_node 1208 * to here?) 1209 */ 1210 vfsp = ufsp->vfs_vfs; 1211 if (vfsp != checkp->vfsp) 1212 continue; 1213 if (vfsp->vfs_dev != checkp->vfs_dev) 1214 continue; 1215 if (vfs_lock(vfsp) != 0) 1216 continue; 1217 1218 mutex_exit(&ufsvfs_mutex); 1219 return (vfsp); 1220 } 1221 mutex_exit(&ufsvfs_mutex); 1222 return (NULL); 1223 } 1224 1225 int 1226 ufs_si_io_done(struct buf *bp) 1227 { 1228 sema_v(&bp->b_io); 1229 return (0); 1230 } 1231 1232 #define SI_BUFSZ roundup(sizeof (struct cg), DEV_BSIZE) 1233 #define NSIBUF 32 1234 1235 /* 1236 * ufs_construct_si() 1237 * Read each cylinder group in turn and construct the summary information 1238 */ 1239 static int 1240 ufs_construct_si(dev_t dev, struct fs *fs, struct ufsvfs *ufsvfsp) 1241 { 1242 buf_t *bps, *bp; 1243 char *bufs; 1244 struct csum *sip = fs->fs_u.fs_csp; 1245 struct cg *cgp; 1246 int i, ncg; 1247 int error = 0, cg = 0; 1248 1249 bps = kmem_alloc(NSIBUF * sizeof (buf_t), KM_SLEEP); 1250 bufs = kmem_alloc(NSIBUF * SI_BUFSZ, KM_SLEEP); 1251 1252 /* 1253 * Initialise the buffer headers 1254 */ 1255 for (bp = bps, i = 0; i < NSIBUF; i++, bp++) { 1256 bioinit(bp); 1257 bp->b_iodone = ufs_si_io_done; 1258 bp->b_bufsize = bp->b_bcount = SI_BUFSZ; 1259 bp->b_flags = B_READ; 1260 bp->b_un.b_addr = bufs + (i * SI_BUFSZ); 1261 bp->b_edev = dev; 1262 } 1263 1264 /* 1265 * Repeat while there are cylinder groups left to read. 1266 */ 1267 do { 1268 /* 1269 * Issue upto NSIBUF asynchronous reads 1270 */ 1271 ncg = MIN(NSIBUF, (fs->fs_ncg - cg)); 1272 for (bp = bps, i = 0; i < ncg; i++, bp++) { 1273 bp->b_blkno = (daddr_t)fsbtodb(fs, cgtod(fs, cg + i)); 1274 if (ufsvfsp->vfs_log) { 1275 lufs_read_strategy(ufsvfsp->vfs_log, bp); 1276 } else { 1277 (void) bdev_strategy(bp); 1278 } 1279 } 1280 1281 /* 1282 * wait for each read to finish; 1283 * check for errors and copy the csum info 1284 */ 1285 for (bp = bps, i = 0; i < ncg; i++, bp++) { 1286 sema_p(&bp->b_io); 1287 if (!error) { 1288 cgp = bp->b_un.b_cg; 1289 sip[cg + i] = cgp->cg_cs; 1290 error = geterror(bp); 1291 } 1292 } 1293 if (error) { 1294 goto err; 1295 } 1296 cg += ncg; 1297 } while (cg < fs->fs_ncg); 1298 1299 err: 1300 kmem_free(bps, NSIBUF * sizeof (buf_t)); 1301 kmem_free(bufs, NSIBUF * SI_BUFSZ); 1302 return (error); 1303 } 1304 1305 /* 1306 * ufs_getsummaryinfo 1307 */ 1308 int 1309 ufs_getsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs) 1310 { 1311 int i; /* `for' loop counter */ 1312 ssize_t size; /* bytes of summary info to read */ 1313 daddr_t frags; /* frags of summary info to read */ 1314 caddr_t sip; /* summary info */ 1315 struct buf *tp; /* tmp buf */ 1316 1317 /* 1318 * maintain metadata map for trans device (debug only) 1319 */ 1320 TRANS_MATA_SI(ufsvfsp, fs); 1321 1322 /* 1323 * Compute #frags and allocate space for summary info 1324 */ 1325 frags = howmany(fs->fs_cssize, fs->fs_fsize); 1326 sip = kmem_alloc((size_t)fs->fs_cssize, KM_SLEEP); 1327 fs->fs_u.fs_csp = (struct csum *)sip; 1328 1329 if (fs->fs_si == FS_SI_BAD) { 1330 /* 1331 * The summary information is unknown, read it in from 1332 * the cylinder groups. 1333 */ 1334 if (TRANS_ISTRANS(ufsvfsp) && !TRANS_ISERROR(ufsvfsp) && 1335 ufsvfsp->vfs_log->un_logmap) { 1336 logmap_roll_dev(ufsvfsp->vfs_log); /* flush the log */ 1337 } 1338 bzero(sip, (size_t)fs->fs_cssize); 1339 if (ufs_construct_si(dev, fs, ufsvfsp)) { 1340 kmem_free(fs->fs_u.fs_csp, fs->fs_cssize); 1341 fs->fs_u.fs_csp = NULL; 1342 return (EIO); 1343 } 1344 } else { 1345 /* Read summary info a fs block at a time */ 1346 size = fs->fs_bsize; 1347 for (i = 0; i < frags; i += fs->fs_frag) { 1348 if (i + fs->fs_frag > frags) 1349 /* 1350 * This happens only the last iteration, so 1351 * don't worry about size being reset 1352 */ 1353 size = (frags - i) * fs->fs_fsize; 1354 tp = UFS_BREAD(ufsvfsp, dev, 1355 (daddr_t)fsbtodb(fs, fs->fs_csaddr+i), size); 1356 tp->b_flags |= B_STALE | B_AGE; 1357 if (tp->b_flags & B_ERROR) { 1358 kmem_free(fs->fs_u.fs_csp, fs->fs_cssize); 1359 fs->fs_u.fs_csp = NULL; 1360 brelse(tp); 1361 return (EIO); 1362 } 1363 bcopy(tp->b_un.b_addr, sip, size); 1364 sip += size; 1365 brelse(tp); 1366 } 1367 } 1368 bzero((caddr_t)&fs->fs_cstotal, sizeof (fs->fs_cstotal)); 1369 for (i = 0; i < fs->fs_ncg; ++i) { 1370 fs->fs_cstotal.cs_ndir += fs->fs_cs(fs, i).cs_ndir; 1371 fs->fs_cstotal.cs_nbfree += fs->fs_cs(fs, i).cs_nbfree; 1372 fs->fs_cstotal.cs_nifree += fs->fs_cs(fs, i).cs_nifree; 1373 fs->fs_cstotal.cs_nffree += fs->fs_cs(fs, i).cs_nffree; 1374 } 1375 return (0); 1376 } 1377 1378 /* 1379 * ufs_putsummaryinfo() stores all the cylinder group summary information 1380 * This is only used when logging, but the file system may not 1381 * be logging at the time, eg a read-only mount to flush the log 1382 * may push the summary info out. 1383 */ 1384 int 1385 ufs_putsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs) 1386 { 1387 struct buf b, *bp; /* tmp buf */ 1388 caddr_t sip; /* summary info */ 1389 ssize_t size; /* bytes of summary info to write */ 1390 daddr_t frags; /* frags of summary info to write */ 1391 int i; /* `for' loop counter */ 1392 int error; /* error */ 1393 1394 if (TRANS_ISERROR(ufsvfsp)) { 1395 return (EIO); 1396 } 1397 1398 if ((fs->fs_si != FS_SI_BAD) || !ufsvfsp->vfs_nolog_si) { 1399 return (0); 1400 } 1401 1402 bp = &b; 1403 bioinit(bp); 1404 bp->b_iodone = ufs_si_io_done; 1405 bp->b_bufsize = size = fs->fs_bsize; 1406 bp->b_flags = B_WRITE; 1407 bp->b_un.b_addr = kmem_alloc(size, KM_SLEEP); 1408 bp->b_edev = dev; 1409 frags = howmany(fs->fs_cssize, fs->fs_fsize); 1410 sip = (caddr_t)fs->fs_u.fs_csp; 1411 1412 /* Write summary info one fs block at a time */ 1413 for (error = 0, i = 0; (i < frags) && (error == 0); i += fs->fs_frag) { 1414 if (i + fs->fs_frag > frags) { 1415 /* 1416 * This happens only the last iteration, so 1417 * don't worry about size being reset 1418 */ 1419 size = (frags - i) * fs->fs_fsize; 1420 } 1421 bcopy(sip, bp->b_un.b_addr, size); 1422 bp->b_blkno = (daddr_t)fsbtodb(fs, fs->fs_csaddr+i); 1423 bp->b_bcount = size; 1424 (void) bdev_strategy(bp); 1425 sema_p(&bp->b_io); /* wait for write to complete */ 1426 error = geterror(bp); 1427 sip += size; 1428 } 1429 kmem_free(bp->b_un.b_addr, fs->fs_bsize); 1430 if (!error) { 1431 fs->fs_si = FS_SI_OK; 1432 } 1433 return (error); 1434 } 1435 1436 /* 1437 * Decide whether it is okay to remove within a sticky directory. 1438 * Two conditions need to be met: write access to the directory 1439 * is needed. In sticky directories, write access is not sufficient; 1440 * you can remove entries from a directory only if you own the directory, 1441 * if you are privileged, if you own the entry or if the entry is 1442 * a plain file and you have write access to that file. 1443 * Function returns 0 if remove access is granted. 1444 * Note, the caller is responsible for holding the i_contents lock 1445 * at least as reader on the inquired inode 'ip'. 1446 */ 1447 int 1448 ufs_sticky_remove_access(struct inode *dp, struct inode *ip, struct cred *cr) 1449 { 1450 uid_t uid; 1451 1452 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 1453 1454 if ((dp->i_mode & ISVTX) && 1455 (uid = crgetuid(cr)) != dp->i_uid && 1456 uid != ip->i_uid && 1457 ((ip->i_mode & IFMT) != IFREG || 1458 ufs_iaccess(ip, IWRITE, cr, 0) != 0)) 1459 return (secpolicy_vnode_remove(cr)); 1460 1461 return (0); 1462 } 1463 #endif /* _KERNEL */ 1464 1465 extern int around[9]; 1466 extern int inside[9]; 1467 extern uchar_t *fragtbl[]; 1468 1469 /* 1470 * Update the frsum fields to reflect addition or deletion 1471 * of some frags. 1472 */ 1473 void 1474 fragacct(struct fs *fs, int fragmap, int32_t *fraglist, int cnt) 1475 { 1476 int inblk; 1477 int field, subfield; 1478 int siz, pos; 1479 1480 /* 1481 * ufsvfsp->vfs_lock is held when calling this. 1482 */ 1483 inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1; 1484 fragmap <<= 1; 1485 for (siz = 1; siz < fs->fs_frag; siz++) { 1486 if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0) 1487 continue; 1488 field = around[siz]; 1489 subfield = inside[siz]; 1490 for (pos = siz; pos <= fs->fs_frag; pos++) { 1491 if ((fragmap & field) == subfield) { 1492 fraglist[siz] += cnt; 1493 ASSERT(fraglist[siz] >= 0); 1494 pos += siz; 1495 field <<= siz; 1496 subfield <<= siz; 1497 } 1498 field <<= 1; 1499 subfield <<= 1; 1500 } 1501 } 1502 } 1503 1504 /* 1505 * Block operations 1506 */ 1507 1508 /* 1509 * Check if a block is available 1510 */ 1511 int 1512 isblock(struct fs *fs, uchar_t *cp, daddr_t h) 1513 { 1514 uchar_t mask; 1515 1516 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \ 1517 fs->fs_frag == 1); 1518 /* 1519 * ufsvfsp->vfs_lock is held when calling this. 1520 */ 1521 switch ((int)fs->fs_frag) { 1522 case 8: 1523 return (cp[h] == 0xff); 1524 case 4: 1525 mask = 0x0f << ((h & 0x1) << 2); 1526 return ((cp[h >> 1] & mask) == mask); 1527 case 2: 1528 mask = 0x03 << ((h & 0x3) << 1); 1529 return ((cp[h >> 2] & mask) == mask); 1530 case 1: 1531 mask = 0x01 << (h & 0x7); 1532 return ((cp[h >> 3] & mask) == mask); 1533 default: 1534 #ifndef _KERNEL 1535 cmn_err(CE_PANIC, "isblock: illegal fs->fs_frag value (%d)", 1536 fs->fs_frag); 1537 #endif /* _KERNEL */ 1538 return (0); 1539 } 1540 } 1541 1542 /* 1543 * Take a block out of the map 1544 */ 1545 void 1546 clrblock(struct fs *fs, uchar_t *cp, daddr_t h) 1547 { 1548 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \ 1549 fs->fs_frag == 1); 1550 /* 1551 * ufsvfsp->vfs_lock is held when calling this. 1552 */ 1553 switch ((int)fs->fs_frag) { 1554 case 8: 1555 cp[h] = 0; 1556 return; 1557 case 4: 1558 cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); 1559 return; 1560 case 2: 1561 cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); 1562 return; 1563 case 1: 1564 cp[h >> 3] &= ~(0x01 << (h & 0x7)); 1565 return; 1566 default: 1567 #ifndef _KERNEL 1568 cmn_err(CE_PANIC, "clrblock: illegal fs->fs_frag value (%d)", 1569 fs->fs_frag); 1570 #endif /* _KERNEL */ 1571 return; 1572 } 1573 } 1574 1575 /* 1576 * Is block allocated? 1577 */ 1578 int 1579 isclrblock(struct fs *fs, uchar_t *cp, daddr_t h) 1580 { 1581 uchar_t mask; 1582 int frag; 1583 /* 1584 * ufsvfsp->vfs_lock is held when calling this. 1585 */ 1586 frag = fs->fs_frag; 1587 ASSERT(frag == 8 || frag == 4 || frag == 2 || frag == 1); 1588 switch (frag) { 1589 case 8: 1590 return (cp[h] == 0); 1591 case 4: 1592 mask = ~(0x0f << ((h & 0x1) << 2)); 1593 return (cp[h >> 1] == (cp[h >> 1] & mask)); 1594 case 2: 1595 mask = ~(0x03 << ((h & 0x3) << 1)); 1596 return (cp[h >> 2] == (cp[h >> 2] & mask)); 1597 case 1: 1598 mask = ~(0x01 << (h & 0x7)); 1599 return (cp[h >> 3] == (cp[h >> 3] & mask)); 1600 default: 1601 #ifndef _KERNEL 1602 cmn_err(CE_PANIC, "isclrblock: illegal fs->fs_frag value (%d)", 1603 fs->fs_frag); 1604 #endif /* _KERNEL */ 1605 break; 1606 } 1607 return (0); 1608 } 1609 1610 /* 1611 * Put a block into the map 1612 */ 1613 void 1614 setblock(struct fs *fs, uchar_t *cp, daddr_t h) 1615 { 1616 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \ 1617 fs->fs_frag == 1); 1618 /* 1619 * ufsvfsp->vfs_lock is held when calling this. 1620 */ 1621 switch ((int)fs->fs_frag) { 1622 case 8: 1623 cp[h] = 0xff; 1624 return; 1625 case 4: 1626 cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); 1627 return; 1628 case 2: 1629 cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); 1630 return; 1631 case 1: 1632 cp[h >> 3] |= (0x01 << (h & 0x7)); 1633 return; 1634 default: 1635 #ifndef _KERNEL 1636 cmn_err(CE_PANIC, "setblock: illegal fs->fs_frag value (%d)", 1637 fs->fs_frag); 1638 #endif /* _KERNEL */ 1639 return; 1640 } 1641 } 1642 1643 int 1644 skpc(char c, uint_t len, char *cp) 1645 { 1646 if (len == 0) 1647 return (0); 1648 while (*cp++ == c && --len) 1649 ; 1650 return (len); 1651 } 1652