1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/param.h> 45 #include <sys/time.h> 46 #include <sys/fs/ufs_fs.h> 47 #include <sys/cmn_err.h> 48 49 #ifdef _KERNEL 50 51 #include <sys/systm.h> 52 #include <sys/sysmacros.h> 53 #include <sys/buf.h> 54 #include <sys/conf.h> 55 #include <sys/user.h> 56 #include <sys/var.h> 57 #include <sys/vfs.h> 58 #include <sys/vnode.h> 59 #include <sys/proc.h> 60 #include <sys/debug.h> 61 #include <sys/fssnap_if.h> 62 #include <sys/fs/ufs_inode.h> 63 #include <sys/fs/ufs_trans.h> 64 #include <sys/fs/ufs_panic.h> 65 #include <sys/fs/ufs_bio.h> 66 #include <sys/fs/ufs_log.h> 67 #include <sys/kmem.h> 68 #include <sys/policy.h> 69 #include <vm/hat.h> 70 #include <vm/as.h> 71 #include <vm/seg.h> 72 #include <vm/pvn.h> 73 #include <vm/seg_map.h> 74 #include <sys/swap.h> 75 #include <vm/seg_kmem.h> 76 77 #else /* _KERNEL */ 78 79 #define ASSERT(x) /* don't use asserts for fsck et al */ 80 81 #endif /* _KERNEL */ 82 83 #ifdef _KERNEL 84 85 /* 86 * Used to verify that a given entry on the ufs_instances list (see below) 87 * still refers to a mounted file system. 88 * 89 * XXX: This is a crock that substitutes for proper locking to coordinate 90 * updates to and uses of the entries in ufs_instances. 91 */ 92 struct check_node { 93 struct vfs *vfsp; 94 struct ufsvfs *ufsvfs; 95 dev_t vfs_dev; 96 }; 97 98 static vfs_t *still_mounted(struct check_node *); 99 100 /* 101 * All ufs file system instances are linked together into a list starting at 102 * ufs_instances. The list is updated as part of mount and unmount. It's 103 * consulted in ufs_update, to allow syncing out all ufs file system instances 104 * in a batch. 105 * 106 * ufsvfs_mutex guards access to this list and to the {,old}ufsvfslist 107 * manipulated in ufs_funmount_cleanup. (A given ufs instance is always on 108 * exactly one of these lists except while it's being allocated or 109 * deallocated.) 110 */ 111 struct ufsvfs *ufs_instances; 112 extern kmutex_t ufsvfs_mutex; /* XXX: move this to ufs_inode.h? */ 113 114 /* 115 * ufsvfs list manipulation routines 116 */ 117 118 /* 119 * Link ufsp in at the head of the list of ufs_instances. 120 */ 121 void 122 ufs_vfs_add(struct ufsvfs *ufsp) 123 { 124 mutex_enter(&ufsvfs_mutex); 125 ufsp->vfs_next = ufs_instances; 126 ufs_instances = ufsp; 127 mutex_exit(&ufsvfs_mutex); 128 } 129 130 /* 131 * Remove ufsp from the list of ufs_instances. 132 * 133 * Does no error checking; ufsp is assumed to actually be on the list. 134 */ 135 void 136 ufs_vfs_remove(struct ufsvfs *ufsp) 137 { 138 struct ufsvfs **delpt = &ufs_instances; 139 140 mutex_enter(&ufsvfs_mutex); 141 for (; *delpt != NULL; delpt = &((*delpt)->vfs_next)) { 142 if (*delpt == ufsp) { 143 *delpt = ufsp->vfs_next; 144 ufsp->vfs_next = NULL; 145 break; 146 } 147 } 148 mutex_exit(&ufsvfs_mutex); 149 } 150 151 /* 152 * Clean up state resulting from a forcible unmount that couldn't be handled 153 * directly during the unmount. (See commentary in the unmount code for more 154 * info.) 155 */ 156 static void 157 ufs_funmount_cleanup() 158 { 159 struct ufsvfs *ufsvfsp; 160 extern struct ufsvfs *oldufsvfslist, *ufsvfslist; 161 162 /* 163 * Assumption: it's now safe to blow away the entries on 164 * oldufsvfslist. 165 */ 166 mutex_enter(&ufsvfs_mutex); 167 while ((ufsvfsp = oldufsvfslist) != NULL) { 168 oldufsvfslist = ufsvfsp->vfs_next; 169 170 mutex_destroy(&ufsvfsp->vfs_lock); 171 kmem_free(ufsvfsp, sizeof (struct ufsvfs)); 172 } 173 /* 174 * Rotate more recent unmount entries into place in preparation for 175 * the next time around. 176 */ 177 oldufsvfslist = ufsvfslist; 178 ufsvfslist = NULL; 179 mutex_exit(&ufsvfs_mutex); 180 } 181 182 183 /* 184 * ufs_update performs the ufs part of `sync'. It goes through the disk 185 * queues to initiate sandbagged IO; goes through the inodes to write 186 * modified nodes; and it goes through the mount table to initiate 187 * the writing of the modified super blocks. 188 */ 189 extern time_t time; 190 time_t ufs_sync_time; 191 time_t ufs_sync_time_secs = 1; 192 193 extern kmutex_t ufs_scan_lock; 194 195 void 196 ufs_update(int flag) 197 { 198 struct vfs *vfsp; 199 struct fs *fs; 200 struct ufsvfs *ufsp; 201 struct ufsvfs *ufsnext; 202 struct ufsvfs *update_list = NULL; 203 int check_cnt = 0; 204 size_t check_size; 205 struct check_node *check_list, *ptr; 206 int cheap = flag & SYNC_ATTR; 207 208 /* 209 * This is a hack. A design flaw in the forced unmount protocol 210 * could allow a thread to attempt to use a kmem_freed ufsvfs 211 * structure in ufs_lockfs_begin/ufs_check_lockfs. This window 212 * is difficult to hit, even during the lockfs stress tests. 213 * So the hacky fix is to wait awhile before kmem_free'ing the 214 * ufsvfs structures for forcibly unmounted file systems. `Awhile' 215 * is defined as every other call from fsflush (~60 seconds). 216 */ 217 if (cheap) 218 ufs_funmount_cleanup(); 219 220 /* 221 * Examine all ufsvfs structures and add those that we can lock to the 222 * update list. This is so that we don't hold the list lock for a 223 * long time. If vfs_lock fails for a file system instance, then skip 224 * it because somebody is doing a unmount on it. 225 */ 226 mutex_enter(&ufsvfs_mutex); 227 for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) { 228 vfsp = ufsp->vfs_vfs; 229 if (vfs_lock(vfsp) != 0) 230 continue; 231 ufsp->vfs_wnext = update_list; 232 update_list = ufsp; 233 check_cnt++; 234 } 235 mutex_exit(&ufsvfs_mutex); 236 237 if (update_list == NULL) 238 return; 239 240 check_size = sizeof (struct check_node) * check_cnt; 241 check_list = ptr = kmem_alloc(check_size, KM_NOSLEEP); 242 243 /* 244 * Write back modified superblocks. 245 * Consistency check that the superblock of 246 * each file system is still in the buffer cache. 247 * 248 * Note that the update_list traversal is done without the protection 249 * of an overall list lock, so it's necessary to rely on the fact that 250 * each entry of the list is vfs_locked when moving from one entry to 251 * the next. This works because a concurrent attempt to add an entry 252 * to another thread's update_list won't find it, since it'll already 253 * be locked. 254 */ 255 check_cnt = 0; 256 for (ufsp = update_list; ufsp != NULL; ufsp = ufsnext) { 257 /* 258 * Need to grab the next ptr before we unlock this one so 259 * another thread doesn't grab it and change it before we move 260 * on to the next vfs. (Once we unlock it, it's ok if another 261 * thread finds it to add it to its own update_list; we don't 262 * attempt to refer to it through our list any more.) 263 */ 264 ufsnext = ufsp->vfs_wnext; 265 vfsp = ufsp->vfs_vfs; 266 267 /* 268 * Seems like this can't happen, so perhaps it should become 269 * an ASSERT(vfsp->vfs_data != NULL). 270 */ 271 if (!vfsp->vfs_data) { 272 vfs_unlock(vfsp); 273 continue; 274 } 275 276 fs = ufsp->vfs_fs; 277 278 /* 279 * don't update a locked superblock during a panic; it 280 * may be in an inconsistent state 281 */ 282 if (panicstr) { 283 if (!mutex_tryenter(&ufsp->vfs_lock)) { 284 vfs_unlock(vfsp); 285 continue; 286 } 287 } else 288 mutex_enter(&ufsp->vfs_lock); 289 /* 290 * Build up the STABLE check list, so we can unlock the vfs 291 * until we do the actual checking. 292 */ 293 if (check_list != NULL) { 294 if ((fs->fs_ronly == 0) && 295 (fs->fs_clean != FSBAD) && 296 (fs->fs_clean != FSSUSPEND)) { 297 ptr->vfsp = vfsp; 298 ptr->ufsvfs = ufsp; 299 ptr->vfs_dev = vfsp->vfs_dev; 300 ptr++; 301 check_cnt++; 302 } 303 } 304 305 /* 306 * superblock is not modified 307 */ 308 if (fs->fs_fmod == 0) { 309 mutex_exit(&ufsp->vfs_lock); 310 vfs_unlock(vfsp); 311 continue; 312 } 313 if (fs->fs_ronly != 0) { 314 mutex_exit(&ufsp->vfs_lock); 315 vfs_unlock(vfsp); 316 (void) ufs_fault(ufsp->vfs_root, 317 "fs = %s update: ro fs mod\n", fs->fs_fsmnt); 318 /* 319 * XXX: Why is this a return instead of a continue? 320 * This may be an attempt to replace a panic with 321 * something less drastic, but there's cleanup we 322 * should be doing that's not being done (e.g., 323 * unlocking the remaining entries on the list). 324 */ 325 return; 326 } 327 fs->fs_fmod = 0; 328 mutex_exit(&ufsp->vfs_lock); 329 TRANS_SBUPDATE(ufsp, vfsp, TOP_SBUPDATE_UPDATE); 330 vfs_unlock(vfsp); 331 } 332 333 ufs_sync_time = time; 334 335 /* 336 * Avoid racing with ufs_unmount() and ufs_sync(). 337 */ 338 mutex_enter(&ufs_scan_lock); 339 340 (void) ufs_scan_inodes(1, ufs_sync_inode, (void *)(uintptr_t)cheap, 341 NULL); 342 343 mutex_exit(&ufs_scan_lock); 344 345 /* 346 * Force stale buffer cache information to be flushed, 347 * for all devices. This should cause any remaining control 348 * information (e.g., cg and inode info) to be flushed back. 349 */ 350 bflush((dev_t)NODEV); 351 352 if (check_list == NULL) 353 return; 354 355 /* 356 * For each UFS filesystem in the STABLE check_list, update 357 * the clean flag if warranted. 358 */ 359 for (ptr = check_list; check_cnt > 0; check_cnt--, ptr++) { 360 int error; 361 362 /* 363 * still_mounted() returns with vfsp and the vfs_reflock 364 * held if ptr refers to a vfs that is still mounted. 365 */ 366 if ((vfsp = still_mounted(ptr)) == NULL) 367 continue; 368 ufs_checkclean(vfsp); 369 /* 370 * commit any outstanding async transactions 371 */ 372 ufsp = (struct ufsvfs *)vfsp->vfs_data; 373 curthread->t_flag |= T_DONTBLOCK; 374 TRANS_BEGIN_SYNC(ufsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, 375 error); 376 if (!error) { 377 TRANS_END_SYNC(ufsp, error, TOP_COMMIT_UPDATE, 378 TOP_COMMIT_SIZE); 379 } 380 curthread->t_flag &= ~T_DONTBLOCK; 381 382 vfs_unlock(vfsp); 383 } 384 385 kmem_free(check_list, check_size); 386 } 387 388 int 389 ufs_sync_inode(struct inode *ip, void *arg) 390 { 391 int cheap = (int)(uintptr_t)arg; 392 struct ufsvfs *ufsvfsp; 393 uint_t flag = ip->i_flag; 394 395 if (cheap && ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) == 0)) 396 return (0); 397 398 /* 399 * if we are panic'ing; then don't update the inode if this 400 * file system is FSSTABLE. Otherwise, we would have to 401 * force the superblock to FSACTIVE and the superblock 402 * may not be in a good state. Also, if the inode is 403 * IREF'ed then it may be in an inconsistent state. Don't 404 * push it. Finally, don't push the inode if the fs is 405 * logging; the transaction will be discarded at boot. 406 */ 407 if (panicstr) { 408 409 if (flag & IREF) 410 return (0); 411 412 if (ip->i_ufsvfs == NULL || 413 (ip->i_fs->fs_clean == FSSTABLE || 414 ip->i_fs->fs_clean == FSLOG)) 415 return (0); 416 } 417 418 ufsvfsp = ip->i_ufsvfs; 419 420 /* 421 * Limit access time only updates 422 */ 423 if (((flag & (IMOD|IMODACC|IUPD|ICHG|IACC)) == IMODACC) && ufsvfsp) { 424 /* 425 * if file system has deferred access time turned on and there 426 * was no IO recently, don't bother flushing it. It will be 427 * flushed when I/Os start again. 428 */ 429 if (cheap && (ufsvfsp->vfs_dfritime & UFS_DFRATIME) && 430 (ufsvfsp->vfs_iotstamp + ufs_iowait < lbolt)) 431 return (0); 432 /* 433 * an app issueing a sync() can take forever on a trans device 434 * when NetWorker or find is running because all of the 435 * directorys' access times have to be updated. So, we limit 436 * the time we spend updating access times per sync. 437 */ 438 if (TRANS_ISTRANS(ufsvfsp) && ((ufs_sync_time + 439 ufs_sync_time_secs) < time)) 440 return (0); 441 } 442 443 /* 444 * if we are running on behalf of the flush thread or this is 445 * a swap file, then simply do a delay update of the inode. 446 * Otherwise, push the pages and then do a delayed inode update. 447 */ 448 if (cheap || IS_SWAPVP(ITOV(ip))) { 449 TRANS_IUPDAT(ip, 0); 450 } else { 451 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_SYNC); 452 } 453 return (0); 454 } 455 456 /* 457 * Flush all the pages associated with an inode using the given 'flags', 458 * then force inode information to be written back using the given 'waitfor'. 459 */ 460 int 461 ufs_syncip(struct inode *ip, int flags, int waitfor, top_t topid) 462 { 463 int error; 464 struct vnode *vp = ITOV(ip); 465 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 466 int dotrans = 0; 467 468 /* 469 * Return if file system has been forcibly umounted. 470 */ 471 if (ufsvfsp == NULL) 472 return (EIO); 473 /* 474 * don't need to VOP_PUTPAGE if there are no pages 475 */ 476 if (!vn_has_cached_data(vp) || vp->v_type == VCHR) { 477 error = 0; 478 } else { 479 /* 480 * if the inode we're working on is a shadow inode 481 * or quota inode we need to make sure that the 482 * ufs_putpage call is inside a transaction as this 483 * could include meta data changes. 484 */ 485 if ((ip->i_mode & IFMT) == IFSHAD || 486 ufsvfsp->vfs_qinod == ip) { 487 dotrans = 1; 488 curthread->t_flag |= T_DONTBLOCK; 489 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE, 490 TOP_PUTPAGE_SIZE(ip)); 491 } 492 error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, flags, CRED()); 493 if (dotrans) { 494 TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE, 495 TOP_PUTPAGE_SIZE(ip)); 496 curthread->t_flag &= ~T_DONTBLOCK; 497 dotrans = 0; 498 } 499 } 500 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 501 goto out; 502 /* 503 * waitfor represents two things - 504 * 1. whether data sync or file sync. 505 * 2. if file sync then ufs_iupdat should 'waitfor' disk i/o or not. 506 */ 507 if (waitfor == I_DSYNC) { 508 /* 509 * If data sync, only IATTCHG (size/block change) requires 510 * inode update, fdatasync()/FDSYNC implementation. 511 */ 512 if (ip->i_flag & (IBDWRITE|IATTCHG)) { 513 /* 514 * Enter a transaction to provide mutual exclusion 515 * with deltamap_push and avoid a race where 516 * the inode flush could get dropped. 517 */ 518 if ((curthread->t_flag & T_DONTBLOCK) == 0) { 519 dotrans = 1; 520 curthread->t_flag |= T_DONTBLOCK; 521 TRANS_BEGIN_ASYNC(ufsvfsp, topid, 522 TOP_SYNCIP_SIZE); 523 } 524 rw_enter(&ip->i_contents, RW_READER); 525 mutex_enter(&ip->i_tlock); 526 ip->i_flag &= ~IMODTIME; 527 mutex_exit(&ip->i_tlock); 528 ufs_iupdat(ip, 1); 529 rw_exit(&ip->i_contents); 530 if (dotrans) { 531 TRANS_END_ASYNC(ufsvfsp, topid, 532 TOP_SYNCIP_SIZE); 533 curthread->t_flag &= ~T_DONTBLOCK; 534 } 535 } 536 } else { 537 /* For file sync, any inode change requires inode update */ 538 if (ip->i_flag & (IBDWRITE|IUPD|IACC|ICHG|IMOD|IMODACC)) { 539 /* 540 * Enter a transaction to provide mutual exclusion 541 * with deltamap_push and avoid a race where 542 * the inode flush could get dropped. 543 */ 544 if ((curthread->t_flag & T_DONTBLOCK) == 0) { 545 dotrans = 1; 546 curthread->t_flag |= T_DONTBLOCK; 547 TRANS_BEGIN_ASYNC(ufsvfsp, topid, 548 TOP_SYNCIP_SIZE); 549 } 550 rw_enter(&ip->i_contents, RW_READER); 551 mutex_enter(&ip->i_tlock); 552 ip->i_flag &= ~IMODTIME; 553 mutex_exit(&ip->i_tlock); 554 ufs_iupdat(ip, waitfor); 555 rw_exit(&ip->i_contents); 556 if (dotrans) { 557 TRANS_END_ASYNC(ufsvfsp, topid, 558 TOP_SYNCIP_SIZE); 559 curthread->t_flag &= ~T_DONTBLOCK; 560 } 561 } 562 } 563 564 out: 565 return (error); 566 } 567 /* 568 * Flush all indirect blocks related to an inode. 569 * Supports triple indirect blocks also. 570 */ 571 int 572 ufs_sync_indir(struct inode *ip) 573 { 574 int i; 575 daddr_t blkno; 576 daddr_t lbn; /* logical blkno of last blk in file */ 577 daddr_t clbn; /* current logical blk */ 578 daddr32_t *bap; 579 struct fs *fs; 580 struct buf *bp; 581 int bsize; 582 struct ufsvfs *ufsvfsp; 583 int j; 584 daddr_t indirect_blkno; 585 daddr32_t *indirect_bap; 586 struct buf *indirect_bp; 587 588 ufsvfsp = ip->i_ufsvfs; 589 /* 590 * unnecessary when logging; allocation blocks are kept up-to-date 591 */ 592 if (TRANS_ISTRANS(ufsvfsp)) 593 return (0); 594 595 fs = ufsvfsp->vfs_fs; 596 bsize = fs->fs_bsize; 597 lbn = (daddr_t)lblkno(fs, ip->i_size - 1); 598 if (lbn < NDADDR) 599 return (0); /* No indirect blocks used */ 600 if (lbn < NDADDR + NINDIR(fs)) { 601 /* File has one indirect block. */ 602 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, ip->i_ib[0])); 603 return (0); 604 } 605 606 /* Write out all the first level indirect blocks */ 607 for (i = 0; i <= NIADDR; i++) { 608 if ((blkno = ip->i_ib[i]) == 0) 609 continue; 610 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno)); 611 } 612 /* Write out second level of indirect blocks */ 613 if ((blkno = ip->i_ib[1]) == 0) 614 return (0); 615 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize); 616 if (bp->b_flags & B_ERROR) { 617 brelse(bp); 618 return (EIO); 619 } 620 bap = bp->b_un.b_daddr; 621 clbn = NDADDR + NINDIR(fs); 622 for (i = 0; i < NINDIR(fs); i++) { 623 if (clbn > lbn) 624 break; 625 clbn += NINDIR(fs); 626 if ((blkno = bap[i]) == 0) 627 continue; 628 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno)); 629 } 630 631 brelse(bp); 632 /* write out third level indirect blocks */ 633 634 if ((blkno = ip->i_ib[2]) == 0) 635 return (0); 636 637 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize); 638 if (bp->b_flags & B_ERROR) { 639 brelse(bp); 640 return (EIO); 641 } 642 bap = bp->b_un.b_daddr; 643 clbn = NDADDR + NINDIR(fs) + (NINDIR(fs) * NINDIR(fs)); 644 645 for (i = 0; i < NINDIR(fs); i++) { 646 if (clbn > lbn) 647 break; 648 if ((indirect_blkno = bap[i]) == 0) 649 continue; 650 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, indirect_blkno)); 651 indirect_bp = UFS_BREAD(ufsvfsp, ip->i_dev, 652 (daddr_t)fsbtodb(fs, indirect_blkno), bsize); 653 if (indirect_bp->b_flags & B_ERROR) { 654 brelse(indirect_bp); 655 brelse(bp); 656 return (EIO); 657 } 658 indirect_bap = indirect_bp->b_un.b_daddr; 659 for (j = 0; j < NINDIR(fs); j++) { 660 if (clbn > lbn) 661 break; 662 clbn += NINDIR(fs); 663 if ((blkno = indirect_bap[j]) == 0) 664 continue; 665 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno)); 666 } 667 brelse(indirect_bp); 668 } 669 brelse(bp); 670 671 return (0); 672 } 673 674 /* 675 * Flush all indirect blocks related to an offset of a file. 676 * read/write in sync mode may have to flush indirect blocks. 677 */ 678 int 679 ufs_indirblk_sync(struct inode *ip, offset_t off) 680 { 681 daddr_t lbn; 682 struct fs *fs; 683 struct buf *bp; 684 int i, j, shft; 685 daddr_t ob, nb, tbn; 686 daddr32_t *bap; 687 int nindirshift, nindiroffset; 688 struct ufsvfs *ufsvfsp; 689 690 ufsvfsp = ip->i_ufsvfs; 691 /* 692 * unnecessary when logging; allocation blocks are kept up-to-date 693 */ 694 if (TRANS_ISTRANS(ufsvfsp)) 695 return (0); 696 697 fs = ufsvfsp->vfs_fs; 698 699 lbn = (daddr_t)lblkno(fs, off); 700 if (lbn < 0) 701 return (EFBIG); 702 703 /* The first NDADDR are direct so nothing to do */ 704 if (lbn < NDADDR) 705 return (0); 706 707 nindirshift = ip->i_ufsvfs->vfs_nindirshift; 708 nindiroffset = ip->i_ufsvfs->vfs_nindiroffset; 709 710 /* Determine level of indirect blocks */ 711 shft = 0; 712 tbn = lbn - NDADDR; 713 for (j = NIADDR; j > 0; j--) { 714 longlong_t sh; 715 716 shft += nindirshift; 717 sh = 1LL << shft; 718 if (tbn < sh) 719 break; 720 tbn -= (daddr_t)sh; 721 } 722 723 if (j == 0) 724 return (EFBIG); 725 726 if ((nb = ip->i_ib[NIADDR - j]) == 0) 727 return (0); /* UFS Hole */ 728 729 /* Flush first level indirect block */ 730 blkflush(ip->i_dev, fsbtodb(fs, nb)); 731 732 /* Fetch through next levels */ 733 for (; j < NIADDR; j++) { 734 ob = nb; 735 bp = UFS_BREAD(ufsvfsp, 736 ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize); 737 if (bp->b_flags & B_ERROR) { 738 brelse(bp); 739 return (EIO); 740 } 741 bap = bp->b_un.b_daddr; 742 shft -= nindirshift; /* sh / nindir */ 743 i = (tbn >> shft) & nindiroffset; /* (tbn /sh) & nindir */ 744 nb = bap[i]; 745 brelse(bp); 746 if (nb == 0) { 747 return (0); /* UFS hole */ 748 } 749 blkflush(ip->i_dev, fsbtodb(fs, nb)); 750 } 751 return (0); 752 } 753 754 #ifdef DEBUG 755 756 /* 757 * The bad block checking routines: ufs_indir_badblock() and ufs_badblock() 758 * are very expensive. It's been found from profiling that we're 759 * spending 6-7% of our time in ufs_badblock, and another 1-2% in 760 * ufs_indir_badblock. They are only called via ASSERTs (from debug kernels). 761 * In addition from experience no failures have been found in recent 762 * years. So the following tunable can be set to enable checking. 763 */ 764 int ufs_badblock_checks = 0; 765 766 /* 767 * Check that a given indirect block contains blocks in range 768 */ 769 int 770 ufs_indir_badblock(struct inode *ip, daddr32_t *bap) 771 { 772 int i; 773 int err = 0; 774 775 if (ufs_badblock_checks) { 776 for (i = 0; i < NINDIR(ip->i_fs) - 1; i++) 777 if (bap[i] != 0 && (err = ufs_badblock(ip, bap[i]))) 778 break; 779 } 780 return (err); 781 } 782 783 /* 784 * Check that a specified block number is in range. 785 */ 786 int 787 ufs_badblock(struct inode *ip, daddr_t bn) 788 { 789 long c; 790 daddr_t sum; 791 792 if (!ufs_badblock_checks) 793 return (0); 794 ASSERT(bn); 795 if (bn <= 0 || bn > ip->i_fs->fs_size) 796 return (bn); 797 798 sum = 0; 799 c = dtog(ip->i_fs, bn); 800 if (c == 0) { 801 sum = howmany(ip->i_fs->fs_cssize, ip->i_fs->fs_fsize); 802 } 803 /* 804 * if block no. is below this cylinder group, 805 * within the space reserved for superblock, inodes, (summary data) 806 * or if it is above this cylinder group 807 * then its invalid 808 * It's hard to see how we'd be outside this cyl, but let's be careful. 809 */ 810 if ((bn < cgbase(ip->i_fs, c)) || 811 (bn >= cgsblock(ip->i_fs, c) && bn < cgdmin(ip->i_fs, c)+sum) || 812 (bn >= (unsigned)cgbase(ip->i_fs, c+1))) 813 return (bn); 814 815 return (0); /* not a bad block */ 816 } 817 818 #endif /* DEBUG */ 819 820 /* 821 * When i_rwlock is write-locked or has a writer pended, then the inode 822 * is going to change in a way that the filesystem will be marked as 823 * active. So no need to let the filesystem be mark as stable now. 824 * Also to ensure the filesystem consistency during the directory 825 * operations, filesystem cannot be marked as stable if i_rwlock of 826 * the directory inode is write-locked. 827 */ 828 829 /* 830 * Check for busy inodes for this filesystem. 831 * NOTE: Needs better way to do this expensive operation in the future. 832 */ 833 static void 834 ufs_icheck(struct ufsvfs *ufsvfsp, int *isbusyp, int *isreclaimp) 835 { 836 union ihead *ih; 837 struct inode *ip; 838 int i; 839 int isnottrans = !TRANS_ISTRANS(ufsvfsp); 840 int isbusy = *isbusyp; 841 int isreclaim = *isreclaimp; 842 843 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 844 mutex_enter(&ih_lock[i]); 845 for (ip = ih->ih_chain[0]; 846 ip != (struct inode *)ih; 847 ip = ip->i_forw) { 848 /* 849 * if inode is busy/modified/deleted, filesystem is busy 850 */ 851 if (ip->i_ufsvfs != ufsvfsp) 852 continue; 853 if ((ip->i_flag & (IMOD | IUPD | ICHG)) || 854 (RW_ISWRITER(&ip->i_rwlock))) 855 isbusy = 1; 856 if ((ip->i_nlink <= 0) && (ip->i_flag & IREF)) 857 isreclaim = 1; 858 if (isbusy && (isreclaim || isnottrans)) 859 break; 860 } 861 mutex_exit(&ih_lock[i]); 862 if (isbusy && (isreclaim || isnottrans)) 863 break; 864 } 865 *isbusyp = isbusy; 866 *isreclaimp = isreclaim; 867 } 868 869 /* 870 * As part of the ufs 'sync' operation, this routine is called to mark 871 * the filesystem as STABLE if there is no modified metadata in memory. 872 */ 873 void 874 ufs_checkclean(struct vfs *vfsp) 875 { 876 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 877 struct fs *fs = ufsvfsp->vfs_fs; 878 int isbusy; 879 int isreclaim; 880 int updatesb; 881 882 ASSERT(vfs_lock_held(vfsp)); 883 884 /* 885 * filesystem is stable or cleanflag processing is disabled; do nothing 886 * no transitions when panic'ing 887 */ 888 if (fs->fs_ronly || 889 fs->fs_clean == FSBAD || 890 fs->fs_clean == FSSUSPEND || 891 fs->fs_clean == FSSTABLE || 892 panicstr) 893 return; 894 895 /* 896 * if logging and nothing to reclaim; do nothing 897 */ 898 if ((fs->fs_clean == FSLOG) && 899 (((fs->fs_reclaim & FS_RECLAIM) == 0) || 900 (fs->fs_reclaim & FS_RECLAIMING))) 901 return; 902 903 /* 904 * FS_CHECKCLEAN is reset if the file system goes dirty 905 * FS_CHECKRECLAIM is reset if a file gets deleted 906 */ 907 mutex_enter(&ufsvfsp->vfs_lock); 908 fs->fs_reclaim |= (FS_CHECKCLEAN | FS_CHECKRECLAIM); 909 mutex_exit(&ufsvfsp->vfs_lock); 910 911 updatesb = 0; 912 913 /* 914 * if logging or buffers are busy; do nothing 915 */ 916 isbusy = isreclaim = 0; 917 if ((fs->fs_clean == FSLOG) || 918 (bcheck(vfsp->vfs_dev, ufsvfsp->vfs_bufp))) 919 isbusy = 1; 920 921 /* 922 * isreclaim == TRUE means can't change the state of fs_reclaim 923 */ 924 isreclaim = 925 ((fs->fs_clean == FSLOG) && 926 (((fs->fs_reclaim & FS_RECLAIM) == 0) || 927 (fs->fs_reclaim & FS_RECLAIMING))); 928 929 /* 930 * if fs is busy or can't change the state of fs_reclaim; do nothing 931 */ 932 if (isbusy && isreclaim) 933 return; 934 935 /* 936 * look for busy or deleted inodes; (deleted == needs reclaim) 937 */ 938 ufs_icheck(ufsvfsp, &isbusy, &isreclaim); 939 940 mutex_enter(&ufsvfsp->vfs_lock); 941 942 /* 943 * IF POSSIBLE, RESET RECLAIM 944 */ 945 /* 946 * the reclaim thread is not running 947 */ 948 if ((fs->fs_reclaim & FS_RECLAIMING) == 0) 949 /* 950 * no files were deleted during the scan 951 */ 952 if (fs->fs_reclaim & FS_CHECKRECLAIM) 953 /* 954 * no deleted files were found in the inode cache 955 */ 956 if ((isreclaim == 0) && (fs->fs_reclaim & FS_RECLAIM)) { 957 fs->fs_reclaim &= ~FS_RECLAIM; 958 updatesb = 1; 959 } 960 /* 961 * IF POSSIBLE, SET STABLE 962 */ 963 /* 964 * not logging 965 */ 966 if (fs->fs_clean != FSLOG) 967 /* 968 * file system has not gone dirty since the scan began 969 */ 970 if (fs->fs_reclaim & FS_CHECKCLEAN) 971 /* 972 * nothing dirty was found in the buffer or inode cache 973 */ 974 if ((isbusy == 0) && (isreclaim == 0) && 975 (fs->fs_clean != FSSTABLE)) { 976 fs->fs_clean = FSSTABLE; 977 updatesb = 1; 978 } 979 980 mutex_exit(&ufsvfsp->vfs_lock); 981 if (updatesb) { 982 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE); 983 } 984 } 985 986 /* 987 * called whenever an unlink occurs 988 */ 989 void 990 ufs_setreclaim(struct inode *ip) 991 { 992 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 993 struct fs *fs = ufsvfsp->vfs_fs; 994 995 if (ip->i_nlink || fs->fs_ronly || (fs->fs_clean != FSLOG)) 996 return; 997 998 /* 999 * reclaim-needed bit is already set or we need to tell 1000 * ufs_checkclean that a file has been deleted 1001 */ 1002 if ((fs->fs_reclaim & (FS_RECLAIM | FS_CHECKRECLAIM)) == FS_RECLAIM) 1003 return; 1004 1005 mutex_enter(&ufsvfsp->vfs_lock); 1006 /* 1007 * inform ufs_checkclean that the file system has gone dirty 1008 */ 1009 fs->fs_reclaim &= ~FS_CHECKRECLAIM; 1010 1011 /* 1012 * set the reclaim-needed bit 1013 */ 1014 if ((fs->fs_reclaim & FS_RECLAIM) == 0) { 1015 fs->fs_reclaim |= FS_RECLAIM; 1016 ufs_sbwrite(ufsvfsp); 1017 } 1018 mutex_exit(&ufsvfsp->vfs_lock); 1019 } 1020 1021 /* 1022 * Before any modified metadata written back to the disk, this routine 1023 * is called to mark the filesystem as ACTIVE. 1024 */ 1025 void 1026 ufs_notclean(struct ufsvfs *ufsvfsp) 1027 { 1028 struct fs *fs = ufsvfsp->vfs_fs; 1029 1030 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock)); 1031 ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs)); 1032 1033 /* 1034 * inform ufs_checkclean that the file system has gone dirty 1035 */ 1036 fs->fs_reclaim &= ~FS_CHECKCLEAN; 1037 1038 /* 1039 * ignore if active or bad or suspended or readonly or logging 1040 */ 1041 if ((fs->fs_clean == FSACTIVE) || (fs->fs_clean == FSLOG) || 1042 (fs->fs_clean == FSBAD) || (fs->fs_clean == FSSUSPEND) || 1043 (fs->fs_ronly)) { 1044 mutex_exit(&ufsvfsp->vfs_lock); 1045 return; 1046 } 1047 fs->fs_clean = FSACTIVE; 1048 /* 1049 * write superblock synchronously 1050 */ 1051 ufs_sbwrite(ufsvfsp); 1052 mutex_exit(&ufsvfsp->vfs_lock); 1053 } 1054 1055 /* 1056 * ufs specific fbwrite() 1057 */ 1058 int 1059 ufs_fbwrite(struct fbuf *fbp, struct inode *ip) 1060 { 1061 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1062 1063 if (TRANS_ISTRANS(ufsvfsp)) 1064 return (fbwrite(fbp)); 1065 mutex_enter(&ufsvfsp->vfs_lock); 1066 ufs_notclean(ufsvfsp); 1067 return ((ufsvfsp->vfs_dio) ? fbdwrite(fbp) : fbwrite(fbp)); 1068 } 1069 1070 /* 1071 * ufs specific fbiwrite() 1072 */ 1073 int 1074 ufs_fbiwrite(struct fbuf *fbp, struct inode *ip, daddr_t bn, long bsize) 1075 { 1076 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1077 o_mode_t ifmt = ip->i_mode & IFMT; 1078 buf_t *bp; 1079 int error; 1080 1081 mutex_enter(&ufsvfsp->vfs_lock); 1082 ufs_notclean(ufsvfsp); 1083 if (ifmt == IFDIR || ifmt == IFSHAD || ifmt == IFATTRDIR || 1084 (ip->i_ufsvfs->vfs_qinod == ip)) { 1085 TRANS_DELTA(ufsvfsp, ldbtob(bn * (offset_t)(btod(bsize))), 1086 fbp->fb_count, DT_FBI, 0, 0); 1087 } 1088 /* 1089 * Inlined version of fbiwrite() 1090 */ 1091 bp = pageio_setup((struct page *)NULL, fbp->fb_count, 1092 ip->i_devvp, B_WRITE); 1093 bp->b_flags &= ~B_PAGEIO; 1094 bp->b_un.b_addr = fbp->fb_addr; 1095 1096 bp->b_blkno = bn * btod(bsize); 1097 bp->b_dev = cmpdev(ip->i_dev); /* store in old dev format */ 1098 bp->b_edev = ip->i_dev; 1099 bp->b_proc = NULL; /* i.e. the kernel */ 1100 bp->b_file = ip->i_vnode; 1101 bp->b_offset = -1; 1102 1103 if (ufsvfsp->vfs_log) { 1104 lufs_write_strategy(ufsvfsp->vfs_log, bp); 1105 } else if (ufsvfsp->vfs_snapshot) { 1106 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 1107 } else { 1108 ufsvfsp->vfs_iotstamp = lbolt; 1109 ub.ub_fbiwrites.value.ul++; 1110 (void) bdev_strategy(bp); 1111 lwp_stat_update(LWP_STAT_OUBLK, 1); 1112 } 1113 error = biowait(bp); 1114 pageio_done(bp); 1115 fbrelse(fbp, S_OTHER); 1116 return (error); 1117 } 1118 1119 /* 1120 * Write the ufs superblock only. 1121 */ 1122 void 1123 ufs_sbwrite(struct ufsvfs *ufsvfsp) 1124 { 1125 char sav_fs_fmod; 1126 struct fs *fs = ufsvfsp->vfs_fs; 1127 struct buf *bp = ufsvfsp->vfs_bufp; 1128 1129 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock)); 1130 1131 /* 1132 * for ulockfs processing, limit the superblock writes 1133 */ 1134 if ((ufsvfsp->vfs_ulockfs.ul_sbowner) && 1135 (curthread != ufsvfsp->vfs_ulockfs.ul_sbowner)) { 1136 /* try again later */ 1137 fs->fs_fmod = 1; 1138 return; 1139 } 1140 1141 ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs)); 1142 /* 1143 * update superblock timestamp and fs_clean checksum 1144 * if marked FSBAD, we always want an erroneous 1145 * checksum to force repair 1146 */ 1147 fs->fs_time = gethrestime_sec(); 1148 fs->fs_state = (fs->fs_clean != FSBAD) ? 1149 FSOKAY - fs->fs_time : -(FSOKAY - fs->fs_time); 1150 switch (fs->fs_clean) { 1151 case FSCLEAN: 1152 case FSSTABLE: 1153 fs->fs_reclaim &= ~FS_RECLAIM; 1154 break; 1155 case FSACTIVE: 1156 case FSSUSPEND: 1157 case FSBAD: 1158 case FSLOG: 1159 break; 1160 default: 1161 fs->fs_clean = FSACTIVE; 1162 break; 1163 } 1164 /* 1165 * reset incore only bits 1166 */ 1167 fs->fs_reclaim &= ~(FS_CHECKCLEAN | FS_CHECKRECLAIM); 1168 1169 /* 1170 * delta the whole superblock 1171 */ 1172 TRANS_DELTA(ufsvfsp, ldbtob(SBLOCK), sizeof (struct fs), 1173 DT_SB, NULL, 0); 1174 /* 1175 * retain the incore state of fs_fmod; set the ondisk state to 0 1176 */ 1177 sav_fs_fmod = fs->fs_fmod; 1178 fs->fs_fmod = 0; 1179 1180 /* 1181 * Don't release the buffer after written to the disk 1182 */ 1183 UFS_BWRITE2(ufsvfsp, bp); 1184 fs->fs_fmod = sav_fs_fmod; /* reset fs_fmod's incore state */ 1185 } 1186 1187 /* 1188 * Returns vfs pointer if vfs still being mounted. vfs lock is held. 1189 * Otherwise, returns NULL. 1190 * 1191 * For our purposes, "still mounted" means that the file system still appears 1192 * on the list of UFS file system instances. 1193 */ 1194 static vfs_t * 1195 still_mounted(struct check_node *checkp) 1196 { 1197 struct vfs *vfsp; 1198 struct ufsvfs *ufsp; 1199 1200 mutex_enter(&ufsvfs_mutex); 1201 for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) { 1202 if (ufsp != checkp->ufsvfs) 1203 continue; 1204 /* 1205 * Tentative match: verify it and try to lock. (It's not at 1206 * all clear how the verification could fail, given that we've 1207 * gotten this far. We would have had to reallocate the 1208 * ufsvfs struct at hand for a new incarnation; is that really 1209 * possible in the interval from constructing the check_node 1210 * to here?) 1211 */ 1212 vfsp = ufsp->vfs_vfs; 1213 if (vfsp != checkp->vfsp) 1214 continue; 1215 if (vfsp->vfs_dev != checkp->vfs_dev) 1216 continue; 1217 if (vfs_lock(vfsp) != 0) 1218 continue; 1219 1220 mutex_exit(&ufsvfs_mutex); 1221 return (vfsp); 1222 } 1223 mutex_exit(&ufsvfs_mutex); 1224 return (NULL); 1225 } 1226 1227 int 1228 ufs_si_io_done(struct buf *bp) 1229 { 1230 sema_v(&bp->b_io); 1231 return (0); 1232 } 1233 1234 #define SI_BUFSZ roundup(sizeof (struct cg), DEV_BSIZE) 1235 #define NSIBUF 32 1236 1237 /* 1238 * ufs_construct_si() 1239 * Read each cylinder group in turn and construct the summary information 1240 */ 1241 static int 1242 ufs_construct_si(dev_t dev, struct fs *fs, struct ufsvfs *ufsvfsp) 1243 { 1244 buf_t *bps, *bp; 1245 char *bufs; 1246 struct csum *sip = fs->fs_u.fs_csp; 1247 struct cg *cgp; 1248 int i, ncg; 1249 int error = 0, cg = 0; 1250 1251 bps = kmem_alloc(NSIBUF * sizeof (buf_t), KM_SLEEP); 1252 bufs = kmem_alloc(NSIBUF * SI_BUFSZ, KM_SLEEP); 1253 1254 /* 1255 * Initialise the buffer headers 1256 */ 1257 for (bp = bps, i = 0; i < NSIBUF; i++, bp++) { 1258 bioinit(bp); 1259 bp->b_iodone = ufs_si_io_done; 1260 bp->b_bufsize = bp->b_bcount = SI_BUFSZ; 1261 bp->b_flags = B_READ; 1262 bp->b_un.b_addr = bufs + (i * SI_BUFSZ); 1263 bp->b_edev = dev; 1264 } 1265 1266 /* 1267 * Repeat while there are cylinder groups left to read. 1268 */ 1269 do { 1270 /* 1271 * Issue upto NSIBUF asynchronous reads 1272 */ 1273 ncg = MIN(NSIBUF, (fs->fs_ncg - cg)); 1274 for (bp = bps, i = 0; i < ncg; i++, bp++) { 1275 bp->b_blkno = (daddr_t)fsbtodb(fs, cgtod(fs, cg + i)); 1276 if (ufsvfsp->vfs_log) { 1277 lufs_read_strategy(ufsvfsp->vfs_log, bp); 1278 } else { 1279 (void) bdev_strategy(bp); 1280 } 1281 } 1282 1283 /* 1284 * wait for each read to finish; 1285 * check for errors and copy the csum info 1286 */ 1287 for (bp = bps, i = 0; i < ncg; i++, bp++) { 1288 sema_p(&bp->b_io); 1289 if (!error) { 1290 cgp = bp->b_un.b_cg; 1291 sip[cg + i] = cgp->cg_cs; 1292 error = geterror(bp); 1293 } 1294 } 1295 if (error) { 1296 goto err; 1297 } 1298 cg += ncg; 1299 } while (cg < fs->fs_ncg); 1300 1301 err: 1302 kmem_free(bps, NSIBUF * sizeof (buf_t)); 1303 kmem_free(bufs, NSIBUF * SI_BUFSZ); 1304 return (error); 1305 } 1306 1307 /* 1308 * ufs_getsummaryinfo 1309 */ 1310 int 1311 ufs_getsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs) 1312 { 1313 int i; /* `for' loop counter */ 1314 ssize_t size; /* bytes of summary info to read */ 1315 daddr_t frags; /* frags of summary info to read */ 1316 caddr_t sip; /* summary info */ 1317 struct buf *tp; /* tmp buf */ 1318 1319 /* 1320 * maintain metadata map for trans device (debug only) 1321 */ 1322 TRANS_MATA_SI(ufsvfsp, fs); 1323 1324 /* 1325 * Compute #frags and allocate space for summary info 1326 */ 1327 frags = howmany(fs->fs_cssize, fs->fs_fsize); 1328 sip = kmem_alloc((size_t)fs->fs_cssize, KM_SLEEP); 1329 fs->fs_u.fs_csp = (struct csum *)sip; 1330 1331 if (fs->fs_si == FS_SI_BAD) { 1332 /* 1333 * The summary information is unknown, read it in from 1334 * the cylinder groups. 1335 */ 1336 if (TRANS_ISTRANS(ufsvfsp) && !TRANS_ISERROR(ufsvfsp) && 1337 ufsvfsp->vfs_log->un_logmap) { 1338 logmap_roll_dev(ufsvfsp->vfs_log); /* flush the log */ 1339 } 1340 bzero(sip, (size_t)fs->fs_cssize); 1341 if (ufs_construct_si(dev, fs, ufsvfsp)) { 1342 kmem_free(fs->fs_u.fs_csp, fs->fs_cssize); 1343 fs->fs_u.fs_csp = NULL; 1344 return (EIO); 1345 } 1346 } else { 1347 /* Read summary info a fs block at a time */ 1348 size = fs->fs_bsize; 1349 for (i = 0; i < frags; i += fs->fs_frag) { 1350 if (i + fs->fs_frag > frags) 1351 /* 1352 * This happens only the last iteration, so 1353 * don't worry about size being reset 1354 */ 1355 size = (frags - i) * fs->fs_fsize; 1356 tp = UFS_BREAD(ufsvfsp, dev, 1357 (daddr_t)fsbtodb(fs, fs->fs_csaddr+i), size); 1358 tp->b_flags |= B_STALE | B_AGE; 1359 if (tp->b_flags & B_ERROR) { 1360 kmem_free(fs->fs_u.fs_csp, fs->fs_cssize); 1361 fs->fs_u.fs_csp = NULL; 1362 brelse(tp); 1363 return (EIO); 1364 } 1365 bcopy(tp->b_un.b_addr, sip, size); 1366 sip += size; 1367 brelse(tp); 1368 } 1369 } 1370 bzero((caddr_t)&fs->fs_cstotal, sizeof (fs->fs_cstotal)); 1371 for (i = 0; i < fs->fs_ncg; ++i) { 1372 fs->fs_cstotal.cs_ndir += fs->fs_cs(fs, i).cs_ndir; 1373 fs->fs_cstotal.cs_nbfree += fs->fs_cs(fs, i).cs_nbfree; 1374 fs->fs_cstotal.cs_nifree += fs->fs_cs(fs, i).cs_nifree; 1375 fs->fs_cstotal.cs_nffree += fs->fs_cs(fs, i).cs_nffree; 1376 } 1377 return (0); 1378 } 1379 1380 /* 1381 * ufs_putsummaryinfo() stores all the cylinder group summary information 1382 * This is only used when logging, but the file system may not 1383 * be logging at the time, eg a read-only mount to flush the log 1384 * may push the summary info out. 1385 */ 1386 int 1387 ufs_putsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs) 1388 { 1389 struct buf b, *bp; /* tmp buf */ 1390 caddr_t sip; /* summary info */ 1391 ssize_t size; /* bytes of summary info to write */ 1392 daddr_t frags; /* frags of summary info to write */ 1393 int i; /* `for' loop counter */ 1394 int error; /* error */ 1395 1396 if (TRANS_ISERROR(ufsvfsp)) { 1397 return (EIO); 1398 } 1399 1400 if ((fs->fs_si != FS_SI_BAD) || !ufsvfsp->vfs_nolog_si) { 1401 return (0); 1402 } 1403 1404 bp = &b; 1405 bioinit(bp); 1406 bp->b_iodone = ufs_si_io_done; 1407 bp->b_bufsize = size = fs->fs_bsize; 1408 bp->b_flags = B_WRITE; 1409 bp->b_un.b_addr = kmem_alloc(size, KM_SLEEP); 1410 bp->b_edev = dev; 1411 frags = howmany(fs->fs_cssize, fs->fs_fsize); 1412 sip = (caddr_t)fs->fs_u.fs_csp; 1413 1414 /* Write summary info one fs block at a time */ 1415 for (error = 0, i = 0; (i < frags) && (error == 0); i += fs->fs_frag) { 1416 if (i + fs->fs_frag > frags) { 1417 /* 1418 * This happens only the last iteration, so 1419 * don't worry about size being reset 1420 */ 1421 size = (frags - i) * fs->fs_fsize; 1422 } 1423 bcopy(sip, bp->b_un.b_addr, size); 1424 bp->b_blkno = (daddr_t)fsbtodb(fs, fs->fs_csaddr+i); 1425 bp->b_bcount = size; 1426 (void) bdev_strategy(bp); 1427 sema_p(&bp->b_io); /* wait for write to complete */ 1428 error = geterror(bp); 1429 sip += size; 1430 } 1431 kmem_free(bp->b_un.b_addr, fs->fs_bsize); 1432 if (!error) { 1433 fs->fs_si = FS_SI_OK; 1434 } 1435 return (error); 1436 } 1437 1438 /* 1439 * Decide whether it is okay to remove within a sticky directory. 1440 * Two conditions need to be met: write access to the directory 1441 * is needed. In sticky directories, write access is not sufficient; 1442 * you can remove entries from a directory only if you own the directory, 1443 * if you are privileged, if you own the entry or if the entry is 1444 * a plain file and you have write access to that file. 1445 * Function returns 0 if remove access is granted. 1446 */ 1447 int 1448 ufs_sticky_remove_access(struct inode *dp, struct inode *ip, struct cred *cr) 1449 { 1450 uid_t uid; 1451 if ((dp->i_mode & ISVTX) && 1452 (uid = crgetuid(cr)) != dp->i_uid && 1453 uid != ip->i_uid && 1454 ((ip->i_mode & IFMT) != IFREG || 1455 ufs_iaccess(ip, IWRITE, cr) != 0)) 1456 return (secpolicy_vnode_remove(cr)); 1457 1458 return (0); 1459 } 1460 #endif /* _KERNEL */ 1461 1462 extern int around[9]; 1463 extern int inside[9]; 1464 extern uchar_t *fragtbl[]; 1465 1466 /* 1467 * Update the frsum fields to reflect addition or deletion 1468 * of some frags. 1469 */ 1470 void 1471 fragacct(struct fs *fs, int fragmap, int32_t *fraglist, int cnt) 1472 { 1473 int inblk; 1474 int field, subfield; 1475 int siz, pos; 1476 1477 /* 1478 * ufsvfsp->vfs_lock is held when calling this. 1479 */ 1480 inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1; 1481 fragmap <<= 1; 1482 for (siz = 1; siz < fs->fs_frag; siz++) { 1483 if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0) 1484 continue; 1485 field = around[siz]; 1486 subfield = inside[siz]; 1487 for (pos = siz; pos <= fs->fs_frag; pos++) { 1488 if ((fragmap & field) == subfield) { 1489 fraglist[siz] += cnt; 1490 ASSERT(fraglist[siz] >= 0); 1491 pos += siz; 1492 field <<= siz; 1493 subfield <<= siz; 1494 } 1495 field <<= 1; 1496 subfield <<= 1; 1497 } 1498 } 1499 } 1500 1501 /* 1502 * Block operations 1503 */ 1504 1505 /* 1506 * Check if a block is available 1507 */ 1508 int 1509 isblock(struct fs *fs, uchar_t *cp, daddr_t h) 1510 { 1511 uchar_t mask; 1512 1513 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \ 1514 fs->fs_frag == 1); 1515 /* 1516 * ufsvfsp->vfs_lock is held when calling this. 1517 */ 1518 switch ((int)fs->fs_frag) { 1519 case 8: 1520 return (cp[h] == 0xff); 1521 case 4: 1522 mask = 0x0f << ((h & 0x1) << 2); 1523 return ((cp[h >> 1] & mask) == mask); 1524 case 2: 1525 mask = 0x03 << ((h & 0x3) << 1); 1526 return ((cp[h >> 2] & mask) == mask); 1527 case 1: 1528 mask = 0x01 << (h & 0x7); 1529 return ((cp[h >> 3] & mask) == mask); 1530 default: 1531 #ifndef _KERNEL 1532 cmn_err(CE_PANIC, "isblock: illegal fs->fs_frag value (%d)", 1533 fs->fs_frag); 1534 #endif /* _KERNEL */ 1535 return (0); 1536 } 1537 } 1538 1539 /* 1540 * Take a block out of the map 1541 */ 1542 void 1543 clrblock(struct fs *fs, uchar_t *cp, daddr_t h) 1544 { 1545 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \ 1546 fs->fs_frag == 1); 1547 /* 1548 * ufsvfsp->vfs_lock is held when calling this. 1549 */ 1550 switch ((int)fs->fs_frag) { 1551 case 8: 1552 cp[h] = 0; 1553 return; 1554 case 4: 1555 cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); 1556 return; 1557 case 2: 1558 cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); 1559 return; 1560 case 1: 1561 cp[h >> 3] &= ~(0x01 << (h & 0x7)); 1562 return; 1563 default: 1564 #ifndef _KERNEL 1565 cmn_err(CE_PANIC, "clrblock: illegal fs->fs_frag value (%d)", 1566 fs->fs_frag); 1567 #endif /* _KERNEL */ 1568 return; 1569 } 1570 } 1571 1572 /* 1573 * Is block allocated? 1574 */ 1575 int 1576 isclrblock(struct fs *fs, uchar_t *cp, daddr_t h) 1577 { 1578 uchar_t mask; 1579 int frag; 1580 /* 1581 * ufsvfsp->vfs_lock is held when calling this. 1582 */ 1583 frag = fs->fs_frag; 1584 ASSERT(frag == 8 || frag == 4 || frag == 2 || frag == 1); 1585 switch (frag) { 1586 case 8: 1587 return (cp[h] == 0); 1588 case 4: 1589 mask = ~(0x0f << ((h & 0x1) << 2)); 1590 return (cp[h >> 1] == (cp[h >> 1] & mask)); 1591 case 2: 1592 mask = ~(0x03 << ((h & 0x3) << 1)); 1593 return (cp[h >> 2] == (cp[h >> 2] & mask)); 1594 case 1: 1595 mask = ~(0x01 << (h & 0x7)); 1596 return (cp[h >> 3] == (cp[h >> 3] & mask)); 1597 default: 1598 #ifndef _KERNEL 1599 cmn_err(CE_PANIC, "isclrblock: illegal fs->fs_frag value (%d)", 1600 fs->fs_frag); 1601 #endif /* _KERNEL */ 1602 break; 1603 } 1604 return (0); 1605 } 1606 1607 /* 1608 * Put a block into the map 1609 */ 1610 void 1611 setblock(struct fs *fs, uchar_t *cp, daddr_t h) 1612 { 1613 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \ 1614 fs->fs_frag == 1); 1615 /* 1616 * ufsvfsp->vfs_lock is held when calling this. 1617 */ 1618 switch ((int)fs->fs_frag) { 1619 case 8: 1620 cp[h] = 0xff; 1621 return; 1622 case 4: 1623 cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); 1624 return; 1625 case 2: 1626 cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); 1627 return; 1628 case 1: 1629 cp[h >> 3] |= (0x01 << (h & 0x7)); 1630 return; 1631 default: 1632 #ifndef _KERNEL 1633 cmn_err(CE_PANIC, "setblock: illegal fs->fs_frag value (%d)", 1634 fs->fs_frag); 1635 #endif /* _KERNEL */ 1636 return; 1637 } 1638 } 1639 1640 int 1641 skpc(char c, uint_t len, char *cp) 1642 { 1643 if (len == 0) 1644 return (0); 1645 while (*cp++ == c && --len) 1646 ; 1647 return (len); 1648 } 1649