1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 41 #pragma ident "%Z%%M% %I% %E% SMI" 42 43 #include <sys/types.h> 44 #include <sys/t_lock.h> 45 #include <sys/param.h> 46 #include <sys/time.h> 47 #include <sys/fs/ufs_fs.h> 48 #include <sys/cmn_err.h> 49 50 #ifdef _KERNEL 51 52 #include <sys/systm.h> 53 #include <sys/sysmacros.h> 54 #include <sys/buf.h> 55 #include <sys/conf.h> 56 #include <sys/user.h> 57 #include <sys/var.h> 58 #include <sys/vfs.h> 59 #include <sys/vnode.h> 60 #include <sys/proc.h> 61 #include <sys/debug.h> 62 #include <sys/fssnap_if.h> 63 #include <sys/fs/ufs_inode.h> 64 #include <sys/fs/ufs_trans.h> 65 #include <sys/fs/ufs_panic.h> 66 #include <sys/fs/ufs_bio.h> 67 #include <sys/fs/ufs_log.h> 68 #include <sys/kmem.h> 69 #include <sys/vtrace.h> 70 #include <sys/policy.h> 71 #include <vm/hat.h> 72 #include <vm/as.h> 73 #include <vm/seg.h> 74 #include <vm/pvn.h> 75 #include <vm/seg_map.h> 76 #include <sys/swap.h> 77 #include <vm/seg_kmem.h> 78 79 #else /* _KERNEL */ 80 81 #define ASSERT(x) /* don't use asserts for fsck et al */ 82 83 #endif /* _KERNEL */ 84 85 #ifdef _KERNEL 86 87 /* 88 * Used to verify that a given entry on the ufs_instances list (see below) 89 * still refers to a mounted file system. 90 * 91 * XXX: This is a crock that substitutes for proper locking to coordinate 92 * updates to and uses of the entries in ufs_instances. 93 */ 94 struct check_node { 95 struct vfs *vfsp; 96 struct ufsvfs *ufsvfs; 97 dev_t vfs_dev; 98 }; 99 100 static vfs_t *still_mounted(struct check_node *); 101 102 /* 103 * All ufs file system instances are linked together into a list starting at 104 * ufs_instances. The list is updated as part of mount and unmount. It's 105 * consulted in ufs_update, to allow syncing out all ufs file system instances 106 * in a batch. 107 * 108 * ufsvfs_mutex guards access to this list and to the {,old}ufsvfslist 109 * manipulated in ufs_funmount_cleanup. (A given ufs instance is always on 110 * exactly one of these lists except while it's being allocated or 111 * deallocated.) 112 */ 113 struct ufsvfs *ufs_instances; 114 extern kmutex_t ufsvfs_mutex; /* XXX: move this to ufs_inode.h? */ 115 116 /* 117 * ufsvfs list manipulation routines 118 */ 119 120 /* 121 * Link ufsp in at the head of the list of ufs_instances. 122 */ 123 void 124 ufs_vfs_add(struct ufsvfs *ufsp) 125 { 126 mutex_enter(&ufsvfs_mutex); 127 ufsp->vfs_next = ufs_instances; 128 ufs_instances = ufsp; 129 mutex_exit(&ufsvfs_mutex); 130 } 131 132 /* 133 * Remove ufsp from the list of ufs_instances. 134 * 135 * Does no error checking; ufsp is assumed to actually be on the list. 136 */ 137 void 138 ufs_vfs_remove(struct ufsvfs *ufsp) 139 { 140 struct ufsvfs **delpt = &ufs_instances; 141 142 mutex_enter(&ufsvfs_mutex); 143 for (; *delpt != NULL; delpt = &((*delpt)->vfs_next)) { 144 if (*delpt == ufsp) { 145 *delpt = ufsp->vfs_next; 146 ufsp->vfs_next = NULL; 147 break; 148 } 149 } 150 mutex_exit(&ufsvfs_mutex); 151 } 152 153 /* 154 * Clean up state resulting from a forcible unmount that couldn't be handled 155 * directly during the unmount. (See commentary in the unmount code for more 156 * info.) 157 */ 158 static void 159 ufs_funmount_cleanup() 160 { 161 struct ufsvfs *ufsvfsp; 162 extern struct ufsvfs *oldufsvfslist, *ufsvfslist; 163 164 /* 165 * Assumption: it's now safe to blow away the entries on 166 * oldufsvfslist. 167 */ 168 mutex_enter(&ufsvfs_mutex); 169 while ((ufsvfsp = oldufsvfslist) != NULL) { 170 oldufsvfslist = ufsvfsp->vfs_next; 171 172 mutex_destroy(&ufsvfsp->vfs_lock); 173 kmem_free(ufsvfsp, sizeof (struct ufsvfs)); 174 } 175 /* 176 * Rotate more recent unmount entries into place in preparation for 177 * the next time around. 178 */ 179 oldufsvfslist = ufsvfslist; 180 ufsvfslist = NULL; 181 mutex_exit(&ufsvfs_mutex); 182 } 183 184 185 /* 186 * ufs_update performs the ufs part of `sync'. It goes through the disk 187 * queues to initiate sandbagged IO; goes through the inodes to write 188 * modified nodes; and it goes through the mount table to initiate 189 * the writing of the modified super blocks. 190 */ 191 extern time_t time; 192 time_t ufs_sync_time; 193 time_t ufs_sync_time_secs = 1; 194 195 extern kmutex_t ufs_scan_lock; 196 197 void 198 ufs_update(int flag) 199 { 200 struct vfs *vfsp; 201 struct fs *fs; 202 struct ufsvfs *ufsp; 203 struct ufsvfs *ufsnext; 204 struct ufsvfs *update_list = NULL; 205 int check_cnt = 0; 206 size_t check_size; 207 struct check_node *check_list, *ptr; 208 int cheap = flag & SYNC_ATTR; 209 210 /* 211 * This is a hack. A design flaw in the forced unmount protocol 212 * could allow a thread to attempt to use a kmem_freed ufsvfs 213 * structure in ufs_lockfs_begin/ufs_check_lockfs. This window 214 * is difficult to hit, even during the lockfs stress tests. 215 * So the hacky fix is to wait awhile before kmem_free'ing the 216 * ufsvfs structures for forcibly unmounted file systems. `Awhile' 217 * is defined as every other call from fsflush (~60 seconds). 218 */ 219 if (cheap) 220 ufs_funmount_cleanup(); 221 222 /* 223 * Examine all ufsvfs structures and add those that we can lock to the 224 * update list. This is so that we don't hold the list lock for a 225 * long time. If vfs_lock fails for a file system instance, then skip 226 * it because somebody is doing a unmount on it. 227 */ 228 mutex_enter(&ufsvfs_mutex); 229 for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) { 230 vfsp = ufsp->vfs_vfs; 231 if (vfs_lock(vfsp) != 0) 232 continue; 233 ufsp->vfs_wnext = update_list; 234 update_list = ufsp; 235 check_cnt++; 236 } 237 mutex_exit(&ufsvfs_mutex); 238 239 if (update_list == NULL) 240 return; 241 242 check_size = sizeof (struct check_node) * check_cnt; 243 check_list = ptr = kmem_alloc(check_size, KM_NOSLEEP); 244 245 /* 246 * Write back modified superblocks. 247 * Consistency check that the superblock of 248 * each file system is still in the buffer cache. 249 * 250 * Note that the update_list traversal is done without the protection 251 * of an overall list lock, so it's necessary to rely on the fact that 252 * each entry of the list is vfs_locked when moving from one entry to 253 * the next. This works because a concurrent attempt to add an entry 254 * to another thread's update_list won't find it, since it'll already 255 * be locked. 256 */ 257 check_cnt = 0; 258 for (ufsp = update_list; ufsp != NULL; ufsp = ufsnext) { 259 /* 260 * Need to grab the next ptr before we unlock this one so 261 * another thread doesn't grab it and change it before we move 262 * on to the next vfs. (Once we unlock it, it's ok if another 263 * thread finds it to add it to its own update_list; we don't 264 * attempt to refer to it through our list any more.) 265 */ 266 ufsnext = ufsp->vfs_wnext; 267 vfsp = ufsp->vfs_vfs; 268 269 /* 270 * Seems like this can't happen, so perhaps it should become 271 * an ASSERT(vfsp->vfs_data != NULL). 272 */ 273 if (!vfsp->vfs_data) { 274 vfs_unlock(vfsp); 275 continue; 276 } 277 278 fs = ufsp->vfs_fs; 279 280 /* 281 * don't update a locked superblock during a panic; it 282 * may be in an inconsistent state 283 */ 284 if (panicstr) { 285 if (!mutex_tryenter(&ufsp->vfs_lock)) { 286 vfs_unlock(vfsp); 287 continue; 288 } 289 } else 290 mutex_enter(&ufsp->vfs_lock); 291 /* 292 * Build up the STABLE check list, so we can unlock the vfs 293 * until we do the actual checking. 294 */ 295 if (check_list != NULL) { 296 if ((fs->fs_ronly == 0) && 297 (fs->fs_clean != FSBAD) && 298 (fs->fs_clean != FSSUSPEND)) { 299 ptr->vfsp = vfsp; 300 ptr->ufsvfs = ufsp; 301 ptr->vfs_dev = vfsp->vfs_dev; 302 ptr++; 303 check_cnt++; 304 } 305 } 306 307 /* 308 * superblock is not modified 309 */ 310 if (fs->fs_fmod == 0) { 311 mutex_exit(&ufsp->vfs_lock); 312 vfs_unlock(vfsp); 313 continue; 314 } 315 if (fs->fs_ronly != 0) { 316 mutex_exit(&ufsp->vfs_lock); 317 vfs_unlock(vfsp); 318 (void) ufs_fault(ufsp->vfs_root, 319 "fs = %s update: ro fs mod\n", 320 fs->fs_fsmnt); 321 /* 322 * XXX: Why is this a return instead of a continue? 323 * This may be an attempt to replace a panic with 324 * something less drastic, but there's cleanup we 325 * should be doing that's not being done (e.g., 326 * unlocking the remaining entries on the list). 327 */ 328 return; 329 } 330 fs->fs_fmod = 0; 331 mutex_exit(&ufsp->vfs_lock); 332 TRANS_SBUPDATE(ufsp, vfsp, TOP_SBUPDATE_UPDATE); 333 vfs_unlock(vfsp); 334 } 335 336 ufs_sync_time = time; 337 338 /* 339 * Avoid racing with ufs_unmount() and ufs_sync(). 340 */ 341 mutex_enter(&ufs_scan_lock); 342 343 (void) ufs_scan_inodes(1, ufs_sync_inode, (void *)(uintptr_t)cheap, 344 NULL); 345 346 mutex_exit(&ufs_scan_lock); 347 348 /* 349 * Force stale buffer cache information to be flushed, 350 * for all devices. This should cause any remaining control 351 * information (e.g., cg and inode info) to be flushed back. 352 */ 353 bflush((dev_t)NODEV); 354 355 if (check_list == NULL) 356 return; 357 358 /* 359 * For each UFS filesystem in the STABLE check_list, update 360 * the clean flag if warranted. 361 */ 362 for (ptr = check_list; check_cnt > 0; check_cnt--, ptr++) { 363 int error; 364 365 /* 366 * still_mounted() returns with vfsp and the vfs_reflock 367 * held if ptr refers to a vfs that is still mounted. 368 */ 369 if ((vfsp = still_mounted(ptr)) == NULL) 370 continue; 371 ufs_checkclean(vfsp); 372 /* 373 * commit any outstanding async transactions 374 */ 375 ufsp = (struct ufsvfs *)vfsp->vfs_data; 376 curthread->t_flag |= T_DONTBLOCK; 377 TRANS_BEGIN_SYNC(ufsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, 378 error); 379 if (!error) { 380 TRANS_END_SYNC(ufsp, error, TOP_COMMIT_UPDATE, 381 TOP_COMMIT_SIZE); 382 } 383 curthread->t_flag &= ~T_DONTBLOCK; 384 385 vfs_unlock(vfsp); 386 } 387 388 kmem_free(check_list, check_size); 389 } 390 391 int 392 ufs_sync_inode(struct inode *ip, void *arg) 393 { 394 int cheap = (int)(uintptr_t)arg; 395 struct ufsvfs *ufsvfsp; 396 uint_t flag = ip->i_flag; 397 398 if (cheap && ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) == 0)) 399 return (0); 400 401 /* 402 * if we are panic'ing; then don't update the inode if this 403 * file system is FSSTABLE. Otherwise, we would have to 404 * force the superblock to FSACTIVE and the superblock 405 * may not be in a good state. Also, if the inode is 406 * IREF'ed then it may be in an inconsistent state. Don't 407 * push it. Finally, don't push the inode if the fs is 408 * logging; the transaction will be discarded at boot. 409 */ 410 if (panicstr) { 411 412 if (flag & IREF) 413 return (0); 414 415 if (ip->i_ufsvfs == NULL || 416 (ip->i_fs->fs_clean == FSSTABLE || 417 ip->i_fs->fs_clean == FSLOG)) 418 return (0); 419 } 420 421 ufsvfsp = ip->i_ufsvfs; 422 423 /* 424 * Limit access time only updates 425 */ 426 if (((flag & (IMOD|IMODACC|IUPD|ICHG|IACC)) == IMODACC) && ufsvfsp) { 427 /* 428 * if file system has deferred access time turned on and there 429 * was no IO recently, don't bother flushing it. It will be 430 * flushed when I/Os start again. 431 */ 432 if (cheap && (ufsvfsp->vfs_dfritime & UFS_DFRATIME) && 433 (ufsvfsp->vfs_iotstamp + ufs_iowait < lbolt)) 434 return (0); 435 /* 436 * an app issueing a sync() can take forever on a trans device 437 * when NetWorker or find is running because all of the 438 * directorys' access times have to be updated. So, we limit 439 * the time we spend updating access times per sync. 440 */ 441 if (TRANS_ISTRANS(ufsvfsp) && ((ufs_sync_time + 442 ufs_sync_time_secs) < time)) 443 return (0); 444 } 445 446 /* 447 * if we are running on behalf of the flush thread or this is 448 * a swap file, then simply do a delay update of the inode. 449 * Otherwise, push the pages and then do a delayed inode update. 450 */ 451 if (cheap || IS_SWAPVP(ITOV(ip))) { 452 TRANS_IUPDAT(ip, 0); 453 } else { 454 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_SYNC); 455 } 456 return (0); 457 } 458 459 /* 460 * Flush all the pages associated with an inode using the given 'flags', 461 * then force inode information to be written back using the given 'waitfor'. 462 */ 463 int 464 ufs_syncip(struct inode *ip, int flags, int waitfor, top_t topid) 465 { 466 int error; 467 struct vnode *vp = ITOV(ip); 468 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 469 int dotrans = 0; 470 471 TRACE_3(TR_FAC_UFS, TR_UFS_SYNCIP_START, 472 "ufs_syncip_start:vp %p flags %x waitfor %x", 473 vp, flags, waitfor); 474 475 /* 476 * Return if file system has been forcibly umounted. 477 */ 478 if (ufsvfsp == NULL) 479 return (EIO); 480 /* 481 * don't need to VOP_PUTPAGE if there are no pages 482 */ 483 if (!vn_has_cached_data(vp) || vp->v_type == VCHR) { 484 error = 0; 485 } else { 486 /* 487 * if the inode we're working on is a shadow inode 488 * or quota inode we need to make sure that the 489 * ufs_putpage call is inside a transaction as this 490 * could include meta data changes. 491 */ 492 if ((ip->i_mode & IFMT) == IFSHAD || 493 ufsvfsp->vfs_qinod == ip) { 494 dotrans = 1; 495 curthread->t_flag |= T_DONTBLOCK; 496 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE, 497 TOP_PUTPAGE_SIZE(ip)); 498 } 499 error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, flags, CRED()); 500 if (dotrans) { 501 TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE, 502 TOP_PUTPAGE_SIZE(ip)); 503 curthread->t_flag &= ~T_DONTBLOCK; 504 dotrans = 0; 505 } 506 } 507 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 508 goto out; 509 /* 510 * waitfor represents two things - 511 * 1. whether data sync or file sync. 512 * 2. if file sync then ufs_iupdat should 'waitfor' disk i/o or not. 513 */ 514 if (waitfor == I_DSYNC) { 515 /* 516 * If data sync, only IATTCHG (size/block change) requires 517 * inode update, fdatasync()/FDSYNC implementation. 518 */ 519 if (ip->i_flag & (IBDWRITE|IATTCHG)) { 520 /* 521 * Enter a transaction to provide mutual exclusion 522 * with deltamap_push and avoid a race where 523 * the inode flush could get dropped. 524 */ 525 if ((curthread->t_flag & T_DONTBLOCK) == 0) { 526 dotrans = 1; 527 curthread->t_flag |= T_DONTBLOCK; 528 TRANS_BEGIN_ASYNC(ufsvfsp, topid, 529 TOP_SYNCIP_SIZE); 530 } 531 rw_enter(&ip->i_contents, RW_READER); 532 mutex_enter(&ip->i_tlock); 533 ip->i_flag &= ~IMODTIME; 534 mutex_exit(&ip->i_tlock); 535 ufs_iupdat(ip, 1); 536 rw_exit(&ip->i_contents); 537 if (dotrans) { 538 TRANS_END_ASYNC(ufsvfsp, topid, 539 TOP_SYNCIP_SIZE); 540 curthread->t_flag &= ~T_DONTBLOCK; 541 } 542 } 543 } else { 544 /* For file sync, any inode change requires inode update */ 545 if (ip->i_flag & (IBDWRITE|IUPD|IACC|ICHG|IMOD|IMODACC)) { 546 /* 547 * Enter a transaction to provide mutual exclusion 548 * with deltamap_push and avoid a race where 549 * the inode flush could get dropped. 550 */ 551 if ((curthread->t_flag & T_DONTBLOCK) == 0) { 552 dotrans = 1; 553 curthread->t_flag |= T_DONTBLOCK; 554 TRANS_BEGIN_ASYNC(ufsvfsp, topid, 555 TOP_SYNCIP_SIZE); 556 } 557 rw_enter(&ip->i_contents, RW_READER); 558 mutex_enter(&ip->i_tlock); 559 ip->i_flag &= ~IMODTIME; 560 mutex_exit(&ip->i_tlock); 561 ufs_iupdat(ip, waitfor); 562 rw_exit(&ip->i_contents); 563 if (dotrans) { 564 TRANS_END_ASYNC(ufsvfsp, topid, 565 TOP_SYNCIP_SIZE); 566 curthread->t_flag &= ~T_DONTBLOCK; 567 } 568 } 569 } 570 571 out: 572 TRACE_2(TR_FAC_UFS, TR_UFS_SYNCIP_END, 573 "ufs_syncip_end:vp %p error %d", 574 vp, error); 575 576 return (error); 577 } 578 /* 579 * Flush all indirect blocks related to an inode. 580 * Supports triple indirect blocks also. 581 */ 582 int 583 ufs_sync_indir(struct inode *ip) 584 { 585 int i; 586 daddr_t blkno; 587 daddr_t lbn; /* logical blkno of last blk in file */ 588 daddr_t clbn; /* current logical blk */ 589 daddr32_t *bap; 590 struct fs *fs; 591 struct buf *bp; 592 int bsize; 593 struct ufsvfs *ufsvfsp; 594 int j; 595 daddr_t indirect_blkno; 596 daddr32_t *indirect_bap; 597 struct buf *indirect_bp; 598 599 ufsvfsp = ip->i_ufsvfs; 600 /* 601 * unnecessary when logging; allocation blocks are kept up-to-date 602 */ 603 if (TRANS_ISTRANS(ufsvfsp)) 604 return (0); 605 606 fs = ufsvfsp->vfs_fs; 607 bsize = fs->fs_bsize; 608 lbn = (daddr_t)lblkno(fs, ip->i_size - 1); 609 if (lbn < NDADDR) 610 return (0); /* No indirect blocks used */ 611 if (lbn < NDADDR + NINDIR(fs)) { 612 /* File has one indirect block. */ 613 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, ip->i_ib[0])); 614 return (0); 615 } 616 617 /* Write out all the first level indirect blocks */ 618 for (i = 0; i <= NIADDR; i++) { 619 if ((blkno = ip->i_ib[i]) == 0) 620 continue; 621 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno)); 622 } 623 /* Write out second level of indirect blocks */ 624 if ((blkno = ip->i_ib[1]) == 0) 625 return (0); 626 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize); 627 if (bp->b_flags & B_ERROR) { 628 brelse(bp); 629 return (EIO); 630 } 631 bap = bp->b_un.b_daddr; 632 clbn = NDADDR + NINDIR(fs); 633 for (i = 0; i < NINDIR(fs); i++) { 634 if (clbn > lbn) 635 break; 636 clbn += NINDIR(fs); 637 if ((blkno = bap[i]) == 0) 638 continue; 639 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno)); 640 } 641 642 brelse(bp); 643 /* write out third level indirect blocks */ 644 645 if ((blkno = ip->i_ib[2]) == 0) 646 return (0); 647 648 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize); 649 if (bp->b_flags & B_ERROR) { 650 brelse(bp); 651 return (EIO); 652 } 653 bap = bp->b_un.b_daddr; 654 clbn = NDADDR + NINDIR(fs) + (NINDIR(fs) * NINDIR(fs)); 655 656 for (i = 0; i < NINDIR(fs); i++) { 657 if (clbn > lbn) 658 break; 659 if ((indirect_blkno = bap[i]) == 0) 660 continue; 661 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, indirect_blkno)); 662 indirect_bp = UFS_BREAD(ufsvfsp, ip->i_dev, 663 (daddr_t)fsbtodb(fs, indirect_blkno), bsize); 664 if (indirect_bp->b_flags & B_ERROR) { 665 brelse(indirect_bp); 666 brelse(bp); 667 return (EIO); 668 } 669 indirect_bap = indirect_bp->b_un.b_daddr; 670 for (j = 0; j < NINDIR(fs); j++) { 671 if (clbn > lbn) 672 break; 673 clbn += NINDIR(fs); 674 if ((blkno = indirect_bap[j]) == 0) 675 continue; 676 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno)); 677 } 678 brelse(indirect_bp); 679 } 680 brelse(bp); 681 682 return (0); 683 } 684 685 /* 686 * Flush all indirect blocks related to an offset of a file. 687 * read/write in sync mode may have to flush indirect blocks. 688 */ 689 int 690 ufs_indirblk_sync(struct inode *ip, offset_t off) 691 { 692 daddr_t lbn; 693 struct fs *fs; 694 struct buf *bp; 695 int i, j, shft; 696 daddr_t ob, nb, tbn; 697 daddr32_t *bap; 698 int nindirshift, nindiroffset; 699 struct ufsvfs *ufsvfsp; 700 701 ufsvfsp = ip->i_ufsvfs; 702 /* 703 * unnecessary when logging; allocation blocks are kept up-to-date 704 */ 705 if (TRANS_ISTRANS(ufsvfsp)) 706 return (0); 707 708 fs = ufsvfsp->vfs_fs; 709 710 lbn = (daddr_t)lblkno(fs, off); 711 if (lbn < 0) 712 return (EFBIG); 713 714 /* The first NDADDR are direct so nothing to do */ 715 if (lbn < NDADDR) 716 return (0); 717 718 nindirshift = ip->i_ufsvfs->vfs_nindirshift; 719 nindiroffset = ip->i_ufsvfs->vfs_nindiroffset; 720 721 /* Determine level of indirect blocks */ 722 shft = 0; 723 tbn = lbn - NDADDR; 724 for (j = NIADDR; j > 0; j--) { 725 longlong_t sh; 726 727 shft += nindirshift; 728 sh = 1LL << shft; 729 if (tbn < sh) 730 break; 731 tbn -= (daddr_t)sh; 732 } 733 734 if (j == 0) 735 return (EFBIG); 736 737 if ((nb = ip->i_ib[NIADDR - j]) == 0) 738 return (0); /* UFS Hole */ 739 740 /* Flush first level indirect block */ 741 blkflush(ip->i_dev, fsbtodb(fs, nb)); 742 743 /* Fetch through next levels */ 744 for (; j < NIADDR; j++) { 745 ob = nb; 746 bp = UFS_BREAD(ufsvfsp, 747 ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize); 748 if (bp->b_flags & B_ERROR) { 749 brelse(bp); 750 return (EIO); 751 } 752 bap = bp->b_un.b_daddr; 753 shft -= nindirshift; /* sh / nindir */ 754 i = (tbn >> shft) & nindiroffset; /* (tbn /sh) & nindir */ 755 nb = bap[i]; 756 brelse(bp); 757 if (nb == 0) { 758 return (0); /* UFS hole */ 759 } 760 blkflush(ip->i_dev, fsbtodb(fs, nb)); 761 } 762 return (0); 763 } 764 765 #ifdef DEBUG 766 767 /* 768 * The bad block checking routines: ufs_indir_badblock() and ufs_badblock() 769 * are very expensive. It's been found from profiling that we're 770 * spending 6-7% of our time in ufs_badblock, and another 1-2% in 771 * ufs_indir_badblock. They are only called via ASSERTs (from debug kernels). 772 * In addition from experience no failures have been found in recent 773 * years. So the following tunable can be set to enable checking. 774 */ 775 int ufs_badblock_checks = 0; 776 777 /* 778 * Check that a given indirect block contains blocks in range 779 */ 780 int 781 ufs_indir_badblock(struct inode *ip, daddr32_t *bap) 782 { 783 int i; 784 int err = 0; 785 786 if (ufs_badblock_checks) { 787 for (i = 0; i < NINDIR(ip->i_fs) - 1; i++) 788 if (bap[i] != 0 && (err = ufs_badblock(ip, bap[i]))) 789 break; 790 } 791 return (err); 792 } 793 794 /* 795 * Check that a specified block number is in range. 796 */ 797 int 798 ufs_badblock(struct inode *ip, daddr_t bn) 799 { 800 long c; 801 daddr_t sum; 802 803 if (!ufs_badblock_checks) 804 return (0); 805 ASSERT(bn); 806 if (bn <= 0 || bn > ip->i_fs->fs_size) 807 return (bn); 808 809 sum = 0; 810 c = dtog(ip->i_fs, bn); 811 if (c == 0) { 812 sum = howmany(ip->i_fs->fs_cssize, ip->i_fs->fs_fsize); 813 } 814 /* 815 * if block no. is below this cylinder group, 816 * within the space reserved for superblock, inodes, (summary data) 817 * or if it is above this cylinder group 818 * then its invalid 819 * It's hard to see how we'd be outside this cyl, but let's be careful. 820 */ 821 if ((bn < cgbase(ip->i_fs, c)) || 822 (bn >= cgsblock(ip->i_fs, c) && bn < cgdmin(ip->i_fs, c)+sum) || 823 (bn >= (unsigned)cgbase(ip->i_fs, c+1))) 824 return (bn); 825 826 return (0); /* not a bad block */ 827 } 828 829 #endif /* DEBUG */ 830 831 /* 832 * When i_rwlock is write-locked or has a writer pended, then the inode 833 * is going to change in a way that the filesystem will be marked as 834 * active. So no need to let the filesystem be mark as stable now. 835 * Also to ensure the filesystem consistency during the directory 836 * operations, filesystem cannot be marked as stable if i_rwlock of 837 * the directory inode is write-locked. 838 */ 839 840 /* 841 * Check for busy inodes for this filesystem. 842 * NOTE: Needs better way to do this expensive operation in the future. 843 */ 844 static void 845 ufs_icheck(struct ufsvfs *ufsvfsp, int *isbusyp, int *isreclaimp) 846 { 847 union ihead *ih; 848 struct inode *ip; 849 int i; 850 int isnottrans = !TRANS_ISTRANS(ufsvfsp); 851 int isbusy = *isbusyp; 852 int isreclaim = *isreclaimp; 853 854 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 855 mutex_enter(&ih_lock[i]); 856 for (ip = ih->ih_chain[0]; 857 ip != (struct inode *)ih; 858 ip = ip->i_forw) { 859 /* 860 * if inode is busy/modified/deleted, filesystem is busy 861 */ 862 if (ip->i_ufsvfs != ufsvfsp) 863 continue; 864 if ((ip->i_flag & (IMOD | IUPD | ICHG)) || 865 (RW_ISWRITER(&ip->i_rwlock))) 866 isbusy = 1; 867 if ((ip->i_nlink <= 0) && (ip->i_flag & IREF)) 868 isreclaim = 1; 869 if (isbusy && (isreclaim || isnottrans)) 870 break; 871 } 872 mutex_exit(&ih_lock[i]); 873 if (isbusy && (isreclaim || isnottrans)) 874 break; 875 } 876 *isbusyp = isbusy; 877 *isreclaimp = isreclaim; 878 } 879 880 /* 881 * As part of the ufs 'sync' operation, this routine is called to mark 882 * the filesystem as STABLE if there is no modified metadata in memory. 883 */ 884 void 885 ufs_checkclean(struct vfs *vfsp) 886 { 887 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 888 struct fs *fs = ufsvfsp->vfs_fs; 889 int isbusy; 890 int isreclaim; 891 int updatesb; 892 893 ASSERT(vfs_lock_held(vfsp)); 894 895 /* 896 * filesystem is stable or cleanflag processing is disabled; do nothing 897 * no transitions when panic'ing 898 */ 899 if (fs->fs_ronly || 900 fs->fs_clean == FSBAD || 901 fs->fs_clean == FSSUSPEND || 902 fs->fs_clean == FSSTABLE || 903 panicstr) 904 return; 905 906 /* 907 * if logging and nothing to reclaim; do nothing 908 */ 909 if ((fs->fs_clean == FSLOG) && 910 (((fs->fs_reclaim & FS_RECLAIM) == 0) || 911 (fs->fs_reclaim & FS_RECLAIMING))) 912 return; 913 914 /* 915 * FS_CHECKCLEAN is reset if the file system goes dirty 916 * FS_CHECKRECLAIM is reset if a file gets deleted 917 */ 918 mutex_enter(&ufsvfsp->vfs_lock); 919 fs->fs_reclaim |= (FS_CHECKCLEAN | FS_CHECKRECLAIM); 920 mutex_exit(&ufsvfsp->vfs_lock); 921 922 updatesb = 0; 923 924 /* 925 * if logging or buffers are busy; do nothing 926 */ 927 isbusy = isreclaim = 0; 928 if ((fs->fs_clean == FSLOG) || 929 (bcheck(vfsp->vfs_dev, ufsvfsp->vfs_bufp))) 930 isbusy = 1; 931 932 /* 933 * isreclaim == TRUE means can't change the state of fs_reclaim 934 */ 935 isreclaim = 936 ((fs->fs_clean == FSLOG) && 937 (((fs->fs_reclaim & FS_RECLAIM) == 0) || 938 (fs->fs_reclaim & FS_RECLAIMING))); 939 940 /* 941 * if fs is busy or can't change the state of fs_reclaim; do nothing 942 */ 943 if (isbusy && isreclaim) 944 return; 945 946 /* 947 * look for busy or deleted inodes; (deleted == needs reclaim) 948 */ 949 ufs_icheck(ufsvfsp, &isbusy, &isreclaim); 950 951 mutex_enter(&ufsvfsp->vfs_lock); 952 953 /* 954 * IF POSSIBLE, RESET RECLAIM 955 */ 956 /* 957 * the reclaim thread is not running 958 */ 959 if ((fs->fs_reclaim & FS_RECLAIMING) == 0) 960 /* 961 * no files were deleted during the scan 962 */ 963 if (fs->fs_reclaim & FS_CHECKRECLAIM) 964 /* 965 * no deleted files were found in the inode cache 966 */ 967 if ((isreclaim == 0) && (fs->fs_reclaim & FS_RECLAIM)) { 968 fs->fs_reclaim &= ~FS_RECLAIM; 969 updatesb = 1; 970 } 971 /* 972 * IF POSSIBLE, SET STABLE 973 */ 974 /* 975 * not logging 976 */ 977 if (fs->fs_clean != FSLOG) 978 /* 979 * file system has not gone dirty since the scan began 980 */ 981 if (fs->fs_reclaim & FS_CHECKCLEAN) 982 /* 983 * nothing dirty was found in the buffer or inode cache 984 */ 985 if ((isbusy == 0) && (isreclaim == 0) && 986 (fs->fs_clean != FSSTABLE)) { 987 fs->fs_clean = FSSTABLE; 988 updatesb = 1; 989 } 990 991 mutex_exit(&ufsvfsp->vfs_lock); 992 if (updatesb) { 993 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE); 994 } 995 } 996 997 /* 998 * called whenever an unlink occurs 999 */ 1000 void 1001 ufs_setreclaim(struct inode *ip) 1002 { 1003 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1004 struct fs *fs = ufsvfsp->vfs_fs; 1005 1006 if (ip->i_nlink || fs->fs_ronly || (fs->fs_clean != FSLOG)) 1007 return; 1008 1009 /* 1010 * reclaim-needed bit is already set or we need to tell 1011 * ufs_checkclean that a file has been deleted 1012 */ 1013 if ((fs->fs_reclaim & (FS_RECLAIM | FS_CHECKRECLAIM)) == FS_RECLAIM) 1014 return; 1015 1016 mutex_enter(&ufsvfsp->vfs_lock); 1017 /* 1018 * inform ufs_checkclean that the file system has gone dirty 1019 */ 1020 fs->fs_reclaim &= ~FS_CHECKRECLAIM; 1021 1022 /* 1023 * set the reclaim-needed bit 1024 */ 1025 if ((fs->fs_reclaim & FS_RECLAIM) == 0) { 1026 fs->fs_reclaim |= FS_RECLAIM; 1027 ufs_sbwrite(ufsvfsp); 1028 } 1029 mutex_exit(&ufsvfsp->vfs_lock); 1030 } 1031 1032 /* 1033 * Before any modified metadata written back to the disk, this routine 1034 * is called to mark the filesystem as ACTIVE. 1035 */ 1036 void 1037 ufs_notclean(struct ufsvfs *ufsvfsp) 1038 { 1039 struct fs *fs = ufsvfsp->vfs_fs; 1040 1041 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock)); 1042 ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs)); 1043 1044 /* 1045 * inform ufs_checkclean that the file system has gone dirty 1046 */ 1047 fs->fs_reclaim &= ~FS_CHECKCLEAN; 1048 1049 /* 1050 * ignore if active or bad or suspended or readonly or logging 1051 */ 1052 if ((fs->fs_clean == FSACTIVE) || (fs->fs_clean == FSLOG) || 1053 (fs->fs_clean == FSBAD) || (fs->fs_clean == FSSUSPEND) || 1054 (fs->fs_ronly)) { 1055 mutex_exit(&ufsvfsp->vfs_lock); 1056 return; 1057 } 1058 fs->fs_clean = FSACTIVE; 1059 /* 1060 * write superblock synchronously 1061 */ 1062 ufs_sbwrite(ufsvfsp); 1063 mutex_exit(&ufsvfsp->vfs_lock); 1064 } 1065 1066 /* 1067 * ufs specific fbwrite() 1068 */ 1069 int 1070 ufs_fbwrite(struct fbuf *fbp, struct inode *ip) 1071 { 1072 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1073 1074 if (TRANS_ISTRANS(ufsvfsp)) 1075 return (fbwrite(fbp)); 1076 mutex_enter(&ufsvfsp->vfs_lock); 1077 ufs_notclean(ufsvfsp); 1078 return ((ufsvfsp->vfs_dio) ? fbdwrite(fbp) : fbwrite(fbp)); 1079 } 1080 1081 /* 1082 * ufs specific fbiwrite() 1083 */ 1084 int 1085 ufs_fbiwrite(struct fbuf *fbp, struct inode *ip, daddr_t bn, long bsize) 1086 { 1087 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1088 o_mode_t ifmt = ip->i_mode & IFMT; 1089 buf_t *bp; 1090 int error; 1091 1092 mutex_enter(&ufsvfsp->vfs_lock); 1093 ufs_notclean(ufsvfsp); 1094 if (ifmt == IFDIR || ifmt == IFSHAD || ifmt == IFATTRDIR || 1095 (ip->i_ufsvfs->vfs_qinod == ip)) { 1096 TRANS_DELTA(ufsvfsp, ldbtob(bn * (offset_t)(btod(bsize))), 1097 fbp->fb_count, DT_FBI, 0, 0); 1098 } 1099 /* 1100 * Inlined version of fbiwrite() 1101 */ 1102 bp = pageio_setup((struct page *)NULL, fbp->fb_count, 1103 ip->i_devvp, B_WRITE); 1104 bp->b_flags &= ~B_PAGEIO; 1105 bp->b_un.b_addr = fbp->fb_addr; 1106 1107 bp->b_blkno = bn * btod(bsize); 1108 bp->b_dev = cmpdev(ip->i_dev); /* store in old dev format */ 1109 bp->b_edev = ip->i_dev; 1110 bp->b_proc = NULL; /* i.e. the kernel */ 1111 bp->b_file = ip->i_vnode; 1112 bp->b_offset = -1; 1113 1114 if (ufsvfsp->vfs_log) { 1115 lufs_write_strategy(ufsvfsp->vfs_log, bp); 1116 } else if (ufsvfsp->vfs_snapshot) { 1117 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 1118 } else { 1119 ufsvfsp->vfs_iotstamp = lbolt; 1120 ub.ub_fbiwrites.value.ul++; 1121 (void) bdev_strategy(bp); 1122 lwp_stat_update(LWP_STAT_OUBLK, 1); 1123 } 1124 error = biowait(bp); 1125 pageio_done(bp); 1126 fbrelse(fbp, S_OTHER); 1127 return (error); 1128 } 1129 1130 /* 1131 * Write the ufs superblock only. 1132 */ 1133 void 1134 ufs_sbwrite(struct ufsvfs *ufsvfsp) 1135 { 1136 char sav_fs_fmod; 1137 struct fs *fs = ufsvfsp->vfs_fs; 1138 struct buf *bp = ufsvfsp->vfs_bufp; 1139 1140 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock)); 1141 1142 /* 1143 * for ulockfs processing, limit the superblock writes 1144 */ 1145 if ((ufsvfsp->vfs_ulockfs.ul_sbowner) && 1146 (curthread != ufsvfsp->vfs_ulockfs.ul_sbowner)) { 1147 /* try again later */ 1148 fs->fs_fmod = 1; 1149 return; 1150 } 1151 1152 ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs)); 1153 /* 1154 * update superblock timestamp and fs_clean checksum 1155 * if marked FSBAD, we always want an erroneous 1156 * checksum to force repair 1157 */ 1158 fs->fs_time = gethrestime_sec(); 1159 fs->fs_state = fs->fs_clean != FSBAD? FSOKAY - fs->fs_time: 1160 -(FSOKAY - fs->fs_time); 1161 switch (fs->fs_clean) { 1162 case FSCLEAN: 1163 case FSSTABLE: 1164 fs->fs_reclaim &= ~FS_RECLAIM; 1165 break; 1166 case FSACTIVE: 1167 case FSSUSPEND: 1168 case FSBAD: 1169 case FSLOG: 1170 break; 1171 default: 1172 fs->fs_clean = FSACTIVE; 1173 break; 1174 } 1175 /* 1176 * reset incore only bits 1177 */ 1178 fs->fs_reclaim &= ~(FS_CHECKCLEAN | FS_CHECKRECLAIM); 1179 1180 /* 1181 * delta the whole superblock 1182 */ 1183 TRANS_DELTA(ufsvfsp, ldbtob(SBLOCK), sizeof (struct fs), 1184 DT_SB, NULL, 0); 1185 /* 1186 * retain the incore state of fs_fmod; set the ondisk state to 0 1187 */ 1188 sav_fs_fmod = fs->fs_fmod; 1189 fs->fs_fmod = 0; 1190 1191 /* 1192 * Don't release the buffer after written to the disk 1193 */ 1194 UFS_BWRITE2(ufsvfsp, bp); 1195 fs->fs_fmod = sav_fs_fmod; /* reset fs_fmod's incore state */ 1196 } 1197 1198 /* 1199 * Returns vfs pointer if vfs still being mounted. vfs lock is held. 1200 * Otherwise, returns NULL. 1201 * 1202 * For our purposes, "still mounted" means that the file system still appears 1203 * on the list of UFS file system instances. 1204 */ 1205 static vfs_t * 1206 still_mounted(struct check_node *checkp) 1207 { 1208 struct vfs *vfsp; 1209 struct ufsvfs *ufsp; 1210 1211 mutex_enter(&ufsvfs_mutex); 1212 for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) { 1213 if (ufsp != checkp->ufsvfs) 1214 continue; 1215 /* 1216 * Tentative match: verify it and try to lock. (It's not at 1217 * all clear how the verification could fail, given that we've 1218 * gotten this far. We would have had to reallocate the 1219 * ufsvfs struct at hand for a new incarnation; is that really 1220 * possible in the interval from constructing the check_node 1221 * to here?) 1222 */ 1223 vfsp = ufsp->vfs_vfs; 1224 if (vfsp != checkp->vfsp) 1225 continue; 1226 if (vfsp->vfs_dev != checkp->vfs_dev) 1227 continue; 1228 if (vfs_lock(vfsp) != 0) 1229 continue; 1230 1231 mutex_exit(&ufsvfs_mutex); 1232 return (vfsp); 1233 } 1234 mutex_exit(&ufsvfs_mutex); 1235 return (NULL); 1236 } 1237 1238 int 1239 ufs_si_io_done(struct buf *bp) 1240 { 1241 sema_v(&bp->b_io); 1242 return (0); 1243 } 1244 1245 #define SI_BUFSZ roundup(sizeof (struct cg), DEV_BSIZE) 1246 #define NSIBUF 32 1247 1248 /* 1249 * ufs_construct_si() 1250 * Read each cylinder group in turn and construct the summary information 1251 */ 1252 static int 1253 ufs_construct_si(dev_t dev, struct fs *fs, struct ufsvfs *ufsvfsp) 1254 { 1255 buf_t *bps, *bp; 1256 char *bufs; 1257 struct csum *sip = fs->fs_u.fs_csp; 1258 struct cg *cgp; 1259 int i, ncg; 1260 int error = 0, cg = 0; 1261 1262 bps = kmem_alloc(NSIBUF * sizeof (buf_t), KM_SLEEP); 1263 bufs = kmem_alloc(NSIBUF * SI_BUFSZ, KM_SLEEP); 1264 1265 /* 1266 * Initialise the buffer headers 1267 */ 1268 for (bp = bps, i = 0; i < NSIBUF; i++, bp++) { 1269 bioinit(bp); 1270 bp->b_iodone = ufs_si_io_done; 1271 bp->b_bufsize = bp->b_bcount = SI_BUFSZ; 1272 bp->b_flags = B_READ; 1273 bp->b_un.b_addr = bufs + (i * SI_BUFSZ); 1274 bp->b_edev = dev; 1275 } 1276 1277 /* 1278 * Repeat while there are cylinder groups left to read. 1279 */ 1280 do { 1281 /* 1282 * Issue upto NSIBUF asynchronous reads 1283 */ 1284 ncg = MIN(NSIBUF, (fs->fs_ncg - cg)); 1285 for (bp = bps, i = 0; i < ncg; i++, bp++) { 1286 bp->b_blkno = (daddr_t)fsbtodb(fs, cgtod(fs, cg + i)); 1287 if (ufsvfsp->vfs_log) { 1288 lufs_read_strategy(ufsvfsp->vfs_log, bp); 1289 } else { 1290 (void) bdev_strategy(bp); 1291 } 1292 } 1293 1294 /* 1295 * wait for each read to finish; 1296 * check for errors and copy the csum info 1297 */ 1298 for (bp = bps, i = 0; i < ncg; i++, bp++) { 1299 sema_p(&bp->b_io); 1300 if (!error) { 1301 cgp = bp->b_un.b_cg; 1302 sip[cg + i] = cgp->cg_cs; 1303 error = geterror(bp); 1304 } 1305 } 1306 if (error) { 1307 goto err; 1308 } 1309 cg += ncg; 1310 } while (cg < fs->fs_ncg); 1311 1312 err: 1313 kmem_free(bps, NSIBUF * sizeof (buf_t)); 1314 kmem_free(bufs, NSIBUF * SI_BUFSZ); 1315 return (error); 1316 } 1317 1318 /* 1319 * ufs_getsummaryinfo 1320 */ 1321 int 1322 ufs_getsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs) 1323 { 1324 int i; /* `for' loop counter */ 1325 ssize_t size; /* bytes of summary info to read */ 1326 daddr_t frags; /* frags of summary info to read */ 1327 caddr_t sip; /* summary info */ 1328 struct buf *tp; /* tmp buf */ 1329 1330 /* 1331 * maintain metadata map for trans device (debug only) 1332 */ 1333 TRANS_MATA_SI(ufsvfsp, fs); 1334 1335 /* 1336 * Compute #frags and allocate space for summary info 1337 */ 1338 frags = howmany(fs->fs_cssize, fs->fs_fsize); 1339 sip = kmem_alloc((size_t)fs->fs_cssize, KM_SLEEP); 1340 fs->fs_u.fs_csp = (struct csum *)sip; 1341 1342 if (fs->fs_si == FS_SI_BAD) { 1343 /* 1344 * The summary information is unknown, read it in from 1345 * the cylinder groups. 1346 */ 1347 if (TRANS_ISTRANS(ufsvfsp) && !TRANS_ISERROR(ufsvfsp) && 1348 ufsvfsp->vfs_log->un_logmap) { 1349 logmap_roll_dev(ufsvfsp->vfs_log); /* flush the log */ 1350 } 1351 bzero(sip, (size_t)fs->fs_cssize); 1352 if (ufs_construct_si(dev, fs, ufsvfsp)) { 1353 kmem_free(fs->fs_u.fs_csp, fs->fs_cssize); 1354 fs->fs_u.fs_csp = NULL; 1355 return (EIO); 1356 } 1357 } else { 1358 /* Read summary info a fs block at a time */ 1359 size = fs->fs_bsize; 1360 for (i = 0; i < frags; i += fs->fs_frag) { 1361 if (i + fs->fs_frag > frags) 1362 /* 1363 * This happens only the last iteration, so 1364 * don't worry about size being reset 1365 */ 1366 size = (frags - i) * fs->fs_fsize; 1367 tp = UFS_BREAD(ufsvfsp, dev, 1368 (daddr_t)fsbtodb(fs, fs->fs_csaddr+i), size); 1369 tp->b_flags |= B_STALE | B_AGE; 1370 if (tp->b_flags & B_ERROR) { 1371 kmem_free(fs->fs_u.fs_csp, fs->fs_cssize); 1372 fs->fs_u.fs_csp = NULL; 1373 brelse(tp); 1374 return (EIO); 1375 } 1376 bcopy(tp->b_un.b_addr, sip, size); 1377 sip += size; 1378 brelse(tp); 1379 } 1380 } 1381 bzero((caddr_t)&fs->fs_cstotal, sizeof (fs->fs_cstotal)); 1382 for (i = 0; i < fs->fs_ncg; ++i) { 1383 fs->fs_cstotal.cs_ndir += fs->fs_cs(fs, i).cs_ndir; 1384 fs->fs_cstotal.cs_nbfree += fs->fs_cs(fs, i).cs_nbfree; 1385 fs->fs_cstotal.cs_nifree += fs->fs_cs(fs, i).cs_nifree; 1386 fs->fs_cstotal.cs_nffree += fs->fs_cs(fs, i).cs_nffree; 1387 } 1388 return (0); 1389 } 1390 1391 /* 1392 * ufs_putsummaryinfo() stores all the cylinder group summary information 1393 * This is only used when logging, but the file system may not 1394 * be logging at the time, eg a read-only mount to flush the log 1395 * may push the summary info out. 1396 */ 1397 int 1398 ufs_putsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs) 1399 { 1400 struct buf b, *bp; /* tmp buf */ 1401 caddr_t sip; /* summary info */ 1402 ssize_t size; /* bytes of summary info to write */ 1403 daddr_t frags; /* frags of summary info to write */ 1404 int i; /* `for' loop counter */ 1405 int error; /* error */ 1406 1407 if (TRANS_ISERROR(ufsvfsp)) { 1408 return (EIO); 1409 } 1410 1411 if ((fs->fs_si != FS_SI_BAD) || !ufsvfsp->vfs_nolog_si) { 1412 return (0); 1413 } 1414 1415 bp = &b; 1416 bioinit(bp); 1417 bp->b_iodone = ufs_si_io_done; 1418 bp->b_bufsize = size = fs->fs_bsize; 1419 bp->b_flags = B_WRITE; 1420 bp->b_un.b_addr = kmem_alloc(size, KM_SLEEP); 1421 bp->b_edev = dev; 1422 frags = howmany(fs->fs_cssize, fs->fs_fsize); 1423 sip = (caddr_t)fs->fs_u.fs_csp; 1424 1425 /* Write summary info one fs block at a time */ 1426 for (error = 0, i = 0; (i < frags) && (error == 0); i += fs->fs_frag) { 1427 if (i + fs->fs_frag > frags) { 1428 /* 1429 * This happens only the last iteration, so 1430 * don't worry about size being reset 1431 */ 1432 size = (frags - i) * fs->fs_fsize; 1433 } 1434 bcopy(sip, bp->b_un.b_addr, size); 1435 bp->b_blkno = (daddr_t)fsbtodb(fs, fs->fs_csaddr+i); 1436 bp->b_bcount = size; 1437 (void) bdev_strategy(bp); 1438 sema_p(&bp->b_io); /* wait for write to complete */ 1439 error = geterror(bp); 1440 sip += size; 1441 } 1442 kmem_free(bp->b_un.b_addr, fs->fs_bsize); 1443 if (!error) { 1444 fs->fs_si = FS_SI_OK; 1445 } 1446 return (error); 1447 } 1448 1449 /* 1450 * Decide whether it is okay to remove within a sticky directory. 1451 * Two conditions need to be met: write access to the directory 1452 * is needed. In sticky directories, write access is not sufficient; 1453 * you can remove entries from a directory only if you own the directory, 1454 * if you are privileged, if you own the entry or if the entry is 1455 * a plain file and you have write access to that file. 1456 * Function returns 0 if remove access is granted. 1457 */ 1458 int 1459 ufs_sticky_remove_access(struct inode *dp, struct inode *ip, struct cred *cr) 1460 { 1461 uid_t uid; 1462 if ((dp->i_mode & ISVTX) && 1463 (uid = crgetuid(cr)) != dp->i_uid && 1464 uid != ip->i_uid && 1465 ((ip->i_mode & IFMT) != IFREG || 1466 ufs_iaccess(ip, IWRITE, cr) != 0)) 1467 return (secpolicy_vnode_remove(cr)); 1468 1469 return (0); 1470 } 1471 #endif /* _KERNEL */ 1472 1473 extern int around[9]; 1474 extern int inside[9]; 1475 extern uchar_t *fragtbl[]; 1476 1477 /* 1478 * Update the frsum fields to reflect addition or deletion 1479 * of some frags. 1480 */ 1481 void 1482 fragacct(struct fs *fs, int fragmap, int32_t *fraglist, int cnt) 1483 { 1484 int inblk; 1485 int field, subfield; 1486 int siz, pos; 1487 1488 /* 1489 * ufsvfsp->vfs_lock is held when calling this. 1490 */ 1491 inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1; 1492 fragmap <<= 1; 1493 for (siz = 1; siz < fs->fs_frag; siz++) { 1494 if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0) 1495 continue; 1496 field = around[siz]; 1497 subfield = inside[siz]; 1498 for (pos = siz; pos <= fs->fs_frag; pos++) { 1499 if ((fragmap & field) == subfield) { 1500 fraglist[siz] += cnt; 1501 ASSERT(fraglist[siz] >= 0); 1502 pos += siz; 1503 field <<= siz; 1504 subfield <<= siz; 1505 } 1506 field <<= 1; 1507 subfield <<= 1; 1508 } 1509 } 1510 } 1511 1512 /* 1513 * Block operations 1514 */ 1515 1516 /* 1517 * Check if a block is available 1518 */ 1519 int 1520 isblock(struct fs *fs, uchar_t *cp, daddr_t h) 1521 { 1522 uchar_t mask; 1523 1524 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \ 1525 fs->fs_frag == 1); 1526 /* 1527 * ufsvfsp->vfs_lock is held when calling this. 1528 */ 1529 switch ((int)fs->fs_frag) { 1530 case 8: 1531 return (cp[h] == 0xff); 1532 case 4: 1533 mask = 0x0f << ((h & 0x1) << 2); 1534 return ((cp[h >> 1] & mask) == mask); 1535 case 2: 1536 mask = 0x03 << ((h & 0x3) << 1); 1537 return ((cp[h >> 2] & mask) == mask); 1538 case 1: 1539 mask = 0x01 << (h & 0x7); 1540 return ((cp[h >> 3] & mask) == mask); 1541 default: 1542 #ifndef _KERNEL 1543 cmn_err(CE_PANIC, "isblock: illegal fs->fs_frag value (%d)", 1544 fs->fs_frag); 1545 #endif /* _KERNEL */ 1546 return (0); 1547 } 1548 } 1549 1550 /* 1551 * Take a block out of the map 1552 */ 1553 void 1554 clrblock(struct fs *fs, uchar_t *cp, daddr_t h) 1555 { 1556 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \ 1557 fs->fs_frag == 1); 1558 /* 1559 * ufsvfsp->vfs_lock is held when calling this. 1560 */ 1561 switch ((int)fs->fs_frag) { 1562 case 8: 1563 cp[h] = 0; 1564 return; 1565 case 4: 1566 cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); 1567 return; 1568 case 2: 1569 cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); 1570 return; 1571 case 1: 1572 cp[h >> 3] &= ~(0x01 << (h & 0x7)); 1573 return; 1574 default: 1575 #ifndef _KERNEL 1576 cmn_err(CE_PANIC, "clrblock: illegal fs->fs_frag value (%d)", 1577 fs->fs_frag); 1578 #endif /* _KERNEL */ 1579 return; 1580 } 1581 } 1582 1583 /* 1584 * Is block allocated? 1585 */ 1586 int 1587 isclrblock(struct fs *fs, uchar_t *cp, daddr_t h) 1588 { 1589 uchar_t mask; 1590 int frag; 1591 /* 1592 * ufsvfsp->vfs_lock is held when calling this. 1593 */ 1594 frag = fs->fs_frag; 1595 ASSERT(frag == 8 || frag == 4 || frag == 2 || frag == 1); 1596 switch (frag) { 1597 case 8: 1598 return (cp[h] == 0); 1599 case 4: 1600 mask = ~(0x0f << ((h & 0x1) << 2)); 1601 return (cp[h >> 1] == (cp[h >> 1] & mask)); 1602 case 2: 1603 mask = ~(0x03 << ((h & 0x3) << 1)); 1604 return (cp[h >> 2] == (cp[h >> 2] & mask)); 1605 case 1: 1606 mask = ~(0x01 << (h & 0x7)); 1607 return (cp[h >> 3] == (cp[h >> 3] & mask)); 1608 default: 1609 #ifndef _KERNEL 1610 cmn_err(CE_PANIC, "isclrblock: illegal fs->fs_frag value (%d)", 1611 fs->fs_frag); 1612 #endif /* _KERNEL */ 1613 break; 1614 } 1615 return (0); 1616 } 1617 1618 /* 1619 * Put a block into the map 1620 */ 1621 void 1622 setblock(struct fs *fs, uchar_t *cp, daddr_t h) 1623 { 1624 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \ 1625 fs->fs_frag == 1); 1626 /* 1627 * ufsvfsp->vfs_lock is held when calling this. 1628 */ 1629 switch ((int)fs->fs_frag) { 1630 case 8: 1631 cp[h] = 0xff; 1632 return; 1633 case 4: 1634 cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); 1635 return; 1636 case 2: 1637 cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); 1638 return; 1639 case 1: 1640 cp[h >> 3] |= (0x01 << (h & 0x7)); 1641 return; 1642 default: 1643 #ifndef _KERNEL 1644 cmn_err(CE_PANIC, "setblock: illegal fs->fs_frag value (%d)", 1645 fs->fs_frag); 1646 #endif /* _KERNEL */ 1647 return; 1648 } 1649 } 1650 1651 int 1652 skpc(char c, uint_t len, char *cp) 1653 { 1654 if (len == 0) 1655 return (0); 1656 while (*cp++ == c && --len) 1657 ; 1658 return (len); 1659 } 1660