1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/param.h> 45 #include <sys/time.h> 46 #include <sys/fs/ufs_fs.h> 47 #include <sys/cmn_err.h> 48 49 #ifdef _KERNEL 50 51 #include <sys/systm.h> 52 #include <sys/sysmacros.h> 53 #include <sys/buf.h> 54 #include <sys/conf.h> 55 #include <sys/user.h> 56 #include <sys/var.h> 57 #include <sys/vfs.h> 58 #include <sys/vnode.h> 59 #include <sys/proc.h> 60 #include <sys/debug.h> 61 #include <sys/fssnap_if.h> 62 #include <sys/fs/ufs_inode.h> 63 #include <sys/fs/ufs_trans.h> 64 #include <sys/fs/ufs_panic.h> 65 #include <sys/fs/ufs_bio.h> 66 #include <sys/fs/ufs_log.h> 67 #include <sys/kmem.h> 68 #include <sys/policy.h> 69 #include <vm/hat.h> 70 #include <vm/as.h> 71 #include <vm/seg.h> 72 #include <vm/pvn.h> 73 #include <vm/seg_map.h> 74 #include <sys/swap.h> 75 #include <vm/seg_kmem.h> 76 77 #else /* _KERNEL */ 78 79 #define ASSERT(x) /* don't use asserts for fsck et al */ 80 81 #endif /* _KERNEL */ 82 83 #ifdef _KERNEL 84 85 /* 86 * Used to verify that a given entry on the ufs_instances list (see below) 87 * still refers to a mounted file system. 88 * 89 * XXX: This is a crock that substitutes for proper locking to coordinate 90 * updates to and uses of the entries in ufs_instances. 91 */ 92 struct check_node { 93 struct vfs *vfsp; 94 struct ufsvfs *ufsvfs; 95 dev_t vfs_dev; 96 }; 97 98 static vfs_t *still_mounted(struct check_node *); 99 100 /* 101 * All ufs file system instances are linked together into a list starting at 102 * ufs_instances. The list is updated as part of mount and unmount. It's 103 * consulted in ufs_update, to allow syncing out all ufs file system instances 104 * in a batch. 105 * 106 * ufsvfs_mutex guards access to this list and to the {,old}ufsvfslist 107 * manipulated in ufs_funmount_cleanup. (A given ufs instance is always on 108 * exactly one of these lists except while it's being allocated or 109 * deallocated.) 110 */ 111 struct ufsvfs *ufs_instances; 112 extern kmutex_t ufsvfs_mutex; /* XXX: move this to ufs_inode.h? */ 113 114 /* 115 * ufsvfs list manipulation routines 116 */ 117 118 /* 119 * Link ufsp in at the head of the list of ufs_instances. 120 */ 121 void 122 ufs_vfs_add(struct ufsvfs *ufsp) 123 { 124 mutex_enter(&ufsvfs_mutex); 125 ufsp->vfs_next = ufs_instances; 126 ufs_instances = ufsp; 127 mutex_exit(&ufsvfs_mutex); 128 } 129 130 /* 131 * Remove ufsp from the list of ufs_instances. 132 * 133 * Does no error checking; ufsp is assumed to actually be on the list. 134 */ 135 void 136 ufs_vfs_remove(struct ufsvfs *ufsp) 137 { 138 struct ufsvfs **delpt = &ufs_instances; 139 140 mutex_enter(&ufsvfs_mutex); 141 for (; *delpt != NULL; delpt = &((*delpt)->vfs_next)) { 142 if (*delpt == ufsp) { 143 *delpt = ufsp->vfs_next; 144 ufsp->vfs_next = NULL; 145 break; 146 } 147 } 148 mutex_exit(&ufsvfs_mutex); 149 } 150 151 /* 152 * Clean up state resulting from a forcible unmount that couldn't be handled 153 * directly during the unmount. (See commentary in the unmount code for more 154 * info.) 155 */ 156 static void 157 ufs_funmount_cleanup() 158 { 159 struct ufsvfs *ufsvfsp; 160 extern struct ufsvfs *oldufsvfslist, *ufsvfslist; 161 162 /* 163 * Assumption: it's now safe to blow away the entries on 164 * oldufsvfslist. 165 */ 166 mutex_enter(&ufsvfs_mutex); 167 while ((ufsvfsp = oldufsvfslist) != NULL) { 168 oldufsvfslist = ufsvfsp->vfs_next; 169 170 mutex_destroy(&ufsvfsp->vfs_lock); 171 kmem_free(ufsvfsp, sizeof (struct ufsvfs)); 172 } 173 /* 174 * Rotate more recent unmount entries into place in preparation for 175 * the next time around. 176 */ 177 oldufsvfslist = ufsvfslist; 178 ufsvfslist = NULL; 179 mutex_exit(&ufsvfs_mutex); 180 } 181 182 183 /* 184 * ufs_update performs the ufs part of `sync'. It goes through the disk 185 * queues to initiate sandbagged IO; goes through the inodes to write 186 * modified nodes; and it goes through the mount table to initiate 187 * the writing of the modified super blocks. 188 */ 189 extern time_t time; 190 time_t ufs_sync_time; 191 time_t ufs_sync_time_secs = 1; 192 193 extern kmutex_t ufs_scan_lock; 194 195 void 196 ufs_update(int flag) 197 { 198 struct vfs *vfsp; 199 struct fs *fs; 200 struct ufsvfs *ufsp; 201 struct ufsvfs *ufsnext; 202 struct ufsvfs *update_list = NULL; 203 int check_cnt = 0; 204 size_t check_size; 205 struct check_node *check_list, *ptr; 206 int cheap = flag & SYNC_ATTR; 207 208 /* 209 * This is a hack. A design flaw in the forced unmount protocol 210 * could allow a thread to attempt to use a kmem_freed ufsvfs 211 * structure in ufs_lockfs_begin/ufs_check_lockfs. This window 212 * is difficult to hit, even during the lockfs stress tests. 213 * So the hacky fix is to wait awhile before kmem_free'ing the 214 * ufsvfs structures for forcibly unmounted file systems. `Awhile' 215 * is defined as every other call from fsflush (~60 seconds). 216 */ 217 if (cheap) 218 ufs_funmount_cleanup(); 219 220 /* 221 * Examine all ufsvfs structures and add those that we can lock to the 222 * update list. This is so that we don't hold the list lock for a 223 * long time. If vfs_lock fails for a file system instance, then skip 224 * it because somebody is doing a unmount on it. 225 */ 226 mutex_enter(&ufsvfs_mutex); 227 for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) { 228 vfsp = ufsp->vfs_vfs; 229 if (vfs_lock(vfsp) != 0) 230 continue; 231 ufsp->vfs_wnext = update_list; 232 update_list = ufsp; 233 check_cnt++; 234 } 235 mutex_exit(&ufsvfs_mutex); 236 237 if (update_list == NULL) 238 return; 239 240 check_size = sizeof (struct check_node) * check_cnt; 241 check_list = ptr = kmem_alloc(check_size, KM_NOSLEEP); 242 243 /* 244 * Write back modified superblocks. 245 * Consistency check that the superblock of 246 * each file system is still in the buffer cache. 247 * 248 * Note that the update_list traversal is done without the protection 249 * of an overall list lock, so it's necessary to rely on the fact that 250 * each entry of the list is vfs_locked when moving from one entry to 251 * the next. This works because a concurrent attempt to add an entry 252 * to another thread's update_list won't find it, since it'll already 253 * be locked. 254 */ 255 check_cnt = 0; 256 for (ufsp = update_list; ufsp != NULL; ufsp = ufsnext) { 257 /* 258 * Need to grab the next ptr before we unlock this one so 259 * another thread doesn't grab it and change it before we move 260 * on to the next vfs. (Once we unlock it, it's ok if another 261 * thread finds it to add it to its own update_list; we don't 262 * attempt to refer to it through our list any more.) 263 */ 264 ufsnext = ufsp->vfs_wnext; 265 vfsp = ufsp->vfs_vfs; 266 267 /* 268 * Seems like this can't happen, so perhaps it should become 269 * an ASSERT(vfsp->vfs_data != NULL). 270 */ 271 if (!vfsp->vfs_data) { 272 vfs_unlock(vfsp); 273 continue; 274 } 275 276 fs = ufsp->vfs_fs; 277 278 /* 279 * don't update a locked superblock during a panic; it 280 * may be in an inconsistent state 281 */ 282 if (panicstr) { 283 if (!mutex_tryenter(&ufsp->vfs_lock)) { 284 vfs_unlock(vfsp); 285 continue; 286 } 287 } else 288 mutex_enter(&ufsp->vfs_lock); 289 /* 290 * Build up the STABLE check list, so we can unlock the vfs 291 * until we do the actual checking. 292 */ 293 if (check_list != NULL) { 294 if ((fs->fs_ronly == 0) && 295 (fs->fs_clean != FSBAD) && 296 (fs->fs_clean != FSSUSPEND)) { 297 ptr->vfsp = vfsp; 298 ptr->ufsvfs = ufsp; 299 ptr->vfs_dev = vfsp->vfs_dev; 300 ptr++; 301 check_cnt++; 302 } 303 } 304 305 /* 306 * superblock is not modified 307 */ 308 if (fs->fs_fmod == 0) { 309 mutex_exit(&ufsp->vfs_lock); 310 vfs_unlock(vfsp); 311 continue; 312 } 313 if (fs->fs_ronly != 0) { 314 mutex_exit(&ufsp->vfs_lock); 315 vfs_unlock(vfsp); 316 (void) ufs_fault(ufsp->vfs_root, 317 "fs = %s update: ro fs mod\n", fs->fs_fsmnt); 318 /* 319 * XXX: Why is this a return instead of a continue? 320 * This may be an attempt to replace a panic with 321 * something less drastic, but there's cleanup we 322 * should be doing that's not being done (e.g., 323 * unlocking the remaining entries on the list). 324 */ 325 return; 326 } 327 fs->fs_fmod = 0; 328 mutex_exit(&ufsp->vfs_lock); 329 TRANS_SBUPDATE(ufsp, vfsp, TOP_SBUPDATE_UPDATE); 330 vfs_unlock(vfsp); 331 } 332 333 ufs_sync_time = time; 334 335 /* 336 * Avoid racing with ufs_unmount() and ufs_sync(). 337 */ 338 mutex_enter(&ufs_scan_lock); 339 340 (void) ufs_scan_inodes(1, ufs_sync_inode, (void *)(uintptr_t)cheap, 341 NULL); 342 343 mutex_exit(&ufs_scan_lock); 344 345 /* 346 * Force stale buffer cache information to be flushed, 347 * for all devices. This should cause any remaining control 348 * information (e.g., cg and inode info) to be flushed back. 349 */ 350 bflush((dev_t)NODEV); 351 352 if (check_list == NULL) 353 return; 354 355 /* 356 * For each UFS filesystem in the STABLE check_list, update 357 * the clean flag if warranted. 358 */ 359 for (ptr = check_list; check_cnt > 0; check_cnt--, ptr++) { 360 int error; 361 362 /* 363 * still_mounted() returns with vfsp and the vfs_reflock 364 * held if ptr refers to a vfs that is still mounted. 365 */ 366 if ((vfsp = still_mounted(ptr)) == NULL) 367 continue; 368 ufs_checkclean(vfsp); 369 /* 370 * commit any outstanding async transactions 371 */ 372 ufsp = (struct ufsvfs *)vfsp->vfs_data; 373 curthread->t_flag |= T_DONTBLOCK; 374 TRANS_BEGIN_SYNC(ufsp, TOP_COMMIT_UPDATE, TOP_COMMIT_SIZE, 375 error); 376 if (!error) { 377 TRANS_END_SYNC(ufsp, error, TOP_COMMIT_UPDATE, 378 TOP_COMMIT_SIZE); 379 } 380 curthread->t_flag &= ~T_DONTBLOCK; 381 382 vfs_unlock(vfsp); 383 } 384 385 kmem_free(check_list, check_size); 386 } 387 388 int 389 ufs_sync_inode(struct inode *ip, void *arg) 390 { 391 int cheap = (int)(uintptr_t)arg; 392 struct ufsvfs *ufsvfsp; 393 uint_t flag = ip->i_flag; 394 395 if (cheap && ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) == 0)) 396 return (0); 397 398 /* 399 * if we are panic'ing; then don't update the inode if this 400 * file system is FSSTABLE. Otherwise, we would have to 401 * force the superblock to FSACTIVE and the superblock 402 * may not be in a good state. Also, if the inode is 403 * IREF'ed then it may be in an inconsistent state. Don't 404 * push it. Finally, don't push the inode if the fs is 405 * logging; the transaction will be discarded at boot. 406 */ 407 if (panicstr) { 408 409 if (flag & IREF) 410 return (0); 411 412 if (ip->i_ufsvfs == NULL || 413 (ip->i_fs->fs_clean == FSSTABLE || 414 ip->i_fs->fs_clean == FSLOG)) 415 return (0); 416 } 417 418 ufsvfsp = ip->i_ufsvfs; 419 420 /* 421 * Limit access time only updates 422 */ 423 if (((flag & (IMOD|IMODACC|IUPD|ICHG|IACC)) == IMODACC) && ufsvfsp) { 424 /* 425 * if file system has deferred access time turned on and there 426 * was no IO recently, don't bother flushing it. It will be 427 * flushed when I/Os start again. 428 */ 429 if (cheap && (ufsvfsp->vfs_dfritime & UFS_DFRATIME) && 430 (ufsvfsp->vfs_iotstamp + ufs_iowait < lbolt)) 431 return (0); 432 /* 433 * an app issueing a sync() can take forever on a trans device 434 * when NetWorker or find is running because all of the 435 * directorys' access times have to be updated. So, we limit 436 * the time we spend updating access times per sync. 437 */ 438 if (TRANS_ISTRANS(ufsvfsp) && ((ufs_sync_time + 439 ufs_sync_time_secs) < time)) 440 return (0); 441 } 442 443 /* 444 * if we are running on behalf of the flush thread or this is 445 * a swap file, then simply do a delay update of the inode. 446 * Otherwise, push the pages and then do a delayed inode update. 447 */ 448 if (cheap || IS_SWAPVP(ITOV(ip))) { 449 TRANS_IUPDAT(ip, 0); 450 } else { 451 (void) TRANS_SYNCIP(ip, B_ASYNC, I_ASYNC, TOP_SYNCIP_SYNC); 452 } 453 return (0); 454 } 455 456 /* 457 * Flush all the pages associated with an inode using the given 'flags', 458 * then force inode information to be written back using the given 'waitfor'. 459 */ 460 int 461 ufs_syncip(struct inode *ip, int flags, int waitfor, top_t topid) 462 { 463 int error; 464 struct vnode *vp = ITOV(ip); 465 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 466 int dotrans = 0; 467 468 /* 469 * Return if file system has been forcibly umounted. 470 */ 471 if (ufsvfsp == NULL) 472 return (EIO); 473 /* 474 * don't need to VOP_PUTPAGE if there are no pages 475 */ 476 if (!vn_has_cached_data(vp) || vp->v_type == VCHR) { 477 error = 0; 478 } else { 479 /* 480 * if the inode we're working on is a shadow inode 481 * or quota inode we need to make sure that the 482 * ufs_putpage call is inside a transaction as this 483 * could include meta data changes. 484 */ 485 if ((ip->i_mode & IFMT) == IFSHAD || 486 ufsvfsp->vfs_qinod == ip) { 487 dotrans = 1; 488 curthread->t_flag |= T_DONTBLOCK; 489 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE, 490 TOP_PUTPAGE_SIZE(ip)); 491 } 492 error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, 493 flags, CRED(), NULL); 494 if (dotrans) { 495 TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE, 496 TOP_PUTPAGE_SIZE(ip)); 497 curthread->t_flag &= ~T_DONTBLOCK; 498 dotrans = 0; 499 } 500 } 501 if (panicstr && TRANS_ISTRANS(ufsvfsp)) 502 goto out; 503 /* 504 * waitfor represents two things - 505 * 1. whether data sync or file sync. 506 * 2. if file sync then ufs_iupdat should 'waitfor' disk i/o or not. 507 */ 508 if (waitfor == I_DSYNC) { 509 /* 510 * If data sync, only IATTCHG (size/block change) requires 511 * inode update, fdatasync()/FDSYNC implementation. 512 */ 513 if (ip->i_flag & (IBDWRITE|IATTCHG)) { 514 /* 515 * Enter a transaction to provide mutual exclusion 516 * with deltamap_push and avoid a race where 517 * the inode flush could get dropped. 518 */ 519 if ((curthread->t_flag & T_DONTBLOCK) == 0) { 520 dotrans = 1; 521 curthread->t_flag |= T_DONTBLOCK; 522 TRANS_BEGIN_ASYNC(ufsvfsp, topid, 523 TOP_SYNCIP_SIZE); 524 } 525 rw_enter(&ip->i_contents, RW_READER); 526 mutex_enter(&ip->i_tlock); 527 ip->i_flag &= ~IMODTIME; 528 mutex_exit(&ip->i_tlock); 529 ufs_iupdat(ip, 1); 530 rw_exit(&ip->i_contents); 531 if (dotrans) { 532 TRANS_END_ASYNC(ufsvfsp, topid, 533 TOP_SYNCIP_SIZE); 534 curthread->t_flag &= ~T_DONTBLOCK; 535 } 536 } 537 } else { 538 /* For file sync, any inode change requires inode update */ 539 if (ip->i_flag & (IBDWRITE|IUPD|IACC|ICHG|IMOD|IMODACC)) { 540 /* 541 * Enter a transaction to provide mutual exclusion 542 * with deltamap_push and avoid a race where 543 * the inode flush could get dropped. 544 */ 545 if ((curthread->t_flag & T_DONTBLOCK) == 0) { 546 dotrans = 1; 547 curthread->t_flag |= T_DONTBLOCK; 548 TRANS_BEGIN_ASYNC(ufsvfsp, topid, 549 TOP_SYNCIP_SIZE); 550 } 551 rw_enter(&ip->i_contents, RW_READER); 552 mutex_enter(&ip->i_tlock); 553 ip->i_flag &= ~IMODTIME; 554 mutex_exit(&ip->i_tlock); 555 ufs_iupdat(ip, waitfor); 556 rw_exit(&ip->i_contents); 557 if (dotrans) { 558 TRANS_END_ASYNC(ufsvfsp, topid, 559 TOP_SYNCIP_SIZE); 560 curthread->t_flag &= ~T_DONTBLOCK; 561 } 562 } 563 } 564 565 out: 566 return (error); 567 } 568 /* 569 * Flush all indirect blocks related to an inode. 570 * Supports triple indirect blocks also. 571 */ 572 int 573 ufs_sync_indir(struct inode *ip) 574 { 575 int i; 576 daddr_t blkno; 577 daddr_t lbn; /* logical blkno of last blk in file */ 578 daddr_t clbn; /* current logical blk */ 579 daddr32_t *bap; 580 struct fs *fs; 581 struct buf *bp; 582 int bsize; 583 struct ufsvfs *ufsvfsp; 584 int j; 585 daddr_t indirect_blkno; 586 daddr32_t *indirect_bap; 587 struct buf *indirect_bp; 588 589 ufsvfsp = ip->i_ufsvfs; 590 /* 591 * unnecessary when logging; allocation blocks are kept up-to-date 592 */ 593 if (TRANS_ISTRANS(ufsvfsp)) 594 return (0); 595 596 fs = ufsvfsp->vfs_fs; 597 bsize = fs->fs_bsize; 598 lbn = (daddr_t)lblkno(fs, ip->i_size - 1); 599 if (lbn < NDADDR) 600 return (0); /* No indirect blocks used */ 601 if (lbn < NDADDR + NINDIR(fs)) { 602 /* File has one indirect block. */ 603 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, ip->i_ib[0])); 604 return (0); 605 } 606 607 /* Write out all the first level indirect blocks */ 608 for (i = 0; i <= NIADDR; i++) { 609 if ((blkno = ip->i_ib[i]) == 0) 610 continue; 611 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno)); 612 } 613 /* Write out second level of indirect blocks */ 614 if ((blkno = ip->i_ib[1]) == 0) 615 return (0); 616 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize); 617 if (bp->b_flags & B_ERROR) { 618 brelse(bp); 619 return (EIO); 620 } 621 bap = bp->b_un.b_daddr; 622 clbn = NDADDR + NINDIR(fs); 623 for (i = 0; i < NINDIR(fs); i++) { 624 if (clbn > lbn) 625 break; 626 clbn += NINDIR(fs); 627 if ((blkno = bap[i]) == 0) 628 continue; 629 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno)); 630 } 631 632 brelse(bp); 633 /* write out third level indirect blocks */ 634 635 if ((blkno = ip->i_ib[2]) == 0) 636 return (0); 637 638 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, blkno), bsize); 639 if (bp->b_flags & B_ERROR) { 640 brelse(bp); 641 return (EIO); 642 } 643 bap = bp->b_un.b_daddr; 644 clbn = NDADDR + NINDIR(fs) + (NINDIR(fs) * NINDIR(fs)); 645 646 for (i = 0; i < NINDIR(fs); i++) { 647 if (clbn > lbn) 648 break; 649 if ((indirect_blkno = bap[i]) == 0) 650 continue; 651 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, indirect_blkno)); 652 indirect_bp = UFS_BREAD(ufsvfsp, ip->i_dev, 653 (daddr_t)fsbtodb(fs, indirect_blkno), bsize); 654 if (indirect_bp->b_flags & B_ERROR) { 655 brelse(indirect_bp); 656 brelse(bp); 657 return (EIO); 658 } 659 indirect_bap = indirect_bp->b_un.b_daddr; 660 for (j = 0; j < NINDIR(fs); j++) { 661 if (clbn > lbn) 662 break; 663 clbn += NINDIR(fs); 664 if ((blkno = indirect_bap[j]) == 0) 665 continue; 666 blkflush(ip->i_dev, (daddr_t)fsbtodb(fs, blkno)); 667 } 668 brelse(indirect_bp); 669 } 670 brelse(bp); 671 672 return (0); 673 } 674 675 /* 676 * Flush all indirect blocks related to an offset of a file. 677 * read/write in sync mode may have to flush indirect blocks. 678 */ 679 int 680 ufs_indirblk_sync(struct inode *ip, offset_t off) 681 { 682 daddr_t lbn; 683 struct fs *fs; 684 struct buf *bp; 685 int i, j, shft; 686 daddr_t ob, nb, tbn; 687 daddr32_t *bap; 688 int nindirshift, nindiroffset; 689 struct ufsvfs *ufsvfsp; 690 691 ufsvfsp = ip->i_ufsvfs; 692 /* 693 * unnecessary when logging; allocation blocks are kept up-to-date 694 */ 695 if (TRANS_ISTRANS(ufsvfsp)) 696 return (0); 697 698 fs = ufsvfsp->vfs_fs; 699 700 lbn = (daddr_t)lblkno(fs, off); 701 if (lbn < 0) 702 return (EFBIG); 703 704 /* The first NDADDR are direct so nothing to do */ 705 if (lbn < NDADDR) 706 return (0); 707 708 nindirshift = ip->i_ufsvfs->vfs_nindirshift; 709 nindiroffset = ip->i_ufsvfs->vfs_nindiroffset; 710 711 /* Determine level of indirect blocks */ 712 shft = 0; 713 tbn = lbn - NDADDR; 714 for (j = NIADDR; j > 0; j--) { 715 longlong_t sh; 716 717 shft += nindirshift; 718 sh = 1LL << shft; 719 if (tbn < sh) 720 break; 721 tbn -= (daddr_t)sh; 722 } 723 724 if (j == 0) 725 return (EFBIG); 726 727 if ((nb = ip->i_ib[NIADDR - j]) == 0) 728 return (0); /* UFS Hole */ 729 730 /* Flush first level indirect block */ 731 blkflush(ip->i_dev, fsbtodb(fs, nb)); 732 733 /* Fetch through next levels */ 734 for (; j < NIADDR; j++) { 735 ob = nb; 736 bp = UFS_BREAD(ufsvfsp, 737 ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize); 738 if (bp->b_flags & B_ERROR) { 739 brelse(bp); 740 return (EIO); 741 } 742 bap = bp->b_un.b_daddr; 743 shft -= nindirshift; /* sh / nindir */ 744 i = (tbn >> shft) & nindiroffset; /* (tbn /sh) & nindir */ 745 nb = bap[i]; 746 brelse(bp); 747 if (nb == 0) { 748 return (0); /* UFS hole */ 749 } 750 blkflush(ip->i_dev, fsbtodb(fs, nb)); 751 } 752 return (0); 753 } 754 755 #ifdef DEBUG 756 757 /* 758 * The bad block checking routines: ufs_indir_badblock() and ufs_badblock() 759 * are very expensive. It's been found from profiling that we're 760 * spending 6-7% of our time in ufs_badblock, and another 1-2% in 761 * ufs_indir_badblock. They are only called via ASSERTs (from debug kernels). 762 * In addition from experience no failures have been found in recent 763 * years. So the following tunable can be set to enable checking. 764 */ 765 int ufs_badblock_checks = 0; 766 767 /* 768 * Check that a given indirect block contains blocks in range 769 */ 770 int 771 ufs_indir_badblock(struct inode *ip, daddr32_t *bap) 772 { 773 int i; 774 int err = 0; 775 776 if (ufs_badblock_checks) { 777 for (i = 0; i < NINDIR(ip->i_fs) - 1; i++) 778 if (bap[i] != 0 && (err = ufs_badblock(ip, bap[i]))) 779 break; 780 } 781 return (err); 782 } 783 784 /* 785 * Check that a specified block number is in range. 786 */ 787 int 788 ufs_badblock(struct inode *ip, daddr_t bn) 789 { 790 long c; 791 daddr_t sum; 792 793 if (!ufs_badblock_checks) 794 return (0); 795 ASSERT(bn); 796 if (bn <= 0 || bn > ip->i_fs->fs_size) 797 return (bn); 798 799 sum = 0; 800 c = dtog(ip->i_fs, bn); 801 if (c == 0) { 802 sum = howmany(ip->i_fs->fs_cssize, ip->i_fs->fs_fsize); 803 } 804 /* 805 * if block no. is below this cylinder group, 806 * within the space reserved for superblock, inodes, (summary data) 807 * or if it is above this cylinder group 808 * then its invalid 809 * It's hard to see how we'd be outside this cyl, but let's be careful. 810 */ 811 if ((bn < cgbase(ip->i_fs, c)) || 812 (bn >= cgsblock(ip->i_fs, c) && bn < cgdmin(ip->i_fs, c)+sum) || 813 (bn >= (unsigned)cgbase(ip->i_fs, c+1))) 814 return (bn); 815 816 return (0); /* not a bad block */ 817 } 818 819 #endif /* DEBUG */ 820 821 /* 822 * When i_rwlock is write-locked or has a writer pended, then the inode 823 * is going to change in a way that the filesystem will be marked as 824 * active. So no need to let the filesystem be mark as stable now. 825 * Also to ensure the filesystem consistency during the directory 826 * operations, filesystem cannot be marked as stable if i_rwlock of 827 * the directory inode is write-locked. 828 */ 829 830 /* 831 * Check for busy inodes for this filesystem. 832 * NOTE: Needs better way to do this expensive operation in the future. 833 */ 834 static void 835 ufs_icheck(struct ufsvfs *ufsvfsp, int *isbusyp, int *isreclaimp) 836 { 837 union ihead *ih; 838 struct inode *ip; 839 int i; 840 int isnottrans = !TRANS_ISTRANS(ufsvfsp); 841 int isbusy = *isbusyp; 842 int isreclaim = *isreclaimp; 843 844 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 845 mutex_enter(&ih_lock[i]); 846 for (ip = ih->ih_chain[0]; 847 ip != (struct inode *)ih; 848 ip = ip->i_forw) { 849 /* 850 * if inode is busy/modified/deleted, filesystem is busy 851 */ 852 if (ip->i_ufsvfs != ufsvfsp) 853 continue; 854 if ((ip->i_flag & (IMOD | IUPD | ICHG)) || 855 (RW_ISWRITER(&ip->i_rwlock))) 856 isbusy = 1; 857 if ((ip->i_nlink <= 0) && (ip->i_flag & IREF)) 858 isreclaim = 1; 859 if (isbusy && (isreclaim || isnottrans)) 860 break; 861 } 862 mutex_exit(&ih_lock[i]); 863 if (isbusy && (isreclaim || isnottrans)) 864 break; 865 } 866 *isbusyp = isbusy; 867 *isreclaimp = isreclaim; 868 } 869 870 /* 871 * As part of the ufs 'sync' operation, this routine is called to mark 872 * the filesystem as STABLE if there is no modified metadata in memory. 873 */ 874 void 875 ufs_checkclean(struct vfs *vfsp) 876 { 877 struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 878 struct fs *fs = ufsvfsp->vfs_fs; 879 int isbusy; 880 int isreclaim; 881 int updatesb; 882 883 ASSERT(vfs_lock_held(vfsp)); 884 885 /* 886 * filesystem is stable or cleanflag processing is disabled; do nothing 887 * no transitions when panic'ing 888 */ 889 if (fs->fs_ronly || 890 fs->fs_clean == FSBAD || 891 fs->fs_clean == FSSUSPEND || 892 fs->fs_clean == FSSTABLE || 893 panicstr) 894 return; 895 896 /* 897 * if logging and nothing to reclaim; do nothing 898 */ 899 if ((fs->fs_clean == FSLOG) && 900 (((fs->fs_reclaim & FS_RECLAIM) == 0) || 901 (fs->fs_reclaim & FS_RECLAIMING))) 902 return; 903 904 /* 905 * FS_CHECKCLEAN is reset if the file system goes dirty 906 * FS_CHECKRECLAIM is reset if a file gets deleted 907 */ 908 mutex_enter(&ufsvfsp->vfs_lock); 909 fs->fs_reclaim |= (FS_CHECKCLEAN | FS_CHECKRECLAIM); 910 mutex_exit(&ufsvfsp->vfs_lock); 911 912 updatesb = 0; 913 914 /* 915 * if logging or buffers are busy; do nothing 916 */ 917 isbusy = isreclaim = 0; 918 if ((fs->fs_clean == FSLOG) || 919 (bcheck(vfsp->vfs_dev, ufsvfsp->vfs_bufp))) 920 isbusy = 1; 921 922 /* 923 * isreclaim == TRUE means can't change the state of fs_reclaim 924 */ 925 isreclaim = 926 ((fs->fs_clean == FSLOG) && 927 (((fs->fs_reclaim & FS_RECLAIM) == 0) || 928 (fs->fs_reclaim & FS_RECLAIMING))); 929 930 /* 931 * if fs is busy or can't change the state of fs_reclaim; do nothing 932 */ 933 if (isbusy && isreclaim) 934 return; 935 936 /* 937 * look for busy or deleted inodes; (deleted == needs reclaim) 938 */ 939 ufs_icheck(ufsvfsp, &isbusy, &isreclaim); 940 941 mutex_enter(&ufsvfsp->vfs_lock); 942 943 /* 944 * IF POSSIBLE, RESET RECLAIM 945 */ 946 /* 947 * the reclaim thread is not running 948 */ 949 if ((fs->fs_reclaim & FS_RECLAIMING) == 0) 950 /* 951 * no files were deleted during the scan 952 */ 953 if (fs->fs_reclaim & FS_CHECKRECLAIM) 954 /* 955 * no deleted files were found in the inode cache 956 */ 957 if ((isreclaim == 0) && (fs->fs_reclaim & FS_RECLAIM)) { 958 fs->fs_reclaim &= ~FS_RECLAIM; 959 updatesb = 1; 960 } 961 /* 962 * IF POSSIBLE, SET STABLE 963 */ 964 /* 965 * not logging 966 */ 967 if (fs->fs_clean != FSLOG) 968 /* 969 * file system has not gone dirty since the scan began 970 */ 971 if (fs->fs_reclaim & FS_CHECKCLEAN) 972 /* 973 * nothing dirty was found in the buffer or inode cache 974 */ 975 if ((isbusy == 0) && (isreclaim == 0) && 976 (fs->fs_clean != FSSTABLE)) { 977 fs->fs_clean = FSSTABLE; 978 updatesb = 1; 979 } 980 981 mutex_exit(&ufsvfsp->vfs_lock); 982 if (updatesb) { 983 TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE); 984 } 985 } 986 987 /* 988 * called whenever an unlink occurs 989 */ 990 void 991 ufs_setreclaim(struct inode *ip) 992 { 993 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 994 struct fs *fs = ufsvfsp->vfs_fs; 995 996 if (ip->i_nlink || fs->fs_ronly || (fs->fs_clean != FSLOG)) 997 return; 998 999 /* 1000 * reclaim-needed bit is already set or we need to tell 1001 * ufs_checkclean that a file has been deleted 1002 */ 1003 if ((fs->fs_reclaim & (FS_RECLAIM | FS_CHECKRECLAIM)) == FS_RECLAIM) 1004 return; 1005 1006 mutex_enter(&ufsvfsp->vfs_lock); 1007 /* 1008 * inform ufs_checkclean that the file system has gone dirty 1009 */ 1010 fs->fs_reclaim &= ~FS_CHECKRECLAIM; 1011 1012 /* 1013 * set the reclaim-needed bit 1014 */ 1015 if ((fs->fs_reclaim & FS_RECLAIM) == 0) { 1016 fs->fs_reclaim |= FS_RECLAIM; 1017 ufs_sbwrite(ufsvfsp); 1018 } 1019 mutex_exit(&ufsvfsp->vfs_lock); 1020 } 1021 1022 /* 1023 * Before any modified metadata written back to the disk, this routine 1024 * is called to mark the filesystem as ACTIVE. 1025 */ 1026 void 1027 ufs_notclean(struct ufsvfs *ufsvfsp) 1028 { 1029 struct fs *fs = ufsvfsp->vfs_fs; 1030 1031 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock)); 1032 ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs)); 1033 1034 /* 1035 * inform ufs_checkclean that the file system has gone dirty 1036 */ 1037 fs->fs_reclaim &= ~FS_CHECKCLEAN; 1038 1039 /* 1040 * ignore if active or bad or suspended or readonly or logging 1041 */ 1042 if ((fs->fs_clean == FSACTIVE) || (fs->fs_clean == FSLOG) || 1043 (fs->fs_clean == FSBAD) || (fs->fs_clean == FSSUSPEND) || 1044 (fs->fs_ronly)) { 1045 mutex_exit(&ufsvfsp->vfs_lock); 1046 return; 1047 } 1048 fs->fs_clean = FSACTIVE; 1049 /* 1050 * write superblock synchronously 1051 */ 1052 ufs_sbwrite(ufsvfsp); 1053 mutex_exit(&ufsvfsp->vfs_lock); 1054 } 1055 1056 /* 1057 * ufs specific fbwrite() 1058 */ 1059 int 1060 ufs_fbwrite(struct fbuf *fbp, struct inode *ip) 1061 { 1062 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1063 1064 if (TRANS_ISTRANS(ufsvfsp)) 1065 return (fbwrite(fbp)); 1066 mutex_enter(&ufsvfsp->vfs_lock); 1067 ufs_notclean(ufsvfsp); 1068 return ((ufsvfsp->vfs_dio) ? fbdwrite(fbp) : fbwrite(fbp)); 1069 } 1070 1071 /* 1072 * ufs specific fbiwrite() 1073 */ 1074 int 1075 ufs_fbiwrite(struct fbuf *fbp, struct inode *ip, daddr_t bn, long bsize) 1076 { 1077 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1078 o_mode_t ifmt = ip->i_mode & IFMT; 1079 buf_t *bp; 1080 int error; 1081 1082 mutex_enter(&ufsvfsp->vfs_lock); 1083 ufs_notclean(ufsvfsp); 1084 if (ifmt == IFDIR || ifmt == IFSHAD || ifmt == IFATTRDIR || 1085 (ip->i_ufsvfs->vfs_qinod == ip)) { 1086 TRANS_DELTA(ufsvfsp, ldbtob(bn * (offset_t)(btod(bsize))), 1087 fbp->fb_count, DT_FBI, 0, 0); 1088 } 1089 /* 1090 * Inlined version of fbiwrite() 1091 */ 1092 bp = pageio_setup((struct page *)NULL, fbp->fb_count, 1093 ip->i_devvp, B_WRITE); 1094 bp->b_flags &= ~B_PAGEIO; 1095 bp->b_un.b_addr = fbp->fb_addr; 1096 1097 bp->b_blkno = bn * btod(bsize); 1098 bp->b_dev = cmpdev(ip->i_dev); /* store in old dev format */ 1099 bp->b_edev = ip->i_dev; 1100 bp->b_proc = NULL; /* i.e. the kernel */ 1101 bp->b_file = ip->i_vnode; 1102 bp->b_offset = -1; 1103 1104 if (ufsvfsp->vfs_log) { 1105 lufs_write_strategy(ufsvfsp->vfs_log, bp); 1106 } else if (ufsvfsp->vfs_snapshot) { 1107 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 1108 } else { 1109 ufsvfsp->vfs_iotstamp = lbolt; 1110 ub.ub_fbiwrites.value.ul++; 1111 (void) bdev_strategy(bp); 1112 lwp_stat_update(LWP_STAT_OUBLK, 1); 1113 } 1114 error = biowait(bp); 1115 pageio_done(bp); 1116 fbrelse(fbp, S_OTHER); 1117 return (error); 1118 } 1119 1120 /* 1121 * Write the ufs superblock only. 1122 */ 1123 void 1124 ufs_sbwrite(struct ufsvfs *ufsvfsp) 1125 { 1126 char sav_fs_fmod; 1127 struct fs *fs = ufsvfsp->vfs_fs; 1128 struct buf *bp = ufsvfsp->vfs_bufp; 1129 1130 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock)); 1131 1132 /* 1133 * for ulockfs processing, limit the superblock writes 1134 */ 1135 if ((ufsvfsp->vfs_ulockfs.ul_sbowner) && 1136 (curthread != ufsvfsp->vfs_ulockfs.ul_sbowner)) { 1137 /* try again later */ 1138 fs->fs_fmod = 1; 1139 return; 1140 } 1141 1142 ULOCKFS_SET_MOD((&ufsvfsp->vfs_ulockfs)); 1143 /* 1144 * update superblock timestamp and fs_clean checksum 1145 * if marked FSBAD, we always want an erroneous 1146 * checksum to force repair 1147 */ 1148 fs->fs_time = gethrestime_sec(); 1149 fs->fs_state = (fs->fs_clean != FSBAD) ? 1150 FSOKAY - fs->fs_time : -(FSOKAY - fs->fs_time); 1151 switch (fs->fs_clean) { 1152 case FSCLEAN: 1153 case FSSTABLE: 1154 fs->fs_reclaim &= ~FS_RECLAIM; 1155 break; 1156 case FSACTIVE: 1157 case FSSUSPEND: 1158 case FSBAD: 1159 case FSLOG: 1160 break; 1161 default: 1162 fs->fs_clean = FSACTIVE; 1163 break; 1164 } 1165 /* 1166 * reset incore only bits 1167 */ 1168 fs->fs_reclaim &= ~(FS_CHECKCLEAN | FS_CHECKRECLAIM); 1169 1170 /* 1171 * delta the whole superblock 1172 */ 1173 TRANS_DELTA(ufsvfsp, ldbtob(SBLOCK), sizeof (struct fs), 1174 DT_SB, NULL, 0); 1175 /* 1176 * retain the incore state of fs_fmod; set the ondisk state to 0 1177 */ 1178 sav_fs_fmod = fs->fs_fmod; 1179 fs->fs_fmod = 0; 1180 1181 /* 1182 * Don't release the buffer after written to the disk 1183 */ 1184 UFS_BWRITE2(ufsvfsp, bp); 1185 fs->fs_fmod = sav_fs_fmod; /* reset fs_fmod's incore state */ 1186 } 1187 1188 /* 1189 * Returns vfs pointer if vfs still being mounted. vfs lock is held. 1190 * Otherwise, returns NULL. 1191 * 1192 * For our purposes, "still mounted" means that the file system still appears 1193 * on the list of UFS file system instances. 1194 */ 1195 static vfs_t * 1196 still_mounted(struct check_node *checkp) 1197 { 1198 struct vfs *vfsp; 1199 struct ufsvfs *ufsp; 1200 1201 mutex_enter(&ufsvfs_mutex); 1202 for (ufsp = ufs_instances; ufsp != NULL; ufsp = ufsp->vfs_next) { 1203 if (ufsp != checkp->ufsvfs) 1204 continue; 1205 /* 1206 * Tentative match: verify it and try to lock. (It's not at 1207 * all clear how the verification could fail, given that we've 1208 * gotten this far. We would have had to reallocate the 1209 * ufsvfs struct at hand for a new incarnation; is that really 1210 * possible in the interval from constructing the check_node 1211 * to here?) 1212 */ 1213 vfsp = ufsp->vfs_vfs; 1214 if (vfsp != checkp->vfsp) 1215 continue; 1216 if (vfsp->vfs_dev != checkp->vfs_dev) 1217 continue; 1218 if (vfs_lock(vfsp) != 0) 1219 continue; 1220 1221 mutex_exit(&ufsvfs_mutex); 1222 return (vfsp); 1223 } 1224 mutex_exit(&ufsvfs_mutex); 1225 return (NULL); 1226 } 1227 1228 int 1229 ufs_si_io_done(struct buf *bp) 1230 { 1231 sema_v(&bp->b_io); 1232 return (0); 1233 } 1234 1235 #define SI_BUFSZ roundup(sizeof (struct cg), DEV_BSIZE) 1236 #define NSIBUF 32 1237 1238 /* 1239 * ufs_construct_si() 1240 * Read each cylinder group in turn and construct the summary information 1241 */ 1242 static int 1243 ufs_construct_si(dev_t dev, struct fs *fs, struct ufsvfs *ufsvfsp) 1244 { 1245 buf_t *bps, *bp; 1246 char *bufs; 1247 struct csum *sip = fs->fs_u.fs_csp; 1248 struct cg *cgp; 1249 int i, ncg; 1250 int error = 0, cg = 0; 1251 1252 bps = kmem_alloc(NSIBUF * sizeof (buf_t), KM_SLEEP); 1253 bufs = kmem_alloc(NSIBUF * SI_BUFSZ, KM_SLEEP); 1254 1255 /* 1256 * Initialise the buffer headers 1257 */ 1258 for (bp = bps, i = 0; i < NSIBUF; i++, bp++) { 1259 bioinit(bp); 1260 bp->b_iodone = ufs_si_io_done; 1261 bp->b_bufsize = bp->b_bcount = SI_BUFSZ; 1262 bp->b_flags = B_READ; 1263 bp->b_un.b_addr = bufs + (i * SI_BUFSZ); 1264 bp->b_edev = dev; 1265 } 1266 1267 /* 1268 * Repeat while there are cylinder groups left to read. 1269 */ 1270 do { 1271 /* 1272 * Issue upto NSIBUF asynchronous reads 1273 */ 1274 ncg = MIN(NSIBUF, (fs->fs_ncg - cg)); 1275 for (bp = bps, i = 0; i < ncg; i++, bp++) { 1276 bp->b_blkno = (daddr_t)fsbtodb(fs, cgtod(fs, cg + i)); 1277 if (ufsvfsp->vfs_log) { 1278 lufs_read_strategy(ufsvfsp->vfs_log, bp); 1279 } else { 1280 (void) bdev_strategy(bp); 1281 } 1282 } 1283 1284 /* 1285 * wait for each read to finish; 1286 * check for errors and copy the csum info 1287 */ 1288 for (bp = bps, i = 0; i < ncg; i++, bp++) { 1289 sema_p(&bp->b_io); 1290 if (!error) { 1291 cgp = bp->b_un.b_cg; 1292 sip[cg + i] = cgp->cg_cs; 1293 error = geterror(bp); 1294 } 1295 } 1296 if (error) { 1297 goto err; 1298 } 1299 cg += ncg; 1300 } while (cg < fs->fs_ncg); 1301 1302 err: 1303 kmem_free(bps, NSIBUF * sizeof (buf_t)); 1304 kmem_free(bufs, NSIBUF * SI_BUFSZ); 1305 return (error); 1306 } 1307 1308 /* 1309 * ufs_getsummaryinfo 1310 */ 1311 int 1312 ufs_getsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs) 1313 { 1314 int i; /* `for' loop counter */ 1315 ssize_t size; /* bytes of summary info to read */ 1316 daddr_t frags; /* frags of summary info to read */ 1317 caddr_t sip; /* summary info */ 1318 struct buf *tp; /* tmp buf */ 1319 1320 /* 1321 * maintain metadata map for trans device (debug only) 1322 */ 1323 TRANS_MATA_SI(ufsvfsp, fs); 1324 1325 /* 1326 * Compute #frags and allocate space for summary info 1327 */ 1328 frags = howmany(fs->fs_cssize, fs->fs_fsize); 1329 sip = kmem_alloc((size_t)fs->fs_cssize, KM_SLEEP); 1330 fs->fs_u.fs_csp = (struct csum *)sip; 1331 1332 if (fs->fs_si == FS_SI_BAD) { 1333 /* 1334 * The summary information is unknown, read it in from 1335 * the cylinder groups. 1336 */ 1337 if (TRANS_ISTRANS(ufsvfsp) && !TRANS_ISERROR(ufsvfsp) && 1338 ufsvfsp->vfs_log->un_logmap) { 1339 logmap_roll_dev(ufsvfsp->vfs_log); /* flush the log */ 1340 } 1341 bzero(sip, (size_t)fs->fs_cssize); 1342 if (ufs_construct_si(dev, fs, ufsvfsp)) { 1343 kmem_free(fs->fs_u.fs_csp, fs->fs_cssize); 1344 fs->fs_u.fs_csp = NULL; 1345 return (EIO); 1346 } 1347 } else { 1348 /* Read summary info a fs block at a time */ 1349 size = fs->fs_bsize; 1350 for (i = 0; i < frags; i += fs->fs_frag) { 1351 if (i + fs->fs_frag > frags) 1352 /* 1353 * This happens only the last iteration, so 1354 * don't worry about size being reset 1355 */ 1356 size = (frags - i) * fs->fs_fsize; 1357 tp = UFS_BREAD(ufsvfsp, dev, 1358 (daddr_t)fsbtodb(fs, fs->fs_csaddr+i), size); 1359 tp->b_flags |= B_STALE | B_AGE; 1360 if (tp->b_flags & B_ERROR) { 1361 kmem_free(fs->fs_u.fs_csp, fs->fs_cssize); 1362 fs->fs_u.fs_csp = NULL; 1363 brelse(tp); 1364 return (EIO); 1365 } 1366 bcopy(tp->b_un.b_addr, sip, size); 1367 sip += size; 1368 brelse(tp); 1369 } 1370 } 1371 bzero((caddr_t)&fs->fs_cstotal, sizeof (fs->fs_cstotal)); 1372 for (i = 0; i < fs->fs_ncg; ++i) { 1373 fs->fs_cstotal.cs_ndir += fs->fs_cs(fs, i).cs_ndir; 1374 fs->fs_cstotal.cs_nbfree += fs->fs_cs(fs, i).cs_nbfree; 1375 fs->fs_cstotal.cs_nifree += fs->fs_cs(fs, i).cs_nifree; 1376 fs->fs_cstotal.cs_nffree += fs->fs_cs(fs, i).cs_nffree; 1377 } 1378 return (0); 1379 } 1380 1381 /* 1382 * ufs_putsummaryinfo() stores all the cylinder group summary information 1383 * This is only used when logging, but the file system may not 1384 * be logging at the time, eg a read-only mount to flush the log 1385 * may push the summary info out. 1386 */ 1387 int 1388 ufs_putsummaryinfo(dev_t dev, struct ufsvfs *ufsvfsp, struct fs *fs) 1389 { 1390 struct buf b, *bp; /* tmp buf */ 1391 caddr_t sip; /* summary info */ 1392 ssize_t size; /* bytes of summary info to write */ 1393 daddr_t frags; /* frags of summary info to write */ 1394 int i; /* `for' loop counter */ 1395 int error; /* error */ 1396 1397 if (TRANS_ISERROR(ufsvfsp)) { 1398 return (EIO); 1399 } 1400 1401 if ((fs->fs_si != FS_SI_BAD) || !ufsvfsp->vfs_nolog_si) { 1402 return (0); 1403 } 1404 1405 bp = &b; 1406 bioinit(bp); 1407 bp->b_iodone = ufs_si_io_done; 1408 bp->b_bufsize = size = fs->fs_bsize; 1409 bp->b_flags = B_WRITE; 1410 bp->b_un.b_addr = kmem_alloc(size, KM_SLEEP); 1411 bp->b_edev = dev; 1412 frags = howmany(fs->fs_cssize, fs->fs_fsize); 1413 sip = (caddr_t)fs->fs_u.fs_csp; 1414 1415 /* Write summary info one fs block at a time */ 1416 for (error = 0, i = 0; (i < frags) && (error == 0); i += fs->fs_frag) { 1417 if (i + fs->fs_frag > frags) { 1418 /* 1419 * This happens only the last iteration, so 1420 * don't worry about size being reset 1421 */ 1422 size = (frags - i) * fs->fs_fsize; 1423 } 1424 bcopy(sip, bp->b_un.b_addr, size); 1425 bp->b_blkno = (daddr_t)fsbtodb(fs, fs->fs_csaddr+i); 1426 bp->b_bcount = size; 1427 (void) bdev_strategy(bp); 1428 sema_p(&bp->b_io); /* wait for write to complete */ 1429 error = geterror(bp); 1430 sip += size; 1431 } 1432 kmem_free(bp->b_un.b_addr, fs->fs_bsize); 1433 if (!error) { 1434 fs->fs_si = FS_SI_OK; 1435 } 1436 return (error); 1437 } 1438 1439 /* 1440 * Decide whether it is okay to remove within a sticky directory. 1441 * Two conditions need to be met: write access to the directory 1442 * is needed. In sticky directories, write access is not sufficient; 1443 * you can remove entries from a directory only if you own the directory, 1444 * if you are privileged, if you own the entry or if the entry is 1445 * a plain file and you have write access to that file. 1446 * Function returns 0 if remove access is granted. 1447 */ 1448 int 1449 ufs_sticky_remove_access(struct inode *dp, struct inode *ip, struct cred *cr) 1450 { 1451 uid_t uid; 1452 if ((dp->i_mode & ISVTX) && 1453 (uid = crgetuid(cr)) != dp->i_uid && 1454 uid != ip->i_uid && 1455 ((ip->i_mode & IFMT) != IFREG || 1456 ufs_iaccess(ip, IWRITE, cr) != 0)) 1457 return (secpolicy_vnode_remove(cr)); 1458 1459 return (0); 1460 } 1461 #endif /* _KERNEL */ 1462 1463 extern int around[9]; 1464 extern int inside[9]; 1465 extern uchar_t *fragtbl[]; 1466 1467 /* 1468 * Update the frsum fields to reflect addition or deletion 1469 * of some frags. 1470 */ 1471 void 1472 fragacct(struct fs *fs, int fragmap, int32_t *fraglist, int cnt) 1473 { 1474 int inblk; 1475 int field, subfield; 1476 int siz, pos; 1477 1478 /* 1479 * ufsvfsp->vfs_lock is held when calling this. 1480 */ 1481 inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1; 1482 fragmap <<= 1; 1483 for (siz = 1; siz < fs->fs_frag; siz++) { 1484 if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0) 1485 continue; 1486 field = around[siz]; 1487 subfield = inside[siz]; 1488 for (pos = siz; pos <= fs->fs_frag; pos++) { 1489 if ((fragmap & field) == subfield) { 1490 fraglist[siz] += cnt; 1491 ASSERT(fraglist[siz] >= 0); 1492 pos += siz; 1493 field <<= siz; 1494 subfield <<= siz; 1495 } 1496 field <<= 1; 1497 subfield <<= 1; 1498 } 1499 } 1500 } 1501 1502 /* 1503 * Block operations 1504 */ 1505 1506 /* 1507 * Check if a block is available 1508 */ 1509 int 1510 isblock(struct fs *fs, uchar_t *cp, daddr_t h) 1511 { 1512 uchar_t mask; 1513 1514 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \ 1515 fs->fs_frag == 1); 1516 /* 1517 * ufsvfsp->vfs_lock is held when calling this. 1518 */ 1519 switch ((int)fs->fs_frag) { 1520 case 8: 1521 return (cp[h] == 0xff); 1522 case 4: 1523 mask = 0x0f << ((h & 0x1) << 2); 1524 return ((cp[h >> 1] & mask) == mask); 1525 case 2: 1526 mask = 0x03 << ((h & 0x3) << 1); 1527 return ((cp[h >> 2] & mask) == mask); 1528 case 1: 1529 mask = 0x01 << (h & 0x7); 1530 return ((cp[h >> 3] & mask) == mask); 1531 default: 1532 #ifndef _KERNEL 1533 cmn_err(CE_PANIC, "isblock: illegal fs->fs_frag value (%d)", 1534 fs->fs_frag); 1535 #endif /* _KERNEL */ 1536 return (0); 1537 } 1538 } 1539 1540 /* 1541 * Take a block out of the map 1542 */ 1543 void 1544 clrblock(struct fs *fs, uchar_t *cp, daddr_t h) 1545 { 1546 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \ 1547 fs->fs_frag == 1); 1548 /* 1549 * ufsvfsp->vfs_lock is held when calling this. 1550 */ 1551 switch ((int)fs->fs_frag) { 1552 case 8: 1553 cp[h] = 0; 1554 return; 1555 case 4: 1556 cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); 1557 return; 1558 case 2: 1559 cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); 1560 return; 1561 case 1: 1562 cp[h >> 3] &= ~(0x01 << (h & 0x7)); 1563 return; 1564 default: 1565 #ifndef _KERNEL 1566 cmn_err(CE_PANIC, "clrblock: illegal fs->fs_frag value (%d)", 1567 fs->fs_frag); 1568 #endif /* _KERNEL */ 1569 return; 1570 } 1571 } 1572 1573 /* 1574 * Is block allocated? 1575 */ 1576 int 1577 isclrblock(struct fs *fs, uchar_t *cp, daddr_t h) 1578 { 1579 uchar_t mask; 1580 int frag; 1581 /* 1582 * ufsvfsp->vfs_lock is held when calling this. 1583 */ 1584 frag = fs->fs_frag; 1585 ASSERT(frag == 8 || frag == 4 || frag == 2 || frag == 1); 1586 switch (frag) { 1587 case 8: 1588 return (cp[h] == 0); 1589 case 4: 1590 mask = ~(0x0f << ((h & 0x1) << 2)); 1591 return (cp[h >> 1] == (cp[h >> 1] & mask)); 1592 case 2: 1593 mask = ~(0x03 << ((h & 0x3) << 1)); 1594 return (cp[h >> 2] == (cp[h >> 2] & mask)); 1595 case 1: 1596 mask = ~(0x01 << (h & 0x7)); 1597 return (cp[h >> 3] == (cp[h >> 3] & mask)); 1598 default: 1599 #ifndef _KERNEL 1600 cmn_err(CE_PANIC, "isclrblock: illegal fs->fs_frag value (%d)", 1601 fs->fs_frag); 1602 #endif /* _KERNEL */ 1603 break; 1604 } 1605 return (0); 1606 } 1607 1608 /* 1609 * Put a block into the map 1610 */ 1611 void 1612 setblock(struct fs *fs, uchar_t *cp, daddr_t h) 1613 { 1614 ASSERT(fs->fs_frag == 8 || fs->fs_frag == 4 || fs->fs_frag == 2 || \ 1615 fs->fs_frag == 1); 1616 /* 1617 * ufsvfsp->vfs_lock is held when calling this. 1618 */ 1619 switch ((int)fs->fs_frag) { 1620 case 8: 1621 cp[h] = 0xff; 1622 return; 1623 case 4: 1624 cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); 1625 return; 1626 case 2: 1627 cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); 1628 return; 1629 case 1: 1630 cp[h >> 3] |= (0x01 << (h & 0x7)); 1631 return; 1632 default: 1633 #ifndef _KERNEL 1634 cmn_err(CE_PANIC, "setblock: illegal fs->fs_frag value (%d)", 1635 fs->fs_frag); 1636 #endif /* _KERNEL */ 1637 return; 1638 } 1639 } 1640 1641 int 1642 skpc(char c, uint_t len, char *cp) 1643 { 1644 if (len == 0) 1645 return (0); 1646 while (*cp++ == c && --len) 1647 ; 1648 return (len); 1649 } 1650