1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 41 #pragma ident "%Z%%M% %I% %E% SMI" 42 43 #include <sys/condvar_impl.h> 44 #include <sys/types.h> 45 #include <sys/t_lock.h> 46 #include <sys/debug.h> 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/signal.h> 50 #include <sys/cred.h> 51 #include <sys/proc.h> 52 #include <sys/disp.h> 53 #include <sys/user.h> 54 #include <sys/buf.h> 55 #include <sys/vfs.h> 56 #include <sys/vnode.h> 57 #include <sys/acl.h> 58 #include <sys/fs/ufs_fs.h> 59 #include <sys/fs/ufs_inode.h> 60 #include <sys/fs/ufs_acl.h> 61 #include <sys/fs/ufs_bio.h> 62 #include <sys/fs/ufs_quota.h> 63 #include <sys/kmem.h> 64 #include <sys/fs/ufs_trans.h> 65 #include <sys/fs/ufs_panic.h> 66 #include <sys/errno.h> 67 #include <sys/time.h> 68 #include <sys/sysmacros.h> 69 #include <sys/file.h> 70 #include <sys/fcntl.h> 71 #include <sys/flock.h> 72 #include <fs/fs_subr.h> 73 #include <sys/cmn_err.h> 74 #include <sys/policy.h> 75 76 static ino_t hashalloc(); 77 static daddr_t fragextend(); 78 static daddr_t alloccg(); 79 static daddr_t alloccgblk(); 80 static ino_t ialloccg(); 81 static daddr_t mapsearch(); 82 83 extern int inside[], around[]; 84 extern uchar_t *fragtbl[]; 85 void delay(); 86 87 /* 88 * Allocate a block in the file system. 89 * 90 * The size of the requested block is given, which must be some 91 * multiple of fs_fsize and <= fs_bsize. 92 * A preference may be optionally specified. If a preference is given 93 * the following hierarchy is used to allocate a block: 94 * 1) allocate the requested block. 95 * 2) allocate a rotationally optimal block in the same cylinder. 96 * 3) allocate a block in the same cylinder group. 97 * 4) quadratically rehash into other cylinder groups, until an 98 * available block is located. 99 * If no block preference is given the following hierarchy is used 100 * to allocate a block: 101 * 1) allocate a block in the cylinder group that contains the 102 * inode for the file. 103 * 2) quadratically rehash into other cylinder groups, until an 104 * available block is located. 105 */ 106 int 107 alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr) 108 { 109 struct fs *fs; 110 struct ufsvfs *ufsvfsp; 111 daddr_t bno; 112 int cg; 113 int err; 114 char *errmsg = NULL; 115 size_t len; 116 117 ufsvfsp = ip->i_ufsvfs; 118 fs = ufsvfsp->vfs_fs; 119 if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) { 120 err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx," 121 " bsize = %d, size = %d, fs = %s\n", 122 ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); 123 return (err); 124 } 125 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 126 goto nospace; 127 if (freespace(fs, ufsvfsp) <= 0 && 128 secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0) 129 goto nospace; 130 err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len); 131 /* Note that may not have err, but may have errmsg */ 132 if (errmsg != NULL) { 133 uprintf(errmsg); 134 kmem_free(errmsg, len); 135 errmsg = NULL; 136 } 137 if (err) 138 return (err); 139 if (bpref >= fs->fs_size) 140 bpref = 0; 141 if (bpref == 0) 142 cg = (int)itog(fs, ip->i_number); 143 else 144 cg = dtog(fs, bpref); 145 146 bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size, 147 (ulong_t (*)())alloccg); 148 if (bno > 0) { 149 *bnp = bno; 150 return (0); 151 } 152 153 /* 154 * hashalloc() failed because some other thread grabbed 155 * the last block so unwind the quota operation. We can 156 * ignore the return because subtractions don't fail and 157 * size is guaranteed to be >= zero by our caller. 158 */ 159 (void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL, 160 (size_t *)NULL); 161 162 nospace: 163 mutex_enter(&ufsvfsp->vfs_lock); 164 if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) && 165 (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) { 166 ufsvfsp->vfs_lastwhinetime = lbolt; 167 cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt); 168 } 169 mutex_exit(&ufsvfsp->vfs_lock); 170 return (ENOSPC); 171 } 172 173 /* 174 * Reallocate a fragment to a bigger size 175 * 176 * The number and size of the old block is given, and a preference 177 * and new size is also specified. The allocator attempts to extend 178 * the original block. Failing that, the regular block allocator is 179 * invoked to get an appropriate block. 180 */ 181 int 182 realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize, 183 int nsize, daddr_t *bnp, cred_t *cr) 184 { 185 daddr_t bno; 186 struct fs *fs; 187 struct ufsvfs *ufsvfsp; 188 int cg, request; 189 int err; 190 char *errmsg = NULL; 191 size_t len; 192 193 ufsvfsp = ip->i_ufsvfs; 194 fs = ufsvfsp->vfs_fs; 195 if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || 196 (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { 197 err = ufs_fault(ITOV(ip), 198 "realloccg: bad size, dev=0x%lx, bsize=%d, " 199 "osize=%d, nsize=%d, fs=%s\n", 200 ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt); 201 return (err); 202 } 203 if (freespace(fs, ufsvfsp) <= 0 && 204 secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0) 205 goto nospace; 206 if (bprev == 0) { 207 err = ufs_fault(ITOV(ip), 208 "realloccg: bad bprev, dev = 0x%lx, bsize = %d," 209 " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev, 210 fs->fs_fsmnt); 211 return (err); 212 } 213 err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len); 214 /* Note that may not have err, but may have errmsg */ 215 if (errmsg != NULL) { 216 uprintf(errmsg); 217 kmem_free(errmsg, len); 218 errmsg = NULL; 219 } 220 if (err) 221 return (err); 222 cg = dtog(fs, bprev); 223 bno = fragextend(ip, cg, (long)bprev, osize, nsize); 224 if (bno != 0) { 225 *bnp = bno; 226 return (0); 227 } 228 if (bpref >= fs->fs_size) 229 bpref = 0; 230 231 /* 232 * When optimizing for time we allocate a full block and 233 * then only use the upper portion for this request. When 234 * this file grows again it will grow into the unused portion 235 * of the block (See fragextend() above). This saves time 236 * because an extra disk write would be needed if the frags 237 * following the current allocation were not free. The extra 238 * disk write is needed to move the data from its current 239 * location into the newly allocated position. 240 * 241 * When optimizing for space we allocate a run of frags 242 * that is just the right size for this request. 243 */ 244 request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize; 245 bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request, 246 (ulong_t (*)())alloccg); 247 if (bno > 0) { 248 *bnp = bno; 249 if (nsize < request) 250 (void) free(ip, bno + numfrags(fs, nsize), 251 (off_t)(request - nsize), I_NOCANCEL); 252 return (0); 253 } 254 255 /* 256 * hashalloc() failed because some other thread grabbed 257 * the last block so unwind the quota operation. We can 258 * ignore the return because subtractions don't fail, and 259 * our caller guarantees nsize >= osize. 260 */ 261 (void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL, 262 (size_t *)NULL); 263 264 nospace: 265 mutex_enter(&ufsvfsp->vfs_lock); 266 if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) && 267 (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) { 268 ufsvfsp->vfs_lastwhinetime = lbolt; 269 cmn_err(CE_NOTE, 270 "realloccg %s: file system full", fs->fs_fsmnt); 271 } 272 mutex_exit(&ufsvfsp->vfs_lock); 273 return (ENOSPC); 274 } 275 276 /* 277 * Allocate an inode in the file system. 278 * 279 * A preference may be optionally specified. If a preference is given 280 * the following hierarchy is used to allocate an inode: 281 * 1) allocate the requested inode. 282 * 2) allocate an inode in the same cylinder group. 283 * 3) quadratically rehash into other cylinder groups, until an 284 * available inode is located. 285 * If no inode preference is given the following hierarchy is used 286 * to allocate an inode: 287 * 1) allocate an inode in cylinder group 0. 288 * 2) quadratically rehash into other cylinder groups, until an 289 * available inode is located. 290 */ 291 int 292 ufs_ialloc(struct inode *pip, 293 ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr) 294 { 295 struct inode *ip; 296 struct fs *fs; 297 int cg; 298 ino_t ino; 299 int err; 300 int nifree; 301 struct ufsvfs *ufsvfsp = pip->i_ufsvfs; 302 char *errmsg = NULL; 303 size_t len; 304 305 ASSERT(RW_WRITE_HELD(&pip->i_rwlock)); 306 fs = pip->i_fs; 307 loop: 308 nifree = fs->fs_cstotal.cs_nifree; 309 310 if (nifree == 0) 311 goto noinodes; 312 /* 313 * Shadow inodes don't count against a user's inode allocation. 314 * They are an implementation method and not a resource. 315 */ 316 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 317 err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data, 318 /* change */ 1, (struct inode *)NULL, crgetuid(cr), 0, 319 cr, &errmsg, &len); 320 /* 321 * As we haven't acquired any locks yet, dump the message 322 * now. 323 */ 324 if (errmsg != NULL) { 325 uprintf(errmsg); 326 kmem_free(errmsg, len); 327 errmsg = NULL; 328 } 329 if (err) 330 return (err); 331 } 332 333 if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg)) 334 ipref = 0; 335 cg = (int)itog(fs, ipref); 336 ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode, 337 (ulong_t (*)())ialloccg); 338 if (ino == 0) { 339 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 340 /* 341 * We can safely ignore the return from chkiq() 342 * because deallocations can only fail if we 343 * can't get the user's quota info record off 344 * the disk due to an I/O error. In that case, 345 * the quota subsystem is already messed up. 346 */ 347 (void) chkiq(ufsvfsp, /* change */ -1, 348 (struct inode *)NULL, crgetuid(cr), 0, cr, 349 (char **)NULL, (size_t *)NULL); 350 } 351 goto noinodes; 352 } 353 err = ufs_iget(pip->i_vfs, ino, ipp, cr); 354 if (err) { 355 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 356 /* 357 * See above comment about why it is safe to ignore an 358 * error return here. 359 */ 360 (void) chkiq(ufsvfsp, /* change */ -1, 361 (struct inode *)NULL, crgetuid(cr), 0, cr, 362 (char **)NULL, (size_t *)NULL); 363 } 364 ufs_ifree(pip, ino, 0); 365 return (err); 366 } 367 ip = *ipp; 368 ASSERT(!ip->i_ufs_acl); 369 ASSERT(!ip->i_dquot); 370 rw_enter(&ip->i_contents, RW_WRITER); 371 372 /* 373 * Check if we really got a free inode, if not then complain 374 * and mark the inode ISTALE so that it will be freed by the 375 * ufs idle thread eventually and will not be sent to ufs_delete(). 376 */ 377 if (ip->i_mode || (ip->i_nlink > 0)) { 378 ip->i_flag |= ISTALE; 379 rw_exit(&ip->i_contents); 380 VN_RELE(ITOV(ip)); 381 cmn_err(CE_WARN, 382 "%s: unexpected allocated inode %d, run fsck(1M)%s", 383 fs->fs_fsmnt, (int)ino, 384 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 385 goto loop; 386 } 387 388 /* 389 * Check the inode has no size or data blocks. 390 * This could have happened if the truncation failed when 391 * deleting the inode. It used to be possible for this to occur 392 * if a block allocation failed when iteratively truncating a 393 * large file using logging and with a full file system. 394 * This was fixed with bug fix 4348738. However, truncation may 395 * still fail on an IO error. So in all cases for safety and 396 * security we clear out the size; the blocks allocated; and 397 * pointers to the blocks. This will ultimately cause a fsck 398 * error of un-accounted for blocks, but its a fairly benign error, 399 * and possibly the correct thing to do anyway as accesssing those 400 * blocks agains may lead to more IO errors. 401 */ 402 if (ip->i_size || ip->i_blocks) { 403 int i; 404 405 if (ip->i_size) { 406 cmn_err(CE_WARN, 407 "%s: free inode %d had size 0x%llx, run fsck(1M)%s", 408 fs->fs_fsmnt, (int)ino, ip->i_size, 409 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 410 } 411 /* 412 * Clear any garbage left behind. 413 */ 414 ip->i_size = (u_offset_t)0; 415 ip->i_blocks = 0; 416 for (i = 0; i < NDADDR; i++) 417 ip->i_db[i] = 0; 418 for (i = 0; i < NIADDR; i++) 419 ip->i_ib[i] = 0; 420 } 421 422 /* 423 * Initialize the link count 424 */ 425 ip->i_nlink = 0; 426 427 /* 428 * Clear the old flags 429 */ 430 ip->i_flag &= IREF; 431 432 /* 433 * Access times are not really defined if the fs is mounted 434 * with 'noatime'. But it can cause nfs clients to fail 435 * open() if the atime is not a legal value. Set a legal value 436 * here when the inode is allocated. 437 */ 438 if (ufsvfsp->vfs_noatime) { 439 mutex_enter(&ufs_iuniqtime_lock); 440 ip->i_atime = iuniqtime; 441 mutex_exit(&ufs_iuniqtime_lock); 442 } 443 rw_exit(&ip->i_contents); 444 return (0); 445 noinodes: 446 if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET)) 447 cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt); 448 return (ENOSPC); 449 } 450 451 /* 452 * Find a cylinder group to place a directory. 453 * Returns an inumber within the selected cylinder group. 454 * Note, the vfs_lock is not needed as we don't require exact cg summary info. 455 * 456 * If the switch ufs_close_dirs is set, then the policy is to use 457 * the current cg if it has more than 25% free inodes and more 458 * than 25% free blocks. Otherwise the cgs are searched from 459 * the beginning and the first cg with the same criteria is 460 * used. If that is also null then we revert to the old algorithm. 461 * This tends to cluster files at the beginning of the disk 462 * until the disk gets full. 463 * 464 * Otherwise if ufs_close_dirs is not set then the original policy is 465 * used which is to select from among those cylinder groups with 466 * above the average number of free inodes, the one with the smallest 467 * number of directories. 468 */ 469 470 int ufs_close_dirs = 1; /* allocate directories close as possible */ 471 472 ino_t 473 dirpref(inode_t *dp) 474 { 475 int cg, minndir, mincg, avgifree, mininode, minbpg, ifree; 476 struct fs *fs = dp->i_fs; 477 478 cg = itog(fs, dp->i_number); 479 mininode = fs->fs_ipg >> 2; 480 minbpg = fs->fs_maxbpg >> 2; 481 if (ufs_close_dirs && 482 (fs->fs_cs(fs, cg).cs_nifree > mininode) && 483 (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) { 484 return (dp->i_number); 485 } 486 487 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 488 minndir = fs->fs_ipg; 489 mincg = 0; 490 for (cg = 0; cg < fs->fs_ncg; cg++) { 491 ifree = fs->fs_cs(fs, cg).cs_nifree; 492 if (ufs_close_dirs && 493 (ifree > mininode) && 494 (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) { 495 return ((ino_t)(fs->fs_ipg * cg)); 496 } 497 if ((fs->fs_cs(fs, cg).cs_ndir < minndir) && 498 (ifree >= avgifree)) { 499 mincg = cg; 500 minndir = fs->fs_cs(fs, cg).cs_ndir; 501 } 502 } 503 return ((ino_t)(fs->fs_ipg * mincg)); 504 } 505 506 /* 507 * Select the desired position for the next block in a file. The file is 508 * logically divided into sections. The first section is composed of the 509 * direct blocks. Each additional section contains fs_maxbpg blocks. 510 * 511 * If no blocks have been allocated in the first section, the policy is to 512 * request a block in the same cylinder group as the inode that describes 513 * the file. If no blocks have been allocated in any other section, the 514 * policy is to place the section in a cylinder group with a greater than 515 * average number of free blocks. An appropriate cylinder group is found 516 * by using a rotor that sweeps the cylinder groups. When a new group of 517 * blocks is needed, the sweep begins in the cylinder group following the 518 * cylinder group from which the previous allocation was made. The sweep 519 * continues until a cylinder group with greater than the average number 520 * of free blocks is found. If the allocation is for the first block in an 521 * indirect block, the information on the previous allocation is unavailable; 522 * here a best guess is made based upon the logical block number being 523 * allocated. 524 * 525 * If a section is already partially allocated, the policy is to 526 * contiguously allocate fs_maxcontig blocks. The end of one of these 527 * contiguous blocks and the beginning of the next is physically separated 528 * so that the disk head will be in transit between them for at least 529 * fs_rotdelay milliseconds. This is to allow time for the processor to 530 * schedule another I/O transfer. 531 */ 532 daddr_t 533 blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap) 534 { 535 struct fs *fs; 536 struct ufsvfs *ufsvfsp; 537 int cg; 538 int avgbfree, startcg; 539 daddr_t nextblk; 540 541 ufsvfsp = ip->i_ufsvfs; 542 fs = ip->i_fs; 543 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 544 if (lbn < NDADDR) { 545 cg = itog(fs, ip->i_number); 546 return (fs->fs_fpg * cg + fs->fs_frag); 547 } 548 /* 549 * Find a cylinder with greater than average 550 * number of unused data blocks. 551 */ 552 if (indx == 0 || bap[indx - 1] == 0) 553 startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg; 554 else 555 startcg = dtog(fs, bap[indx - 1]) + 1; 556 startcg %= fs->fs_ncg; 557 558 mutex_enter(&ufsvfsp->vfs_lock); 559 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 560 /* 561 * used for computing log space for writes/truncs 562 */ 563 ufsvfsp->vfs_avgbfree = avgbfree; 564 for (cg = startcg; cg < fs->fs_ncg; cg++) 565 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 566 fs->fs_cgrotor = cg; 567 mutex_exit(&ufsvfsp->vfs_lock); 568 return (fs->fs_fpg * cg + fs->fs_frag); 569 } 570 for (cg = 0; cg <= startcg; cg++) 571 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 572 fs->fs_cgrotor = cg; 573 mutex_exit(&ufsvfsp->vfs_lock); 574 return (fs->fs_fpg * cg + fs->fs_frag); 575 } 576 mutex_exit(&ufsvfsp->vfs_lock); 577 return (NULL); 578 } 579 /* 580 * One or more previous blocks have been laid out. If less 581 * than fs_maxcontig previous blocks are contiguous, the 582 * next block is requested contiguously, otherwise it is 583 * requested rotationally delayed by fs_rotdelay milliseconds. 584 */ 585 586 nextblk = bap[indx - 1]; 587 /* 588 * Provision for fallocate to return positive 589 * blk preference based on last allocation 590 */ 591 if (nextblk < 0 && nextblk != UFS_HOLE) { 592 nextblk = (-bap[indx - 1]) + fs->fs_frag; 593 } else { 594 nextblk = bap[indx - 1] + fs->fs_frag; 595 } 596 597 if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] + 598 blkstofrags(fs, fs->fs_maxcontig) != nextblk) { 599 return (nextblk); 600 } 601 if (fs->fs_rotdelay != 0) 602 /* 603 * Here we convert ms of delay to frags as: 604 * (frags) = (ms) * (rev/sec) * (sect/rev) / 605 * ((sect/frag) * (ms/sec)) 606 * then round up to the next block. 607 */ 608 nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect / 609 (NSPF(fs) * 1000), fs->fs_frag); 610 return (nextblk); 611 } 612 613 /* 614 * Free a block or fragment. 615 * 616 * The specified block or fragment is placed back in the 617 * free map. If a fragment is deallocated, a possible 618 * block reassembly is checked. 619 */ 620 void 621 free(struct inode *ip, daddr_t bno, off_t size, int flags) 622 { 623 struct fs *fs = ip->i_fs; 624 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 625 struct ufs_q *delq = &ufsvfsp->vfs_delete; 626 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 627 struct cg *cgp; 628 struct buf *bp; 629 int cg, bmap, bbase; 630 int i; 631 uchar_t *blksfree; 632 int *blktot; 633 short *blks; 634 daddr_t blkno, cylno, rpos; 635 636 /* 637 * fallocate'd files will have negative block address. 638 * So negate it again to get original block address. 639 */ 640 if (bno < 0 && bno % fs->fs_bsize == 0 && bno != UFS_HOLE) { 641 bno = -bno; 642 } 643 644 if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) { 645 (void) ufs_fault(ITOV(ip), 646 "free: bad size, dev = 0x%lx, bsize = %d, size = %d, " 647 "fs = %s\n", ip->i_dev, fs->fs_bsize, 648 (int)size, fs->fs_fsmnt); 649 return; 650 } 651 cg = dtog(fs, bno); 652 ASSERT(!ufs_badblock(ip, bno)); 653 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 654 (int)fs->fs_cgsize); 655 656 cgp = bp->b_un.b_cg; 657 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 658 brelse(bp); 659 return; 660 } 661 662 if (!(flags & I_NOCANCEL)) 663 TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags); 664 if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) { 665 TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size); 666 } 667 blksfree = cg_blksfree(cgp); 668 blktot = cg_blktot(cgp); 669 mutex_enter(&ufsvfsp->vfs_lock); 670 cgp->cg_time = gethrestime_sec(); 671 bno = dtogd(fs, bno); 672 if (size == fs->fs_bsize) { 673 blkno = fragstoblks(fs, bno); 674 cylno = cbtocylno(fs, bno); 675 rpos = cbtorpos(ufsvfsp, bno); 676 blks = cg_blks(ufsvfsp, cgp, cylno); 677 if (!isclrblock(fs, blksfree, blkno)) { 678 mutex_exit(&ufsvfsp->vfs_lock); 679 brelse(bp); 680 (void) ufs_fault(ITOV(ip), "free: freeing free block, " 681 "dev:0x%lx, block:%ld, ino:%lu, fs:%s", 682 ip->i_dev, bno, ip->i_number, fs->fs_fsmnt); 683 return; 684 } 685 setblock(fs, blksfree, blkno); 686 blks[rpos]++; 687 blktot[cylno]++; 688 cgp->cg_cs.cs_nbfree++; /* Log below */ 689 fs->fs_cstotal.cs_nbfree++; 690 fs->fs_cs(fs, cg).cs_nbfree++; 691 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) { 692 mutex_enter(&delq->uq_mutex); 693 delq_info->delq_unreclaimed_blocks -= 694 btodb(fs->fs_bsize); 695 mutex_exit(&delq->uq_mutex); 696 } 697 } else { 698 bbase = bno - fragnum(fs, bno); 699 /* 700 * Decrement the counts associated with the old frags 701 */ 702 bmap = blkmap(fs, blksfree, bbase); 703 fragacct(fs, bmap, cgp->cg_frsum, -1); 704 /* 705 * Deallocate the fragment 706 */ 707 for (i = 0; i < numfrags(fs, size); i++) { 708 if (isset(blksfree, bno + i)) { 709 brelse(bp); 710 mutex_exit(&ufsvfsp->vfs_lock); 711 (void) ufs_fault(ITOV(ip), 712 "free: freeing free frag, " 713 "dev:0x%lx, blk:%ld, cg:%d, " 714 "ino:%lu, fs:%s", 715 ip->i_dev, 716 bno + i, 717 cgp->cg_cgx, 718 ip->i_number, 719 fs->fs_fsmnt); 720 return; 721 } 722 setbit(blksfree, bno + i); 723 } 724 cgp->cg_cs.cs_nffree += i; 725 fs->fs_cstotal.cs_nffree += i; 726 fs->fs_cs(fs, cg).cs_nffree += i; 727 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) { 728 mutex_enter(&delq->uq_mutex); 729 delq_info->delq_unreclaimed_blocks -= 730 btodb(i * fs->fs_fsize); 731 mutex_exit(&delq->uq_mutex); 732 } 733 /* 734 * Add back in counts associated with the new frags 735 */ 736 bmap = blkmap(fs, blksfree, bbase); 737 fragacct(fs, bmap, cgp->cg_frsum, 1); 738 /* 739 * If a complete block has been reassembled, account for it 740 */ 741 blkno = fragstoblks(fs, bbase); 742 if (isblock(fs, blksfree, blkno)) { 743 cylno = cbtocylno(fs, bbase); 744 rpos = cbtorpos(ufsvfsp, bbase); 745 blks = cg_blks(ufsvfsp, cgp, cylno); 746 blks[rpos]++; 747 blktot[cylno]++; 748 cgp->cg_cs.cs_nffree -= fs->fs_frag; 749 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 750 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 751 cgp->cg_cs.cs_nbfree++; 752 fs->fs_cstotal.cs_nbfree++; 753 fs->fs_cs(fs, cg).cs_nbfree++; 754 } 755 } 756 fs->fs_fmod = 1; 757 ufs_notclean(ufsvfsp); 758 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 759 TRANS_SI(ufsvfsp, fs, cg); 760 bdrwrite(bp); 761 } 762 763 /* 764 * Free an inode. 765 * 766 * The specified inode is placed back in the free map. 767 */ 768 void 769 ufs_ifree(struct inode *ip, ino_t ino, mode_t mode) 770 { 771 struct fs *fs = ip->i_fs; 772 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 773 struct cg *cgp; 774 struct buf *bp; 775 unsigned int inot; 776 int cg; 777 char *iused; 778 779 if (ip->i_number == ino && ip->i_mode != 0) { 780 (void) ufs_fault(ITOV(ip), 781 "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, " 782 "fs = %s\n", 783 ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt); 784 return; 785 } 786 if (ino >= fs->fs_ipg * fs->fs_ncg) { 787 (void) ufs_fault(ITOV(ip), 788 "ifree: range, dev = 0x%x, ino = %d, fs = %s\n", 789 (int)ip->i_dev, (int)ino, fs->fs_fsmnt); 790 return; 791 } 792 cg = (int)itog(fs, ino); 793 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 794 (int)fs->fs_cgsize); 795 796 cgp = bp->b_un.b_cg; 797 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 798 brelse(bp); 799 return; 800 } 801 mutex_enter(&ufsvfsp->vfs_lock); 802 cgp->cg_time = gethrestime_sec(); 803 iused = cg_inosused(cgp); 804 inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg); 805 if (isclr(iused, inot)) { 806 mutex_exit(&ufsvfsp->vfs_lock); 807 brelse(bp); 808 (void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, " 809 "mode: (imode) %o, (omode) %o, ino:%d, " 810 "fs:%s", 811 ip->i_mode, mode, (int)ino, fs->fs_fsmnt); 812 return; 813 } 814 clrbit(iused, inot); 815 816 if (inot < (ulong_t)cgp->cg_irotor) 817 cgp->cg_irotor = inot; 818 cgp->cg_cs.cs_nifree++; 819 fs->fs_cstotal.cs_nifree++; 820 fs->fs_cs(fs, cg).cs_nifree++; 821 if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) { 822 cgp->cg_cs.cs_ndir--; 823 fs->fs_cstotal.cs_ndir--; 824 fs->fs_cs(fs, cg).cs_ndir--; 825 } 826 fs->fs_fmod = 1; 827 ufs_notclean(ufsvfsp); 828 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 829 TRANS_SI(ufsvfsp, fs, cg); 830 bdrwrite(bp); 831 } 832 833 /* 834 * Implement the cylinder overflow algorithm. 835 * 836 * The policy implemented by this algorithm is: 837 * 1) allocate the block in its requested cylinder group. 838 * 2) quadratically rehash on the cylinder group number. 839 * 3) brute force search for a free block. 840 * The size parameter means size for data blocks, mode for inodes. 841 */ 842 static ino_t 843 hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)()) 844 { 845 struct fs *fs; 846 int i; 847 long result; 848 int icg = cg; 849 850 fs = ip->i_fs; 851 /* 852 * 1: preferred cylinder group 853 */ 854 result = (*allocator)(ip, cg, pref, size); 855 if (result) 856 return (result); 857 /* 858 * 2: quadratic rehash 859 */ 860 for (i = 1; i < fs->fs_ncg; i *= 2) { 861 cg += i; 862 if (cg >= fs->fs_ncg) 863 cg -= fs->fs_ncg; 864 result = (*allocator)(ip, cg, 0, size); 865 if (result) 866 return (result); 867 } 868 /* 869 * 3: brute force search 870 * Note that we start at i == 2, since 0 was checked initially, 871 * and 1 is always checked in the quadratic rehash. 872 */ 873 cg = (icg + 2) % fs->fs_ncg; 874 for (i = 2; i < fs->fs_ncg; i++) { 875 result = (*allocator)(ip, cg, 0, size); 876 if (result) 877 return (result); 878 cg++; 879 if (cg == fs->fs_ncg) 880 cg = 0; 881 } 882 return (NULL); 883 } 884 885 /* 886 * Determine whether a fragment can be extended. 887 * 888 * Check to see if the necessary fragments are available, and 889 * if they are, allocate them. 890 */ 891 static daddr_t 892 fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize) 893 { 894 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 895 struct fs *fs = ip->i_fs; 896 struct buf *bp; 897 struct cg *cgp; 898 uchar_t *blksfree; 899 long bno; 900 int frags, bbase; 901 int i, j; 902 903 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) 904 return (NULL); 905 frags = numfrags(fs, nsize); 906 bbase = (int)fragnum(fs, bprev); 907 if (bbase > fragnum(fs, (bprev + frags - 1))) { 908 /* cannot extend across a block boundary */ 909 return (NULL); 910 } 911 912 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 913 (int)fs->fs_cgsize); 914 cgp = bp->b_un.b_cg; 915 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 916 brelse(bp); 917 return (NULL); 918 } 919 920 blksfree = cg_blksfree(cgp); 921 mutex_enter(&ufsvfsp->vfs_lock); 922 bno = dtogd(fs, bprev); 923 for (i = numfrags(fs, osize); i < frags; i++) { 924 if (isclr(blksfree, bno + i)) { 925 mutex_exit(&ufsvfsp->vfs_lock); 926 brelse(bp); 927 return (NULL); 928 } 929 if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)), 930 fs->fs_fsize))) { 931 mutex_exit(&ufsvfsp->vfs_lock); 932 brelse(bp); 933 return (NULL); 934 } 935 } 936 937 cgp->cg_time = gethrestime_sec(); 938 /* 939 * The current fragment can be extended, 940 * deduct the count on fragment being extended into 941 * increase the count on the remaining fragment (if any) 942 * allocate the extended piece. 943 */ 944 for (i = frags; i < fs->fs_frag - bbase; i++) 945 if (isclr(blksfree, bno + i)) 946 break; 947 j = i - numfrags(fs, osize); 948 cgp->cg_frsum[j]--; 949 ASSERT(cgp->cg_frsum[j] >= 0); 950 if (i != frags) 951 cgp->cg_frsum[i - frags]++; 952 for (i = numfrags(fs, osize); i < frags; i++) { 953 clrbit(blksfree, bno + i); 954 cgp->cg_cs.cs_nffree--; 955 fs->fs_cs(fs, cg).cs_nffree--; 956 fs->fs_cstotal.cs_nffree--; 957 } 958 fs->fs_fmod = 1; 959 ufs_notclean(ufsvfsp); 960 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 961 TRANS_SI(ufsvfsp, fs, cg); 962 bdrwrite(bp); 963 return ((daddr_t)bprev); 964 } 965 966 /* 967 * Determine whether a block can be allocated. 968 * 969 * Check to see if a block of the apprpriate size 970 * is available, and if it is, allocate it. 971 */ 972 static daddr_t 973 alloccg(struct inode *ip, int cg, daddr_t bpref, int size) 974 { 975 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 976 struct fs *fs = ip->i_fs; 977 struct buf *bp; 978 struct cg *cgp; 979 uchar_t *blksfree; 980 int bno, frags; 981 int allocsiz; 982 int i; 983 984 if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) 985 return (0); 986 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 987 (int)fs->fs_cgsize); 988 989 cgp = bp->b_un.b_cg; 990 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) || 991 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { 992 brelse(bp); 993 return (0); 994 } 995 blksfree = cg_blksfree(cgp); 996 mutex_enter(&ufsvfsp->vfs_lock); 997 cgp->cg_time = gethrestime_sec(); 998 if (size == fs->fs_bsize) { 999 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0) 1000 goto errout; 1001 fs->fs_fmod = 1; 1002 ufs_notclean(ufsvfsp); 1003 TRANS_SI(ufsvfsp, fs, cg); 1004 bdrwrite(bp); 1005 return (bno); 1006 } 1007 /* 1008 * Check to see if any fragments are already available 1009 * allocsiz is the size which will be allocated, hacking 1010 * it down to a smaller size if necessary. 1011 */ 1012 frags = numfrags(fs, size); 1013 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 1014 if (cgp->cg_frsum[allocsiz] != 0) 1015 break; 1016 1017 if (allocsiz != fs->fs_frag) 1018 bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz); 1019 1020 if (allocsiz == fs->fs_frag || bno < 0) { 1021 /* 1022 * No fragments were available, so a block 1023 * will be allocated and hacked up. 1024 */ 1025 if (cgp->cg_cs.cs_nbfree == 0) 1026 goto errout; 1027 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0) 1028 goto errout; 1029 bpref = dtogd(fs, bno); 1030 for (i = frags; i < fs->fs_frag; i++) 1031 setbit(blksfree, bpref + i); 1032 i = fs->fs_frag - frags; 1033 cgp->cg_cs.cs_nffree += i; 1034 fs->fs_cstotal.cs_nffree += i; 1035 fs->fs_cs(fs, cg).cs_nffree += i; 1036 cgp->cg_frsum[i]++; 1037 fs->fs_fmod = 1; 1038 ufs_notclean(ufsvfsp); 1039 TRANS_SI(ufsvfsp, fs, cg); 1040 bdrwrite(bp); 1041 return (bno); 1042 } 1043 1044 for (i = 0; i < frags; i++) 1045 clrbit(blksfree, bno + i); 1046 cgp->cg_cs.cs_nffree -= frags; 1047 fs->fs_cstotal.cs_nffree -= frags; 1048 fs->fs_cs(fs, cg).cs_nffree -= frags; 1049 cgp->cg_frsum[allocsiz]--; 1050 ASSERT(cgp->cg_frsum[allocsiz] >= 0); 1051 if (frags != allocsiz) { 1052 cgp->cg_frsum[allocsiz - frags]++; 1053 } 1054 fs->fs_fmod = 1; 1055 ufs_notclean(ufsvfsp); 1056 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1057 TRANS_SI(ufsvfsp, fs, cg); 1058 bdrwrite(bp); 1059 return (cg * fs->fs_fpg + bno); 1060 errout: 1061 mutex_exit(&ufsvfsp->vfs_lock); 1062 brelse(bp); 1063 return (0); 1064 } 1065 1066 /* 1067 * Allocate a block in a cylinder group. 1068 * 1069 * This algorithm implements the following policy: 1070 * 1) allocate the requested block. 1071 * 2) allocate a rotationally optimal block in the same cylinder. 1072 * 3) allocate the next available block on the block rotor for the 1073 * specified cylinder group. 1074 * Note that this routine only allocates fs_bsize blocks; these 1075 * blocks may be fragmented by the routine that allocates them. 1076 */ 1077 static daddr_t 1078 alloccgblk( 1079 struct ufsvfs *ufsvfsp, 1080 struct cg *cgp, 1081 daddr_t bpref, 1082 struct buf *bp) 1083 { 1084 daddr_t bno; 1085 int cylno, pos, delta, rotbl_size; 1086 short *cylbp; 1087 int i; 1088 struct fs *fs; 1089 uchar_t *blksfree; 1090 daddr_t blkno, rpos, frag; 1091 short *blks; 1092 int32_t *blktot; 1093 1094 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock)); 1095 fs = ufsvfsp->vfs_fs; 1096 blksfree = cg_blksfree(cgp); 1097 if (bpref == 0) { 1098 bpref = cgp->cg_rotor; 1099 goto norot; 1100 } 1101 bpref = blknum(fs, bpref); 1102 bpref = dtogd(fs, bpref); 1103 /* 1104 * If the requested block is available, use it. 1105 */ 1106 if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) { 1107 bno = bpref; 1108 goto gotit; 1109 } 1110 /* 1111 * Check for a block available on the same cylinder. 1112 */ 1113 cylno = cbtocylno(fs, bpref); 1114 if (cg_blktot(cgp)[cylno] == 0) 1115 goto norot; 1116 if (fs->fs_cpc == 0) { 1117 /* 1118 * Block layout info is not available, so just 1119 * have to take any block in this cylinder. 1120 */ 1121 bpref = howmany(fs->fs_spc * cylno, NSPF(fs)); 1122 goto norot; 1123 } 1124 /* 1125 * Check the summary information to see if a block is 1126 * available in the requested cylinder starting at the 1127 * requested rotational position and proceeding around. 1128 */ 1129 cylbp = cg_blks(ufsvfsp, cgp, cylno); 1130 pos = cbtorpos(ufsvfsp, bpref); 1131 for (i = pos; i < ufsvfsp->vfs_nrpos; i++) 1132 if (cylbp[i] > 0) 1133 break; 1134 if (i == ufsvfsp->vfs_nrpos) 1135 for (i = 0; i < pos; i++) 1136 if (cylbp[i] > 0) 1137 break; 1138 if (cylbp[i] > 0) { 1139 /* 1140 * Found a rotational position, now find the actual 1141 * block. A "panic" if none is actually there. 1142 */ 1143 1144 /* 1145 * Up to this point, "pos" has referred to the rotational 1146 * position of the desired block. From now on, it holds 1147 * the offset of the current cylinder within a cylinder 1148 * cycle. (A cylinder cycle refers to a set of cylinders 1149 * which are described by a single rotational table; the 1150 * size of the cycle is fs_cpc.) 1151 * 1152 * bno is set to the block number of the first block within 1153 * the current cylinder cycle. 1154 */ 1155 1156 pos = cylno % fs->fs_cpc; 1157 bno = (cylno - pos) * fs->fs_spc / NSPB(fs); 1158 1159 /* 1160 * The blocks within a cylinder are grouped into equivalence 1161 * classes according to their "rotational position." There 1162 * are two tables used to determine these classes. 1163 * 1164 * The positional offset table (fs_postbl) has an entry for 1165 * each rotational position of each cylinder in a cylinder 1166 * cycle. This entry contains the relative block number 1167 * (counting from the start of the cylinder cycle) of the 1168 * first block in the equivalence class for that position 1169 * and that cylinder. Positions for which no blocks exist 1170 * are indicated by a -1. 1171 * 1172 * The rotational delta table (fs_rotbl) has an entry for 1173 * each block in a cylinder cycle. This entry contains 1174 * the offset from that block to the next block in the 1175 * same equivalence class. The last block in the class 1176 * is indicated by a zero in the table. 1177 * 1178 * The following code, then, walks through all of the blocks 1179 * in the cylinder (cylno) which we're allocating within 1180 * which are in the equivalence class for the rotational 1181 * position (i) which we're allocating within. 1182 */ 1183 1184 if (fs_postbl(ufsvfsp, pos)[i] == -1) { 1185 (void) ufs_fault(ufsvfsp->vfs_root, 1186 "alloccgblk: cyl groups corrupted, pos = %d, " 1187 "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt); 1188 return (0); 1189 } 1190 1191 /* 1192 * There is one entry in the rotational table for each block 1193 * in the cylinder cycle. These are whole blocks, not frags. 1194 */ 1195 1196 rotbl_size = (fs->fs_cpc * fs->fs_spc) >> 1197 (fs->fs_fragshift + fs->fs_fsbtodb); 1198 1199 /* 1200 * As we start, "i" is the rotational position within which 1201 * we're searching. After the next line, it will be a block 1202 * number (relative to the start of the cylinder cycle) 1203 * within the equivalence class of that rotational position. 1204 */ 1205 1206 i = fs_postbl(ufsvfsp, pos)[i]; 1207 1208 for (;;) { 1209 if (isblock(fs, blksfree, (daddr_t)(bno + i))) { 1210 bno = blkstofrags(fs, (bno + i)); 1211 goto gotit; 1212 } 1213 delta = fs_rotbl(fs)[i]; 1214 if (delta <= 0 || /* End of chain, or */ 1215 delta + i > rotbl_size) /* end of table? */ 1216 break; /* If so, panic. */ 1217 i += delta; 1218 } 1219 (void) ufs_fault(ufsvfsp->vfs_root, 1220 "alloccgblk: can't find blk in cyl, pos:%d, i:%d, " 1221 "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno); 1222 return (0); 1223 } 1224 norot: 1225 /* 1226 * No blocks in the requested cylinder, so take 1227 * next available one in this cylinder group. 1228 */ 1229 bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag); 1230 if (bno < 0) 1231 return (0); 1232 cgp->cg_rotor = bno; 1233 gotit: 1234 blkno = fragstoblks(fs, bno); 1235 frag = (cgp->cg_cgx * fs->fs_fpg) + bno; 1236 if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize)) 1237 goto norot; 1238 clrblock(fs, blksfree, (long)blkno); 1239 /* 1240 * the other cg/sb/si fields are TRANS'ed by the caller 1241 */ 1242 cgp->cg_cs.cs_nbfree--; 1243 fs->fs_cstotal.cs_nbfree--; 1244 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; 1245 cylno = cbtocylno(fs, bno); 1246 blks = cg_blks(ufsvfsp, cgp, cylno); 1247 rpos = cbtorpos(ufsvfsp, bno); 1248 blktot = cg_blktot(cgp); 1249 blks[rpos]--; 1250 blktot[cylno]--; 1251 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1252 fs->fs_fmod = 1; 1253 return (frag); 1254 } 1255 1256 /* 1257 * Determine whether an inode can be allocated. 1258 * 1259 * Check to see if an inode is available, and if it is, 1260 * allocate it using the following policy: 1261 * 1) allocate the requested inode. 1262 * 2) allocate the next available inode after the requested 1263 * inode in the specified cylinder group. 1264 */ 1265 static ino_t 1266 ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode) 1267 { 1268 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1269 struct fs *fs = ip->i_fs; 1270 struct cg *cgp; 1271 struct buf *bp; 1272 int start, len, loc, map, i; 1273 char *iused; 1274 1275 if (fs->fs_cs(fs, cg).cs_nifree == 0) 1276 return (0); 1277 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 1278 (int)fs->fs_cgsize); 1279 1280 cgp = bp->b_un.b_cg; 1281 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) || 1282 cgp->cg_cs.cs_nifree == 0) { 1283 brelse(bp); 1284 return (0); 1285 } 1286 iused = cg_inosused(cgp); 1287 mutex_enter(&ufsvfsp->vfs_lock); 1288 /* 1289 * While we are waiting for the mutex, someone may have taken 1290 * the last available inode. Need to recheck. 1291 */ 1292 if (cgp->cg_cs.cs_nifree == 0) { 1293 mutex_exit(&ufsvfsp->vfs_lock); 1294 brelse(bp); 1295 return (0); 1296 } 1297 1298 cgp->cg_time = gethrestime_sec(); 1299 if (ipref) { 1300 ipref %= fs->fs_ipg; 1301 if (isclr(iused, ipref)) 1302 goto gotit; 1303 } 1304 start = cgp->cg_irotor / NBBY; 1305 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); 1306 loc = skpc(0xff, (uint_t)len, &iused[start]); 1307 if (loc == 0) { 1308 len = start + 1; 1309 start = 0; 1310 loc = skpc(0xff, (uint_t)len, &iused[0]); 1311 if (loc == 0) { 1312 mutex_exit(&ufsvfsp->vfs_lock); 1313 (void) ufs_fault(ITOV(ip), 1314 "ialloccg: map corrupted, cg = %d, irotor = %d, " 1315 "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt); 1316 return (0); 1317 } 1318 } 1319 i = start + len - loc; 1320 map = iused[i]; 1321 ipref = i * NBBY; 1322 for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) { 1323 if ((map & i) == 0) { 1324 cgp->cg_irotor = ipref; 1325 goto gotit; 1326 } 1327 } 1328 1329 mutex_exit(&ufsvfsp->vfs_lock); 1330 (void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s", 1331 fs->fs_fsmnt); 1332 return (0); 1333 gotit: 1334 setbit(iused, ipref); 1335 cgp->cg_cs.cs_nifree--; 1336 fs->fs_cstotal.cs_nifree--; 1337 fs->fs_cs(fs, cg).cs_nifree--; 1338 if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) { 1339 cgp->cg_cs.cs_ndir++; 1340 fs->fs_cstotal.cs_ndir++; 1341 fs->fs_cs(fs, cg).cs_ndir++; 1342 } 1343 fs->fs_fmod = 1; 1344 ufs_notclean(ufsvfsp); 1345 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1346 TRANS_SI(ufsvfsp, fs, cg); 1347 bdrwrite(bp); 1348 return (cg * fs->fs_ipg + ipref); 1349 } 1350 1351 /* 1352 * Find a block of the specified size in the specified cylinder group. 1353 * 1354 * It is a panic if a request is made to find a block if none are 1355 * available. 1356 */ 1357 static daddr_t 1358 mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref, 1359 int allocsiz) 1360 { 1361 struct fs *fs = ufsvfsp->vfs_fs; 1362 daddr_t bno, cfrag; 1363 int start, len, loc, i, last, first, secondtime; 1364 int blk, field, subfield, pos; 1365 int gotit; 1366 1367 /* 1368 * ufsvfs->vfs_lock is held when calling this. 1369 */ 1370 /* 1371 * Find the fragment by searching through the 1372 * free block map for an appropriate bit pattern. 1373 */ 1374 if (bpref) 1375 start = dtogd(fs, bpref) / NBBY; 1376 else 1377 start = cgp->cg_frotor / NBBY; 1378 /* 1379 * the following loop performs two scans -- the first scan 1380 * searches the bottom half of the array for a match and the 1381 * second scan searches the top half of the array. The loops 1382 * have been merged just to make things difficult. 1383 */ 1384 first = start; 1385 last = howmany(fs->fs_fpg, NBBY); 1386 secondtime = 0; 1387 cfrag = cgp->cg_cgx * fs->fs_fpg; 1388 while (first < last) { 1389 len = last - first; 1390 /* 1391 * search the array for a match 1392 */ 1393 loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first], 1394 (uchar_t *)fragtbl[fs->fs_frag], 1395 (int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 1396 /* 1397 * match found 1398 */ 1399 if (loc) { 1400 bno = (last - loc) * NBBY; 1401 1402 /* 1403 * Found the byte in the map, sift 1404 * through the bits to find the selected frag 1405 */ 1406 cgp->cg_frotor = bno; 1407 gotit = 0; 1408 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 1409 blk = blkmap(fs, cg_blksfree(cgp), bno); 1410 blk <<= 1; 1411 field = around[allocsiz]; 1412 subfield = inside[allocsiz]; 1413 for (pos = 0; 1414 pos <= fs->fs_frag - allocsiz; 1415 pos++) { 1416 if ((blk & field) == subfield) { 1417 gotit++; 1418 break; 1419 } 1420 field <<= 1; 1421 subfield <<= 1; 1422 } 1423 if (gotit) 1424 break; 1425 } 1426 bno += pos; 1427 1428 /* 1429 * success if block is *not* being converted from 1430 * metadata into userdata (harpy). If so, ignore. 1431 */ 1432 if (!TRANS_ISCANCEL(ufsvfsp, 1433 ldbtob(fsbtodb(fs, (cfrag+bno))), 1434 allocsiz * fs->fs_fsize)) 1435 return (bno); 1436 1437 /* 1438 * keep looking -- this block is being converted 1439 */ 1440 first = (last - loc) + 1; 1441 loc = 0; 1442 if (first < last) 1443 continue; 1444 } 1445 /* 1446 * no usable matches in bottom half -- now search the top half 1447 */ 1448 if (secondtime) 1449 /* 1450 * no usable matches in top half -- all done 1451 */ 1452 break; 1453 secondtime = 1; 1454 last = start + 1; 1455 first = 0; 1456 } 1457 /* 1458 * no usable matches 1459 */ 1460 return ((daddr_t)-1); 1461 } 1462 1463 #define UFSNADDR (NDADDR + NIADDR) /* NADDR applies to (obsolete) S5FS */ 1464 #define IB(i) (NDADDR + (i)) /* index of i'th indirect block ptr */ 1465 #define SINGLE 0 /* single indirect block ptr */ 1466 #define DOUBLE 1 /* double indirect block ptr */ 1467 #define TRIPLE 2 /* triple indirect block ptr */ 1468 1469 /* 1470 * Acquire a write lock, and keep trying till we get it 1471 */ 1472 static int 1473 allocsp_wlockfs(struct vnode *vp, struct lockfs *lf) 1474 { 1475 int err = 0; 1476 1477 lockagain: 1478 do { 1479 err = ufs_fiolfss(vp, lf); 1480 if (err) 1481 return (err); 1482 } while (!LOCKFS_IS_ULOCK(lf)); 1483 1484 lf->lf_lock = LOCKFS_WLOCK; 1485 lf->lf_flags = 0; 1486 lf->lf_comment = NULL; 1487 err = ufs__fiolfs(vp, lf, 1, 0); 1488 1489 if (err == EBUSY || err == EINVAL) 1490 goto lockagain; 1491 1492 return (err); 1493 } 1494 1495 /* 1496 * Release the write lock 1497 */ 1498 static int 1499 allocsp_unlockfs(struct vnode *vp, struct lockfs *lf) 1500 { 1501 int err = 0; 1502 1503 lf->lf_lock = LOCKFS_ULOCK; 1504 lf->lf_flags = 0; 1505 err = ufs__fiolfs(vp, lf, 1, 0); 1506 return (err); 1507 } 1508 1509 struct allocsp_undo { 1510 daddr_t offset; 1511 daddr_t blk; 1512 struct allocsp_undo *next; 1513 }; 1514 1515 /* 1516 * ufs_allocsp() can be used to pre-allocate blocks for a file on a given 1517 * file system. The blocks are not initialized and are only marked as allocated. 1518 * These addresses are then stored as negative block numbers in the inode to 1519 * imply special handling. UFS has been modified where necessary to understand 1520 * this new notion. Successfully fallocated files will have IFALLOCATE cflag 1521 * set in the inode. 1522 */ 1523 int 1524 ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr) 1525 { 1526 struct lockfs lf; 1527 int berr, err, resv, issync; 1528 off_t start, istart, len; /* istart, special for idb */ 1529 struct inode *ip; 1530 struct fs *fs; 1531 struct ufsvfs *ufsvfsp; 1532 u_offset_t resid, i; 1533 daddr32_t db_undo[NDADDR]; /* old direct blocks */ 1534 struct allocsp_undo *ib_undo = NULL; /* ib undo */ 1535 struct allocsp_undo *undo = NULL; 1536 u_offset_t osz; /* old file size */ 1537 int chunkblks = 0; /* # of blocks in 1 allocation */ 1538 int cnt = 0; 1539 daddr_t allocblk; 1540 daddr_t totblks = 0; 1541 struct ulockfs *ulp; 1542 1543 ASSERT(vp->v_type == VREG); 1544 1545 ip = VTOI(vp); 1546 fs = ip->i_fs; 1547 if ((ufsvfsp = ip->i_ufsvfs) == NULL) { 1548 err = EIO; 1549 goto out_allocsp; 1550 } 1551 1552 istart = start = blkroundup(fs, (lp->l_start)); 1553 len = blkroundup(fs, (lp->l_len)); 1554 chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize; 1555 ulp = &ufsvfsp->vfs_ulockfs; 1556 1557 if (lp->l_start < 0 || lp->l_len <= 0) 1558 return (EINVAL); 1559 1560 /* Quickly check to make sure we have space before we proceed */ 1561 if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) { 1562 if (TRANS_ISTRANS(ufsvfsp)) { 1563 ufs_delete_drain_wait(ufsvfsp, 1); 1564 if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) 1565 return (ENOSPC); 1566 } else 1567 return (ENOSPC); 1568 } 1569 1570 /* 1571 * We will keep i_rwlock locked as WRITER through out the function 1572 * since we don't want anyone else reading or writing to the inode 1573 * while we are in the middle of fallocating the file. 1574 */ 1575 rw_enter(&ip->i_rwlock, RW_WRITER); 1576 1577 /* Back up the direct block list, used for undo later if necessary */ 1578 rw_enter(&ip->i_contents, RW_READER); 1579 for (i = 0; i < NDADDR; i++) 1580 db_undo[i] = ip->i_db[i]; 1581 osz = ip->i_size; 1582 rw_exit(&ip->i_contents); 1583 1584 /* Allocate any direct blocks now before we write lock the fs */ 1585 if (lblkno(fs, start) < NDADDR) { 1586 ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize), 1587 &resv, &resid); 1588 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1589 1590 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1591 rw_enter(&ip->i_contents, RW_WRITER); 1592 1593 for (i = start; (i < len) && (lblkno(fs, i) < NDADDR); 1594 i += fs->fs_bsize) { 1595 berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE, 1596 &allocblk, cr); 1597 /* Yikes error, quit */ 1598 if (berr) { 1599 TRANS_INODE(ufsvfsp, ip); 1600 rw_exit(&ip->i_contents); 1601 rw_exit(&ufsvfsp->vfs_dqrwlock); 1602 TRANS_END_CSYNC(ufsvfsp, err, issync, 1603 TOP_ALLOCSP, resv); 1604 goto exit; 1605 } 1606 1607 if (allocblk) { 1608 totblks++; 1609 ip->i_size += fs->fs_bsize; 1610 } 1611 } 1612 1613 TRANS_INODE(ufsvfsp, ip); 1614 rw_exit(&ip->i_contents); 1615 rw_exit(&ufsvfsp->vfs_dqrwlock); 1616 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1617 1618 istart = i; /* start offset for indirect allocation */ 1619 } 1620 1621 /* Write lock the file system */ 1622 if (err = allocsp_wlockfs(vp, &lf)) 1623 goto exit; 1624 1625 /* Break the transactions into vfs_iotransz units */ 1626 ufs_trans_trunc_resv(ip, ip->i_size + 1627 blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid); 1628 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1629 1630 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1631 rw_enter(&ip->i_contents, RW_WRITER); 1632 1633 /* Now go about fallocating necessary indirect blocks */ 1634 for (i = istart; i < len; i += fs->fs_bsize) { 1635 berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE, 1636 &allocblk, cr); 1637 if (berr) { 1638 TRANS_INODE(ufsvfsp, ip); 1639 rw_exit(&ip->i_contents); 1640 rw_exit(&ufsvfsp->vfs_dqrwlock); 1641 TRANS_END_CSYNC(ufsvfsp, err, issync, 1642 TOP_ALLOCSP, resv); 1643 err = allocsp_unlockfs(vp, &lf); 1644 goto exit; 1645 } 1646 1647 /* Update the blk counter only if new block was added */ 1648 if (allocblk) { 1649 /* Save undo information */ 1650 undo = kmem_alloc(sizeof (struct allocsp_undo), 1651 KM_SLEEP); 1652 undo->offset = i; 1653 undo->blk = allocblk; 1654 undo->next = ib_undo; 1655 ib_undo = undo; 1656 totblks++; 1657 ip->i_size += fs->fs_bsize; 1658 } 1659 cnt++; 1660 1661 /* Being a good UFS citizen, let others get a share */ 1662 if (cnt == chunkblks) { 1663 /* 1664 * If there are waiters or the fs is hard locked, 1665 * error locked, or read-only error locked, 1666 * quit with EIO 1667 */ 1668 if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) || 1669 ULOCKFS_IS_ROELOCK(ulp)) { 1670 ip->i_cflags |= IFALLOCATE; 1671 TRANS_INODE(ufsvfsp, ip); 1672 rw_exit(&ip->i_contents); 1673 rw_exit(&ufsvfsp->vfs_dqrwlock); 1674 1675 TRANS_END_CSYNC(ufsvfsp, err, issync, 1676 TOP_ALLOCSP, resv); 1677 rw_exit(&ip->i_rwlock); 1678 return (EIO); 1679 } 1680 1681 TRANS_INODE(ufsvfsp, ip); 1682 rw_exit(&ip->i_contents); 1683 rw_exit(&ufsvfsp->vfs_dqrwlock); 1684 1685 /* End the current transaction */ 1686 TRANS_END_CSYNC(ufsvfsp, err, issync, 1687 TOP_ALLOCSP, resv); 1688 1689 if (CV_HAS_WAITERS(&ulp->ul_cv)) { 1690 /* Release the write lock */ 1691 if (err = allocsp_unlockfs(vp, &lf)) 1692 goto exit; 1693 1694 /* Wake up others waiting to do operations */ 1695 mutex_enter(&ulp->ul_lock); 1696 cv_broadcast(&ulp->ul_cv); 1697 mutex_exit(&ulp->ul_lock); 1698 1699 /* Grab the write lock again */ 1700 if (err = allocsp_wlockfs(vp, &lf)) 1701 goto exit; 1702 } /* end of CV_HAS_WAITERS(&ulp->ul_cv) */ 1703 1704 /* Reserve more space in log for this file */ 1705 ufs_trans_trunc_resv(ip, 1706 ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz), 1707 &resv, &resid); 1708 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1709 1710 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1711 rw_enter(&ip->i_contents, RW_WRITER); 1712 1713 cnt = 0; /* reset cnt b/c of new transaction */ 1714 } 1715 } 1716 1717 if (!err && !berr) 1718 ip->i_cflags |= IFALLOCATE; 1719 1720 /* Release locks, end log transaction and unlock fs */ 1721 TRANS_INODE(ufsvfsp, ip); 1722 rw_exit(&ip->i_contents); 1723 rw_exit(&ufsvfsp->vfs_dqrwlock); 1724 1725 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1726 err = allocsp_unlockfs(vp, &lf); 1727 1728 /* 1729 * @ exit label, we should no longer be holding the fs write lock, and 1730 * all logging transactions should have been ended. We still hold 1731 * ip->i_rwlock. 1732 */ 1733 exit: 1734 /* 1735 * File has grown larger than 2GB. Set flag 1736 * in superblock to indicate this, if it 1737 * is not already set. 1738 */ 1739 if ((ip->i_size > MAXOFF32_T) && 1740 !(fs->fs_flags & FSLARGEFILES)) { 1741 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1742 mutex_enter(&ufsvfsp->vfs_lock); 1743 fs->fs_flags |= FSLARGEFILES; 1744 ufs_sbwrite(ufsvfsp); 1745 mutex_exit(&ufsvfsp->vfs_lock); 1746 } 1747 1748 /* 1749 * Since we couldn't allocate completely, we will undo the allocations. 1750 */ 1751 if (berr) { 1752 ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid); 1753 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1754 1755 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1756 rw_enter(&ip->i_contents, RW_WRITER); 1757 1758 /* Direct blocks */ 1759 for (i = 0; i < NDADDR; i++) { 1760 /* 1761 * Only free the block if they are not same, and 1762 * the old one isn't zero (the fragment was 1763 * re-allocated). 1764 */ 1765 if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) { 1766 free(ip, ip->i_db[i], fs->fs_bsize, 0); 1767 ip->i_db[i] = 0; 1768 } 1769 } 1770 1771 /* Undo the indirect blocks */ 1772 while (ib_undo != NULL) { 1773 undo = ib_undo; 1774 err = bmap_set_bn(vp, undo->offset, 0); 1775 if (err) 1776 cmn_err(CE_PANIC, "ufs_allocsp(): failed to " 1777 "undo allocation of block %ld", 1778 undo->offset); 1779 free(ip, undo->blk, fs->fs_bsize, I_IBLK); 1780 ib_undo = undo->next; 1781 kmem_free(undo, sizeof (struct allocsp_undo)); 1782 } 1783 1784 ip->i_size = osz; 1785 TRANS_INODE(ufsvfsp, ip); 1786 1787 rw_exit(&ip->i_contents); 1788 rw_exit(&ufsvfsp->vfs_dqrwlock); 1789 1790 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1791 1792 rw_exit(&ip->i_rwlock); 1793 return (berr); 1794 } 1795 1796 /* 1797 * Don't forget to free the undo chain :) 1798 */ 1799 while (ib_undo != NULL) { 1800 undo = ib_undo; 1801 ib_undo = undo->next; 1802 kmem_free(undo, sizeof (struct allocsp_undo)); 1803 } 1804 1805 rw_exit(&ip->i_rwlock); 1806 1807 out_allocsp: 1808 return (err); 1809 } 1810 1811 /* 1812 * Free storage space associated with the specified inode. The portion 1813 * to be freed is specified by lp->l_start and lp->l_len (already 1814 * normalized to a "whence" of 0). 1815 * 1816 * This is an experimental facility whose continued existence is not 1817 * guaranteed. Currently, we only support the special case 1818 * of l_len == 0, meaning free to end of file. 1819 * 1820 * Blocks are freed in reverse order. This FILO algorithm will tend to 1821 * maintain a contiguous free list much longer than FIFO. 1822 * See also ufs_itrunc() in ufs_inode.c. 1823 * 1824 * Bug: unused bytes in the last retained block are not cleared. 1825 * This may result in a "hole" in the file that does not read as zeroes. 1826 */ 1827 /* ARGSUSED */ 1828 int 1829 ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr) 1830 { 1831 int i; 1832 struct inode *ip = VTOI(vp); 1833 int error; 1834 1835 ASSERT(vp->v_type == VREG); 1836 ASSERT(lp->l_start >= 0); /* checked by convoff */ 1837 1838 if (lp->l_len != 0) 1839 return (EINVAL); 1840 1841 rw_enter(&ip->i_contents, RW_READER); 1842 if (ip->i_size == (u_offset_t)lp->l_start) { 1843 rw_exit(&ip->i_contents); 1844 return (0); 1845 } 1846 1847 /* 1848 * Check if there is any active mandatory lock on the 1849 * range that will be truncated/expanded. 1850 */ 1851 if (MANDLOCK(vp, ip->i_mode)) { 1852 offset_t save_start; 1853 1854 save_start = lp->l_start; 1855 1856 if (ip->i_size < lp->l_start) { 1857 /* 1858 * "Truncate up" case: need to make sure there 1859 * is no lock beyond current end-of-file. To 1860 * do so, we need to set l_start to the size 1861 * of the file temporarily. 1862 */ 1863 lp->l_start = ip->i_size; 1864 } 1865 lp->l_type = F_WRLCK; 1866 lp->l_sysid = 0; 1867 lp->l_pid = ttoproc(curthread)->p_pid; 1868 i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK; 1869 rw_exit(&ip->i_contents); 1870 if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 || 1871 lp->l_type != F_UNLCK) { 1872 return (i ? i : EAGAIN); 1873 } 1874 rw_enter(&ip->i_contents, RW_READER); 1875 1876 lp->l_start = save_start; 1877 } 1878 1879 /* 1880 * Make sure a write isn't in progress (allocating blocks) 1881 * by acquiring i_rwlock (we promised ufs_bmap we wouldn't 1882 * truncate while it was allocating blocks). 1883 * Grab the locks in the right order. 1884 */ 1885 rw_exit(&ip->i_contents); 1886 rw_enter(&ip->i_rwlock, RW_WRITER); 1887 error = TRANS_ITRUNC(ip, (u_offset_t)lp->l_start, 0, cr); 1888 rw_exit(&ip->i_rwlock); 1889 return (error); 1890 } 1891 1892 /* 1893 * Find a cg with as close to nb contiguous bytes as possible 1894 * THIS MAY TAKE MANY DISK READS! 1895 * 1896 * Implemented in an attempt to allocate contiguous blocks for 1897 * writing the ufs log file to, minimizing future disk head seeking 1898 */ 1899 daddr_t 1900 contigpref(ufsvfs_t *ufsvfsp, size_t nb) 1901 { 1902 struct fs *fs = ufsvfsp->vfs_fs; 1903 daddr_t nblk = lblkno(fs, blkroundup(fs, nb)); 1904 daddr_t savebno, curbno, cgbno; 1905 int cg, cgblks, savecg, savenblk, curnblk; 1906 uchar_t *blksfree; 1907 buf_t *bp; 1908 struct cg *cgp; 1909 1910 savenblk = 0; 1911 savecg = 0; 1912 savebno = 0; 1913 for (cg = 0; cg < fs->fs_ncg; ++cg) { 1914 1915 /* not enough free blks for a contig check */ 1916 if (fs->fs_cs(fs, cg).cs_nbfree < nblk) 1917 continue; 1918 1919 /* 1920 * find the largest contiguous range in this cg 1921 */ 1922 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, 1923 (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 1924 (int)fs->fs_cgsize); 1925 cgp = bp->b_un.b_cg; 1926 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 1927 brelse(bp); 1928 continue; 1929 } 1930 blksfree = cg_blksfree(cgp); /* free array */ 1931 cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */ 1932 cgbno = 0; 1933 while (cgbno < cgblks && savenblk < nblk) { 1934 /* find a free block */ 1935 for (; cgbno < cgblks; ++cgbno) 1936 if (isblock(fs, blksfree, cgbno)) 1937 break; 1938 curbno = cgbno; 1939 /* count the number of free blocks */ 1940 for (curnblk = 0; cgbno < cgblks; ++cgbno) { 1941 if (!isblock(fs, blksfree, cgbno)) 1942 break; 1943 if (++curnblk >= nblk) 1944 break; 1945 } 1946 if (curnblk > savenblk) { 1947 savecg = cg; 1948 savenblk = curnblk; 1949 savebno = curbno; 1950 } 1951 } 1952 brelse(bp); 1953 if (savenblk >= nblk) 1954 break; 1955 } 1956 1957 /* convert block offset in cg to frag offset in cg */ 1958 savebno = blkstofrags(fs, savebno); 1959 1960 /* convert frag offset in cg to frag offset in fs */ 1961 savebno += (savecg * fs->fs_fpg); 1962 1963 return (savebno); 1964 } 1965