1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #include <sys/condvar_impl.h> 40 #include <sys/types.h> 41 #include <sys/t_lock.h> 42 #include <sys/debug.h> 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/signal.h> 46 #include <sys/cred.h> 47 #include <sys/proc.h> 48 #include <sys/disp.h> 49 #include <sys/user.h> 50 #include <sys/buf.h> 51 #include <sys/vfs.h> 52 #include <sys/vnode.h> 53 #include <sys/acl.h> 54 #include <sys/fs/ufs_fs.h> 55 #include <sys/fs/ufs_inode.h> 56 #include <sys/fs/ufs_acl.h> 57 #include <sys/fs/ufs_bio.h> 58 #include <sys/fs/ufs_quota.h> 59 #include <sys/kmem.h> 60 #include <sys/fs/ufs_trans.h> 61 #include <sys/fs/ufs_panic.h> 62 #include <sys/errno.h> 63 #include <sys/time.h> 64 #include <sys/sysmacros.h> 65 #include <sys/file.h> 66 #include <sys/fcntl.h> 67 #include <sys/flock.h> 68 #include <fs/fs_subr.h> 69 #include <sys/cmn_err.h> 70 #include <sys/policy.h> 71 72 static ino_t hashalloc(); 73 static daddr_t fragextend(); 74 static daddr_t alloccg(); 75 static daddr_t alloccgblk(); 76 static ino_t ialloccg(); 77 static daddr_t mapsearch(); 78 79 extern int inside[], around[]; 80 extern uchar_t *fragtbl[]; 81 void delay(); 82 83 /* 84 * Allocate a block in the file system. 85 * 86 * The size of the requested block is given, which must be some 87 * multiple of fs_fsize and <= fs_bsize. 88 * A preference may be optionally specified. If a preference is given 89 * the following hierarchy is used to allocate a block: 90 * 1) allocate the requested block. 91 * 2) allocate a rotationally optimal block in the same cylinder. 92 * 3) allocate a block in the same cylinder group. 93 * 4) quadratically rehash into other cylinder groups, until an 94 * available block is located. 95 * If no block preference is given the following hierarchy is used 96 * to allocate a block: 97 * 1) allocate a block in the cylinder group that contains the 98 * inode for the file. 99 * 2) quadratically rehash into other cylinder groups, until an 100 * available block is located. 101 */ 102 int 103 alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr) 104 { 105 struct fs *fs; 106 struct ufsvfs *ufsvfsp; 107 daddr_t bno; 108 int cg; 109 int err; 110 char *errmsg = NULL; 111 size_t len; 112 113 ufsvfsp = ip->i_ufsvfs; 114 fs = ufsvfsp->vfs_fs; 115 if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) { 116 err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx," 117 " bsize = %d, size = %d, fs = %s\n", 118 ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); 119 return (err); 120 } 121 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 122 goto nospace; 123 if (freespace(fs, ufsvfsp) <= 0 && 124 secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0) 125 goto nospace; 126 err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len); 127 /* Note that may not have err, but may have errmsg */ 128 if (errmsg != NULL) { 129 uprintf(errmsg); 130 kmem_free(errmsg, len); 131 errmsg = NULL; 132 } 133 if (err) 134 return (err); 135 if (bpref >= fs->fs_size) 136 bpref = 0; 137 if (bpref == 0) 138 cg = (int)itog(fs, ip->i_number); 139 else 140 cg = dtog(fs, bpref); 141 142 bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size, 143 (ulong_t (*)())alloccg); 144 if (bno > 0) { 145 *bnp = bno; 146 return (0); 147 } 148 149 /* 150 * hashalloc() failed because some other thread grabbed 151 * the last block so unwind the quota operation. We can 152 * ignore the return because subtractions don't fail and 153 * size is guaranteed to be >= zero by our caller. 154 */ 155 (void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL, 156 (size_t *)NULL); 157 158 nospace: 159 mutex_enter(&ufsvfsp->vfs_lock); 160 if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) && 161 (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) { 162 ufsvfsp->vfs_lastwhinetime = lbolt; 163 cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt); 164 } 165 mutex_exit(&ufsvfsp->vfs_lock); 166 return (ENOSPC); 167 } 168 169 /* 170 * Reallocate a fragment to a bigger size 171 * 172 * The number and size of the old block is given, and a preference 173 * and new size is also specified. The allocator attempts to extend 174 * the original block. Failing that, the regular block allocator is 175 * invoked to get an appropriate block. 176 */ 177 int 178 realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize, 179 int nsize, daddr_t *bnp, cred_t *cr) 180 { 181 daddr_t bno; 182 struct fs *fs; 183 struct ufsvfs *ufsvfsp; 184 int cg, request; 185 int err; 186 char *errmsg = NULL; 187 size_t len; 188 189 ufsvfsp = ip->i_ufsvfs; 190 fs = ufsvfsp->vfs_fs; 191 if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || 192 (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { 193 err = ufs_fault(ITOV(ip), 194 "realloccg: bad size, dev=0x%lx, bsize=%d, " 195 "osize=%d, nsize=%d, fs=%s\n", 196 ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt); 197 return (err); 198 } 199 if (freespace(fs, ufsvfsp) <= 0 && 200 secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0) 201 goto nospace; 202 if (bprev == 0) { 203 err = ufs_fault(ITOV(ip), 204 "realloccg: bad bprev, dev = 0x%lx, bsize = %d," 205 " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev, 206 fs->fs_fsmnt); 207 return (err); 208 } 209 err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len); 210 /* Note that may not have err, but may have errmsg */ 211 if (errmsg != NULL) { 212 uprintf(errmsg); 213 kmem_free(errmsg, len); 214 errmsg = NULL; 215 } 216 if (err) 217 return (err); 218 cg = dtog(fs, bprev); 219 bno = fragextend(ip, cg, (long)bprev, osize, nsize); 220 if (bno != 0) { 221 *bnp = bno; 222 return (0); 223 } 224 if (bpref >= fs->fs_size) 225 bpref = 0; 226 227 /* 228 * When optimizing for time we allocate a full block and 229 * then only use the upper portion for this request. When 230 * this file grows again it will grow into the unused portion 231 * of the block (See fragextend() above). This saves time 232 * because an extra disk write would be needed if the frags 233 * following the current allocation were not free. The extra 234 * disk write is needed to move the data from its current 235 * location into the newly allocated position. 236 * 237 * When optimizing for space we allocate a run of frags 238 * that is just the right size for this request. 239 */ 240 request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize; 241 bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request, 242 (ulong_t (*)())alloccg); 243 if (bno > 0) { 244 *bnp = bno; 245 if (nsize < request) 246 (void) free(ip, bno + numfrags(fs, nsize), 247 (off_t)(request - nsize), I_NOCANCEL); 248 return (0); 249 } 250 251 /* 252 * hashalloc() failed because some other thread grabbed 253 * the last block so unwind the quota operation. We can 254 * ignore the return because subtractions don't fail, and 255 * our caller guarantees nsize >= osize. 256 */ 257 (void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL, 258 (size_t *)NULL); 259 260 nospace: 261 mutex_enter(&ufsvfsp->vfs_lock); 262 if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) && 263 (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) { 264 ufsvfsp->vfs_lastwhinetime = lbolt; 265 cmn_err(CE_NOTE, 266 "realloccg %s: file system full", fs->fs_fsmnt); 267 } 268 mutex_exit(&ufsvfsp->vfs_lock); 269 return (ENOSPC); 270 } 271 272 /* 273 * Allocate an inode in the file system. 274 * 275 * A preference may be optionally specified. If a preference is given 276 * the following hierarchy is used to allocate an inode: 277 * 1) allocate the requested inode. 278 * 2) allocate an inode in the same cylinder group. 279 * 3) quadratically rehash into other cylinder groups, until an 280 * available inode is located. 281 * If no inode preference is given the following hierarchy is used 282 * to allocate an inode: 283 * 1) allocate an inode in cylinder group 0. 284 * 2) quadratically rehash into other cylinder groups, until an 285 * available inode is located. 286 */ 287 int 288 ufs_ialloc(struct inode *pip, 289 ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr) 290 { 291 struct inode *ip; 292 struct fs *fs; 293 int cg; 294 ino_t ino; 295 int err; 296 int nifree; 297 struct ufsvfs *ufsvfsp = pip->i_ufsvfs; 298 char *errmsg = NULL; 299 size_t len; 300 301 ASSERT(RW_WRITE_HELD(&pip->i_rwlock)); 302 fs = pip->i_fs; 303 loop: 304 nifree = fs->fs_cstotal.cs_nifree; 305 306 if (nifree == 0) 307 goto noinodes; 308 /* 309 * Shadow inodes don't count against a user's inode allocation. 310 * They are an implementation method and not a resource. 311 */ 312 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 313 err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data, 314 /* change */ 1, (struct inode *)NULL, crgetuid(cr), 0, 315 cr, &errmsg, &len); 316 /* 317 * As we haven't acquired any locks yet, dump the message 318 * now. 319 */ 320 if (errmsg != NULL) { 321 uprintf(errmsg); 322 kmem_free(errmsg, len); 323 errmsg = NULL; 324 } 325 if (err) 326 return (err); 327 } 328 329 if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg)) 330 ipref = 0; 331 cg = (int)itog(fs, ipref); 332 ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode, 333 (ulong_t (*)())ialloccg); 334 if (ino == 0) { 335 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 336 /* 337 * We can safely ignore the return from chkiq() 338 * because deallocations can only fail if we 339 * can't get the user's quota info record off 340 * the disk due to an I/O error. In that case, 341 * the quota subsystem is already messed up. 342 */ 343 (void) chkiq(ufsvfsp, /* change */ -1, 344 (struct inode *)NULL, crgetuid(cr), 0, cr, 345 (char **)NULL, (size_t *)NULL); 346 } 347 goto noinodes; 348 } 349 err = ufs_iget(pip->i_vfs, ino, ipp, cr); 350 if (err) { 351 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 352 /* 353 * See above comment about why it is safe to ignore an 354 * error return here. 355 */ 356 (void) chkiq(ufsvfsp, /* change */ -1, 357 (struct inode *)NULL, crgetuid(cr), 0, cr, 358 (char **)NULL, (size_t *)NULL); 359 } 360 ufs_ifree(pip, ino, 0); 361 return (err); 362 } 363 ip = *ipp; 364 ASSERT(!ip->i_ufs_acl); 365 ASSERT(!ip->i_dquot); 366 rw_enter(&ip->i_contents, RW_WRITER); 367 368 /* 369 * Check if we really got a free inode, if not then complain 370 * and mark the inode ISTALE so that it will be freed by the 371 * ufs idle thread eventually and will not be sent to ufs_delete(). 372 */ 373 if (ip->i_mode || (ip->i_nlink > 0)) { 374 ip->i_flag |= ISTALE; 375 rw_exit(&ip->i_contents); 376 VN_RELE(ITOV(ip)); 377 cmn_err(CE_WARN, 378 "%s: unexpected allocated inode %d, run fsck(1M)%s", 379 fs->fs_fsmnt, (int)ino, 380 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 381 goto loop; 382 } 383 384 /* 385 * Check the inode has no size or data blocks. 386 * This could have happened if the truncation failed when 387 * deleting the inode. It used to be possible for this to occur 388 * if a block allocation failed when iteratively truncating a 389 * large file using logging and with a full file system. 390 * This was fixed with bug fix 4348738. However, truncation may 391 * still fail on an IO error. So in all cases for safety and 392 * security we clear out the size; the blocks allocated; and 393 * pointers to the blocks. This will ultimately cause a fsck 394 * error of un-accounted for blocks, but its a fairly benign error, 395 * and possibly the correct thing to do anyway as accesssing those 396 * blocks agains may lead to more IO errors. 397 */ 398 if (ip->i_size || ip->i_blocks) { 399 int i; 400 401 if (ip->i_size) { 402 cmn_err(CE_WARN, 403 "%s: free inode %d had size 0x%llx, run fsck(1M)%s", 404 fs->fs_fsmnt, (int)ino, ip->i_size, 405 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 406 } 407 /* 408 * Clear any garbage left behind. 409 */ 410 ip->i_size = (u_offset_t)0; 411 ip->i_blocks = 0; 412 for (i = 0; i < NDADDR; i++) 413 ip->i_db[i] = 0; 414 for (i = 0; i < NIADDR; i++) 415 ip->i_ib[i] = 0; 416 } 417 418 /* 419 * Initialize the link count 420 */ 421 ip->i_nlink = 0; 422 423 /* 424 * Clear the old flags 425 */ 426 ip->i_flag &= IREF; 427 428 /* 429 * Access times are not really defined if the fs is mounted 430 * with 'noatime'. But it can cause nfs clients to fail 431 * open() if the atime is not a legal value. Set a legal value 432 * here when the inode is allocated. 433 */ 434 if (ufsvfsp->vfs_noatime) { 435 mutex_enter(&ufs_iuniqtime_lock); 436 ip->i_atime = iuniqtime; 437 mutex_exit(&ufs_iuniqtime_lock); 438 } 439 rw_exit(&ip->i_contents); 440 return (0); 441 noinodes: 442 if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET)) 443 cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt); 444 return (ENOSPC); 445 } 446 447 /* 448 * Find a cylinder group to place a directory. 449 * Returns an inumber within the selected cylinder group. 450 * Note, the vfs_lock is not needed as we don't require exact cg summary info. 451 * 452 * If the switch ufs_close_dirs is set, then the policy is to use 453 * the current cg if it has more than 25% free inodes and more 454 * than 25% free blocks. Otherwise the cgs are searched from 455 * the beginning and the first cg with the same criteria is 456 * used. If that is also null then we revert to the old algorithm. 457 * This tends to cluster files at the beginning of the disk 458 * until the disk gets full. 459 * 460 * Otherwise if ufs_close_dirs is not set then the original policy is 461 * used which is to select from among those cylinder groups with 462 * above the average number of free inodes, the one with the smallest 463 * number of directories. 464 */ 465 466 int ufs_close_dirs = 1; /* allocate directories close as possible */ 467 468 ino_t 469 dirpref(inode_t *dp) 470 { 471 int cg, minndir, mincg, avgifree, mininode, minbpg, ifree; 472 struct fs *fs = dp->i_fs; 473 474 cg = itog(fs, dp->i_number); 475 mininode = fs->fs_ipg >> 2; 476 minbpg = fs->fs_maxbpg >> 2; 477 if (ufs_close_dirs && 478 (fs->fs_cs(fs, cg).cs_nifree > mininode) && 479 (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) { 480 return (dp->i_number); 481 } 482 483 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 484 minndir = fs->fs_ipg; 485 mincg = 0; 486 for (cg = 0; cg < fs->fs_ncg; cg++) { 487 ifree = fs->fs_cs(fs, cg).cs_nifree; 488 if (ufs_close_dirs && 489 (ifree > mininode) && 490 (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) { 491 return ((ino_t)(fs->fs_ipg * cg)); 492 } 493 if ((fs->fs_cs(fs, cg).cs_ndir < minndir) && 494 (ifree >= avgifree)) { 495 mincg = cg; 496 minndir = fs->fs_cs(fs, cg).cs_ndir; 497 } 498 } 499 return ((ino_t)(fs->fs_ipg * mincg)); 500 } 501 502 /* 503 * Select the desired position for the next block in a file. The file is 504 * logically divided into sections. The first section is composed of the 505 * direct blocks. Each additional section contains fs_maxbpg blocks. 506 * 507 * If no blocks have been allocated in the first section, the policy is to 508 * request a block in the same cylinder group as the inode that describes 509 * the file. If no blocks have been allocated in any other section, the 510 * policy is to place the section in a cylinder group with a greater than 511 * average number of free blocks. An appropriate cylinder group is found 512 * by using a rotor that sweeps the cylinder groups. When a new group of 513 * blocks is needed, the sweep begins in the cylinder group following the 514 * cylinder group from which the previous allocation was made. The sweep 515 * continues until a cylinder group with greater than the average number 516 * of free blocks is found. If the allocation is for the first block in an 517 * indirect block, the information on the previous allocation is unavailable; 518 * here a best guess is made based upon the logical block number being 519 * allocated. 520 * 521 * If a section is already partially allocated, the policy is to 522 * contiguously allocate fs_maxcontig blocks. The end of one of these 523 * contiguous blocks and the beginning of the next is physically separated 524 * so that the disk head will be in transit between them for at least 525 * fs_rotdelay milliseconds. This is to allow time for the processor to 526 * schedule another I/O transfer. 527 */ 528 daddr_t 529 blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap) 530 { 531 struct fs *fs; 532 struct ufsvfs *ufsvfsp; 533 int cg; 534 int avgbfree, startcg; 535 daddr_t nextblk; 536 537 ufsvfsp = ip->i_ufsvfs; 538 fs = ip->i_fs; 539 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 540 if (lbn < NDADDR) { 541 cg = itog(fs, ip->i_number); 542 return (fs->fs_fpg * cg + fs->fs_frag); 543 } 544 /* 545 * Find a cylinder with greater than average 546 * number of unused data blocks. 547 */ 548 if (indx == 0 || bap[indx - 1] == 0) 549 startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg; 550 else 551 startcg = dtog(fs, bap[indx - 1]) + 1; 552 startcg %= fs->fs_ncg; 553 554 mutex_enter(&ufsvfsp->vfs_lock); 555 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 556 /* 557 * used for computing log space for writes/truncs 558 */ 559 ufsvfsp->vfs_avgbfree = avgbfree; 560 for (cg = startcg; cg < fs->fs_ncg; cg++) 561 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 562 fs->fs_cgrotor = cg; 563 mutex_exit(&ufsvfsp->vfs_lock); 564 return (fs->fs_fpg * cg + fs->fs_frag); 565 } 566 for (cg = 0; cg <= startcg; cg++) 567 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 568 fs->fs_cgrotor = cg; 569 mutex_exit(&ufsvfsp->vfs_lock); 570 return (fs->fs_fpg * cg + fs->fs_frag); 571 } 572 mutex_exit(&ufsvfsp->vfs_lock); 573 return (NULL); 574 } 575 /* 576 * One or more previous blocks have been laid out. If less 577 * than fs_maxcontig previous blocks are contiguous, the 578 * next block is requested contiguously, otherwise it is 579 * requested rotationally delayed by fs_rotdelay milliseconds. 580 */ 581 582 nextblk = bap[indx - 1]; 583 /* 584 * Provision for fallocate to return positive 585 * blk preference based on last allocation 586 */ 587 if (nextblk < 0 && nextblk != UFS_HOLE) { 588 nextblk = (-bap[indx - 1]) + fs->fs_frag; 589 } else { 590 nextblk = bap[indx - 1] + fs->fs_frag; 591 } 592 593 if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] + 594 blkstofrags(fs, fs->fs_maxcontig) != nextblk) { 595 return (nextblk); 596 } 597 if (fs->fs_rotdelay != 0) 598 /* 599 * Here we convert ms of delay to frags as: 600 * (frags) = (ms) * (rev/sec) * (sect/rev) / 601 * ((sect/frag) * (ms/sec)) 602 * then round up to the next block. 603 */ 604 nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect / 605 (NSPF(fs) * 1000), fs->fs_frag); 606 return (nextblk); 607 } 608 609 /* 610 * Free a block or fragment. 611 * 612 * The specified block or fragment is placed back in the 613 * free map. If a fragment is deallocated, a possible 614 * block reassembly is checked. 615 */ 616 void 617 free(struct inode *ip, daddr_t bno, off_t size, int flags) 618 { 619 struct fs *fs = ip->i_fs; 620 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 621 struct ufs_q *delq = &ufsvfsp->vfs_delete; 622 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 623 struct cg *cgp; 624 struct buf *bp; 625 int cg, bmap, bbase; 626 int i; 627 uchar_t *blksfree; 628 int *blktot; 629 short *blks; 630 daddr_t blkno, cylno, rpos; 631 632 /* 633 * fallocate'd files will have negative block address. 634 * So negate it again to get original block address. 635 */ 636 if (bno < 0 && (bno % fs->fs_frag == 0) && bno != UFS_HOLE) { 637 bno = -bno; 638 } 639 640 if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) { 641 (void) ufs_fault(ITOV(ip), 642 "free: bad size, dev = 0x%lx, bsize = %d, size = %d, " 643 "fs = %s\n", ip->i_dev, fs->fs_bsize, 644 (int)size, fs->fs_fsmnt); 645 return; 646 } 647 cg = dtog(fs, bno); 648 ASSERT(!ufs_badblock(ip, bno)); 649 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 650 (int)fs->fs_cgsize); 651 652 cgp = bp->b_un.b_cg; 653 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 654 brelse(bp); 655 return; 656 } 657 658 if (!(flags & I_NOCANCEL)) 659 TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags); 660 if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) { 661 TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size); 662 } 663 blksfree = cg_blksfree(cgp); 664 blktot = cg_blktot(cgp); 665 mutex_enter(&ufsvfsp->vfs_lock); 666 cgp->cg_time = gethrestime_sec(); 667 bno = dtogd(fs, bno); 668 if (size == fs->fs_bsize) { 669 blkno = fragstoblks(fs, bno); 670 cylno = cbtocylno(fs, bno); 671 rpos = cbtorpos(ufsvfsp, bno); 672 blks = cg_blks(ufsvfsp, cgp, cylno); 673 if (!isclrblock(fs, blksfree, blkno)) { 674 mutex_exit(&ufsvfsp->vfs_lock); 675 brelse(bp); 676 (void) ufs_fault(ITOV(ip), "free: freeing free block, " 677 "dev:0x%lx, block:%ld, ino:%lu, fs:%s", 678 ip->i_dev, bno, ip->i_number, fs->fs_fsmnt); 679 return; 680 } 681 setblock(fs, blksfree, blkno); 682 blks[rpos]++; 683 blktot[cylno]++; 684 cgp->cg_cs.cs_nbfree++; /* Log below */ 685 fs->fs_cstotal.cs_nbfree++; 686 fs->fs_cs(fs, cg).cs_nbfree++; 687 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) { 688 mutex_enter(&delq->uq_mutex); 689 delq_info->delq_unreclaimed_blocks -= 690 btodb(fs->fs_bsize); 691 mutex_exit(&delq->uq_mutex); 692 } 693 } else { 694 bbase = bno - fragnum(fs, bno); 695 /* 696 * Decrement the counts associated with the old frags 697 */ 698 bmap = blkmap(fs, blksfree, bbase); 699 fragacct(fs, bmap, cgp->cg_frsum, -1); 700 /* 701 * Deallocate the fragment 702 */ 703 for (i = 0; i < numfrags(fs, size); i++) { 704 if (isset(blksfree, bno + i)) { 705 brelse(bp); 706 mutex_exit(&ufsvfsp->vfs_lock); 707 (void) ufs_fault(ITOV(ip), 708 "free: freeing free frag, " 709 "dev:0x%lx, blk:%ld, cg:%d, " 710 "ino:%lu, fs:%s", 711 ip->i_dev, 712 bno + i, 713 cgp->cg_cgx, 714 ip->i_number, 715 fs->fs_fsmnt); 716 return; 717 } 718 setbit(blksfree, bno + i); 719 } 720 cgp->cg_cs.cs_nffree += i; 721 fs->fs_cstotal.cs_nffree += i; 722 fs->fs_cs(fs, cg).cs_nffree += i; 723 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) { 724 mutex_enter(&delq->uq_mutex); 725 delq_info->delq_unreclaimed_blocks -= 726 btodb(i * fs->fs_fsize); 727 mutex_exit(&delq->uq_mutex); 728 } 729 /* 730 * Add back in counts associated with the new frags 731 */ 732 bmap = blkmap(fs, blksfree, bbase); 733 fragacct(fs, bmap, cgp->cg_frsum, 1); 734 /* 735 * If a complete block has been reassembled, account for it 736 */ 737 blkno = fragstoblks(fs, bbase); 738 if (isblock(fs, blksfree, blkno)) { 739 cylno = cbtocylno(fs, bbase); 740 rpos = cbtorpos(ufsvfsp, bbase); 741 blks = cg_blks(ufsvfsp, cgp, cylno); 742 blks[rpos]++; 743 blktot[cylno]++; 744 cgp->cg_cs.cs_nffree -= fs->fs_frag; 745 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 746 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 747 cgp->cg_cs.cs_nbfree++; 748 fs->fs_cstotal.cs_nbfree++; 749 fs->fs_cs(fs, cg).cs_nbfree++; 750 } 751 } 752 fs->fs_fmod = 1; 753 ufs_notclean(ufsvfsp); 754 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 755 TRANS_SI(ufsvfsp, fs, cg); 756 bdrwrite(bp); 757 } 758 759 /* 760 * Free an inode. 761 * 762 * The specified inode is placed back in the free map. 763 */ 764 void 765 ufs_ifree(struct inode *ip, ino_t ino, mode_t mode) 766 { 767 struct fs *fs = ip->i_fs; 768 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 769 struct cg *cgp; 770 struct buf *bp; 771 unsigned int inot; 772 int cg; 773 char *iused; 774 775 if (ip->i_number == ino && ip->i_mode != 0) { 776 (void) ufs_fault(ITOV(ip), 777 "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, " 778 "fs = %s\n", 779 ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt); 780 return; 781 } 782 if (ino >= fs->fs_ipg * fs->fs_ncg) { 783 (void) ufs_fault(ITOV(ip), 784 "ifree: range, dev = 0x%x, ino = %d, fs = %s\n", 785 (int)ip->i_dev, (int)ino, fs->fs_fsmnt); 786 return; 787 } 788 cg = (int)itog(fs, ino); 789 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 790 (int)fs->fs_cgsize); 791 792 cgp = bp->b_un.b_cg; 793 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 794 brelse(bp); 795 return; 796 } 797 mutex_enter(&ufsvfsp->vfs_lock); 798 cgp->cg_time = gethrestime_sec(); 799 iused = cg_inosused(cgp); 800 inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg); 801 if (isclr(iused, inot)) { 802 mutex_exit(&ufsvfsp->vfs_lock); 803 brelse(bp); 804 (void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, " 805 "mode: (imode) %o, (omode) %o, ino:%d, " 806 "fs:%s", 807 ip->i_mode, mode, (int)ino, fs->fs_fsmnt); 808 return; 809 } 810 clrbit(iused, inot); 811 812 if (inot < (ulong_t)cgp->cg_irotor) 813 cgp->cg_irotor = inot; 814 cgp->cg_cs.cs_nifree++; 815 fs->fs_cstotal.cs_nifree++; 816 fs->fs_cs(fs, cg).cs_nifree++; 817 if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) { 818 cgp->cg_cs.cs_ndir--; 819 fs->fs_cstotal.cs_ndir--; 820 fs->fs_cs(fs, cg).cs_ndir--; 821 } 822 fs->fs_fmod = 1; 823 ufs_notclean(ufsvfsp); 824 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 825 TRANS_SI(ufsvfsp, fs, cg); 826 bdrwrite(bp); 827 } 828 829 /* 830 * Implement the cylinder overflow algorithm. 831 * 832 * The policy implemented by this algorithm is: 833 * 1) allocate the block in its requested cylinder group. 834 * 2) quadratically rehash on the cylinder group number. 835 * 3) brute force search for a free block. 836 * The size parameter means size for data blocks, mode for inodes. 837 */ 838 static ino_t 839 hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)()) 840 { 841 struct fs *fs; 842 int i; 843 long result; 844 int icg = cg; 845 846 fs = ip->i_fs; 847 /* 848 * 1: preferred cylinder group 849 */ 850 result = (*allocator)(ip, cg, pref, size); 851 if (result) 852 return (result); 853 /* 854 * 2: quadratic rehash 855 */ 856 for (i = 1; i < fs->fs_ncg; i *= 2) { 857 cg += i; 858 if (cg >= fs->fs_ncg) 859 cg -= fs->fs_ncg; 860 result = (*allocator)(ip, cg, 0, size); 861 if (result) 862 return (result); 863 } 864 /* 865 * 3: brute force search 866 * Note that we start at i == 2, since 0 was checked initially, 867 * and 1 is always checked in the quadratic rehash. 868 */ 869 cg = (icg + 2) % fs->fs_ncg; 870 for (i = 2; i < fs->fs_ncg; i++) { 871 result = (*allocator)(ip, cg, 0, size); 872 if (result) 873 return (result); 874 cg++; 875 if (cg == fs->fs_ncg) 876 cg = 0; 877 } 878 return (NULL); 879 } 880 881 /* 882 * Determine whether a fragment can be extended. 883 * 884 * Check to see if the necessary fragments are available, and 885 * if they are, allocate them. 886 */ 887 static daddr_t 888 fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize) 889 { 890 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 891 struct fs *fs = ip->i_fs; 892 struct buf *bp; 893 struct cg *cgp; 894 uchar_t *blksfree; 895 long bno; 896 int frags, bbase; 897 int i, j; 898 899 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) 900 return (NULL); 901 frags = numfrags(fs, nsize); 902 bbase = (int)fragnum(fs, bprev); 903 if (bbase > fragnum(fs, (bprev + frags - 1))) { 904 /* cannot extend across a block boundary */ 905 return (NULL); 906 } 907 908 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 909 (int)fs->fs_cgsize); 910 cgp = bp->b_un.b_cg; 911 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 912 brelse(bp); 913 return (NULL); 914 } 915 916 blksfree = cg_blksfree(cgp); 917 mutex_enter(&ufsvfsp->vfs_lock); 918 bno = dtogd(fs, bprev); 919 for (i = numfrags(fs, osize); i < frags; i++) { 920 if (isclr(blksfree, bno + i)) { 921 mutex_exit(&ufsvfsp->vfs_lock); 922 brelse(bp); 923 return (NULL); 924 } 925 if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)), 926 fs->fs_fsize))) { 927 mutex_exit(&ufsvfsp->vfs_lock); 928 brelse(bp); 929 return (NULL); 930 } 931 } 932 933 cgp->cg_time = gethrestime_sec(); 934 /* 935 * The current fragment can be extended, 936 * deduct the count on fragment being extended into 937 * increase the count on the remaining fragment (if any) 938 * allocate the extended piece. 939 */ 940 for (i = frags; i < fs->fs_frag - bbase; i++) 941 if (isclr(blksfree, bno + i)) 942 break; 943 j = i - numfrags(fs, osize); 944 cgp->cg_frsum[j]--; 945 ASSERT(cgp->cg_frsum[j] >= 0); 946 if (i != frags) 947 cgp->cg_frsum[i - frags]++; 948 for (i = numfrags(fs, osize); i < frags; i++) { 949 clrbit(blksfree, bno + i); 950 cgp->cg_cs.cs_nffree--; 951 fs->fs_cs(fs, cg).cs_nffree--; 952 fs->fs_cstotal.cs_nffree--; 953 } 954 fs->fs_fmod = 1; 955 ufs_notclean(ufsvfsp); 956 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 957 TRANS_SI(ufsvfsp, fs, cg); 958 bdrwrite(bp); 959 return ((daddr_t)bprev); 960 } 961 962 /* 963 * Determine whether a block can be allocated. 964 * 965 * Check to see if a block of the apprpriate size 966 * is available, and if it is, allocate it. 967 */ 968 static daddr_t 969 alloccg(struct inode *ip, int cg, daddr_t bpref, int size) 970 { 971 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 972 struct fs *fs = ip->i_fs; 973 struct buf *bp; 974 struct cg *cgp; 975 uchar_t *blksfree; 976 int bno, frags; 977 int allocsiz; 978 int i; 979 980 /* 981 * Searching for space could be time expensive so do some 982 * up front checking to verify that there is actually space 983 * available (free blocks or free frags). 984 */ 985 if (fs->fs_cs(fs, cg).cs_nbfree == 0) { 986 if (size == fs->fs_bsize) 987 return (0); 988 989 /* 990 * If there are not enough free frags then return. 991 */ 992 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, size)) 993 return (0); 994 } 995 996 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 997 (int)fs->fs_cgsize); 998 999 cgp = bp->b_un.b_cg; 1000 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) || 1001 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { 1002 brelse(bp); 1003 return (0); 1004 } 1005 blksfree = cg_blksfree(cgp); 1006 mutex_enter(&ufsvfsp->vfs_lock); 1007 cgp->cg_time = gethrestime_sec(); 1008 if (size == fs->fs_bsize) { 1009 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0) 1010 goto errout; 1011 fs->fs_fmod = 1; 1012 ufs_notclean(ufsvfsp); 1013 TRANS_SI(ufsvfsp, fs, cg); 1014 bdrwrite(bp); 1015 return (bno); 1016 } 1017 /* 1018 * Check fragment bitmap to see if any fragments are already available. 1019 * mapsearch() may fail because the fragment that fits this request 1020 * might still be on the cancel list and not available for re-use yet. 1021 * Look for a bigger sized fragment to allocate first before we have 1022 * to give up and fragment a whole new block eventually. 1023 */ 1024 frags = numfrags(fs, size); 1025 allocsiz = frags; 1026 next_size: 1027 for (; allocsiz < fs->fs_frag; allocsiz++) 1028 if (cgp->cg_frsum[allocsiz] != 0) 1029 break; 1030 1031 if (allocsiz != fs->fs_frag) { 1032 bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz); 1033 if (bno < 0 && allocsiz < (fs->fs_frag - 1)) { 1034 allocsiz++; 1035 goto next_size; 1036 } 1037 } 1038 1039 if (allocsiz == fs->fs_frag || bno < 0) { 1040 /* 1041 * No fragments were available, so a block 1042 * will be allocated and hacked up. 1043 */ 1044 if (cgp->cg_cs.cs_nbfree == 0) 1045 goto errout; 1046 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0) 1047 goto errout; 1048 bpref = dtogd(fs, bno); 1049 for (i = frags; i < fs->fs_frag; i++) 1050 setbit(blksfree, bpref + i); 1051 i = fs->fs_frag - frags; 1052 cgp->cg_cs.cs_nffree += i; 1053 fs->fs_cstotal.cs_nffree += i; 1054 fs->fs_cs(fs, cg).cs_nffree += i; 1055 cgp->cg_frsum[i]++; 1056 fs->fs_fmod = 1; 1057 ufs_notclean(ufsvfsp); 1058 TRANS_SI(ufsvfsp, fs, cg); 1059 bdrwrite(bp); 1060 return (bno); 1061 } 1062 1063 for (i = 0; i < frags; i++) 1064 clrbit(blksfree, bno + i); 1065 cgp->cg_cs.cs_nffree -= frags; 1066 fs->fs_cstotal.cs_nffree -= frags; 1067 fs->fs_cs(fs, cg).cs_nffree -= frags; 1068 cgp->cg_frsum[allocsiz]--; 1069 ASSERT(cgp->cg_frsum[allocsiz] >= 0); 1070 if (frags != allocsiz) { 1071 cgp->cg_frsum[allocsiz - frags]++; 1072 } 1073 fs->fs_fmod = 1; 1074 ufs_notclean(ufsvfsp); 1075 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1076 TRANS_SI(ufsvfsp, fs, cg); 1077 bdrwrite(bp); 1078 return (cg * fs->fs_fpg + bno); 1079 errout: 1080 mutex_exit(&ufsvfsp->vfs_lock); 1081 brelse(bp); 1082 return (0); 1083 } 1084 1085 /* 1086 * Allocate a block in a cylinder group. 1087 * 1088 * This algorithm implements the following policy: 1089 * 1) allocate the requested block. 1090 * 2) allocate a rotationally optimal block in the same cylinder. 1091 * 3) allocate the next available block on the block rotor for the 1092 * specified cylinder group. 1093 * Note that this routine only allocates fs_bsize blocks; these 1094 * blocks may be fragmented by the routine that allocates them. 1095 */ 1096 static daddr_t 1097 alloccgblk( 1098 struct ufsvfs *ufsvfsp, 1099 struct cg *cgp, 1100 daddr_t bpref, 1101 struct buf *bp) 1102 { 1103 daddr_t bno; 1104 int cylno, pos, delta, rotbl_size; 1105 short *cylbp; 1106 int i; 1107 struct fs *fs; 1108 uchar_t *blksfree; 1109 daddr_t blkno, rpos, frag; 1110 short *blks; 1111 int32_t *blktot; 1112 1113 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock)); 1114 fs = ufsvfsp->vfs_fs; 1115 blksfree = cg_blksfree(cgp); 1116 if (bpref == 0) { 1117 bpref = cgp->cg_rotor; 1118 goto norot; 1119 } 1120 bpref = blknum(fs, bpref); 1121 bpref = dtogd(fs, bpref); 1122 /* 1123 * If the requested block is available, use it. 1124 */ 1125 if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) { 1126 bno = bpref; 1127 goto gotit; 1128 } 1129 /* 1130 * Check for a block available on the same cylinder. 1131 */ 1132 cylno = cbtocylno(fs, bpref); 1133 if (cg_blktot(cgp)[cylno] == 0) 1134 goto norot; 1135 if (fs->fs_cpc == 0) { 1136 /* 1137 * Block layout info is not available, so just 1138 * have to take any block in this cylinder. 1139 */ 1140 bpref = howmany(fs->fs_spc * cylno, NSPF(fs)); 1141 goto norot; 1142 } 1143 /* 1144 * Check the summary information to see if a block is 1145 * available in the requested cylinder starting at the 1146 * requested rotational position and proceeding around. 1147 */ 1148 cylbp = cg_blks(ufsvfsp, cgp, cylno); 1149 pos = cbtorpos(ufsvfsp, bpref); 1150 for (i = pos; i < ufsvfsp->vfs_nrpos; i++) 1151 if (cylbp[i] > 0) 1152 break; 1153 if (i == ufsvfsp->vfs_nrpos) 1154 for (i = 0; i < pos; i++) 1155 if (cylbp[i] > 0) 1156 break; 1157 if (cylbp[i] > 0) { 1158 /* 1159 * Found a rotational position, now find the actual 1160 * block. A "panic" if none is actually there. 1161 */ 1162 1163 /* 1164 * Up to this point, "pos" has referred to the rotational 1165 * position of the desired block. From now on, it holds 1166 * the offset of the current cylinder within a cylinder 1167 * cycle. (A cylinder cycle refers to a set of cylinders 1168 * which are described by a single rotational table; the 1169 * size of the cycle is fs_cpc.) 1170 * 1171 * bno is set to the block number of the first block within 1172 * the current cylinder cycle. 1173 */ 1174 1175 pos = cylno % fs->fs_cpc; 1176 bno = (cylno - pos) * fs->fs_spc / NSPB(fs); 1177 1178 /* 1179 * The blocks within a cylinder are grouped into equivalence 1180 * classes according to their "rotational position." There 1181 * are two tables used to determine these classes. 1182 * 1183 * The positional offset table (fs_postbl) has an entry for 1184 * each rotational position of each cylinder in a cylinder 1185 * cycle. This entry contains the relative block number 1186 * (counting from the start of the cylinder cycle) of the 1187 * first block in the equivalence class for that position 1188 * and that cylinder. Positions for which no blocks exist 1189 * are indicated by a -1. 1190 * 1191 * The rotational delta table (fs_rotbl) has an entry for 1192 * each block in a cylinder cycle. This entry contains 1193 * the offset from that block to the next block in the 1194 * same equivalence class. The last block in the class 1195 * is indicated by a zero in the table. 1196 * 1197 * The following code, then, walks through all of the blocks 1198 * in the cylinder (cylno) which we're allocating within 1199 * which are in the equivalence class for the rotational 1200 * position (i) which we're allocating within. 1201 */ 1202 1203 if (fs_postbl(ufsvfsp, pos)[i] == -1) { 1204 (void) ufs_fault(ufsvfsp->vfs_root, 1205 "alloccgblk: cyl groups corrupted, pos = %d, " 1206 "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt); 1207 return (0); 1208 } 1209 1210 /* 1211 * There is one entry in the rotational table for each block 1212 * in the cylinder cycle. These are whole blocks, not frags. 1213 */ 1214 1215 rotbl_size = (fs->fs_cpc * fs->fs_spc) >> 1216 (fs->fs_fragshift + fs->fs_fsbtodb); 1217 1218 /* 1219 * As we start, "i" is the rotational position within which 1220 * we're searching. After the next line, it will be a block 1221 * number (relative to the start of the cylinder cycle) 1222 * within the equivalence class of that rotational position. 1223 */ 1224 1225 i = fs_postbl(ufsvfsp, pos)[i]; 1226 1227 for (;;) { 1228 if (isblock(fs, blksfree, (daddr_t)(bno + i))) { 1229 bno = blkstofrags(fs, (bno + i)); 1230 goto gotit; 1231 } 1232 delta = fs_rotbl(fs)[i]; 1233 if (delta <= 0 || /* End of chain, or */ 1234 delta + i > rotbl_size) /* end of table? */ 1235 break; /* If so, panic. */ 1236 i += delta; 1237 } 1238 (void) ufs_fault(ufsvfsp->vfs_root, 1239 "alloccgblk: can't find blk in cyl, pos:%d, i:%d, " 1240 "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno); 1241 return (0); 1242 } 1243 norot: 1244 /* 1245 * No blocks in the requested cylinder, so take 1246 * next available one in this cylinder group. 1247 */ 1248 bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag); 1249 if (bno < 0) 1250 return (0); 1251 cgp->cg_rotor = bno; 1252 gotit: 1253 blkno = fragstoblks(fs, bno); 1254 frag = (cgp->cg_cgx * fs->fs_fpg) + bno; 1255 if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize)) 1256 goto norot; 1257 clrblock(fs, blksfree, (long)blkno); 1258 /* 1259 * the other cg/sb/si fields are TRANS'ed by the caller 1260 */ 1261 cgp->cg_cs.cs_nbfree--; 1262 fs->fs_cstotal.cs_nbfree--; 1263 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; 1264 cylno = cbtocylno(fs, bno); 1265 blks = cg_blks(ufsvfsp, cgp, cylno); 1266 rpos = cbtorpos(ufsvfsp, bno); 1267 blktot = cg_blktot(cgp); 1268 blks[rpos]--; 1269 blktot[cylno]--; 1270 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1271 fs->fs_fmod = 1; 1272 return (frag); 1273 } 1274 1275 /* 1276 * Determine whether an inode can be allocated. 1277 * 1278 * Check to see if an inode is available, and if it is, 1279 * allocate it using the following policy: 1280 * 1) allocate the requested inode. 1281 * 2) allocate the next available inode after the requested 1282 * inode in the specified cylinder group. 1283 */ 1284 static ino_t 1285 ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode) 1286 { 1287 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1288 struct fs *fs = ip->i_fs; 1289 struct cg *cgp; 1290 struct buf *bp; 1291 int start, len, loc, map, i; 1292 char *iused; 1293 1294 if (fs->fs_cs(fs, cg).cs_nifree == 0) 1295 return (0); 1296 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 1297 (int)fs->fs_cgsize); 1298 1299 cgp = bp->b_un.b_cg; 1300 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) || 1301 cgp->cg_cs.cs_nifree == 0) { 1302 brelse(bp); 1303 return (0); 1304 } 1305 iused = cg_inosused(cgp); 1306 mutex_enter(&ufsvfsp->vfs_lock); 1307 /* 1308 * While we are waiting for the mutex, someone may have taken 1309 * the last available inode. Need to recheck. 1310 */ 1311 if (cgp->cg_cs.cs_nifree == 0) { 1312 mutex_exit(&ufsvfsp->vfs_lock); 1313 brelse(bp); 1314 return (0); 1315 } 1316 1317 cgp->cg_time = gethrestime_sec(); 1318 if (ipref) { 1319 ipref %= fs->fs_ipg; 1320 if (isclr(iused, ipref)) 1321 goto gotit; 1322 } 1323 start = cgp->cg_irotor / NBBY; 1324 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); 1325 loc = skpc(0xff, (uint_t)len, &iused[start]); 1326 if (loc == 0) { 1327 len = start + 1; 1328 start = 0; 1329 loc = skpc(0xff, (uint_t)len, &iused[0]); 1330 if (loc == 0) { 1331 mutex_exit(&ufsvfsp->vfs_lock); 1332 (void) ufs_fault(ITOV(ip), 1333 "ialloccg: map corrupted, cg = %d, irotor = %d, " 1334 "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt); 1335 return (0); 1336 } 1337 } 1338 i = start + len - loc; 1339 map = iused[i]; 1340 ipref = i * NBBY; 1341 for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) { 1342 if ((map & i) == 0) { 1343 cgp->cg_irotor = ipref; 1344 goto gotit; 1345 } 1346 } 1347 1348 mutex_exit(&ufsvfsp->vfs_lock); 1349 (void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s", 1350 fs->fs_fsmnt); 1351 return (0); 1352 gotit: 1353 setbit(iused, ipref); 1354 cgp->cg_cs.cs_nifree--; 1355 fs->fs_cstotal.cs_nifree--; 1356 fs->fs_cs(fs, cg).cs_nifree--; 1357 if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) { 1358 cgp->cg_cs.cs_ndir++; 1359 fs->fs_cstotal.cs_ndir++; 1360 fs->fs_cs(fs, cg).cs_ndir++; 1361 } 1362 fs->fs_fmod = 1; 1363 ufs_notclean(ufsvfsp); 1364 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1365 TRANS_SI(ufsvfsp, fs, cg); 1366 bdrwrite(bp); 1367 return (cg * fs->fs_ipg + ipref); 1368 } 1369 1370 /* 1371 * Find a block of the specified size in the specified cylinder group. 1372 * 1373 * It is a panic if a request is made to find a block if none are 1374 * available. 1375 */ 1376 static daddr_t 1377 mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref, 1378 int allocsiz) 1379 { 1380 struct fs *fs = ufsvfsp->vfs_fs; 1381 daddr_t bno, cfrag; 1382 int start, len, loc, i, last, first, secondtime; 1383 int blk, field, subfield, pos; 1384 int gotit; 1385 1386 /* 1387 * ufsvfs->vfs_lock is held when calling this. 1388 */ 1389 /* 1390 * Find the fragment by searching through the 1391 * free block map for an appropriate bit pattern. 1392 */ 1393 if (bpref) 1394 start = dtogd(fs, bpref) / NBBY; 1395 else 1396 start = cgp->cg_frotor / NBBY; 1397 /* 1398 * the following loop performs two scans -- the first scan 1399 * searches the bottom half of the array for a match and the 1400 * second scan searches the top half of the array. The loops 1401 * have been merged just to make things difficult. 1402 */ 1403 first = start; 1404 last = howmany(fs->fs_fpg, NBBY); 1405 secondtime = 0; 1406 cfrag = cgp->cg_cgx * fs->fs_fpg; 1407 while (first < last) { 1408 len = last - first; 1409 /* 1410 * search the array for a match 1411 */ 1412 loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first], 1413 (uchar_t *)fragtbl[fs->fs_frag], 1414 (int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 1415 /* 1416 * match found 1417 */ 1418 if (loc) { 1419 bno = (last - loc) * NBBY; 1420 1421 /* 1422 * Found the byte in the map, sift 1423 * through the bits to find the selected frag 1424 */ 1425 cgp->cg_frotor = bno; 1426 gotit = 0; 1427 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 1428 blk = blkmap(fs, cg_blksfree(cgp), bno); 1429 blk <<= 1; 1430 field = around[allocsiz]; 1431 subfield = inside[allocsiz]; 1432 for (pos = 0; 1433 pos <= fs->fs_frag - allocsiz; 1434 pos++) { 1435 if ((blk & field) == subfield) { 1436 gotit++; 1437 break; 1438 } 1439 field <<= 1; 1440 subfield <<= 1; 1441 } 1442 if (gotit) 1443 break; 1444 } 1445 bno += pos; 1446 1447 /* 1448 * success if block is *not* being converted from 1449 * metadata into userdata (harpy). If so, ignore. 1450 */ 1451 if (!TRANS_ISCANCEL(ufsvfsp, 1452 ldbtob(fsbtodb(fs, (cfrag+bno))), 1453 allocsiz * fs->fs_fsize)) 1454 return (bno); 1455 1456 /* 1457 * keep looking -- this block is being converted 1458 */ 1459 first = (last - loc) + 1; 1460 loc = 0; 1461 if (first < last) 1462 continue; 1463 } 1464 /* 1465 * no usable matches in bottom half -- now search the top half 1466 */ 1467 if (secondtime) 1468 /* 1469 * no usable matches in top half -- all done 1470 */ 1471 break; 1472 secondtime = 1; 1473 last = start + 1; 1474 first = 0; 1475 } 1476 /* 1477 * no usable matches 1478 */ 1479 return ((daddr_t)-1); 1480 } 1481 1482 #define UFSNADDR (NDADDR + NIADDR) /* NADDR applies to (obsolete) S5FS */ 1483 #define IB(i) (NDADDR + (i)) /* index of i'th indirect block ptr */ 1484 #define SINGLE 0 /* single indirect block ptr */ 1485 #define DOUBLE 1 /* double indirect block ptr */ 1486 #define TRIPLE 2 /* triple indirect block ptr */ 1487 1488 /* 1489 * Acquire a write lock, and keep trying till we get it 1490 */ 1491 static int 1492 allocsp_wlockfs(struct vnode *vp, struct lockfs *lf) 1493 { 1494 int err = 0; 1495 1496 lockagain: 1497 do { 1498 err = ufs_fiolfss(vp, lf); 1499 if (err) 1500 return (err); 1501 } while (!LOCKFS_IS_ULOCK(lf)); 1502 1503 lf->lf_lock = LOCKFS_WLOCK; 1504 lf->lf_flags = 0; 1505 lf->lf_comment = NULL; 1506 err = ufs__fiolfs(vp, lf, 1, 0); 1507 1508 if (err == EBUSY || err == EINVAL) 1509 goto lockagain; 1510 1511 return (err); 1512 } 1513 1514 /* 1515 * Release the write lock 1516 */ 1517 static int 1518 allocsp_unlockfs(struct vnode *vp, struct lockfs *lf) 1519 { 1520 int err = 0; 1521 1522 lf->lf_lock = LOCKFS_ULOCK; 1523 lf->lf_flags = 0; 1524 err = ufs__fiolfs(vp, lf, 1, 0); 1525 return (err); 1526 } 1527 1528 struct allocsp_undo { 1529 daddr_t offset; 1530 daddr_t blk; 1531 struct allocsp_undo *next; 1532 }; 1533 1534 /* 1535 * ufs_allocsp() can be used to pre-allocate blocks for a file on a given 1536 * file system. For direct blocks, the blocks are allocated from the offset 1537 * requested to the block boundary, then any full blocks are allocated, 1538 * and finally any remainder. 1539 * For indirect blocks the blocks are not initialized and are 1540 * only marked as allocated. These addresses are then stored as negative 1541 * block numbers in the inode to imply special handling. UFS has been modified 1542 * where necessary to understand this new notion. 1543 * Successfully fallocated files will have IFALLOCATE cflag set in the inode. 1544 */ 1545 int 1546 ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr) 1547 { 1548 struct lockfs lf; 1549 int berr, err, resv, issync; 1550 off_t istart, len; /* istart, special for idb */ 1551 struct inode *ip; 1552 struct fs *fs; 1553 struct ufsvfs *ufsvfsp; 1554 u_offset_t resid, i, uoff; 1555 daddr32_t db_undo[NDADDR]; /* old direct blocks */ 1556 struct allocsp_undo *ib_undo = NULL; /* ib undo */ 1557 struct allocsp_undo *undo = NULL; 1558 u_offset_t osz; /* old file size */ 1559 int chunkblks = 0; /* # of blocks in 1 allocation */ 1560 int cnt = 0; 1561 daddr_t allocblk; 1562 daddr_t totblks = 0; 1563 struct ulockfs *ulp; 1564 size_t done_len; 1565 int nbytes, offsetn; 1566 1567 1568 ASSERT(vp->v_type == VREG); 1569 1570 ip = VTOI(vp); 1571 fs = ip->i_fs; 1572 if ((ufsvfsp = ip->i_ufsvfs) == NULL) { 1573 err = EIO; 1574 goto out_allocsp; 1575 } 1576 1577 istart = blkroundup(fs, (lp->l_start)); 1578 len = blkroundup(fs, (lp->l_len)); 1579 chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize; 1580 ulp = &ufsvfsp->vfs_ulockfs; 1581 1582 if (lp->l_start < 0 || lp->l_len <= 0) 1583 return (EINVAL); 1584 1585 /* Quickly check to make sure we have space before we proceed */ 1586 if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) { 1587 if (TRANS_ISTRANS(ufsvfsp)) { 1588 ufs_delete_drain_wait(ufsvfsp, 1); 1589 if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) 1590 return (ENOSPC); 1591 } else 1592 return (ENOSPC); 1593 } 1594 1595 /* 1596 * We will keep i_rwlock locked as WRITER through out the function 1597 * since we don't want anyone else reading or writing to the inode 1598 * while we are in the middle of fallocating the file. 1599 */ 1600 rw_enter(&ip->i_rwlock, RW_WRITER); 1601 1602 /* Back up the direct block list, used for undo later if necessary */ 1603 rw_enter(&ip->i_contents, RW_READER); 1604 for (i = 0; i < NDADDR; i++) 1605 db_undo[i] = ip->i_db[i]; 1606 osz = ip->i_size; 1607 rw_exit(&ip->i_contents); 1608 1609 /* Write lock the file system */ 1610 if (err = allocsp_wlockfs(vp, &lf)) 1611 goto exit; 1612 1613 /* 1614 * Allocate any direct blocks now. 1615 * Blocks are allocated from the offset requested to the block 1616 * boundary, then any full blocks are allocated, and finally any 1617 * remainder. 1618 */ 1619 if (lblkno(fs, lp->l_start) < NDADDR) { 1620 ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize), 1621 &resv, &resid); 1622 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1623 1624 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1625 rw_enter(&ip->i_contents, RW_WRITER); 1626 1627 done_len = 0; 1628 while ((done_len < lp->l_len) && 1629 (lblkno(fs, lp->l_start + done_len) < NDADDR)) { 1630 uoff = (offset_t)(lp->l_start + done_len); 1631 offsetn = (int)blkoff(fs, uoff); 1632 nbytes = (int)MIN(fs->fs_bsize - offsetn, 1633 lp->l_len - done_len); 1634 1635 berr = bmap_write(ip, uoff, offsetn + nbytes, 1636 BI_FALLOCATE, &allocblk, cr); 1637 /* Yikes error, quit */ 1638 if (berr) { 1639 TRANS_INODE(ufsvfsp, ip); 1640 rw_exit(&ip->i_contents); 1641 rw_exit(&ufsvfsp->vfs_dqrwlock); 1642 TRANS_END_CSYNC(ufsvfsp, err, issync, 1643 TOP_ALLOCSP, resv); 1644 err = allocsp_unlockfs(vp, &lf); 1645 goto exit; 1646 } 1647 1648 if (allocblk) { 1649 totblks++; 1650 if ((uoff + nbytes) > ip->i_size) 1651 ip->i_size = (uoff + nbytes); 1652 } 1653 done_len += nbytes; 1654 } 1655 1656 TRANS_INODE(ufsvfsp, ip); 1657 rw_exit(&ip->i_contents); 1658 rw_exit(&ufsvfsp->vfs_dqrwlock); 1659 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1660 1661 /* start offset for indirect allocation */ 1662 istart = (uoff + nbytes); 1663 } 1664 1665 /* Break the transactions into vfs_iotransz units */ 1666 ufs_trans_trunc_resv(ip, ip->i_size + 1667 blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid); 1668 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1669 1670 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1671 rw_enter(&ip->i_contents, RW_WRITER); 1672 1673 /* Now go about fallocating necessary indirect blocks */ 1674 for (i = istart; i < (lp->l_start + lp->l_len); i += fs->fs_bsize) { 1675 berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE, 1676 &allocblk, cr); 1677 if (berr) { 1678 TRANS_INODE(ufsvfsp, ip); 1679 rw_exit(&ip->i_contents); 1680 rw_exit(&ufsvfsp->vfs_dqrwlock); 1681 TRANS_END_CSYNC(ufsvfsp, err, issync, 1682 TOP_ALLOCSP, resv); 1683 err = allocsp_unlockfs(vp, &lf); 1684 goto exit; 1685 } 1686 1687 /* Update the blk counter only if new block was added */ 1688 if (allocblk) { 1689 /* Save undo information */ 1690 undo = kmem_alloc(sizeof (struct allocsp_undo), 1691 KM_SLEEP); 1692 undo->offset = i; 1693 undo->blk = allocblk; 1694 undo->next = ib_undo; 1695 ib_undo = undo; 1696 totblks++; 1697 1698 if (i >= ip->i_size) 1699 ip->i_size += fs->fs_bsize; 1700 } 1701 cnt++; 1702 1703 /* Being a good UFS citizen, let others get a share */ 1704 if (cnt == chunkblks) { 1705 /* 1706 * If there are waiters or the fs is hard locked, 1707 * error locked, or read-only error locked, 1708 * quit with EIO 1709 */ 1710 if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) || 1711 ULOCKFS_IS_ROELOCK(ulp)) { 1712 ip->i_cflags |= IFALLOCATE; 1713 TRANS_INODE(ufsvfsp, ip); 1714 rw_exit(&ip->i_contents); 1715 rw_exit(&ufsvfsp->vfs_dqrwlock); 1716 1717 TRANS_END_CSYNC(ufsvfsp, err, issync, 1718 TOP_ALLOCSP, resv); 1719 rw_exit(&ip->i_rwlock); 1720 (void) allocsp_unlockfs(vp, &lf); 1721 return (EIO); 1722 } 1723 1724 TRANS_INODE(ufsvfsp, ip); 1725 rw_exit(&ip->i_contents); 1726 rw_exit(&ufsvfsp->vfs_dqrwlock); 1727 1728 /* End the current transaction */ 1729 TRANS_END_CSYNC(ufsvfsp, err, issync, 1730 TOP_ALLOCSP, resv); 1731 1732 if (CV_HAS_WAITERS(&ulp->ul_cv)) { 1733 /* Release the write lock */ 1734 if (err = allocsp_unlockfs(vp, &lf)) 1735 goto exit; 1736 1737 /* Wake up others waiting to do operations */ 1738 mutex_enter(&ulp->ul_lock); 1739 cv_broadcast(&ulp->ul_cv); 1740 mutex_exit(&ulp->ul_lock); 1741 1742 /* Grab the write lock again */ 1743 if (err = allocsp_wlockfs(vp, &lf)) 1744 goto exit; 1745 } /* end of CV_HAS_WAITERS(&ulp->ul_cv) */ 1746 1747 /* Reserve more space in log for this file */ 1748 ufs_trans_trunc_resv(ip, 1749 ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz), 1750 &resv, &resid); 1751 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1752 1753 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1754 rw_enter(&ip->i_contents, RW_WRITER); 1755 1756 cnt = 0; /* reset cnt b/c of new transaction */ 1757 } 1758 } 1759 1760 if (!err && !berr) 1761 ip->i_cflags |= IFALLOCATE; 1762 1763 /* If the file has grown then correct the file size */ 1764 if (osz < (lp->l_start + lp->l_len)) 1765 ip->i_size = (lp->l_start + lp->l_len); 1766 1767 /* Release locks, end log transaction and unlock fs */ 1768 TRANS_INODE(ufsvfsp, ip); 1769 rw_exit(&ip->i_contents); 1770 rw_exit(&ufsvfsp->vfs_dqrwlock); 1771 1772 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1773 err = allocsp_unlockfs(vp, &lf); 1774 1775 /* 1776 * @ exit label, we should no longer be holding the fs write lock, and 1777 * all logging transactions should have been ended. We still hold 1778 * ip->i_rwlock. 1779 */ 1780 exit: 1781 /* 1782 * File has grown larger than 2GB. Set flag 1783 * in superblock to indicate this, if it 1784 * is not already set. 1785 */ 1786 if ((ip->i_size > MAXOFF32_T) && 1787 !(fs->fs_flags & FSLARGEFILES)) { 1788 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1789 mutex_enter(&ufsvfsp->vfs_lock); 1790 fs->fs_flags |= FSLARGEFILES; 1791 ufs_sbwrite(ufsvfsp); 1792 mutex_exit(&ufsvfsp->vfs_lock); 1793 } 1794 1795 /* 1796 * Since we couldn't allocate completely, we will undo the allocations. 1797 */ 1798 if (berr) { 1799 ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid); 1800 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1801 1802 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1803 rw_enter(&ip->i_contents, RW_WRITER); 1804 1805 /* Direct blocks */ 1806 for (i = 0; i < NDADDR; i++) { 1807 /* 1808 * Only free the block if they are not same, and 1809 * the old one isn't zero (the fragment was 1810 * re-allocated). 1811 */ 1812 if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) { 1813 free(ip, ip->i_db[i], fs->fs_bsize, 0); 1814 ip->i_db[i] = 0; 1815 } 1816 } 1817 1818 /* Undo the indirect blocks */ 1819 while (ib_undo != NULL) { 1820 undo = ib_undo; 1821 err = bmap_set_bn(vp, undo->offset, 0); 1822 if (err) 1823 cmn_err(CE_PANIC, "ufs_allocsp(): failed to " 1824 "undo allocation of block %ld", 1825 undo->offset); 1826 free(ip, undo->blk, fs->fs_bsize, I_IBLK); 1827 ib_undo = undo->next; 1828 kmem_free(undo, sizeof (struct allocsp_undo)); 1829 } 1830 1831 ip->i_size = osz; 1832 TRANS_INODE(ufsvfsp, ip); 1833 1834 rw_exit(&ip->i_contents); 1835 rw_exit(&ufsvfsp->vfs_dqrwlock); 1836 1837 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1838 1839 rw_exit(&ip->i_rwlock); 1840 return (berr); 1841 } 1842 1843 /* 1844 * Don't forget to free the undo chain :) 1845 */ 1846 while (ib_undo != NULL) { 1847 undo = ib_undo; 1848 ib_undo = undo->next; 1849 kmem_free(undo, sizeof (struct allocsp_undo)); 1850 } 1851 1852 rw_exit(&ip->i_rwlock); 1853 1854 out_allocsp: 1855 return (err); 1856 } 1857 1858 /* 1859 * Free storage space associated with the specified inode. The portion 1860 * to be freed is specified by lp->l_start and lp->l_len (already 1861 * normalized to a "whence" of 0). 1862 * 1863 * This is an experimental facility whose continued existence is not 1864 * guaranteed. Currently, we only support the special case 1865 * of l_len == 0, meaning free to end of file. 1866 * 1867 * Blocks are freed in reverse order. This FILO algorithm will tend to 1868 * maintain a contiguous free list much longer than FIFO. 1869 * See also ufs_itrunc() in ufs_inode.c. 1870 * 1871 * Bug: unused bytes in the last retained block are not cleared. 1872 * This may result in a "hole" in the file that does not read as zeroes. 1873 */ 1874 /* ARGSUSED */ 1875 int 1876 ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr) 1877 { 1878 int i; 1879 struct inode *ip = VTOI(vp); 1880 int error; 1881 1882 ASSERT(vp->v_type == VREG); 1883 ASSERT(lp->l_start >= 0); /* checked by convoff */ 1884 1885 if (lp->l_len != 0) 1886 return (EINVAL); 1887 1888 rw_enter(&ip->i_contents, RW_READER); 1889 if (ip->i_size == (u_offset_t)lp->l_start) { 1890 rw_exit(&ip->i_contents); 1891 return (0); 1892 } 1893 1894 /* 1895 * Check if there is any active mandatory lock on the 1896 * range that will be truncated/expanded. 1897 */ 1898 if (MANDLOCK(vp, ip->i_mode)) { 1899 offset_t save_start; 1900 1901 save_start = lp->l_start; 1902 1903 if (ip->i_size < lp->l_start) { 1904 /* 1905 * "Truncate up" case: need to make sure there 1906 * is no lock beyond current end-of-file. To 1907 * do so, we need to set l_start to the size 1908 * of the file temporarily. 1909 */ 1910 lp->l_start = ip->i_size; 1911 } 1912 lp->l_type = F_WRLCK; 1913 lp->l_sysid = 0; 1914 lp->l_pid = ttoproc(curthread)->p_pid; 1915 i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK; 1916 rw_exit(&ip->i_contents); 1917 if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 || 1918 lp->l_type != F_UNLCK) { 1919 return (i ? i : EAGAIN); 1920 } 1921 rw_enter(&ip->i_contents, RW_READER); 1922 1923 lp->l_start = save_start; 1924 } 1925 1926 /* 1927 * Make sure a write isn't in progress (allocating blocks) 1928 * by acquiring i_rwlock (we promised ufs_bmap we wouldn't 1929 * truncate while it was allocating blocks). 1930 * Grab the locks in the right order. 1931 */ 1932 rw_exit(&ip->i_contents); 1933 rw_enter(&ip->i_rwlock, RW_WRITER); 1934 error = TRANS_ITRUNC(ip, (u_offset_t)lp->l_start, 0, cr); 1935 rw_exit(&ip->i_rwlock); 1936 return (error); 1937 } 1938 1939 /* 1940 * Find a cg with as close to nb contiguous bytes as possible 1941 * THIS MAY TAKE MANY DISK READS! 1942 * 1943 * Implemented in an attempt to allocate contiguous blocks for 1944 * writing the ufs log file to, minimizing future disk head seeking 1945 */ 1946 daddr_t 1947 contigpref(ufsvfs_t *ufsvfsp, size_t nb) 1948 { 1949 struct fs *fs = ufsvfsp->vfs_fs; 1950 daddr_t nblk = lblkno(fs, blkroundup(fs, nb)); 1951 daddr_t savebno, curbno, cgbno; 1952 int cg, cgblks, savecg, savenblk, curnblk; 1953 uchar_t *blksfree; 1954 buf_t *bp; 1955 struct cg *cgp; 1956 1957 savenblk = 0; 1958 savecg = 0; 1959 savebno = 0; 1960 for (cg = 0; cg < fs->fs_ncg; ++cg) { 1961 1962 /* not enough free blks for a contig check */ 1963 if (fs->fs_cs(fs, cg).cs_nbfree < nblk) 1964 continue; 1965 1966 /* 1967 * find the largest contiguous range in this cg 1968 */ 1969 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, 1970 (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 1971 (int)fs->fs_cgsize); 1972 cgp = bp->b_un.b_cg; 1973 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 1974 brelse(bp); 1975 continue; 1976 } 1977 blksfree = cg_blksfree(cgp); /* free array */ 1978 cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */ 1979 cgbno = 0; 1980 while (cgbno < cgblks && savenblk < nblk) { 1981 /* find a free block */ 1982 for (; cgbno < cgblks; ++cgbno) 1983 if (isblock(fs, blksfree, cgbno)) 1984 break; 1985 curbno = cgbno; 1986 /* count the number of free blocks */ 1987 for (curnblk = 0; cgbno < cgblks; ++cgbno) { 1988 if (!isblock(fs, blksfree, cgbno)) 1989 break; 1990 if (++curnblk >= nblk) 1991 break; 1992 } 1993 if (curnblk > savenblk) { 1994 savecg = cg; 1995 savenblk = curnblk; 1996 savebno = curbno; 1997 } 1998 } 1999 brelse(bp); 2000 if (savenblk >= nblk) 2001 break; 2002 } 2003 2004 /* convert block offset in cg to frag offset in cg */ 2005 savebno = blkstofrags(fs, savebno); 2006 2007 /* convert frag offset in cg to frag offset in fs */ 2008 savebno += (savecg * fs->fs_fpg); 2009 2010 return (savebno); 2011 } 2012