1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #include <sys/condvar_impl.h> 40 #include <sys/types.h> 41 #include <sys/t_lock.h> 42 #include <sys/debug.h> 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/signal.h> 46 #include <sys/cred.h> 47 #include <sys/proc.h> 48 #include <sys/disp.h> 49 #include <sys/user.h> 50 #include <sys/buf.h> 51 #include <sys/vfs.h> 52 #include <sys/vnode.h> 53 #include <sys/acl.h> 54 #include <sys/fs/ufs_fs.h> 55 #include <sys/fs/ufs_inode.h> 56 #include <sys/fs/ufs_acl.h> 57 #include <sys/fs/ufs_bio.h> 58 #include <sys/fs/ufs_quota.h> 59 #include <sys/kmem.h> 60 #include <sys/fs/ufs_trans.h> 61 #include <sys/fs/ufs_panic.h> 62 #include <sys/errno.h> 63 #include <sys/time.h> 64 #include <sys/sysmacros.h> 65 #include <sys/file.h> 66 #include <sys/fcntl.h> 67 #include <sys/flock.h> 68 #include <fs/fs_subr.h> 69 #include <sys/cmn_err.h> 70 #include <sys/policy.h> 71 #include <sys/fs/ufs_log.h> 72 73 static ino_t hashalloc(); 74 static daddr_t fragextend(); 75 static daddr_t alloccg(); 76 static daddr_t alloccgblk(); 77 static ino_t ialloccg(); 78 static daddr_t mapsearch(); 79 static int findlogstartcg(); 80 81 extern int inside[], around[]; 82 extern uchar_t *fragtbl[]; 83 void delay(); 84 85 /* 86 * Allocate a block in the file system. 87 * 88 * The size of the requested block is given, which must be some 89 * multiple of fs_fsize and <= fs_bsize. 90 * A preference may be optionally specified. If a preference is given 91 * the following hierarchy is used to allocate a block: 92 * 1) allocate the requested block. 93 * 2) allocate a rotationally optimal block in the same cylinder. 94 * 3) allocate a block in the same cylinder group. 95 * 4) quadratically rehash into other cylinder groups, until an 96 * available block is located. 97 * If no block preference is given the following hierarchy is used 98 * to allocate a block: 99 * 1) allocate a block in the cylinder group that contains the 100 * inode for the file. 101 * 2) quadratically rehash into other cylinder groups, until an 102 * available block is located. 103 */ 104 int 105 alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr) 106 { 107 struct fs *fs; 108 struct ufsvfs *ufsvfsp; 109 daddr_t bno; 110 int cg; 111 int err; 112 char *errmsg = NULL; 113 size_t len; 114 clock_t now; 115 116 ufsvfsp = ip->i_ufsvfs; 117 fs = ufsvfsp->vfs_fs; 118 if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) { 119 err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx," 120 " bsize = %d, size = %d, fs = %s\n", 121 ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); 122 return (err); 123 } 124 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 125 goto nospace; 126 if (freespace(fs, ufsvfsp) <= 0 && 127 secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0) 128 goto nospace; 129 err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len); 130 /* Note that may not have err, but may have errmsg */ 131 if (errmsg != NULL) { 132 uprintf(errmsg); 133 kmem_free(errmsg, len); 134 errmsg = NULL; 135 } 136 if (err) 137 return (err); 138 if (bpref >= fs->fs_size) 139 bpref = 0; 140 if (bpref == 0) 141 cg = (int)itog(fs, ip->i_number); 142 else 143 cg = dtog(fs, bpref); 144 145 bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size, 146 (ulong_t (*)())alloccg); 147 if (bno > 0) { 148 *bnp = bno; 149 return (0); 150 } 151 152 /* 153 * hashalloc() failed because some other thread grabbed 154 * the last block so unwind the quota operation. We can 155 * ignore the return because subtractions don't fail and 156 * size is guaranteed to be >= zero by our caller. 157 */ 158 (void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL, 159 (size_t *)NULL); 160 161 nospace: 162 now = ddi_get_lbolt(); 163 mutex_enter(&ufsvfsp->vfs_lock); 164 if ((now - ufsvfsp->vfs_lastwhinetime) > (hz << 2) && 165 (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) { 166 ufsvfsp->vfs_lastwhinetime = now; 167 cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt); 168 } 169 mutex_exit(&ufsvfsp->vfs_lock); 170 return (ENOSPC); 171 } 172 173 /* 174 * Reallocate a fragment to a bigger size 175 * 176 * The number and size of the old block is given, and a preference 177 * and new size is also specified. The allocator attempts to extend 178 * the original block. Failing that, the regular block allocator is 179 * invoked to get an appropriate block. 180 */ 181 int 182 realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize, 183 int nsize, daddr_t *bnp, cred_t *cr) 184 { 185 daddr_t bno; 186 struct fs *fs; 187 struct ufsvfs *ufsvfsp; 188 int cg, request; 189 int err; 190 char *errmsg = NULL; 191 size_t len; 192 clock_t now; 193 194 ufsvfsp = ip->i_ufsvfs; 195 fs = ufsvfsp->vfs_fs; 196 if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || 197 (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { 198 err = ufs_fault(ITOV(ip), 199 "realloccg: bad size, dev=0x%lx, bsize=%d, " 200 "osize=%d, nsize=%d, fs=%s\n", 201 ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt); 202 return (err); 203 } 204 if (freespace(fs, ufsvfsp) <= 0 && 205 secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0) 206 goto nospace; 207 if (bprev == 0) { 208 err = ufs_fault(ITOV(ip), 209 "realloccg: bad bprev, dev = 0x%lx, bsize = %d," 210 " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev, 211 fs->fs_fsmnt); 212 return (err); 213 } 214 err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len); 215 /* Note that may not have err, but may have errmsg */ 216 if (errmsg != NULL) { 217 uprintf(errmsg); 218 kmem_free(errmsg, len); 219 errmsg = NULL; 220 } 221 if (err) 222 return (err); 223 cg = dtog(fs, bprev); 224 bno = fragextend(ip, cg, (long)bprev, osize, nsize); 225 if (bno != 0) { 226 *bnp = bno; 227 return (0); 228 } 229 if (bpref >= fs->fs_size) 230 bpref = 0; 231 232 /* 233 * When optimizing for time we allocate a full block and 234 * then only use the upper portion for this request. When 235 * this file grows again it will grow into the unused portion 236 * of the block (See fragextend() above). This saves time 237 * because an extra disk write would be needed if the frags 238 * following the current allocation were not free. The extra 239 * disk write is needed to move the data from its current 240 * location into the newly allocated position. 241 * 242 * When optimizing for space we allocate a run of frags 243 * that is just the right size for this request. 244 */ 245 request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize; 246 bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request, 247 (ulong_t (*)())alloccg); 248 if (bno > 0) { 249 *bnp = bno; 250 if (nsize < request) 251 (void) free(ip, bno + numfrags(fs, nsize), 252 (off_t)(request - nsize), I_NOCANCEL); 253 return (0); 254 } 255 256 /* 257 * hashalloc() failed because some other thread grabbed 258 * the last block so unwind the quota operation. We can 259 * ignore the return because subtractions don't fail, and 260 * our caller guarantees nsize >= osize. 261 */ 262 (void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL, 263 (size_t *)NULL); 264 265 nospace: 266 now = ddi_get_lbolt(); 267 mutex_enter(&ufsvfsp->vfs_lock); 268 if ((now - ufsvfsp->vfs_lastwhinetime) > (hz << 2) && 269 (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) { 270 ufsvfsp->vfs_lastwhinetime = now; 271 cmn_err(CE_NOTE, 272 "realloccg %s: file system full", fs->fs_fsmnt); 273 } 274 mutex_exit(&ufsvfsp->vfs_lock); 275 return (ENOSPC); 276 } 277 278 /* 279 * Allocate an inode in the file system. 280 * 281 * A preference may be optionally specified. If a preference is given 282 * the following hierarchy is used to allocate an inode: 283 * 1) allocate the requested inode. 284 * 2) allocate an inode in the same cylinder group. 285 * 3) quadratically rehash into other cylinder groups, until an 286 * available inode is located. 287 * If no inode preference is given the following hierarchy is used 288 * to allocate an inode: 289 * 1) allocate an inode in cylinder group 0. 290 * 2) quadratically rehash into other cylinder groups, until an 291 * available inode is located. 292 */ 293 int 294 ufs_ialloc(struct inode *pip, 295 ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr) 296 { 297 struct inode *ip; 298 struct fs *fs; 299 int cg; 300 ino_t ino; 301 int err; 302 int nifree; 303 struct ufsvfs *ufsvfsp = pip->i_ufsvfs; 304 char *errmsg = NULL; 305 size_t len; 306 307 ASSERT(RW_WRITE_HELD(&pip->i_rwlock)); 308 fs = pip->i_fs; 309 loop: 310 nifree = fs->fs_cstotal.cs_nifree; 311 312 if (nifree == 0) 313 goto noinodes; 314 /* 315 * Shadow inodes don't count against a user's inode allocation. 316 * They are an implementation method and not a resource. 317 */ 318 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 319 err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data, 320 /* change */ 1, (struct inode *)NULL, crgetuid(cr), 0, 321 cr, &errmsg, &len); 322 /* 323 * As we haven't acquired any locks yet, dump the message 324 * now. 325 */ 326 if (errmsg != NULL) { 327 uprintf(errmsg); 328 kmem_free(errmsg, len); 329 errmsg = NULL; 330 } 331 if (err) 332 return (err); 333 } 334 335 if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg)) 336 ipref = 0; 337 cg = (int)itog(fs, ipref); 338 ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode, 339 (ulong_t (*)())ialloccg); 340 if (ino == 0) { 341 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 342 /* 343 * We can safely ignore the return from chkiq() 344 * because deallocations can only fail if we 345 * can't get the user's quota info record off 346 * the disk due to an I/O error. In that case, 347 * the quota subsystem is already messed up. 348 */ 349 (void) chkiq(ufsvfsp, /* change */ -1, 350 (struct inode *)NULL, crgetuid(cr), 0, cr, 351 (char **)NULL, (size_t *)NULL); 352 } 353 goto noinodes; 354 } 355 err = ufs_iget(pip->i_vfs, ino, ipp, cr); 356 if (err) { 357 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 358 /* 359 * See above comment about why it is safe to ignore an 360 * error return here. 361 */ 362 (void) chkiq(ufsvfsp, /* change */ -1, 363 (struct inode *)NULL, crgetuid(cr), 0, cr, 364 (char **)NULL, (size_t *)NULL); 365 } 366 ufs_ifree(pip, ino, 0); 367 return (err); 368 } 369 ip = *ipp; 370 ASSERT(!ip->i_ufs_acl); 371 ASSERT(!ip->i_dquot); 372 rw_enter(&ip->i_contents, RW_WRITER); 373 374 /* 375 * Check if we really got a free inode, if not then complain 376 * and mark the inode ISTALE so that it will be freed by the 377 * ufs idle thread eventually and will not be sent to ufs_delete(). 378 */ 379 if (ip->i_mode || (ip->i_nlink > 0)) { 380 ip->i_flag |= ISTALE; 381 rw_exit(&ip->i_contents); 382 VN_RELE(ITOV(ip)); 383 cmn_err(CE_WARN, 384 "%s: unexpected allocated inode %d, run fsck(1M)%s", 385 fs->fs_fsmnt, (int)ino, 386 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 387 goto loop; 388 } 389 390 /* 391 * Check the inode has no size or data blocks. 392 * This could have happened if the truncation failed when 393 * deleting the inode. It used to be possible for this to occur 394 * if a block allocation failed when iteratively truncating a 395 * large file using logging and with a full file system. 396 * This was fixed with bug fix 4348738. However, truncation may 397 * still fail on an IO error. So in all cases for safety and 398 * security we clear out the size; the blocks allocated; and 399 * pointers to the blocks. This will ultimately cause a fsck 400 * error of un-accounted for blocks, but its a fairly benign error, 401 * and possibly the correct thing to do anyway as accesssing those 402 * blocks agains may lead to more IO errors. 403 */ 404 if (ip->i_size || ip->i_blocks) { 405 int i; 406 407 if (ip->i_size) { 408 cmn_err(CE_WARN, 409 "%s: free inode %d had size 0x%llx, run fsck(1M)%s", 410 fs->fs_fsmnt, (int)ino, ip->i_size, 411 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 412 } 413 /* 414 * Clear any garbage left behind. 415 */ 416 ip->i_size = (u_offset_t)0; 417 ip->i_blocks = 0; 418 for (i = 0; i < NDADDR; i++) 419 ip->i_db[i] = 0; 420 for (i = 0; i < NIADDR; i++) 421 ip->i_ib[i] = 0; 422 } 423 424 /* 425 * Initialize the link count 426 */ 427 ip->i_nlink = 0; 428 429 /* 430 * Clear the old flags 431 */ 432 ip->i_flag &= IREF; 433 434 /* 435 * Access times are not really defined if the fs is mounted 436 * with 'noatime'. But it can cause nfs clients to fail 437 * open() if the atime is not a legal value. Set a legal value 438 * here when the inode is allocated. 439 */ 440 if (ufsvfsp->vfs_noatime) { 441 mutex_enter(&ufs_iuniqtime_lock); 442 ip->i_atime = iuniqtime; 443 mutex_exit(&ufs_iuniqtime_lock); 444 } 445 rw_exit(&ip->i_contents); 446 return (0); 447 noinodes: 448 if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET)) 449 cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt); 450 return (ENOSPC); 451 } 452 453 /* 454 * Find a cylinder group to place a directory. 455 * Returns an inumber within the selected cylinder group. 456 * Note, the vfs_lock is not needed as we don't require exact cg summary info. 457 * 458 * If the switch ufs_close_dirs is set, then the policy is to use 459 * the current cg if it has more than 25% free inodes and more 460 * than 25% free blocks. Otherwise the cgs are searched from 461 * the beginning and the first cg with the same criteria is 462 * used. If that is also null then we revert to the old algorithm. 463 * This tends to cluster files at the beginning of the disk 464 * until the disk gets full. 465 * 466 * Otherwise if ufs_close_dirs is not set then the original policy is 467 * used which is to select from among those cylinder groups with 468 * above the average number of free inodes, the one with the smallest 469 * number of directories. 470 */ 471 472 int ufs_close_dirs = 1; /* allocate directories close as possible */ 473 474 ino_t 475 dirpref(inode_t *dp) 476 { 477 int cg, minndir, mincg, avgifree, mininode, minbpg, ifree; 478 struct fs *fs = dp->i_fs; 479 480 cg = itog(fs, dp->i_number); 481 mininode = fs->fs_ipg >> 2; 482 minbpg = fs->fs_maxbpg >> 2; 483 if (ufs_close_dirs && 484 (fs->fs_cs(fs, cg).cs_nifree > mininode) && 485 (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) { 486 return (dp->i_number); 487 } 488 489 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 490 minndir = fs->fs_ipg; 491 mincg = 0; 492 for (cg = 0; cg < fs->fs_ncg; cg++) { 493 ifree = fs->fs_cs(fs, cg).cs_nifree; 494 if (ufs_close_dirs && 495 (ifree > mininode) && 496 (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) { 497 return ((ino_t)(fs->fs_ipg * cg)); 498 } 499 if ((fs->fs_cs(fs, cg).cs_ndir < minndir) && 500 (ifree >= avgifree)) { 501 mincg = cg; 502 minndir = fs->fs_cs(fs, cg).cs_ndir; 503 } 504 } 505 return ((ino_t)(fs->fs_ipg * mincg)); 506 } 507 508 /* 509 * Select the desired position for the next block in a file. The file is 510 * logically divided into sections. The first section is composed of the 511 * direct blocks. Each additional section contains fs_maxbpg blocks. 512 * 513 * If no blocks have been allocated in the first section, the policy is to 514 * request a block in the same cylinder group as the inode that describes 515 * the file. If no blocks have been allocated in any other section, the 516 * policy is to place the section in a cylinder group with a greater than 517 * average number of free blocks. An appropriate cylinder group is found 518 * by using a rotor that sweeps the cylinder groups. When a new group of 519 * blocks is needed, the sweep begins in the cylinder group following the 520 * cylinder group from which the previous allocation was made. The sweep 521 * continues until a cylinder group with greater than the average number 522 * of free blocks is found. If the allocation is for the first block in an 523 * indirect block, the information on the previous allocation is unavailable; 524 * here a best guess is made based upon the logical block number being 525 * allocated. 526 * 527 * If a section is already partially allocated, the policy is to 528 * contiguously allocate fs_maxcontig blocks. The end of one of these 529 * contiguous blocks and the beginning of the next is physically separated 530 * so that the disk head will be in transit between them for at least 531 * fs_rotdelay milliseconds. This is to allow time for the processor to 532 * schedule another I/O transfer. 533 */ 534 daddr_t 535 blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap) 536 { 537 struct fs *fs; 538 struct ufsvfs *ufsvfsp; 539 int cg; 540 int avgbfree, startcg; 541 daddr_t nextblk; 542 543 ufsvfsp = ip->i_ufsvfs; 544 fs = ip->i_fs; 545 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 546 if (lbn < NDADDR) { 547 cg = itog(fs, ip->i_number); 548 return (fs->fs_fpg * cg + fs->fs_frag); 549 } 550 /* 551 * Find a cylinder with greater than average 552 * number of unused data blocks. 553 */ 554 if (indx == 0 || bap[indx - 1] == 0) 555 startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg; 556 else 557 startcg = dtog(fs, bap[indx - 1]) + 1; 558 startcg %= fs->fs_ncg; 559 560 mutex_enter(&ufsvfsp->vfs_lock); 561 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 562 /* 563 * used for computing log space for writes/truncs 564 */ 565 ufsvfsp->vfs_avgbfree = avgbfree; 566 for (cg = startcg; cg < fs->fs_ncg; cg++) 567 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 568 fs->fs_cgrotor = cg; 569 mutex_exit(&ufsvfsp->vfs_lock); 570 return (fs->fs_fpg * cg + fs->fs_frag); 571 } 572 for (cg = 0; cg <= startcg; cg++) 573 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 574 fs->fs_cgrotor = cg; 575 mutex_exit(&ufsvfsp->vfs_lock); 576 return (fs->fs_fpg * cg + fs->fs_frag); 577 } 578 mutex_exit(&ufsvfsp->vfs_lock); 579 return (0); 580 } 581 /* 582 * One or more previous blocks have been laid out. If less 583 * than fs_maxcontig previous blocks are contiguous, the 584 * next block is requested contiguously, otherwise it is 585 * requested rotationally delayed by fs_rotdelay milliseconds. 586 */ 587 588 nextblk = bap[indx - 1]; 589 /* 590 * Provision for fallocate to return positive 591 * blk preference based on last allocation 592 */ 593 if (nextblk < 0 && nextblk != UFS_HOLE) { 594 nextblk = (-bap[indx - 1]) + fs->fs_frag; 595 } else { 596 nextblk = bap[indx - 1] + fs->fs_frag; 597 } 598 599 if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] + 600 blkstofrags(fs, fs->fs_maxcontig) != nextblk) { 601 return (nextblk); 602 } 603 if (fs->fs_rotdelay != 0) 604 /* 605 * Here we convert ms of delay to frags as: 606 * (frags) = (ms) * (rev/sec) * (sect/rev) / 607 * ((sect/frag) * (ms/sec)) 608 * then round up to the next block. 609 */ 610 nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect / 611 (NSPF(fs) * 1000), fs->fs_frag); 612 return (nextblk); 613 } 614 615 /* 616 * Free a block or fragment. 617 * 618 * The specified block or fragment is placed back in the 619 * free map. If a fragment is deallocated, a possible 620 * block reassembly is checked. 621 */ 622 void 623 free(struct inode *ip, daddr_t bno, off_t size, int flags) 624 { 625 struct fs *fs = ip->i_fs; 626 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 627 struct ufs_q *delq = &ufsvfsp->vfs_delete; 628 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 629 struct cg *cgp; 630 struct buf *bp; 631 int cg, bmap, bbase; 632 int i; 633 uchar_t *blksfree; 634 int *blktot; 635 short *blks; 636 daddr_t blkno, cylno, rpos; 637 638 /* 639 * fallocate'd files will have negative block address. 640 * So negate it again to get original block address. 641 */ 642 if (bno < 0 && (bno % fs->fs_frag == 0) && bno != UFS_HOLE) { 643 bno = -bno; 644 } 645 646 if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) { 647 (void) ufs_fault(ITOV(ip), 648 "free: bad size, dev = 0x%lx, bsize = %d, size = %d, " 649 "fs = %s\n", ip->i_dev, fs->fs_bsize, 650 (int)size, fs->fs_fsmnt); 651 return; 652 } 653 cg = dtog(fs, bno); 654 ASSERT(!ufs_badblock(ip, bno)); 655 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 656 (int)fs->fs_cgsize); 657 658 cgp = bp->b_un.b_cg; 659 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 660 brelse(bp); 661 return; 662 } 663 664 if (!(flags & I_NOCANCEL)) 665 TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags); 666 if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) { 667 TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size); 668 } 669 blksfree = cg_blksfree(cgp); 670 blktot = cg_blktot(cgp); 671 mutex_enter(&ufsvfsp->vfs_lock); 672 cgp->cg_time = gethrestime_sec(); 673 bno = dtogd(fs, bno); 674 if (size == fs->fs_bsize) { 675 blkno = fragstoblks(fs, bno); 676 cylno = cbtocylno(fs, bno); 677 rpos = cbtorpos(ufsvfsp, bno); 678 blks = cg_blks(ufsvfsp, cgp, cylno); 679 if (!isclrblock(fs, blksfree, blkno)) { 680 mutex_exit(&ufsvfsp->vfs_lock); 681 brelse(bp); 682 (void) ufs_fault(ITOV(ip), "free: freeing free block, " 683 "dev:0x%lx, block:%ld, ino:%lu, fs:%s", 684 ip->i_dev, bno, ip->i_number, fs->fs_fsmnt); 685 return; 686 } 687 setblock(fs, blksfree, blkno); 688 blks[rpos]++; 689 blktot[cylno]++; 690 cgp->cg_cs.cs_nbfree++; /* Log below */ 691 fs->fs_cstotal.cs_nbfree++; 692 fs->fs_cs(fs, cg).cs_nbfree++; 693 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) { 694 mutex_enter(&delq->uq_mutex); 695 delq_info->delq_unreclaimed_blocks -= 696 btodb(fs->fs_bsize); 697 mutex_exit(&delq->uq_mutex); 698 } 699 } else { 700 bbase = bno - fragnum(fs, bno); 701 /* 702 * Decrement the counts associated with the old frags 703 */ 704 bmap = blkmap(fs, blksfree, bbase); 705 fragacct(fs, bmap, cgp->cg_frsum, -1); 706 /* 707 * Deallocate the fragment 708 */ 709 for (i = 0; i < numfrags(fs, size); i++) { 710 if (isset(blksfree, bno + i)) { 711 brelse(bp); 712 mutex_exit(&ufsvfsp->vfs_lock); 713 (void) ufs_fault(ITOV(ip), 714 "free: freeing free frag, " 715 "dev:0x%lx, blk:%ld, cg:%d, " 716 "ino:%lu, fs:%s", 717 ip->i_dev, 718 bno + i, 719 cgp->cg_cgx, 720 ip->i_number, 721 fs->fs_fsmnt); 722 return; 723 } 724 setbit(blksfree, bno + i); 725 } 726 cgp->cg_cs.cs_nffree += i; 727 fs->fs_cstotal.cs_nffree += i; 728 fs->fs_cs(fs, cg).cs_nffree += i; 729 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) { 730 mutex_enter(&delq->uq_mutex); 731 delq_info->delq_unreclaimed_blocks -= 732 btodb(i * fs->fs_fsize); 733 mutex_exit(&delq->uq_mutex); 734 } 735 /* 736 * Add back in counts associated with the new frags 737 */ 738 bmap = blkmap(fs, blksfree, bbase); 739 fragacct(fs, bmap, cgp->cg_frsum, 1); 740 /* 741 * If a complete block has been reassembled, account for it 742 */ 743 blkno = fragstoblks(fs, bbase); 744 if (isblock(fs, blksfree, blkno)) { 745 cylno = cbtocylno(fs, bbase); 746 rpos = cbtorpos(ufsvfsp, bbase); 747 blks = cg_blks(ufsvfsp, cgp, cylno); 748 blks[rpos]++; 749 blktot[cylno]++; 750 cgp->cg_cs.cs_nffree -= fs->fs_frag; 751 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 752 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 753 cgp->cg_cs.cs_nbfree++; 754 fs->fs_cstotal.cs_nbfree++; 755 fs->fs_cs(fs, cg).cs_nbfree++; 756 } 757 } 758 fs->fs_fmod = 1; 759 ufs_notclean(ufsvfsp); 760 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 761 TRANS_SI(ufsvfsp, fs, cg); 762 bdrwrite(bp); 763 } 764 765 /* 766 * Free an inode. 767 * 768 * The specified inode is placed back in the free map. 769 */ 770 void 771 ufs_ifree(struct inode *ip, ino_t ino, mode_t mode) 772 { 773 struct fs *fs = ip->i_fs; 774 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 775 struct cg *cgp; 776 struct buf *bp; 777 unsigned int inot; 778 int cg; 779 char *iused; 780 781 if (ip->i_number == ino && ip->i_mode != 0) { 782 (void) ufs_fault(ITOV(ip), 783 "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, " 784 "fs = %s\n", 785 ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt); 786 return; 787 } 788 if (ino >= fs->fs_ipg * fs->fs_ncg) { 789 (void) ufs_fault(ITOV(ip), 790 "ifree: range, dev = 0x%x, ino = %d, fs = %s\n", 791 (int)ip->i_dev, (int)ino, fs->fs_fsmnt); 792 return; 793 } 794 cg = (int)itog(fs, ino); 795 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 796 (int)fs->fs_cgsize); 797 798 cgp = bp->b_un.b_cg; 799 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 800 brelse(bp); 801 return; 802 } 803 mutex_enter(&ufsvfsp->vfs_lock); 804 cgp->cg_time = gethrestime_sec(); 805 iused = cg_inosused(cgp); 806 inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg); 807 if (isclr(iused, inot)) { 808 mutex_exit(&ufsvfsp->vfs_lock); 809 brelse(bp); 810 (void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, " 811 "mode: (imode) %o, (omode) %o, ino:%d, " 812 "fs:%s", 813 ip->i_mode, mode, (int)ino, fs->fs_fsmnt); 814 return; 815 } 816 clrbit(iused, inot); 817 818 if (inot < (ulong_t)cgp->cg_irotor) 819 cgp->cg_irotor = inot; 820 cgp->cg_cs.cs_nifree++; 821 fs->fs_cstotal.cs_nifree++; 822 fs->fs_cs(fs, cg).cs_nifree++; 823 if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) { 824 cgp->cg_cs.cs_ndir--; 825 fs->fs_cstotal.cs_ndir--; 826 fs->fs_cs(fs, cg).cs_ndir--; 827 } 828 fs->fs_fmod = 1; 829 ufs_notclean(ufsvfsp); 830 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 831 TRANS_SI(ufsvfsp, fs, cg); 832 bdrwrite(bp); 833 } 834 835 /* 836 * Implement the cylinder overflow algorithm. 837 * 838 * The policy implemented by this algorithm is: 839 * 1) allocate the block in its requested cylinder group. 840 * 2) quadratically rehash on the cylinder group number. 841 * 3) brute force search for a free block. 842 * The size parameter means size for data blocks, mode for inodes. 843 */ 844 static ino_t 845 hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)()) 846 { 847 struct fs *fs; 848 int i; 849 long result; 850 int icg = cg; 851 852 fs = ip->i_fs; 853 /* 854 * 1: preferred cylinder group 855 */ 856 result = (*allocator)(ip, cg, pref, size); 857 if (result) 858 return (result); 859 /* 860 * 2: quadratic rehash 861 */ 862 for (i = 1; i < fs->fs_ncg; i *= 2) { 863 cg += i; 864 if (cg >= fs->fs_ncg) 865 cg -= fs->fs_ncg; 866 result = (*allocator)(ip, cg, 0, size); 867 if (result) 868 return (result); 869 } 870 /* 871 * 3: brute force search 872 * Note that we start at i == 2, since 0 was checked initially, 873 * and 1 is always checked in the quadratic rehash. 874 */ 875 cg = (icg + 2) % fs->fs_ncg; 876 for (i = 2; i < fs->fs_ncg; i++) { 877 result = (*allocator)(ip, cg, 0, size); 878 if (result) 879 return (result); 880 cg++; 881 if (cg == fs->fs_ncg) 882 cg = 0; 883 } 884 return (0); 885 } 886 887 /* 888 * Determine whether a fragment can be extended. 889 * 890 * Check to see if the necessary fragments are available, and 891 * if they are, allocate them. 892 */ 893 static daddr_t 894 fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize) 895 { 896 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 897 struct fs *fs = ip->i_fs; 898 struct buf *bp; 899 struct cg *cgp; 900 uchar_t *blksfree; 901 long bno; 902 int frags, bbase; 903 int i, j; 904 905 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) 906 return (0); 907 frags = numfrags(fs, nsize); 908 bbase = (int)fragnum(fs, bprev); 909 if (bbase > fragnum(fs, (bprev + frags - 1))) { 910 /* cannot extend across a block boundary */ 911 return (0); 912 } 913 914 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 915 (int)fs->fs_cgsize); 916 cgp = bp->b_un.b_cg; 917 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 918 brelse(bp); 919 return (0); 920 } 921 922 blksfree = cg_blksfree(cgp); 923 mutex_enter(&ufsvfsp->vfs_lock); 924 bno = dtogd(fs, bprev); 925 for (i = numfrags(fs, osize); i < frags; i++) { 926 if (isclr(blksfree, bno + i)) { 927 mutex_exit(&ufsvfsp->vfs_lock); 928 brelse(bp); 929 return (0); 930 } 931 if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)), 932 fs->fs_fsize))) { 933 mutex_exit(&ufsvfsp->vfs_lock); 934 brelse(bp); 935 return (0); 936 } 937 } 938 939 cgp->cg_time = gethrestime_sec(); 940 /* 941 * The current fragment can be extended, 942 * deduct the count on fragment being extended into 943 * increase the count on the remaining fragment (if any) 944 * allocate the extended piece. 945 */ 946 for (i = frags; i < fs->fs_frag - bbase; i++) 947 if (isclr(blksfree, bno + i)) 948 break; 949 j = i - numfrags(fs, osize); 950 cgp->cg_frsum[j]--; 951 ASSERT(cgp->cg_frsum[j] >= 0); 952 if (i != frags) 953 cgp->cg_frsum[i - frags]++; 954 for (i = numfrags(fs, osize); i < frags; i++) { 955 clrbit(blksfree, bno + i); 956 cgp->cg_cs.cs_nffree--; 957 fs->fs_cs(fs, cg).cs_nffree--; 958 fs->fs_cstotal.cs_nffree--; 959 } 960 fs->fs_fmod = 1; 961 ufs_notclean(ufsvfsp); 962 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 963 TRANS_SI(ufsvfsp, fs, cg); 964 bdrwrite(bp); 965 return ((daddr_t)bprev); 966 } 967 968 /* 969 * Determine whether a block can be allocated. 970 * 971 * Check to see if a block of the apprpriate size 972 * is available, and if it is, allocate it. 973 */ 974 static daddr_t 975 alloccg(struct inode *ip, int cg, daddr_t bpref, int size) 976 { 977 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 978 struct fs *fs = ip->i_fs; 979 struct buf *bp; 980 struct cg *cgp; 981 uchar_t *blksfree; 982 int bno, frags; 983 int allocsiz; 984 int i; 985 986 /* 987 * Searching for space could be time expensive so do some 988 * up front checking to verify that there is actually space 989 * available (free blocks or free frags). 990 */ 991 if (fs->fs_cs(fs, cg).cs_nbfree == 0) { 992 if (size == fs->fs_bsize) 993 return (0); 994 995 /* 996 * If there are not enough free frags then return. 997 */ 998 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, size)) 999 return (0); 1000 } 1001 1002 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 1003 (int)fs->fs_cgsize); 1004 1005 cgp = bp->b_un.b_cg; 1006 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) || 1007 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { 1008 brelse(bp); 1009 return (0); 1010 } 1011 blksfree = cg_blksfree(cgp); 1012 mutex_enter(&ufsvfsp->vfs_lock); 1013 cgp->cg_time = gethrestime_sec(); 1014 if (size == fs->fs_bsize) { 1015 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0) 1016 goto errout; 1017 fs->fs_fmod = 1; 1018 ufs_notclean(ufsvfsp); 1019 TRANS_SI(ufsvfsp, fs, cg); 1020 bdrwrite(bp); 1021 return (bno); 1022 } 1023 /* 1024 * Check fragment bitmap to see if any fragments are already available. 1025 * mapsearch() may fail because the fragment that fits this request 1026 * might still be on the cancel list and not available for re-use yet. 1027 * Look for a bigger sized fragment to allocate first before we have 1028 * to give up and fragment a whole new block eventually. 1029 */ 1030 frags = numfrags(fs, size); 1031 allocsiz = frags; 1032 next_size: 1033 for (; allocsiz < fs->fs_frag; allocsiz++) 1034 if (cgp->cg_frsum[allocsiz] != 0) 1035 break; 1036 1037 if (allocsiz != fs->fs_frag) { 1038 bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz); 1039 if (bno < 0 && allocsiz < (fs->fs_frag - 1)) { 1040 allocsiz++; 1041 goto next_size; 1042 } 1043 } 1044 1045 if (allocsiz == fs->fs_frag || bno < 0) { 1046 /* 1047 * No fragments were available, so a block 1048 * will be allocated and hacked up. 1049 */ 1050 if (cgp->cg_cs.cs_nbfree == 0) 1051 goto errout; 1052 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0) 1053 goto errout; 1054 bpref = dtogd(fs, bno); 1055 for (i = frags; i < fs->fs_frag; i++) 1056 setbit(blksfree, bpref + i); 1057 i = fs->fs_frag - frags; 1058 cgp->cg_cs.cs_nffree += i; 1059 fs->fs_cstotal.cs_nffree += i; 1060 fs->fs_cs(fs, cg).cs_nffree += i; 1061 cgp->cg_frsum[i]++; 1062 fs->fs_fmod = 1; 1063 ufs_notclean(ufsvfsp); 1064 TRANS_SI(ufsvfsp, fs, cg); 1065 bdrwrite(bp); 1066 return (bno); 1067 } 1068 1069 for (i = 0; i < frags; i++) 1070 clrbit(blksfree, bno + i); 1071 cgp->cg_cs.cs_nffree -= frags; 1072 fs->fs_cstotal.cs_nffree -= frags; 1073 fs->fs_cs(fs, cg).cs_nffree -= frags; 1074 cgp->cg_frsum[allocsiz]--; 1075 ASSERT(cgp->cg_frsum[allocsiz] >= 0); 1076 if (frags != allocsiz) { 1077 cgp->cg_frsum[allocsiz - frags]++; 1078 } 1079 fs->fs_fmod = 1; 1080 ufs_notclean(ufsvfsp); 1081 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1082 TRANS_SI(ufsvfsp, fs, cg); 1083 bdrwrite(bp); 1084 return (cg * fs->fs_fpg + bno); 1085 errout: 1086 mutex_exit(&ufsvfsp->vfs_lock); 1087 brelse(bp); 1088 return (0); 1089 } 1090 1091 /* 1092 * Allocate a block in a cylinder group. 1093 * 1094 * This algorithm implements the following policy: 1095 * 1) allocate the requested block. 1096 * 2) allocate a rotationally optimal block in the same cylinder. 1097 * 3) allocate the next available block on the block rotor for the 1098 * specified cylinder group. 1099 * Note that this routine only allocates fs_bsize blocks; these 1100 * blocks may be fragmented by the routine that allocates them. 1101 */ 1102 static daddr_t 1103 alloccgblk( 1104 struct ufsvfs *ufsvfsp, 1105 struct cg *cgp, 1106 daddr_t bpref, 1107 struct buf *bp) 1108 { 1109 daddr_t bno; 1110 int cylno, pos, delta, rotbl_size; 1111 short *cylbp; 1112 int i; 1113 struct fs *fs; 1114 uchar_t *blksfree; 1115 daddr_t blkno, rpos, frag; 1116 short *blks; 1117 int32_t *blktot; 1118 1119 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock)); 1120 fs = ufsvfsp->vfs_fs; 1121 blksfree = cg_blksfree(cgp); 1122 if (bpref == 0) { 1123 bpref = cgp->cg_rotor; 1124 goto norot; 1125 } 1126 bpref = blknum(fs, bpref); 1127 bpref = dtogd(fs, bpref); 1128 /* 1129 * If the requested block is available, use it. 1130 */ 1131 if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) { 1132 bno = bpref; 1133 goto gotit; 1134 } 1135 /* 1136 * Check for a block available on the same cylinder. 1137 */ 1138 cylno = cbtocylno(fs, bpref); 1139 if (cg_blktot(cgp)[cylno] == 0) 1140 goto norot; 1141 if (fs->fs_cpc == 0) { 1142 /* 1143 * Block layout info is not available, so just 1144 * have to take any block in this cylinder. 1145 */ 1146 bpref = howmany(fs->fs_spc * cylno, NSPF(fs)); 1147 goto norot; 1148 } 1149 /* 1150 * Check the summary information to see if a block is 1151 * available in the requested cylinder starting at the 1152 * requested rotational position and proceeding around. 1153 */ 1154 cylbp = cg_blks(ufsvfsp, cgp, cylno); 1155 pos = cbtorpos(ufsvfsp, bpref); 1156 for (i = pos; i < ufsvfsp->vfs_nrpos; i++) 1157 if (cylbp[i] > 0) 1158 break; 1159 if (i == ufsvfsp->vfs_nrpos) 1160 for (i = 0; i < pos; i++) 1161 if (cylbp[i] > 0) 1162 break; 1163 if (cylbp[i] > 0) { 1164 /* 1165 * Found a rotational position, now find the actual 1166 * block. A "panic" if none is actually there. 1167 */ 1168 1169 /* 1170 * Up to this point, "pos" has referred to the rotational 1171 * position of the desired block. From now on, it holds 1172 * the offset of the current cylinder within a cylinder 1173 * cycle. (A cylinder cycle refers to a set of cylinders 1174 * which are described by a single rotational table; the 1175 * size of the cycle is fs_cpc.) 1176 * 1177 * bno is set to the block number of the first block within 1178 * the current cylinder cycle. 1179 */ 1180 1181 pos = cylno % fs->fs_cpc; 1182 bno = (cylno - pos) * fs->fs_spc / NSPB(fs); 1183 1184 /* 1185 * The blocks within a cylinder are grouped into equivalence 1186 * classes according to their "rotational position." There 1187 * are two tables used to determine these classes. 1188 * 1189 * The positional offset table (fs_postbl) has an entry for 1190 * each rotational position of each cylinder in a cylinder 1191 * cycle. This entry contains the relative block number 1192 * (counting from the start of the cylinder cycle) of the 1193 * first block in the equivalence class for that position 1194 * and that cylinder. Positions for which no blocks exist 1195 * are indicated by a -1. 1196 * 1197 * The rotational delta table (fs_rotbl) has an entry for 1198 * each block in a cylinder cycle. This entry contains 1199 * the offset from that block to the next block in the 1200 * same equivalence class. The last block in the class 1201 * is indicated by a zero in the table. 1202 * 1203 * The following code, then, walks through all of the blocks 1204 * in the cylinder (cylno) which we're allocating within 1205 * which are in the equivalence class for the rotational 1206 * position (i) which we're allocating within. 1207 */ 1208 1209 if (fs_postbl(ufsvfsp, pos)[i] == -1) { 1210 (void) ufs_fault(ufsvfsp->vfs_root, 1211 "alloccgblk: cyl groups corrupted, pos = %d, " 1212 "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt); 1213 return (0); 1214 } 1215 1216 /* 1217 * There is one entry in the rotational table for each block 1218 * in the cylinder cycle. These are whole blocks, not frags. 1219 */ 1220 1221 rotbl_size = (fs->fs_cpc * fs->fs_spc) >> 1222 (fs->fs_fragshift + fs->fs_fsbtodb); 1223 1224 /* 1225 * As we start, "i" is the rotational position within which 1226 * we're searching. After the next line, it will be a block 1227 * number (relative to the start of the cylinder cycle) 1228 * within the equivalence class of that rotational position. 1229 */ 1230 1231 i = fs_postbl(ufsvfsp, pos)[i]; 1232 1233 for (;;) { 1234 if (isblock(fs, blksfree, (daddr_t)(bno + i))) { 1235 bno = blkstofrags(fs, (bno + i)); 1236 goto gotit; 1237 } 1238 delta = fs_rotbl(fs)[i]; 1239 if (delta <= 0 || /* End of chain, or */ 1240 delta + i > rotbl_size) /* end of table? */ 1241 break; /* If so, panic. */ 1242 i += delta; 1243 } 1244 (void) ufs_fault(ufsvfsp->vfs_root, 1245 "alloccgblk: can't find blk in cyl, pos:%d, i:%d, " 1246 "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno); 1247 return (0); 1248 } 1249 norot: 1250 /* 1251 * No blocks in the requested cylinder, so take 1252 * next available one in this cylinder group. 1253 */ 1254 bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag); 1255 if (bno < 0) 1256 return (0); 1257 cgp->cg_rotor = bno; 1258 gotit: 1259 blkno = fragstoblks(fs, bno); 1260 frag = (cgp->cg_cgx * fs->fs_fpg) + bno; 1261 if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize)) 1262 goto norot; 1263 clrblock(fs, blksfree, (long)blkno); 1264 /* 1265 * the other cg/sb/si fields are TRANS'ed by the caller 1266 */ 1267 cgp->cg_cs.cs_nbfree--; 1268 fs->fs_cstotal.cs_nbfree--; 1269 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; 1270 cylno = cbtocylno(fs, bno); 1271 blks = cg_blks(ufsvfsp, cgp, cylno); 1272 rpos = cbtorpos(ufsvfsp, bno); 1273 blktot = cg_blktot(cgp); 1274 blks[rpos]--; 1275 blktot[cylno]--; 1276 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1277 fs->fs_fmod = 1; 1278 return (frag); 1279 } 1280 1281 /* 1282 * Determine whether an inode can be allocated. 1283 * 1284 * Check to see if an inode is available, and if it is, 1285 * allocate it using the following policy: 1286 * 1) allocate the requested inode. 1287 * 2) allocate the next available inode after the requested 1288 * inode in the specified cylinder group. 1289 */ 1290 static ino_t 1291 ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode) 1292 { 1293 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1294 struct fs *fs = ip->i_fs; 1295 struct cg *cgp; 1296 struct buf *bp; 1297 int start, len, loc, map, i; 1298 char *iused; 1299 1300 if (fs->fs_cs(fs, cg).cs_nifree == 0) 1301 return (0); 1302 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 1303 (int)fs->fs_cgsize); 1304 1305 cgp = bp->b_un.b_cg; 1306 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) || 1307 cgp->cg_cs.cs_nifree == 0) { 1308 brelse(bp); 1309 return (0); 1310 } 1311 iused = cg_inosused(cgp); 1312 mutex_enter(&ufsvfsp->vfs_lock); 1313 /* 1314 * While we are waiting for the mutex, someone may have taken 1315 * the last available inode. Need to recheck. 1316 */ 1317 if (cgp->cg_cs.cs_nifree == 0) { 1318 mutex_exit(&ufsvfsp->vfs_lock); 1319 brelse(bp); 1320 return (0); 1321 } 1322 1323 cgp->cg_time = gethrestime_sec(); 1324 if (ipref) { 1325 ipref %= fs->fs_ipg; 1326 if (isclr(iused, ipref)) 1327 goto gotit; 1328 } 1329 start = cgp->cg_irotor / NBBY; 1330 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); 1331 loc = skpc(0xff, (uint_t)len, &iused[start]); 1332 if (loc == 0) { 1333 len = start + 1; 1334 start = 0; 1335 loc = skpc(0xff, (uint_t)len, &iused[0]); 1336 if (loc == 0) { 1337 mutex_exit(&ufsvfsp->vfs_lock); 1338 (void) ufs_fault(ITOV(ip), 1339 "ialloccg: map corrupted, cg = %d, irotor = %d, " 1340 "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt); 1341 return (0); 1342 } 1343 } 1344 i = start + len - loc; 1345 map = iused[i]; 1346 ipref = i * NBBY; 1347 for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) { 1348 if ((map & i) == 0) { 1349 cgp->cg_irotor = ipref; 1350 goto gotit; 1351 } 1352 } 1353 1354 mutex_exit(&ufsvfsp->vfs_lock); 1355 (void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s", 1356 fs->fs_fsmnt); 1357 return (0); 1358 gotit: 1359 setbit(iused, ipref); 1360 cgp->cg_cs.cs_nifree--; 1361 fs->fs_cstotal.cs_nifree--; 1362 fs->fs_cs(fs, cg).cs_nifree--; 1363 if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) { 1364 cgp->cg_cs.cs_ndir++; 1365 fs->fs_cstotal.cs_ndir++; 1366 fs->fs_cs(fs, cg).cs_ndir++; 1367 } 1368 fs->fs_fmod = 1; 1369 ufs_notclean(ufsvfsp); 1370 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1371 TRANS_SI(ufsvfsp, fs, cg); 1372 bdrwrite(bp); 1373 return (cg * fs->fs_ipg + ipref); 1374 } 1375 1376 /* 1377 * Find a block of the specified size in the specified cylinder group. 1378 * 1379 * It is a panic if a request is made to find a block if none are 1380 * available. 1381 */ 1382 static daddr_t 1383 mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref, int allocsiz) 1384 { 1385 struct fs *fs = ufsvfsp->vfs_fs; 1386 daddr_t bno, cfrag; 1387 int start, len, loc, i, last, first, secondtime; 1388 int blk, field, subfield, pos; 1389 int gotit; 1390 1391 /* 1392 * ufsvfs->vfs_lock is held when calling this. 1393 */ 1394 /* 1395 * Find the fragment by searching through the 1396 * free block map for an appropriate bit pattern. 1397 */ 1398 if (bpref) 1399 start = dtogd(fs, bpref) / NBBY; 1400 else 1401 start = cgp->cg_frotor / NBBY; 1402 /* 1403 * the following loop performs two scans -- the first scan 1404 * searches the bottom half of the array for a match and the 1405 * second scan searches the top half of the array. The loops 1406 * have been merged just to make things difficult. 1407 */ 1408 first = start; 1409 last = howmany(fs->fs_fpg, NBBY); 1410 secondtime = 0; 1411 cfrag = cgp->cg_cgx * fs->fs_fpg; 1412 while (first < last) { 1413 len = last - first; 1414 /* 1415 * search the array for a match 1416 */ 1417 loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first], 1418 (uchar_t *)fragtbl[fs->fs_frag], 1419 (int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 1420 /* 1421 * match found 1422 */ 1423 if (loc) { 1424 bno = (last - loc) * NBBY; 1425 1426 /* 1427 * Found the byte in the map, sift 1428 * through the bits to find the selected frag 1429 */ 1430 cgp->cg_frotor = bno; 1431 gotit = 0; 1432 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 1433 blk = blkmap(fs, cg_blksfree(cgp), bno); 1434 blk <<= 1; 1435 field = around[allocsiz]; 1436 subfield = inside[allocsiz]; 1437 for (pos = 0; 1438 pos <= fs->fs_frag - allocsiz; 1439 pos++) { 1440 if ((blk & field) == subfield) { 1441 gotit++; 1442 break; 1443 } 1444 field <<= 1; 1445 subfield <<= 1; 1446 } 1447 if (gotit) 1448 break; 1449 } 1450 bno += pos; 1451 1452 /* 1453 * success if block is *not* being converted from 1454 * metadata into userdata (harpy). If so, ignore. 1455 */ 1456 if (!TRANS_ISCANCEL(ufsvfsp, 1457 ldbtob(fsbtodb(fs, (cfrag+bno))), 1458 allocsiz * fs->fs_fsize)) 1459 return (bno); 1460 1461 /* 1462 * keep looking -- this block is being converted 1463 */ 1464 first = (last - loc) + 1; 1465 loc = 0; 1466 if (first < last) 1467 continue; 1468 } 1469 /* 1470 * no usable matches in bottom half -- now search the top half 1471 */ 1472 if (secondtime) 1473 /* 1474 * no usable matches in top half -- all done 1475 */ 1476 break; 1477 secondtime = 1; 1478 last = start + 1; 1479 first = 0; 1480 } 1481 /* 1482 * no usable matches 1483 */ 1484 return ((daddr_t)-1); 1485 } 1486 1487 #define UFSNADDR (NDADDR + NIADDR) /* NADDR applies to (obsolete) S5FS */ 1488 #define IB(i) (NDADDR + (i)) /* index of i'th indirect block ptr */ 1489 #define SINGLE 0 /* single indirect block ptr */ 1490 #define DOUBLE 1 /* double indirect block ptr */ 1491 #define TRIPLE 2 /* triple indirect block ptr */ 1492 1493 /* 1494 * Acquire a write lock, and keep trying till we get it 1495 */ 1496 static int 1497 allocsp_wlockfs(struct vnode *vp, struct lockfs *lf) 1498 { 1499 int err = 0; 1500 1501 lockagain: 1502 do { 1503 err = ufs_fiolfss(vp, lf); 1504 if (err) 1505 return (err); 1506 } while (!LOCKFS_IS_ULOCK(lf)); 1507 1508 lf->lf_lock = LOCKFS_WLOCK; 1509 lf->lf_flags = 0; 1510 lf->lf_comment = NULL; 1511 err = ufs__fiolfs(vp, lf, 1, 0); 1512 1513 if (err == EBUSY || err == EINVAL) 1514 goto lockagain; 1515 1516 return (err); 1517 } 1518 1519 /* 1520 * Release the write lock 1521 */ 1522 static int 1523 allocsp_unlockfs(struct vnode *vp, struct lockfs *lf) 1524 { 1525 int err = 0; 1526 1527 lf->lf_lock = LOCKFS_ULOCK; 1528 lf->lf_flags = 0; 1529 err = ufs__fiolfs(vp, lf, 1, 0); 1530 return (err); 1531 } 1532 1533 struct allocsp_undo { 1534 daddr_t offset; 1535 daddr_t blk; 1536 struct allocsp_undo *next; 1537 }; 1538 1539 /* 1540 * ufs_allocsp() can be used to pre-allocate blocks for a file on a given 1541 * file system. For direct blocks, the blocks are allocated from the offset 1542 * requested to the block boundary, then any full blocks are allocated, 1543 * and finally any remainder. 1544 * For indirect blocks the blocks are not initialized and are 1545 * only marked as allocated. These addresses are then stored as negative 1546 * block numbers in the inode to imply special handling. UFS has been modified 1547 * where necessary to understand this new notion. 1548 * Successfully fallocated files will have IFALLOCATE cflag set in the inode. 1549 */ 1550 int 1551 ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr) 1552 { 1553 struct lockfs lf; 1554 int berr, err, resv, issync; 1555 off_t istart, len; /* istart, special for idb */ 1556 struct inode *ip; 1557 struct fs *fs; 1558 struct ufsvfs *ufsvfsp; 1559 u_offset_t resid, i, uoff; 1560 daddr32_t db_undo[NDADDR]; /* old direct blocks */ 1561 struct allocsp_undo *ib_undo = NULL; /* ib undo */ 1562 struct allocsp_undo *undo = NULL; 1563 u_offset_t osz; /* old file size */ 1564 int chunkblks = 0; /* # of blocks in 1 allocation */ 1565 int cnt = 0; 1566 daddr_t allocblk; 1567 daddr_t totblks = 0; 1568 struct ulockfs *ulp; 1569 size_t done_len; 1570 int nbytes, offsetn; 1571 1572 1573 ASSERT(vp->v_type == VREG); 1574 1575 ip = VTOI(vp); 1576 fs = ip->i_fs; 1577 if ((ufsvfsp = ip->i_ufsvfs) == NULL) { 1578 err = EIO; 1579 goto out_allocsp; 1580 } 1581 1582 istart = blkroundup(fs, (lp->l_start)); 1583 len = blkroundup(fs, (lp->l_len)); 1584 chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize; 1585 ulp = &ufsvfsp->vfs_ulockfs; 1586 1587 if (lp->l_start < 0 || lp->l_len <= 0) 1588 return (EINVAL); 1589 1590 /* Quickly check to make sure we have space before we proceed */ 1591 if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) { 1592 if (TRANS_ISTRANS(ufsvfsp)) { 1593 ufs_delete_drain_wait(ufsvfsp, 1); 1594 if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) 1595 return (ENOSPC); 1596 } else 1597 return (ENOSPC); 1598 } 1599 1600 /* 1601 * We will keep i_rwlock locked as WRITER through out the function 1602 * since we don't want anyone else reading or writing to the inode 1603 * while we are in the middle of fallocating the file. 1604 */ 1605 rw_enter(&ip->i_rwlock, RW_WRITER); 1606 1607 /* Back up the direct block list, used for undo later if necessary */ 1608 rw_enter(&ip->i_contents, RW_READER); 1609 for (i = 0; i < NDADDR; i++) 1610 db_undo[i] = ip->i_db[i]; 1611 osz = ip->i_size; 1612 rw_exit(&ip->i_contents); 1613 1614 /* Write lock the file system */ 1615 if (err = allocsp_wlockfs(vp, &lf)) 1616 goto exit; 1617 1618 /* 1619 * Allocate any direct blocks now. 1620 * Blocks are allocated from the offset requested to the block 1621 * boundary, then any full blocks are allocated, and finally any 1622 * remainder. 1623 */ 1624 if (lblkno(fs, lp->l_start) < NDADDR) { 1625 ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize), 1626 &resv, &resid); 1627 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1628 1629 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1630 rw_enter(&ip->i_contents, RW_WRITER); 1631 1632 done_len = 0; 1633 while ((done_len < lp->l_len) && 1634 (lblkno(fs, lp->l_start + done_len) < NDADDR)) { 1635 uoff = (offset_t)(lp->l_start + done_len); 1636 offsetn = (int)blkoff(fs, uoff); 1637 nbytes = (int)MIN(fs->fs_bsize - offsetn, 1638 lp->l_len - done_len); 1639 1640 berr = bmap_write(ip, uoff, offsetn + nbytes, 1641 BI_FALLOCATE, &allocblk, cr); 1642 /* Yikes error, quit */ 1643 if (berr) { 1644 TRANS_INODE(ufsvfsp, ip); 1645 rw_exit(&ip->i_contents); 1646 rw_exit(&ufsvfsp->vfs_dqrwlock); 1647 TRANS_END_CSYNC(ufsvfsp, err, issync, 1648 TOP_ALLOCSP, resv); 1649 err = allocsp_unlockfs(vp, &lf); 1650 goto exit; 1651 } 1652 1653 if (allocblk) { 1654 totblks++; 1655 if ((uoff + nbytes) > ip->i_size) 1656 ip->i_size = (uoff + nbytes); 1657 } 1658 done_len += nbytes; 1659 } 1660 1661 TRANS_INODE(ufsvfsp, ip); 1662 rw_exit(&ip->i_contents); 1663 rw_exit(&ufsvfsp->vfs_dqrwlock); 1664 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1665 1666 /* start offset for indirect allocation */ 1667 istart = (uoff + nbytes); 1668 } 1669 1670 /* Break the transactions into vfs_iotransz units */ 1671 ufs_trans_trunc_resv(ip, ip->i_size + 1672 blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid); 1673 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1674 1675 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1676 rw_enter(&ip->i_contents, RW_WRITER); 1677 1678 /* Now go about fallocating necessary indirect blocks */ 1679 for (i = istart; i < (lp->l_start + lp->l_len); i += fs->fs_bsize) { 1680 berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE, 1681 &allocblk, cr); 1682 if (berr) { 1683 TRANS_INODE(ufsvfsp, ip); 1684 rw_exit(&ip->i_contents); 1685 rw_exit(&ufsvfsp->vfs_dqrwlock); 1686 TRANS_END_CSYNC(ufsvfsp, err, issync, 1687 TOP_ALLOCSP, resv); 1688 err = allocsp_unlockfs(vp, &lf); 1689 goto exit; 1690 } 1691 1692 /* Update the blk counter only if new block was added */ 1693 if (allocblk) { 1694 /* Save undo information */ 1695 undo = kmem_alloc(sizeof (struct allocsp_undo), 1696 KM_SLEEP); 1697 undo->offset = i; 1698 undo->blk = allocblk; 1699 undo->next = ib_undo; 1700 ib_undo = undo; 1701 totblks++; 1702 1703 if (i >= ip->i_size) 1704 ip->i_size += fs->fs_bsize; 1705 } 1706 cnt++; 1707 1708 /* Being a good UFS citizen, let others get a share */ 1709 if (cnt == chunkblks) { 1710 /* 1711 * If there are waiters or the fs is hard locked, 1712 * error locked, or read-only error locked, 1713 * quit with EIO 1714 */ 1715 if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) || 1716 ULOCKFS_IS_ROELOCK(ulp)) { 1717 ip->i_cflags |= IFALLOCATE; 1718 TRANS_INODE(ufsvfsp, ip); 1719 rw_exit(&ip->i_contents); 1720 rw_exit(&ufsvfsp->vfs_dqrwlock); 1721 1722 TRANS_END_CSYNC(ufsvfsp, err, issync, 1723 TOP_ALLOCSP, resv); 1724 rw_exit(&ip->i_rwlock); 1725 (void) allocsp_unlockfs(vp, &lf); 1726 return (EIO); 1727 } 1728 1729 TRANS_INODE(ufsvfsp, ip); 1730 rw_exit(&ip->i_contents); 1731 rw_exit(&ufsvfsp->vfs_dqrwlock); 1732 1733 /* End the current transaction */ 1734 TRANS_END_CSYNC(ufsvfsp, err, issync, 1735 TOP_ALLOCSP, resv); 1736 1737 if (CV_HAS_WAITERS(&ulp->ul_cv)) { 1738 /* Release the write lock */ 1739 if (err = allocsp_unlockfs(vp, &lf)) 1740 goto exit; 1741 1742 /* Wake up others waiting to do operations */ 1743 mutex_enter(&ulp->ul_lock); 1744 cv_broadcast(&ulp->ul_cv); 1745 mutex_exit(&ulp->ul_lock); 1746 1747 /* Grab the write lock again */ 1748 if (err = allocsp_wlockfs(vp, &lf)) 1749 goto exit; 1750 } /* end of CV_HAS_WAITERS(&ulp->ul_cv) */ 1751 1752 /* Reserve more space in log for this file */ 1753 ufs_trans_trunc_resv(ip, 1754 ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz), 1755 &resv, &resid); 1756 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1757 1758 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1759 rw_enter(&ip->i_contents, RW_WRITER); 1760 1761 cnt = 0; /* reset cnt b/c of new transaction */ 1762 } 1763 } 1764 1765 if (!err && !berr) 1766 ip->i_cflags |= IFALLOCATE; 1767 1768 /* If the file has grown then correct the file size */ 1769 if (osz < (lp->l_start + lp->l_len)) 1770 ip->i_size = (lp->l_start + lp->l_len); 1771 1772 /* Release locks, end log transaction and unlock fs */ 1773 TRANS_INODE(ufsvfsp, ip); 1774 rw_exit(&ip->i_contents); 1775 rw_exit(&ufsvfsp->vfs_dqrwlock); 1776 1777 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1778 err = allocsp_unlockfs(vp, &lf); 1779 1780 /* 1781 * @ exit label, we should no longer be holding the fs write lock, and 1782 * all logging transactions should have been ended. We still hold 1783 * ip->i_rwlock. 1784 */ 1785 exit: 1786 /* 1787 * File has grown larger than 2GB. Set flag 1788 * in superblock to indicate this, if it 1789 * is not already set. 1790 */ 1791 if ((ip->i_size > MAXOFF32_T) && 1792 !(fs->fs_flags & FSLARGEFILES)) { 1793 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1794 mutex_enter(&ufsvfsp->vfs_lock); 1795 fs->fs_flags |= FSLARGEFILES; 1796 ufs_sbwrite(ufsvfsp); 1797 mutex_exit(&ufsvfsp->vfs_lock); 1798 } 1799 1800 /* 1801 * Since we couldn't allocate completely, we will undo the allocations. 1802 */ 1803 if (berr) { 1804 ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid); 1805 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1806 1807 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1808 rw_enter(&ip->i_contents, RW_WRITER); 1809 1810 /* Direct blocks */ 1811 for (i = 0; i < NDADDR; i++) { 1812 /* 1813 * Only free the block if they are not same, and 1814 * the old one isn't zero (the fragment was 1815 * re-allocated). 1816 */ 1817 if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) { 1818 free(ip, ip->i_db[i], fs->fs_bsize, 0); 1819 ip->i_db[i] = 0; 1820 } 1821 } 1822 1823 /* Undo the indirect blocks */ 1824 while (ib_undo != NULL) { 1825 undo = ib_undo; 1826 err = bmap_set_bn(vp, undo->offset, 0); 1827 if (err) 1828 cmn_err(CE_PANIC, "ufs_allocsp(): failed to " 1829 "undo allocation of block %ld", 1830 undo->offset); 1831 free(ip, undo->blk, fs->fs_bsize, I_IBLK); 1832 ib_undo = undo->next; 1833 kmem_free(undo, sizeof (struct allocsp_undo)); 1834 } 1835 1836 ip->i_size = osz; 1837 TRANS_INODE(ufsvfsp, ip); 1838 1839 rw_exit(&ip->i_contents); 1840 rw_exit(&ufsvfsp->vfs_dqrwlock); 1841 1842 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1843 1844 rw_exit(&ip->i_rwlock); 1845 return (berr); 1846 } 1847 1848 /* 1849 * Don't forget to free the undo chain :) 1850 */ 1851 while (ib_undo != NULL) { 1852 undo = ib_undo; 1853 ib_undo = undo->next; 1854 kmem_free(undo, sizeof (struct allocsp_undo)); 1855 } 1856 1857 rw_exit(&ip->i_rwlock); 1858 1859 out_allocsp: 1860 return (err); 1861 } 1862 1863 /* 1864 * Free storage space associated with the specified inode. The portion 1865 * to be freed is specified by lp->l_start and lp->l_len (already 1866 * normalized to a "whence" of 0). 1867 * 1868 * This is an experimental facility whose continued existence is not 1869 * guaranteed. Currently, we only support the special case 1870 * of l_len == 0, meaning free to end of file. 1871 * 1872 * Blocks are freed in reverse order. This FILO algorithm will tend to 1873 * maintain a contiguous free list much longer than FIFO. 1874 * See also ufs_itrunc() in ufs_inode.c. 1875 * 1876 * Bug: unused bytes in the last retained block are not cleared. 1877 * This may result in a "hole" in the file that does not read as zeroes. 1878 */ 1879 /* ARGSUSED */ 1880 int 1881 ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr) 1882 { 1883 int i; 1884 struct inode *ip = VTOI(vp); 1885 int error; 1886 1887 ASSERT(vp->v_type == VREG); 1888 ASSERT(lp->l_start >= 0); /* checked by convoff */ 1889 1890 if (lp->l_len != 0) 1891 return (EINVAL); 1892 1893 rw_enter(&ip->i_contents, RW_READER); 1894 if (ip->i_size == (u_offset_t)lp->l_start) { 1895 rw_exit(&ip->i_contents); 1896 return (0); 1897 } 1898 1899 /* 1900 * Check if there is any active mandatory lock on the 1901 * range that will be truncated/expanded. 1902 */ 1903 if (MANDLOCK(vp, ip->i_mode)) { 1904 offset_t save_start; 1905 1906 save_start = lp->l_start; 1907 1908 if (ip->i_size < lp->l_start) { 1909 /* 1910 * "Truncate up" case: need to make sure there 1911 * is no lock beyond current end-of-file. To 1912 * do so, we need to set l_start to the size 1913 * of the file temporarily. 1914 */ 1915 lp->l_start = ip->i_size; 1916 } 1917 lp->l_type = F_WRLCK; 1918 lp->l_sysid = 0; 1919 lp->l_pid = ttoproc(curthread)->p_pid; 1920 i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK; 1921 rw_exit(&ip->i_contents); 1922 if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 || 1923 lp->l_type != F_UNLCK) { 1924 return (i ? i : EAGAIN); 1925 } 1926 rw_enter(&ip->i_contents, RW_READER); 1927 1928 lp->l_start = save_start; 1929 } 1930 1931 /* 1932 * Make sure a write isn't in progress (allocating blocks) 1933 * by acquiring i_rwlock (we promised ufs_bmap we wouldn't 1934 * truncate while it was allocating blocks). 1935 * Grab the locks in the right order. 1936 */ 1937 rw_exit(&ip->i_contents); 1938 rw_enter(&ip->i_rwlock, RW_WRITER); 1939 error = TRANS_ITRUNC(ip, (u_offset_t)lp->l_start, 0, cr); 1940 rw_exit(&ip->i_rwlock); 1941 return (error); 1942 } 1943 1944 /* 1945 * Find a cg with as close to nb contiguous bytes as possible 1946 * THIS MAY TAKE MANY DISK READS! 1947 * 1948 * Implemented in an attempt to allocate contiguous blocks for 1949 * writing the ufs log file to, minimizing future disk head seeking 1950 */ 1951 daddr_t 1952 contigpref(ufsvfs_t *ufsvfsp, size_t nb, size_t minb) 1953 { 1954 struct fs *fs = ufsvfsp->vfs_fs; 1955 daddr_t nblk = lblkno(fs, blkroundup(fs, nb)); 1956 daddr_t minblk = lblkno(fs, blkroundup(fs, minb)); 1957 daddr_t savebno, curbno, cgbno; 1958 int cg, cgblks, savecg, savenblk, curnblk, startcg; 1959 uchar_t *blksfree; 1960 buf_t *bp; 1961 struct cg *cgp; 1962 1963 savenblk = 0; 1964 savecg = 0; 1965 savebno = 0; 1966 1967 if ((startcg = findlogstartcg(fs, nblk, minblk)) == -1) 1968 cg = 0; /* Nothing suitable found */ 1969 else 1970 cg = startcg; 1971 1972 for (; cg < fs->fs_ncg; ++cg) { 1973 /* 1974 * find the largest contiguous range in this cg 1975 */ 1976 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, 1977 (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 1978 (int)fs->fs_cgsize); 1979 cgp = bp->b_un.b_cg; 1980 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 1981 brelse(bp); 1982 continue; 1983 } 1984 blksfree = cg_blksfree(cgp); /* free array */ 1985 cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */ 1986 cgbno = 0; 1987 while (cgbno < cgblks && savenblk < nblk) { 1988 /* find a free block */ 1989 for (; cgbno < cgblks; ++cgbno) { 1990 if (isblock(fs, blksfree, cgbno)) { 1991 if (startcg != -1) { 1992 brelse(bp); 1993 savecg = startcg; 1994 savebno = cgbno; 1995 goto done; 1996 } else 1997 break; 1998 } 1999 } 2000 curbno = cgbno; 2001 /* count the number of free blocks */ 2002 for (curnblk = 0; cgbno < cgblks; ++cgbno) { 2003 if (!isblock(fs, blksfree, cgbno)) 2004 break; 2005 if (++curnblk >= nblk) 2006 break; 2007 } 2008 if (curnblk > savenblk) { 2009 savecg = cg; 2010 savenblk = curnblk; 2011 savebno = curbno; 2012 } 2013 } 2014 brelse(bp); 2015 if (savenblk >= nblk) 2016 break; 2017 } 2018 2019 done: 2020 2021 /* convert block offset in cg to frag offset in cg */ 2022 savebno = blkstofrags(fs, savebno); 2023 2024 /* convert frag offset in cg to frag offset in fs */ 2025 savebno += (savecg * fs->fs_fpg); 2026 2027 return (savebno); 2028 } 2029 2030 /* 2031 * The object of this routine is to find a start point for the UFS log. 2032 * Ideally the space should be allocated from the smallest possible number 2033 * of contiguous cylinder groups. This is found by using a sliding window 2034 * technique. The smallest window of contiguous cylinder groups, which is 2035 * still able to accommodate the target, is found by moving the window 2036 * through the cylinder groups in a single pass. The end of the window is 2037 * advanced until the space is accommodated, then the start is advanced until 2038 * it no longer fits, the end is then advanced again and so on until the 2039 * final cylinder group is reached. The first suitable instance is recorded 2040 * and its starting cg number is returned. 2041 * 2042 * If we are not able to find a minimum amount of space, represented by 2043 * minblk, or to do so uses more than the available extents, then return -1. 2044 */ 2045 2046 int 2047 findlogstartcg(struct fs *fs, daddr_t requested, daddr_t minblk) 2048 { 2049 int ncgs; /* number of cylinder groups */ 2050 daddr_t target; /* amount of space sought */ 2051 int cwidth, ctotal; /* current window width and total */ 2052 int bwidth, btotal; /* best window width and total so far */ 2053 int s; /* index of the first element in the current window */ 2054 int e; /* index of the first element + the width */ 2055 /* (i.e. 1 + index of last element) */ 2056 int bs; /* index of the first element in the best window so far */ 2057 int header, max_extents; 2058 2059 target = requested; 2060 ncgs = fs->fs_ncg; 2061 2062 header = sizeof (extent_block_t) - sizeof (extent_t); 2063 max_extents = ((fs->fs_bsize)-header) / sizeof (extent_t); 2064 cwidth = ctotal = 0; 2065 btotal = -1; 2066 bwidth = ncgs; 2067 s = e = 0; 2068 while (e < ncgs) { 2069 /* Advance the end of the window until it accommodates the target. */ 2070 while (ctotal < target && e < ncgs) { 2071 ctotal += fs->fs_cs(fs, e).cs_nbfree; 2072 e++; 2073 } 2074 2075 /* 2076 * Advance the start of the window until it no longer 2077 * accommodates the target. 2078 */ 2079 while (ctotal >= target && s < e) { 2080 /* See if this is the smallest window so far. */ 2081 cwidth = e - s; 2082 if (cwidth <= bwidth) { 2083 if (cwidth == bwidth && ctotal <= btotal) 2084 goto more; 2085 bwidth = cwidth; 2086 btotal = ctotal; 2087 bs = s; 2088 } 2089 more: 2090 ctotal -= fs->fs_cs(fs, s).cs_nbfree; 2091 s++; 2092 } 2093 } 2094 2095 /* 2096 * If we cannot allocate the minimum required or we use too many 2097 * extents to do so, return -1. 2098 */ 2099 if (btotal < minblk || bwidth > max_extents) 2100 bs = -1; 2101 2102 return (bs); 2103 } 2104