1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #include <sys/condvar_impl.h> 40 #include <sys/types.h> 41 #include <sys/t_lock.h> 42 #include <sys/debug.h> 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/signal.h> 46 #include <sys/cred.h> 47 #include <sys/proc.h> 48 #include <sys/disp.h> 49 #include <sys/user.h> 50 #include <sys/buf.h> 51 #include <sys/vfs.h> 52 #include <sys/vnode.h> 53 #include <sys/acl.h> 54 #include <sys/fs/ufs_fs.h> 55 #include <sys/fs/ufs_inode.h> 56 #include <sys/fs/ufs_acl.h> 57 #include <sys/fs/ufs_bio.h> 58 #include <sys/fs/ufs_quota.h> 59 #include <sys/kmem.h> 60 #include <sys/fs/ufs_trans.h> 61 #include <sys/fs/ufs_panic.h> 62 #include <sys/errno.h> 63 #include <sys/time.h> 64 #include <sys/sysmacros.h> 65 #include <sys/file.h> 66 #include <sys/fcntl.h> 67 #include <sys/flock.h> 68 #include <fs/fs_subr.h> 69 #include <sys/cmn_err.h> 70 #include <sys/policy.h> 71 #include <sys/fs/ufs_log.h> 72 73 static ino_t hashalloc(); 74 static daddr_t fragextend(); 75 static daddr_t alloccg(); 76 static daddr_t alloccgblk(); 77 static ino_t ialloccg(); 78 static daddr_t mapsearch(); 79 static int findlogstartcg(); 80 81 extern int inside[], around[]; 82 extern uchar_t *fragtbl[]; 83 void delay(); 84 85 /* 86 * Allocate a block in the file system. 87 * 88 * The size of the requested block is given, which must be some 89 * multiple of fs_fsize and <= fs_bsize. 90 * A preference may be optionally specified. If a preference is given 91 * the following hierarchy is used to allocate a block: 92 * 1) allocate the requested block. 93 * 2) allocate a rotationally optimal block in the same cylinder. 94 * 3) allocate a block in the same cylinder group. 95 * 4) quadratically rehash into other cylinder groups, until an 96 * available block is located. 97 * If no block preference is given the following hierarchy is used 98 * to allocate a block: 99 * 1) allocate a block in the cylinder group that contains the 100 * inode for the file. 101 * 2) quadratically rehash into other cylinder groups, until an 102 * available block is located. 103 */ 104 int 105 alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr) 106 { 107 struct fs *fs; 108 struct ufsvfs *ufsvfsp; 109 daddr_t bno; 110 int cg; 111 int err; 112 char *errmsg = NULL; 113 size_t len; 114 115 ufsvfsp = ip->i_ufsvfs; 116 fs = ufsvfsp->vfs_fs; 117 if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) { 118 err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx," 119 " bsize = %d, size = %d, fs = %s\n", 120 ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); 121 return (err); 122 } 123 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 124 goto nospace; 125 if (freespace(fs, ufsvfsp) <= 0 && 126 secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0) 127 goto nospace; 128 err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len); 129 /* Note that may not have err, but may have errmsg */ 130 if (errmsg != NULL) { 131 uprintf(errmsg); 132 kmem_free(errmsg, len); 133 errmsg = NULL; 134 } 135 if (err) 136 return (err); 137 if (bpref >= fs->fs_size) 138 bpref = 0; 139 if (bpref == 0) 140 cg = (int)itog(fs, ip->i_number); 141 else 142 cg = dtog(fs, bpref); 143 144 bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size, 145 (ulong_t (*)())alloccg); 146 if (bno > 0) { 147 *bnp = bno; 148 return (0); 149 } 150 151 /* 152 * hashalloc() failed because some other thread grabbed 153 * the last block so unwind the quota operation. We can 154 * ignore the return because subtractions don't fail and 155 * size is guaranteed to be >= zero by our caller. 156 */ 157 (void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL, 158 (size_t *)NULL); 159 160 nospace: 161 mutex_enter(&ufsvfsp->vfs_lock); 162 if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) && 163 (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) { 164 ufsvfsp->vfs_lastwhinetime = lbolt; 165 cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt); 166 } 167 mutex_exit(&ufsvfsp->vfs_lock); 168 return (ENOSPC); 169 } 170 171 /* 172 * Reallocate a fragment to a bigger size 173 * 174 * The number and size of the old block is given, and a preference 175 * and new size is also specified. The allocator attempts to extend 176 * the original block. Failing that, the regular block allocator is 177 * invoked to get an appropriate block. 178 */ 179 int 180 realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize, 181 int nsize, daddr_t *bnp, cred_t *cr) 182 { 183 daddr_t bno; 184 struct fs *fs; 185 struct ufsvfs *ufsvfsp; 186 int cg, request; 187 int err; 188 char *errmsg = NULL; 189 size_t len; 190 191 ufsvfsp = ip->i_ufsvfs; 192 fs = ufsvfsp->vfs_fs; 193 if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || 194 (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { 195 err = ufs_fault(ITOV(ip), 196 "realloccg: bad size, dev=0x%lx, bsize=%d, " 197 "osize=%d, nsize=%d, fs=%s\n", 198 ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt); 199 return (err); 200 } 201 if (freespace(fs, ufsvfsp) <= 0 && 202 secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0) 203 goto nospace; 204 if (bprev == 0) { 205 err = ufs_fault(ITOV(ip), 206 "realloccg: bad bprev, dev = 0x%lx, bsize = %d," 207 " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev, 208 fs->fs_fsmnt); 209 return (err); 210 } 211 err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len); 212 /* Note that may not have err, but may have errmsg */ 213 if (errmsg != NULL) { 214 uprintf(errmsg); 215 kmem_free(errmsg, len); 216 errmsg = NULL; 217 } 218 if (err) 219 return (err); 220 cg = dtog(fs, bprev); 221 bno = fragextend(ip, cg, (long)bprev, osize, nsize); 222 if (bno != 0) { 223 *bnp = bno; 224 return (0); 225 } 226 if (bpref >= fs->fs_size) 227 bpref = 0; 228 229 /* 230 * When optimizing for time we allocate a full block and 231 * then only use the upper portion for this request. When 232 * this file grows again it will grow into the unused portion 233 * of the block (See fragextend() above). This saves time 234 * because an extra disk write would be needed if the frags 235 * following the current allocation were not free. The extra 236 * disk write is needed to move the data from its current 237 * location into the newly allocated position. 238 * 239 * When optimizing for space we allocate a run of frags 240 * that is just the right size for this request. 241 */ 242 request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize; 243 bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request, 244 (ulong_t (*)())alloccg); 245 if (bno > 0) { 246 *bnp = bno; 247 if (nsize < request) 248 (void) free(ip, bno + numfrags(fs, nsize), 249 (off_t)(request - nsize), I_NOCANCEL); 250 return (0); 251 } 252 253 /* 254 * hashalloc() failed because some other thread grabbed 255 * the last block so unwind the quota operation. We can 256 * ignore the return because subtractions don't fail, and 257 * our caller guarantees nsize >= osize. 258 */ 259 (void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL, 260 (size_t *)NULL); 261 262 nospace: 263 mutex_enter(&ufsvfsp->vfs_lock); 264 if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) && 265 (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) { 266 ufsvfsp->vfs_lastwhinetime = lbolt; 267 cmn_err(CE_NOTE, 268 "realloccg %s: file system full", fs->fs_fsmnt); 269 } 270 mutex_exit(&ufsvfsp->vfs_lock); 271 return (ENOSPC); 272 } 273 274 /* 275 * Allocate an inode in the file system. 276 * 277 * A preference may be optionally specified. If a preference is given 278 * the following hierarchy is used to allocate an inode: 279 * 1) allocate the requested inode. 280 * 2) allocate an inode in the same cylinder group. 281 * 3) quadratically rehash into other cylinder groups, until an 282 * available inode is located. 283 * If no inode preference is given the following hierarchy is used 284 * to allocate an inode: 285 * 1) allocate an inode in cylinder group 0. 286 * 2) quadratically rehash into other cylinder groups, until an 287 * available inode is located. 288 */ 289 int 290 ufs_ialloc(struct inode *pip, 291 ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr) 292 { 293 struct inode *ip; 294 struct fs *fs; 295 int cg; 296 ino_t ino; 297 int err; 298 int nifree; 299 struct ufsvfs *ufsvfsp = pip->i_ufsvfs; 300 char *errmsg = NULL; 301 size_t len; 302 303 ASSERT(RW_WRITE_HELD(&pip->i_rwlock)); 304 fs = pip->i_fs; 305 loop: 306 nifree = fs->fs_cstotal.cs_nifree; 307 308 if (nifree == 0) 309 goto noinodes; 310 /* 311 * Shadow inodes don't count against a user's inode allocation. 312 * They are an implementation method and not a resource. 313 */ 314 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 315 err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data, 316 /* change */ 1, (struct inode *)NULL, crgetuid(cr), 0, 317 cr, &errmsg, &len); 318 /* 319 * As we haven't acquired any locks yet, dump the message 320 * now. 321 */ 322 if (errmsg != NULL) { 323 uprintf(errmsg); 324 kmem_free(errmsg, len); 325 errmsg = NULL; 326 } 327 if (err) 328 return (err); 329 } 330 331 if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg)) 332 ipref = 0; 333 cg = (int)itog(fs, ipref); 334 ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode, 335 (ulong_t (*)())ialloccg); 336 if (ino == 0) { 337 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 338 /* 339 * We can safely ignore the return from chkiq() 340 * because deallocations can only fail if we 341 * can't get the user's quota info record off 342 * the disk due to an I/O error. In that case, 343 * the quota subsystem is already messed up. 344 */ 345 (void) chkiq(ufsvfsp, /* change */ -1, 346 (struct inode *)NULL, crgetuid(cr), 0, cr, 347 (char **)NULL, (size_t *)NULL); 348 } 349 goto noinodes; 350 } 351 err = ufs_iget(pip->i_vfs, ino, ipp, cr); 352 if (err) { 353 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 354 /* 355 * See above comment about why it is safe to ignore an 356 * error return here. 357 */ 358 (void) chkiq(ufsvfsp, /* change */ -1, 359 (struct inode *)NULL, crgetuid(cr), 0, cr, 360 (char **)NULL, (size_t *)NULL); 361 } 362 ufs_ifree(pip, ino, 0); 363 return (err); 364 } 365 ip = *ipp; 366 ASSERT(!ip->i_ufs_acl); 367 ASSERT(!ip->i_dquot); 368 rw_enter(&ip->i_contents, RW_WRITER); 369 370 /* 371 * Check if we really got a free inode, if not then complain 372 * and mark the inode ISTALE so that it will be freed by the 373 * ufs idle thread eventually and will not be sent to ufs_delete(). 374 */ 375 if (ip->i_mode || (ip->i_nlink > 0)) { 376 ip->i_flag |= ISTALE; 377 rw_exit(&ip->i_contents); 378 VN_RELE(ITOV(ip)); 379 cmn_err(CE_WARN, 380 "%s: unexpected allocated inode %d, run fsck(1M)%s", 381 fs->fs_fsmnt, (int)ino, 382 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 383 goto loop; 384 } 385 386 /* 387 * Check the inode has no size or data blocks. 388 * This could have happened if the truncation failed when 389 * deleting the inode. It used to be possible for this to occur 390 * if a block allocation failed when iteratively truncating a 391 * large file using logging and with a full file system. 392 * This was fixed with bug fix 4348738. However, truncation may 393 * still fail on an IO error. So in all cases for safety and 394 * security we clear out the size; the blocks allocated; and 395 * pointers to the blocks. This will ultimately cause a fsck 396 * error of un-accounted for blocks, but its a fairly benign error, 397 * and possibly the correct thing to do anyway as accesssing those 398 * blocks agains may lead to more IO errors. 399 */ 400 if (ip->i_size || ip->i_blocks) { 401 int i; 402 403 if (ip->i_size) { 404 cmn_err(CE_WARN, 405 "%s: free inode %d had size 0x%llx, run fsck(1M)%s", 406 fs->fs_fsmnt, (int)ino, ip->i_size, 407 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 408 } 409 /* 410 * Clear any garbage left behind. 411 */ 412 ip->i_size = (u_offset_t)0; 413 ip->i_blocks = 0; 414 for (i = 0; i < NDADDR; i++) 415 ip->i_db[i] = 0; 416 for (i = 0; i < NIADDR; i++) 417 ip->i_ib[i] = 0; 418 } 419 420 /* 421 * Initialize the link count 422 */ 423 ip->i_nlink = 0; 424 425 /* 426 * Clear the old flags 427 */ 428 ip->i_flag &= IREF; 429 430 /* 431 * Access times are not really defined if the fs is mounted 432 * with 'noatime'. But it can cause nfs clients to fail 433 * open() if the atime is not a legal value. Set a legal value 434 * here when the inode is allocated. 435 */ 436 if (ufsvfsp->vfs_noatime) { 437 mutex_enter(&ufs_iuniqtime_lock); 438 ip->i_atime = iuniqtime; 439 mutex_exit(&ufs_iuniqtime_lock); 440 } 441 rw_exit(&ip->i_contents); 442 return (0); 443 noinodes: 444 if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET)) 445 cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt); 446 return (ENOSPC); 447 } 448 449 /* 450 * Find a cylinder group to place a directory. 451 * Returns an inumber within the selected cylinder group. 452 * Note, the vfs_lock is not needed as we don't require exact cg summary info. 453 * 454 * If the switch ufs_close_dirs is set, then the policy is to use 455 * the current cg if it has more than 25% free inodes and more 456 * than 25% free blocks. Otherwise the cgs are searched from 457 * the beginning and the first cg with the same criteria is 458 * used. If that is also null then we revert to the old algorithm. 459 * This tends to cluster files at the beginning of the disk 460 * until the disk gets full. 461 * 462 * Otherwise if ufs_close_dirs is not set then the original policy is 463 * used which is to select from among those cylinder groups with 464 * above the average number of free inodes, the one with the smallest 465 * number of directories. 466 */ 467 468 int ufs_close_dirs = 1; /* allocate directories close as possible */ 469 470 ino_t 471 dirpref(inode_t *dp) 472 { 473 int cg, minndir, mincg, avgifree, mininode, minbpg, ifree; 474 struct fs *fs = dp->i_fs; 475 476 cg = itog(fs, dp->i_number); 477 mininode = fs->fs_ipg >> 2; 478 minbpg = fs->fs_maxbpg >> 2; 479 if (ufs_close_dirs && 480 (fs->fs_cs(fs, cg).cs_nifree > mininode) && 481 (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) { 482 return (dp->i_number); 483 } 484 485 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 486 minndir = fs->fs_ipg; 487 mincg = 0; 488 for (cg = 0; cg < fs->fs_ncg; cg++) { 489 ifree = fs->fs_cs(fs, cg).cs_nifree; 490 if (ufs_close_dirs && 491 (ifree > mininode) && 492 (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) { 493 return ((ino_t)(fs->fs_ipg * cg)); 494 } 495 if ((fs->fs_cs(fs, cg).cs_ndir < minndir) && 496 (ifree >= avgifree)) { 497 mincg = cg; 498 minndir = fs->fs_cs(fs, cg).cs_ndir; 499 } 500 } 501 return ((ino_t)(fs->fs_ipg * mincg)); 502 } 503 504 /* 505 * Select the desired position for the next block in a file. The file is 506 * logically divided into sections. The first section is composed of the 507 * direct blocks. Each additional section contains fs_maxbpg blocks. 508 * 509 * If no blocks have been allocated in the first section, the policy is to 510 * request a block in the same cylinder group as the inode that describes 511 * the file. If no blocks have been allocated in any other section, the 512 * policy is to place the section in a cylinder group with a greater than 513 * average number of free blocks. An appropriate cylinder group is found 514 * by using a rotor that sweeps the cylinder groups. When a new group of 515 * blocks is needed, the sweep begins in the cylinder group following the 516 * cylinder group from which the previous allocation was made. The sweep 517 * continues until a cylinder group with greater than the average number 518 * of free blocks is found. If the allocation is for the first block in an 519 * indirect block, the information on the previous allocation is unavailable; 520 * here a best guess is made based upon the logical block number being 521 * allocated. 522 * 523 * If a section is already partially allocated, the policy is to 524 * contiguously allocate fs_maxcontig blocks. The end of one of these 525 * contiguous blocks and the beginning of the next is physically separated 526 * so that the disk head will be in transit between them for at least 527 * fs_rotdelay milliseconds. This is to allow time for the processor to 528 * schedule another I/O transfer. 529 */ 530 daddr_t 531 blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap) 532 { 533 struct fs *fs; 534 struct ufsvfs *ufsvfsp; 535 int cg; 536 int avgbfree, startcg; 537 daddr_t nextblk; 538 539 ufsvfsp = ip->i_ufsvfs; 540 fs = ip->i_fs; 541 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 542 if (lbn < NDADDR) { 543 cg = itog(fs, ip->i_number); 544 return (fs->fs_fpg * cg + fs->fs_frag); 545 } 546 /* 547 * Find a cylinder with greater than average 548 * number of unused data blocks. 549 */ 550 if (indx == 0 || bap[indx - 1] == 0) 551 startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg; 552 else 553 startcg = dtog(fs, bap[indx - 1]) + 1; 554 startcg %= fs->fs_ncg; 555 556 mutex_enter(&ufsvfsp->vfs_lock); 557 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 558 /* 559 * used for computing log space for writes/truncs 560 */ 561 ufsvfsp->vfs_avgbfree = avgbfree; 562 for (cg = startcg; cg < fs->fs_ncg; cg++) 563 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 564 fs->fs_cgrotor = cg; 565 mutex_exit(&ufsvfsp->vfs_lock); 566 return (fs->fs_fpg * cg + fs->fs_frag); 567 } 568 for (cg = 0; cg <= startcg; cg++) 569 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 570 fs->fs_cgrotor = cg; 571 mutex_exit(&ufsvfsp->vfs_lock); 572 return (fs->fs_fpg * cg + fs->fs_frag); 573 } 574 mutex_exit(&ufsvfsp->vfs_lock); 575 return (NULL); 576 } 577 /* 578 * One or more previous blocks have been laid out. If less 579 * than fs_maxcontig previous blocks are contiguous, the 580 * next block is requested contiguously, otherwise it is 581 * requested rotationally delayed by fs_rotdelay milliseconds. 582 */ 583 584 nextblk = bap[indx - 1]; 585 /* 586 * Provision for fallocate to return positive 587 * blk preference based on last allocation 588 */ 589 if (nextblk < 0 && nextblk != UFS_HOLE) { 590 nextblk = (-bap[indx - 1]) + fs->fs_frag; 591 } else { 592 nextblk = bap[indx - 1] + fs->fs_frag; 593 } 594 595 if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] + 596 blkstofrags(fs, fs->fs_maxcontig) != nextblk) { 597 return (nextblk); 598 } 599 if (fs->fs_rotdelay != 0) 600 /* 601 * Here we convert ms of delay to frags as: 602 * (frags) = (ms) * (rev/sec) * (sect/rev) / 603 * ((sect/frag) * (ms/sec)) 604 * then round up to the next block. 605 */ 606 nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect / 607 (NSPF(fs) * 1000), fs->fs_frag); 608 return (nextblk); 609 } 610 611 /* 612 * Free a block or fragment. 613 * 614 * The specified block or fragment is placed back in the 615 * free map. If a fragment is deallocated, a possible 616 * block reassembly is checked. 617 */ 618 void 619 free(struct inode *ip, daddr_t bno, off_t size, int flags) 620 { 621 struct fs *fs = ip->i_fs; 622 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 623 struct ufs_q *delq = &ufsvfsp->vfs_delete; 624 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 625 struct cg *cgp; 626 struct buf *bp; 627 int cg, bmap, bbase; 628 int i; 629 uchar_t *blksfree; 630 int *blktot; 631 short *blks; 632 daddr_t blkno, cylno, rpos; 633 634 /* 635 * fallocate'd files will have negative block address. 636 * So negate it again to get original block address. 637 */ 638 if (bno < 0 && (bno % fs->fs_frag == 0) && bno != UFS_HOLE) { 639 bno = -bno; 640 } 641 642 if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) { 643 (void) ufs_fault(ITOV(ip), 644 "free: bad size, dev = 0x%lx, bsize = %d, size = %d, " 645 "fs = %s\n", ip->i_dev, fs->fs_bsize, 646 (int)size, fs->fs_fsmnt); 647 return; 648 } 649 cg = dtog(fs, bno); 650 ASSERT(!ufs_badblock(ip, bno)); 651 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 652 (int)fs->fs_cgsize); 653 654 cgp = bp->b_un.b_cg; 655 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 656 brelse(bp); 657 return; 658 } 659 660 if (!(flags & I_NOCANCEL)) 661 TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags); 662 if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) { 663 TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size); 664 } 665 blksfree = cg_blksfree(cgp); 666 blktot = cg_blktot(cgp); 667 mutex_enter(&ufsvfsp->vfs_lock); 668 cgp->cg_time = gethrestime_sec(); 669 bno = dtogd(fs, bno); 670 if (size == fs->fs_bsize) { 671 blkno = fragstoblks(fs, bno); 672 cylno = cbtocylno(fs, bno); 673 rpos = cbtorpos(ufsvfsp, bno); 674 blks = cg_blks(ufsvfsp, cgp, cylno); 675 if (!isclrblock(fs, blksfree, blkno)) { 676 mutex_exit(&ufsvfsp->vfs_lock); 677 brelse(bp); 678 (void) ufs_fault(ITOV(ip), "free: freeing free block, " 679 "dev:0x%lx, block:%ld, ino:%lu, fs:%s", 680 ip->i_dev, bno, ip->i_number, fs->fs_fsmnt); 681 return; 682 } 683 setblock(fs, blksfree, blkno); 684 blks[rpos]++; 685 blktot[cylno]++; 686 cgp->cg_cs.cs_nbfree++; /* Log below */ 687 fs->fs_cstotal.cs_nbfree++; 688 fs->fs_cs(fs, cg).cs_nbfree++; 689 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) { 690 mutex_enter(&delq->uq_mutex); 691 delq_info->delq_unreclaimed_blocks -= 692 btodb(fs->fs_bsize); 693 mutex_exit(&delq->uq_mutex); 694 } 695 } else { 696 bbase = bno - fragnum(fs, bno); 697 /* 698 * Decrement the counts associated with the old frags 699 */ 700 bmap = blkmap(fs, blksfree, bbase); 701 fragacct(fs, bmap, cgp->cg_frsum, -1); 702 /* 703 * Deallocate the fragment 704 */ 705 for (i = 0; i < numfrags(fs, size); i++) { 706 if (isset(blksfree, bno + i)) { 707 brelse(bp); 708 mutex_exit(&ufsvfsp->vfs_lock); 709 (void) ufs_fault(ITOV(ip), 710 "free: freeing free frag, " 711 "dev:0x%lx, blk:%ld, cg:%d, " 712 "ino:%lu, fs:%s", 713 ip->i_dev, 714 bno + i, 715 cgp->cg_cgx, 716 ip->i_number, 717 fs->fs_fsmnt); 718 return; 719 } 720 setbit(blksfree, bno + i); 721 } 722 cgp->cg_cs.cs_nffree += i; 723 fs->fs_cstotal.cs_nffree += i; 724 fs->fs_cs(fs, cg).cs_nffree += i; 725 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) { 726 mutex_enter(&delq->uq_mutex); 727 delq_info->delq_unreclaimed_blocks -= 728 btodb(i * fs->fs_fsize); 729 mutex_exit(&delq->uq_mutex); 730 } 731 /* 732 * Add back in counts associated with the new frags 733 */ 734 bmap = blkmap(fs, blksfree, bbase); 735 fragacct(fs, bmap, cgp->cg_frsum, 1); 736 /* 737 * If a complete block has been reassembled, account for it 738 */ 739 blkno = fragstoblks(fs, bbase); 740 if (isblock(fs, blksfree, blkno)) { 741 cylno = cbtocylno(fs, bbase); 742 rpos = cbtorpos(ufsvfsp, bbase); 743 blks = cg_blks(ufsvfsp, cgp, cylno); 744 blks[rpos]++; 745 blktot[cylno]++; 746 cgp->cg_cs.cs_nffree -= fs->fs_frag; 747 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 748 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 749 cgp->cg_cs.cs_nbfree++; 750 fs->fs_cstotal.cs_nbfree++; 751 fs->fs_cs(fs, cg).cs_nbfree++; 752 } 753 } 754 fs->fs_fmod = 1; 755 ufs_notclean(ufsvfsp); 756 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 757 TRANS_SI(ufsvfsp, fs, cg); 758 bdrwrite(bp); 759 } 760 761 /* 762 * Free an inode. 763 * 764 * The specified inode is placed back in the free map. 765 */ 766 void 767 ufs_ifree(struct inode *ip, ino_t ino, mode_t mode) 768 { 769 struct fs *fs = ip->i_fs; 770 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 771 struct cg *cgp; 772 struct buf *bp; 773 unsigned int inot; 774 int cg; 775 char *iused; 776 777 if (ip->i_number == ino && ip->i_mode != 0) { 778 (void) ufs_fault(ITOV(ip), 779 "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, " 780 "fs = %s\n", 781 ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt); 782 return; 783 } 784 if (ino >= fs->fs_ipg * fs->fs_ncg) { 785 (void) ufs_fault(ITOV(ip), 786 "ifree: range, dev = 0x%x, ino = %d, fs = %s\n", 787 (int)ip->i_dev, (int)ino, fs->fs_fsmnt); 788 return; 789 } 790 cg = (int)itog(fs, ino); 791 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 792 (int)fs->fs_cgsize); 793 794 cgp = bp->b_un.b_cg; 795 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 796 brelse(bp); 797 return; 798 } 799 mutex_enter(&ufsvfsp->vfs_lock); 800 cgp->cg_time = gethrestime_sec(); 801 iused = cg_inosused(cgp); 802 inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg); 803 if (isclr(iused, inot)) { 804 mutex_exit(&ufsvfsp->vfs_lock); 805 brelse(bp); 806 (void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, " 807 "mode: (imode) %o, (omode) %o, ino:%d, " 808 "fs:%s", 809 ip->i_mode, mode, (int)ino, fs->fs_fsmnt); 810 return; 811 } 812 clrbit(iused, inot); 813 814 if (inot < (ulong_t)cgp->cg_irotor) 815 cgp->cg_irotor = inot; 816 cgp->cg_cs.cs_nifree++; 817 fs->fs_cstotal.cs_nifree++; 818 fs->fs_cs(fs, cg).cs_nifree++; 819 if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) { 820 cgp->cg_cs.cs_ndir--; 821 fs->fs_cstotal.cs_ndir--; 822 fs->fs_cs(fs, cg).cs_ndir--; 823 } 824 fs->fs_fmod = 1; 825 ufs_notclean(ufsvfsp); 826 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 827 TRANS_SI(ufsvfsp, fs, cg); 828 bdrwrite(bp); 829 } 830 831 /* 832 * Implement the cylinder overflow algorithm. 833 * 834 * The policy implemented by this algorithm is: 835 * 1) allocate the block in its requested cylinder group. 836 * 2) quadratically rehash on the cylinder group number. 837 * 3) brute force search for a free block. 838 * The size parameter means size for data blocks, mode for inodes. 839 */ 840 static ino_t 841 hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)()) 842 { 843 struct fs *fs; 844 int i; 845 long result; 846 int icg = cg; 847 848 fs = ip->i_fs; 849 /* 850 * 1: preferred cylinder group 851 */ 852 result = (*allocator)(ip, cg, pref, size); 853 if (result) 854 return (result); 855 /* 856 * 2: quadratic rehash 857 */ 858 for (i = 1; i < fs->fs_ncg; i *= 2) { 859 cg += i; 860 if (cg >= fs->fs_ncg) 861 cg -= fs->fs_ncg; 862 result = (*allocator)(ip, cg, 0, size); 863 if (result) 864 return (result); 865 } 866 /* 867 * 3: brute force search 868 * Note that we start at i == 2, since 0 was checked initially, 869 * and 1 is always checked in the quadratic rehash. 870 */ 871 cg = (icg + 2) % fs->fs_ncg; 872 for (i = 2; i < fs->fs_ncg; i++) { 873 result = (*allocator)(ip, cg, 0, size); 874 if (result) 875 return (result); 876 cg++; 877 if (cg == fs->fs_ncg) 878 cg = 0; 879 } 880 return (NULL); 881 } 882 883 /* 884 * Determine whether a fragment can be extended. 885 * 886 * Check to see if the necessary fragments are available, and 887 * if they are, allocate them. 888 */ 889 static daddr_t 890 fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize) 891 { 892 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 893 struct fs *fs = ip->i_fs; 894 struct buf *bp; 895 struct cg *cgp; 896 uchar_t *blksfree; 897 long bno; 898 int frags, bbase; 899 int i, j; 900 901 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) 902 return (NULL); 903 frags = numfrags(fs, nsize); 904 bbase = (int)fragnum(fs, bprev); 905 if (bbase > fragnum(fs, (bprev + frags - 1))) { 906 /* cannot extend across a block boundary */ 907 return (NULL); 908 } 909 910 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 911 (int)fs->fs_cgsize); 912 cgp = bp->b_un.b_cg; 913 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 914 brelse(bp); 915 return (NULL); 916 } 917 918 blksfree = cg_blksfree(cgp); 919 mutex_enter(&ufsvfsp->vfs_lock); 920 bno = dtogd(fs, bprev); 921 for (i = numfrags(fs, osize); i < frags; i++) { 922 if (isclr(blksfree, bno + i)) { 923 mutex_exit(&ufsvfsp->vfs_lock); 924 brelse(bp); 925 return (NULL); 926 } 927 if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)), 928 fs->fs_fsize))) { 929 mutex_exit(&ufsvfsp->vfs_lock); 930 brelse(bp); 931 return (NULL); 932 } 933 } 934 935 cgp->cg_time = gethrestime_sec(); 936 /* 937 * The current fragment can be extended, 938 * deduct the count on fragment being extended into 939 * increase the count on the remaining fragment (if any) 940 * allocate the extended piece. 941 */ 942 for (i = frags; i < fs->fs_frag - bbase; i++) 943 if (isclr(blksfree, bno + i)) 944 break; 945 j = i - numfrags(fs, osize); 946 cgp->cg_frsum[j]--; 947 ASSERT(cgp->cg_frsum[j] >= 0); 948 if (i != frags) 949 cgp->cg_frsum[i - frags]++; 950 for (i = numfrags(fs, osize); i < frags; i++) { 951 clrbit(blksfree, bno + i); 952 cgp->cg_cs.cs_nffree--; 953 fs->fs_cs(fs, cg).cs_nffree--; 954 fs->fs_cstotal.cs_nffree--; 955 } 956 fs->fs_fmod = 1; 957 ufs_notclean(ufsvfsp); 958 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 959 TRANS_SI(ufsvfsp, fs, cg); 960 bdrwrite(bp); 961 return ((daddr_t)bprev); 962 } 963 964 /* 965 * Determine whether a block can be allocated. 966 * 967 * Check to see if a block of the apprpriate size 968 * is available, and if it is, allocate it. 969 */ 970 static daddr_t 971 alloccg(struct inode *ip, int cg, daddr_t bpref, int size) 972 { 973 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 974 struct fs *fs = ip->i_fs; 975 struct buf *bp; 976 struct cg *cgp; 977 uchar_t *blksfree; 978 int bno, frags; 979 int allocsiz; 980 int i; 981 982 /* 983 * Searching for space could be time expensive so do some 984 * up front checking to verify that there is actually space 985 * available (free blocks or free frags). 986 */ 987 if (fs->fs_cs(fs, cg).cs_nbfree == 0) { 988 if (size == fs->fs_bsize) 989 return (0); 990 991 /* 992 * If there are not enough free frags then return. 993 */ 994 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, size)) 995 return (0); 996 } 997 998 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 999 (int)fs->fs_cgsize); 1000 1001 cgp = bp->b_un.b_cg; 1002 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) || 1003 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { 1004 brelse(bp); 1005 return (0); 1006 } 1007 blksfree = cg_blksfree(cgp); 1008 mutex_enter(&ufsvfsp->vfs_lock); 1009 cgp->cg_time = gethrestime_sec(); 1010 if (size == fs->fs_bsize) { 1011 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0) 1012 goto errout; 1013 fs->fs_fmod = 1; 1014 ufs_notclean(ufsvfsp); 1015 TRANS_SI(ufsvfsp, fs, cg); 1016 bdrwrite(bp); 1017 return (bno); 1018 } 1019 /* 1020 * Check fragment bitmap to see if any fragments are already available. 1021 * mapsearch() may fail because the fragment that fits this request 1022 * might still be on the cancel list and not available for re-use yet. 1023 * Look for a bigger sized fragment to allocate first before we have 1024 * to give up and fragment a whole new block eventually. 1025 */ 1026 frags = numfrags(fs, size); 1027 allocsiz = frags; 1028 next_size: 1029 for (; allocsiz < fs->fs_frag; allocsiz++) 1030 if (cgp->cg_frsum[allocsiz] != 0) 1031 break; 1032 1033 if (allocsiz != fs->fs_frag) { 1034 bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz); 1035 if (bno < 0 && allocsiz < (fs->fs_frag - 1)) { 1036 allocsiz++; 1037 goto next_size; 1038 } 1039 } 1040 1041 if (allocsiz == fs->fs_frag || bno < 0) { 1042 /* 1043 * No fragments were available, so a block 1044 * will be allocated and hacked up. 1045 */ 1046 if (cgp->cg_cs.cs_nbfree == 0) 1047 goto errout; 1048 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0) 1049 goto errout; 1050 bpref = dtogd(fs, bno); 1051 for (i = frags; i < fs->fs_frag; i++) 1052 setbit(blksfree, bpref + i); 1053 i = fs->fs_frag - frags; 1054 cgp->cg_cs.cs_nffree += i; 1055 fs->fs_cstotal.cs_nffree += i; 1056 fs->fs_cs(fs, cg).cs_nffree += i; 1057 cgp->cg_frsum[i]++; 1058 fs->fs_fmod = 1; 1059 ufs_notclean(ufsvfsp); 1060 TRANS_SI(ufsvfsp, fs, cg); 1061 bdrwrite(bp); 1062 return (bno); 1063 } 1064 1065 for (i = 0; i < frags; i++) 1066 clrbit(blksfree, bno + i); 1067 cgp->cg_cs.cs_nffree -= frags; 1068 fs->fs_cstotal.cs_nffree -= frags; 1069 fs->fs_cs(fs, cg).cs_nffree -= frags; 1070 cgp->cg_frsum[allocsiz]--; 1071 ASSERT(cgp->cg_frsum[allocsiz] >= 0); 1072 if (frags != allocsiz) { 1073 cgp->cg_frsum[allocsiz - frags]++; 1074 } 1075 fs->fs_fmod = 1; 1076 ufs_notclean(ufsvfsp); 1077 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1078 TRANS_SI(ufsvfsp, fs, cg); 1079 bdrwrite(bp); 1080 return (cg * fs->fs_fpg + bno); 1081 errout: 1082 mutex_exit(&ufsvfsp->vfs_lock); 1083 brelse(bp); 1084 return (0); 1085 } 1086 1087 /* 1088 * Allocate a block in a cylinder group. 1089 * 1090 * This algorithm implements the following policy: 1091 * 1) allocate the requested block. 1092 * 2) allocate a rotationally optimal block in the same cylinder. 1093 * 3) allocate the next available block on the block rotor for the 1094 * specified cylinder group. 1095 * Note that this routine only allocates fs_bsize blocks; these 1096 * blocks may be fragmented by the routine that allocates them. 1097 */ 1098 static daddr_t 1099 alloccgblk( 1100 struct ufsvfs *ufsvfsp, 1101 struct cg *cgp, 1102 daddr_t bpref, 1103 struct buf *bp) 1104 { 1105 daddr_t bno; 1106 int cylno, pos, delta, rotbl_size; 1107 short *cylbp; 1108 int i; 1109 struct fs *fs; 1110 uchar_t *blksfree; 1111 daddr_t blkno, rpos, frag; 1112 short *blks; 1113 int32_t *blktot; 1114 1115 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock)); 1116 fs = ufsvfsp->vfs_fs; 1117 blksfree = cg_blksfree(cgp); 1118 if (bpref == 0) { 1119 bpref = cgp->cg_rotor; 1120 goto norot; 1121 } 1122 bpref = blknum(fs, bpref); 1123 bpref = dtogd(fs, bpref); 1124 /* 1125 * If the requested block is available, use it. 1126 */ 1127 if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) { 1128 bno = bpref; 1129 goto gotit; 1130 } 1131 /* 1132 * Check for a block available on the same cylinder. 1133 */ 1134 cylno = cbtocylno(fs, bpref); 1135 if (cg_blktot(cgp)[cylno] == 0) 1136 goto norot; 1137 if (fs->fs_cpc == 0) { 1138 /* 1139 * Block layout info is not available, so just 1140 * have to take any block in this cylinder. 1141 */ 1142 bpref = howmany(fs->fs_spc * cylno, NSPF(fs)); 1143 goto norot; 1144 } 1145 /* 1146 * Check the summary information to see if a block is 1147 * available in the requested cylinder starting at the 1148 * requested rotational position and proceeding around. 1149 */ 1150 cylbp = cg_blks(ufsvfsp, cgp, cylno); 1151 pos = cbtorpos(ufsvfsp, bpref); 1152 for (i = pos; i < ufsvfsp->vfs_nrpos; i++) 1153 if (cylbp[i] > 0) 1154 break; 1155 if (i == ufsvfsp->vfs_nrpos) 1156 for (i = 0; i < pos; i++) 1157 if (cylbp[i] > 0) 1158 break; 1159 if (cylbp[i] > 0) { 1160 /* 1161 * Found a rotational position, now find the actual 1162 * block. A "panic" if none is actually there. 1163 */ 1164 1165 /* 1166 * Up to this point, "pos" has referred to the rotational 1167 * position of the desired block. From now on, it holds 1168 * the offset of the current cylinder within a cylinder 1169 * cycle. (A cylinder cycle refers to a set of cylinders 1170 * which are described by a single rotational table; the 1171 * size of the cycle is fs_cpc.) 1172 * 1173 * bno is set to the block number of the first block within 1174 * the current cylinder cycle. 1175 */ 1176 1177 pos = cylno % fs->fs_cpc; 1178 bno = (cylno - pos) * fs->fs_spc / NSPB(fs); 1179 1180 /* 1181 * The blocks within a cylinder are grouped into equivalence 1182 * classes according to their "rotational position." There 1183 * are two tables used to determine these classes. 1184 * 1185 * The positional offset table (fs_postbl) has an entry for 1186 * each rotational position of each cylinder in a cylinder 1187 * cycle. This entry contains the relative block number 1188 * (counting from the start of the cylinder cycle) of the 1189 * first block in the equivalence class for that position 1190 * and that cylinder. Positions for which no blocks exist 1191 * are indicated by a -1. 1192 * 1193 * The rotational delta table (fs_rotbl) has an entry for 1194 * each block in a cylinder cycle. This entry contains 1195 * the offset from that block to the next block in the 1196 * same equivalence class. The last block in the class 1197 * is indicated by a zero in the table. 1198 * 1199 * The following code, then, walks through all of the blocks 1200 * in the cylinder (cylno) which we're allocating within 1201 * which are in the equivalence class for the rotational 1202 * position (i) which we're allocating within. 1203 */ 1204 1205 if (fs_postbl(ufsvfsp, pos)[i] == -1) { 1206 (void) ufs_fault(ufsvfsp->vfs_root, 1207 "alloccgblk: cyl groups corrupted, pos = %d, " 1208 "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt); 1209 return (0); 1210 } 1211 1212 /* 1213 * There is one entry in the rotational table for each block 1214 * in the cylinder cycle. These are whole blocks, not frags. 1215 */ 1216 1217 rotbl_size = (fs->fs_cpc * fs->fs_spc) >> 1218 (fs->fs_fragshift + fs->fs_fsbtodb); 1219 1220 /* 1221 * As we start, "i" is the rotational position within which 1222 * we're searching. After the next line, it will be a block 1223 * number (relative to the start of the cylinder cycle) 1224 * within the equivalence class of that rotational position. 1225 */ 1226 1227 i = fs_postbl(ufsvfsp, pos)[i]; 1228 1229 for (;;) { 1230 if (isblock(fs, blksfree, (daddr_t)(bno + i))) { 1231 bno = blkstofrags(fs, (bno + i)); 1232 goto gotit; 1233 } 1234 delta = fs_rotbl(fs)[i]; 1235 if (delta <= 0 || /* End of chain, or */ 1236 delta + i > rotbl_size) /* end of table? */ 1237 break; /* If so, panic. */ 1238 i += delta; 1239 } 1240 (void) ufs_fault(ufsvfsp->vfs_root, 1241 "alloccgblk: can't find blk in cyl, pos:%d, i:%d, " 1242 "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno); 1243 return (0); 1244 } 1245 norot: 1246 /* 1247 * No blocks in the requested cylinder, so take 1248 * next available one in this cylinder group. 1249 */ 1250 bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag); 1251 if (bno < 0) 1252 return (0); 1253 cgp->cg_rotor = bno; 1254 gotit: 1255 blkno = fragstoblks(fs, bno); 1256 frag = (cgp->cg_cgx * fs->fs_fpg) + bno; 1257 if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize)) 1258 goto norot; 1259 clrblock(fs, blksfree, (long)blkno); 1260 /* 1261 * the other cg/sb/si fields are TRANS'ed by the caller 1262 */ 1263 cgp->cg_cs.cs_nbfree--; 1264 fs->fs_cstotal.cs_nbfree--; 1265 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; 1266 cylno = cbtocylno(fs, bno); 1267 blks = cg_blks(ufsvfsp, cgp, cylno); 1268 rpos = cbtorpos(ufsvfsp, bno); 1269 blktot = cg_blktot(cgp); 1270 blks[rpos]--; 1271 blktot[cylno]--; 1272 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1273 fs->fs_fmod = 1; 1274 return (frag); 1275 } 1276 1277 /* 1278 * Determine whether an inode can be allocated. 1279 * 1280 * Check to see if an inode is available, and if it is, 1281 * allocate it using the following policy: 1282 * 1) allocate the requested inode. 1283 * 2) allocate the next available inode after the requested 1284 * inode in the specified cylinder group. 1285 */ 1286 static ino_t 1287 ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode) 1288 { 1289 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1290 struct fs *fs = ip->i_fs; 1291 struct cg *cgp; 1292 struct buf *bp; 1293 int start, len, loc, map, i; 1294 char *iused; 1295 1296 if (fs->fs_cs(fs, cg).cs_nifree == 0) 1297 return (0); 1298 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 1299 (int)fs->fs_cgsize); 1300 1301 cgp = bp->b_un.b_cg; 1302 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) || 1303 cgp->cg_cs.cs_nifree == 0) { 1304 brelse(bp); 1305 return (0); 1306 } 1307 iused = cg_inosused(cgp); 1308 mutex_enter(&ufsvfsp->vfs_lock); 1309 /* 1310 * While we are waiting for the mutex, someone may have taken 1311 * the last available inode. Need to recheck. 1312 */ 1313 if (cgp->cg_cs.cs_nifree == 0) { 1314 mutex_exit(&ufsvfsp->vfs_lock); 1315 brelse(bp); 1316 return (0); 1317 } 1318 1319 cgp->cg_time = gethrestime_sec(); 1320 if (ipref) { 1321 ipref %= fs->fs_ipg; 1322 if (isclr(iused, ipref)) 1323 goto gotit; 1324 } 1325 start = cgp->cg_irotor / NBBY; 1326 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); 1327 loc = skpc(0xff, (uint_t)len, &iused[start]); 1328 if (loc == 0) { 1329 len = start + 1; 1330 start = 0; 1331 loc = skpc(0xff, (uint_t)len, &iused[0]); 1332 if (loc == 0) { 1333 mutex_exit(&ufsvfsp->vfs_lock); 1334 (void) ufs_fault(ITOV(ip), 1335 "ialloccg: map corrupted, cg = %d, irotor = %d, " 1336 "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt); 1337 return (0); 1338 } 1339 } 1340 i = start + len - loc; 1341 map = iused[i]; 1342 ipref = i * NBBY; 1343 for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) { 1344 if ((map & i) == 0) { 1345 cgp->cg_irotor = ipref; 1346 goto gotit; 1347 } 1348 } 1349 1350 mutex_exit(&ufsvfsp->vfs_lock); 1351 (void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s", 1352 fs->fs_fsmnt); 1353 return (0); 1354 gotit: 1355 setbit(iused, ipref); 1356 cgp->cg_cs.cs_nifree--; 1357 fs->fs_cstotal.cs_nifree--; 1358 fs->fs_cs(fs, cg).cs_nifree--; 1359 if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) { 1360 cgp->cg_cs.cs_ndir++; 1361 fs->fs_cstotal.cs_ndir++; 1362 fs->fs_cs(fs, cg).cs_ndir++; 1363 } 1364 fs->fs_fmod = 1; 1365 ufs_notclean(ufsvfsp); 1366 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1367 TRANS_SI(ufsvfsp, fs, cg); 1368 bdrwrite(bp); 1369 return (cg * fs->fs_ipg + ipref); 1370 } 1371 1372 /* 1373 * Find a block of the specified size in the specified cylinder group. 1374 * 1375 * It is a panic if a request is made to find a block if none are 1376 * available. 1377 */ 1378 static daddr_t 1379 mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref, 1380 int allocsiz) 1381 { 1382 struct fs *fs = ufsvfsp->vfs_fs; 1383 daddr_t bno, cfrag; 1384 int start, len, loc, i, last, first, secondtime; 1385 int blk, field, subfield, pos; 1386 int gotit; 1387 1388 /* 1389 * ufsvfs->vfs_lock is held when calling this. 1390 */ 1391 /* 1392 * Find the fragment by searching through the 1393 * free block map for an appropriate bit pattern. 1394 */ 1395 if (bpref) 1396 start = dtogd(fs, bpref) / NBBY; 1397 else 1398 start = cgp->cg_frotor / NBBY; 1399 /* 1400 * the following loop performs two scans -- the first scan 1401 * searches the bottom half of the array for a match and the 1402 * second scan searches the top half of the array. The loops 1403 * have been merged just to make things difficult. 1404 */ 1405 first = start; 1406 last = howmany(fs->fs_fpg, NBBY); 1407 secondtime = 0; 1408 cfrag = cgp->cg_cgx * fs->fs_fpg; 1409 while (first < last) { 1410 len = last - first; 1411 /* 1412 * search the array for a match 1413 */ 1414 loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first], 1415 (uchar_t *)fragtbl[fs->fs_frag], 1416 (int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 1417 /* 1418 * match found 1419 */ 1420 if (loc) { 1421 bno = (last - loc) * NBBY; 1422 1423 /* 1424 * Found the byte in the map, sift 1425 * through the bits to find the selected frag 1426 */ 1427 cgp->cg_frotor = bno; 1428 gotit = 0; 1429 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 1430 blk = blkmap(fs, cg_blksfree(cgp), bno); 1431 blk <<= 1; 1432 field = around[allocsiz]; 1433 subfield = inside[allocsiz]; 1434 for (pos = 0; 1435 pos <= fs->fs_frag - allocsiz; 1436 pos++) { 1437 if ((blk & field) == subfield) { 1438 gotit++; 1439 break; 1440 } 1441 field <<= 1; 1442 subfield <<= 1; 1443 } 1444 if (gotit) 1445 break; 1446 } 1447 bno += pos; 1448 1449 /* 1450 * success if block is *not* being converted from 1451 * metadata into userdata (harpy). If so, ignore. 1452 */ 1453 if (!TRANS_ISCANCEL(ufsvfsp, 1454 ldbtob(fsbtodb(fs, (cfrag+bno))), 1455 allocsiz * fs->fs_fsize)) 1456 return (bno); 1457 1458 /* 1459 * keep looking -- this block is being converted 1460 */ 1461 first = (last - loc) + 1; 1462 loc = 0; 1463 if (first < last) 1464 continue; 1465 } 1466 /* 1467 * no usable matches in bottom half -- now search the top half 1468 */ 1469 if (secondtime) 1470 /* 1471 * no usable matches in top half -- all done 1472 */ 1473 break; 1474 secondtime = 1; 1475 last = start + 1; 1476 first = 0; 1477 } 1478 /* 1479 * no usable matches 1480 */ 1481 return ((daddr_t)-1); 1482 } 1483 1484 #define UFSNADDR (NDADDR + NIADDR) /* NADDR applies to (obsolete) S5FS */ 1485 #define IB(i) (NDADDR + (i)) /* index of i'th indirect block ptr */ 1486 #define SINGLE 0 /* single indirect block ptr */ 1487 #define DOUBLE 1 /* double indirect block ptr */ 1488 #define TRIPLE 2 /* triple indirect block ptr */ 1489 1490 /* 1491 * Acquire a write lock, and keep trying till we get it 1492 */ 1493 static int 1494 allocsp_wlockfs(struct vnode *vp, struct lockfs *lf) 1495 { 1496 int err = 0; 1497 1498 lockagain: 1499 do { 1500 err = ufs_fiolfss(vp, lf); 1501 if (err) 1502 return (err); 1503 } while (!LOCKFS_IS_ULOCK(lf)); 1504 1505 lf->lf_lock = LOCKFS_WLOCK; 1506 lf->lf_flags = 0; 1507 lf->lf_comment = NULL; 1508 err = ufs__fiolfs(vp, lf, 1, 0); 1509 1510 if (err == EBUSY || err == EINVAL) 1511 goto lockagain; 1512 1513 return (err); 1514 } 1515 1516 /* 1517 * Release the write lock 1518 */ 1519 static int 1520 allocsp_unlockfs(struct vnode *vp, struct lockfs *lf) 1521 { 1522 int err = 0; 1523 1524 lf->lf_lock = LOCKFS_ULOCK; 1525 lf->lf_flags = 0; 1526 err = ufs__fiolfs(vp, lf, 1, 0); 1527 return (err); 1528 } 1529 1530 struct allocsp_undo { 1531 daddr_t offset; 1532 daddr_t blk; 1533 struct allocsp_undo *next; 1534 }; 1535 1536 /* 1537 * ufs_allocsp() can be used to pre-allocate blocks for a file on a given 1538 * file system. For direct blocks, the blocks are allocated from the offset 1539 * requested to the block boundary, then any full blocks are allocated, 1540 * and finally any remainder. 1541 * For indirect blocks the blocks are not initialized and are 1542 * only marked as allocated. These addresses are then stored as negative 1543 * block numbers in the inode to imply special handling. UFS has been modified 1544 * where necessary to understand this new notion. 1545 * Successfully fallocated files will have IFALLOCATE cflag set in the inode. 1546 */ 1547 int 1548 ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr) 1549 { 1550 struct lockfs lf; 1551 int berr, err, resv, issync; 1552 off_t istart, len; /* istart, special for idb */ 1553 struct inode *ip; 1554 struct fs *fs; 1555 struct ufsvfs *ufsvfsp; 1556 u_offset_t resid, i, uoff; 1557 daddr32_t db_undo[NDADDR]; /* old direct blocks */ 1558 struct allocsp_undo *ib_undo = NULL; /* ib undo */ 1559 struct allocsp_undo *undo = NULL; 1560 u_offset_t osz; /* old file size */ 1561 int chunkblks = 0; /* # of blocks in 1 allocation */ 1562 int cnt = 0; 1563 daddr_t allocblk; 1564 daddr_t totblks = 0; 1565 struct ulockfs *ulp; 1566 size_t done_len; 1567 int nbytes, offsetn; 1568 1569 1570 ASSERT(vp->v_type == VREG); 1571 1572 ip = VTOI(vp); 1573 fs = ip->i_fs; 1574 if ((ufsvfsp = ip->i_ufsvfs) == NULL) { 1575 err = EIO; 1576 goto out_allocsp; 1577 } 1578 1579 istart = blkroundup(fs, (lp->l_start)); 1580 len = blkroundup(fs, (lp->l_len)); 1581 chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize; 1582 ulp = &ufsvfsp->vfs_ulockfs; 1583 1584 if (lp->l_start < 0 || lp->l_len <= 0) 1585 return (EINVAL); 1586 1587 /* Quickly check to make sure we have space before we proceed */ 1588 if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) { 1589 if (TRANS_ISTRANS(ufsvfsp)) { 1590 ufs_delete_drain_wait(ufsvfsp, 1); 1591 if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) 1592 return (ENOSPC); 1593 } else 1594 return (ENOSPC); 1595 } 1596 1597 /* 1598 * We will keep i_rwlock locked as WRITER through out the function 1599 * since we don't want anyone else reading or writing to the inode 1600 * while we are in the middle of fallocating the file. 1601 */ 1602 rw_enter(&ip->i_rwlock, RW_WRITER); 1603 1604 /* Back up the direct block list, used for undo later if necessary */ 1605 rw_enter(&ip->i_contents, RW_READER); 1606 for (i = 0; i < NDADDR; i++) 1607 db_undo[i] = ip->i_db[i]; 1608 osz = ip->i_size; 1609 rw_exit(&ip->i_contents); 1610 1611 /* Write lock the file system */ 1612 if (err = allocsp_wlockfs(vp, &lf)) 1613 goto exit; 1614 1615 /* 1616 * Allocate any direct blocks now. 1617 * Blocks are allocated from the offset requested to the block 1618 * boundary, then any full blocks are allocated, and finally any 1619 * remainder. 1620 */ 1621 if (lblkno(fs, lp->l_start) < NDADDR) { 1622 ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize), 1623 &resv, &resid); 1624 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1625 1626 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1627 rw_enter(&ip->i_contents, RW_WRITER); 1628 1629 done_len = 0; 1630 while ((done_len < lp->l_len) && 1631 (lblkno(fs, lp->l_start + done_len) < NDADDR)) { 1632 uoff = (offset_t)(lp->l_start + done_len); 1633 offsetn = (int)blkoff(fs, uoff); 1634 nbytes = (int)MIN(fs->fs_bsize - offsetn, 1635 lp->l_len - done_len); 1636 1637 berr = bmap_write(ip, uoff, offsetn + nbytes, 1638 BI_FALLOCATE, &allocblk, cr); 1639 /* Yikes error, quit */ 1640 if (berr) { 1641 TRANS_INODE(ufsvfsp, ip); 1642 rw_exit(&ip->i_contents); 1643 rw_exit(&ufsvfsp->vfs_dqrwlock); 1644 TRANS_END_CSYNC(ufsvfsp, err, issync, 1645 TOP_ALLOCSP, resv); 1646 err = allocsp_unlockfs(vp, &lf); 1647 goto exit; 1648 } 1649 1650 if (allocblk) { 1651 totblks++; 1652 if ((uoff + nbytes) > ip->i_size) 1653 ip->i_size = (uoff + nbytes); 1654 } 1655 done_len += nbytes; 1656 } 1657 1658 TRANS_INODE(ufsvfsp, ip); 1659 rw_exit(&ip->i_contents); 1660 rw_exit(&ufsvfsp->vfs_dqrwlock); 1661 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1662 1663 /* start offset for indirect allocation */ 1664 istart = (uoff + nbytes); 1665 } 1666 1667 /* Break the transactions into vfs_iotransz units */ 1668 ufs_trans_trunc_resv(ip, ip->i_size + 1669 blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid); 1670 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1671 1672 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1673 rw_enter(&ip->i_contents, RW_WRITER); 1674 1675 /* Now go about fallocating necessary indirect blocks */ 1676 for (i = istart; i < (lp->l_start + lp->l_len); i += fs->fs_bsize) { 1677 berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE, 1678 &allocblk, cr); 1679 if (berr) { 1680 TRANS_INODE(ufsvfsp, ip); 1681 rw_exit(&ip->i_contents); 1682 rw_exit(&ufsvfsp->vfs_dqrwlock); 1683 TRANS_END_CSYNC(ufsvfsp, err, issync, 1684 TOP_ALLOCSP, resv); 1685 err = allocsp_unlockfs(vp, &lf); 1686 goto exit; 1687 } 1688 1689 /* Update the blk counter only if new block was added */ 1690 if (allocblk) { 1691 /* Save undo information */ 1692 undo = kmem_alloc(sizeof (struct allocsp_undo), 1693 KM_SLEEP); 1694 undo->offset = i; 1695 undo->blk = allocblk; 1696 undo->next = ib_undo; 1697 ib_undo = undo; 1698 totblks++; 1699 1700 if (i >= ip->i_size) 1701 ip->i_size += fs->fs_bsize; 1702 } 1703 cnt++; 1704 1705 /* Being a good UFS citizen, let others get a share */ 1706 if (cnt == chunkblks) { 1707 /* 1708 * If there are waiters or the fs is hard locked, 1709 * error locked, or read-only error locked, 1710 * quit with EIO 1711 */ 1712 if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) || 1713 ULOCKFS_IS_ROELOCK(ulp)) { 1714 ip->i_cflags |= IFALLOCATE; 1715 TRANS_INODE(ufsvfsp, ip); 1716 rw_exit(&ip->i_contents); 1717 rw_exit(&ufsvfsp->vfs_dqrwlock); 1718 1719 TRANS_END_CSYNC(ufsvfsp, err, issync, 1720 TOP_ALLOCSP, resv); 1721 rw_exit(&ip->i_rwlock); 1722 (void) allocsp_unlockfs(vp, &lf); 1723 return (EIO); 1724 } 1725 1726 TRANS_INODE(ufsvfsp, ip); 1727 rw_exit(&ip->i_contents); 1728 rw_exit(&ufsvfsp->vfs_dqrwlock); 1729 1730 /* End the current transaction */ 1731 TRANS_END_CSYNC(ufsvfsp, err, issync, 1732 TOP_ALLOCSP, resv); 1733 1734 if (CV_HAS_WAITERS(&ulp->ul_cv)) { 1735 /* Release the write lock */ 1736 if (err = allocsp_unlockfs(vp, &lf)) 1737 goto exit; 1738 1739 /* Wake up others waiting to do operations */ 1740 mutex_enter(&ulp->ul_lock); 1741 cv_broadcast(&ulp->ul_cv); 1742 mutex_exit(&ulp->ul_lock); 1743 1744 /* Grab the write lock again */ 1745 if (err = allocsp_wlockfs(vp, &lf)) 1746 goto exit; 1747 } /* end of CV_HAS_WAITERS(&ulp->ul_cv) */ 1748 1749 /* Reserve more space in log for this file */ 1750 ufs_trans_trunc_resv(ip, 1751 ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz), 1752 &resv, &resid); 1753 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1754 1755 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1756 rw_enter(&ip->i_contents, RW_WRITER); 1757 1758 cnt = 0; /* reset cnt b/c of new transaction */ 1759 } 1760 } 1761 1762 if (!err && !berr) 1763 ip->i_cflags |= IFALLOCATE; 1764 1765 /* If the file has grown then correct the file size */ 1766 if (osz < (lp->l_start + lp->l_len)) 1767 ip->i_size = (lp->l_start + lp->l_len); 1768 1769 /* Release locks, end log transaction and unlock fs */ 1770 TRANS_INODE(ufsvfsp, ip); 1771 rw_exit(&ip->i_contents); 1772 rw_exit(&ufsvfsp->vfs_dqrwlock); 1773 1774 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1775 err = allocsp_unlockfs(vp, &lf); 1776 1777 /* 1778 * @ exit label, we should no longer be holding the fs write lock, and 1779 * all logging transactions should have been ended. We still hold 1780 * ip->i_rwlock. 1781 */ 1782 exit: 1783 /* 1784 * File has grown larger than 2GB. Set flag 1785 * in superblock to indicate this, if it 1786 * is not already set. 1787 */ 1788 if ((ip->i_size > MAXOFF32_T) && 1789 !(fs->fs_flags & FSLARGEFILES)) { 1790 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1791 mutex_enter(&ufsvfsp->vfs_lock); 1792 fs->fs_flags |= FSLARGEFILES; 1793 ufs_sbwrite(ufsvfsp); 1794 mutex_exit(&ufsvfsp->vfs_lock); 1795 } 1796 1797 /* 1798 * Since we couldn't allocate completely, we will undo the allocations. 1799 */ 1800 if (berr) { 1801 ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid); 1802 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1803 1804 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1805 rw_enter(&ip->i_contents, RW_WRITER); 1806 1807 /* Direct blocks */ 1808 for (i = 0; i < NDADDR; i++) { 1809 /* 1810 * Only free the block if they are not same, and 1811 * the old one isn't zero (the fragment was 1812 * re-allocated). 1813 */ 1814 if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) { 1815 free(ip, ip->i_db[i], fs->fs_bsize, 0); 1816 ip->i_db[i] = 0; 1817 } 1818 } 1819 1820 /* Undo the indirect blocks */ 1821 while (ib_undo != NULL) { 1822 undo = ib_undo; 1823 err = bmap_set_bn(vp, undo->offset, 0); 1824 if (err) 1825 cmn_err(CE_PANIC, "ufs_allocsp(): failed to " 1826 "undo allocation of block %ld", 1827 undo->offset); 1828 free(ip, undo->blk, fs->fs_bsize, I_IBLK); 1829 ib_undo = undo->next; 1830 kmem_free(undo, sizeof (struct allocsp_undo)); 1831 } 1832 1833 ip->i_size = osz; 1834 TRANS_INODE(ufsvfsp, ip); 1835 1836 rw_exit(&ip->i_contents); 1837 rw_exit(&ufsvfsp->vfs_dqrwlock); 1838 1839 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1840 1841 rw_exit(&ip->i_rwlock); 1842 return (berr); 1843 } 1844 1845 /* 1846 * Don't forget to free the undo chain :) 1847 */ 1848 while (ib_undo != NULL) { 1849 undo = ib_undo; 1850 ib_undo = undo->next; 1851 kmem_free(undo, sizeof (struct allocsp_undo)); 1852 } 1853 1854 rw_exit(&ip->i_rwlock); 1855 1856 out_allocsp: 1857 return (err); 1858 } 1859 1860 /* 1861 * Free storage space associated with the specified inode. The portion 1862 * to be freed is specified by lp->l_start and lp->l_len (already 1863 * normalized to a "whence" of 0). 1864 * 1865 * This is an experimental facility whose continued existence is not 1866 * guaranteed. Currently, we only support the special case 1867 * of l_len == 0, meaning free to end of file. 1868 * 1869 * Blocks are freed in reverse order. This FILO algorithm will tend to 1870 * maintain a contiguous free list much longer than FIFO. 1871 * See also ufs_itrunc() in ufs_inode.c. 1872 * 1873 * Bug: unused bytes in the last retained block are not cleared. 1874 * This may result in a "hole" in the file that does not read as zeroes. 1875 */ 1876 /* ARGSUSED */ 1877 int 1878 ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr) 1879 { 1880 int i; 1881 struct inode *ip = VTOI(vp); 1882 int error; 1883 1884 ASSERT(vp->v_type == VREG); 1885 ASSERT(lp->l_start >= 0); /* checked by convoff */ 1886 1887 if (lp->l_len != 0) 1888 return (EINVAL); 1889 1890 rw_enter(&ip->i_contents, RW_READER); 1891 if (ip->i_size == (u_offset_t)lp->l_start) { 1892 rw_exit(&ip->i_contents); 1893 return (0); 1894 } 1895 1896 /* 1897 * Check if there is any active mandatory lock on the 1898 * range that will be truncated/expanded. 1899 */ 1900 if (MANDLOCK(vp, ip->i_mode)) { 1901 offset_t save_start; 1902 1903 save_start = lp->l_start; 1904 1905 if (ip->i_size < lp->l_start) { 1906 /* 1907 * "Truncate up" case: need to make sure there 1908 * is no lock beyond current end-of-file. To 1909 * do so, we need to set l_start to the size 1910 * of the file temporarily. 1911 */ 1912 lp->l_start = ip->i_size; 1913 } 1914 lp->l_type = F_WRLCK; 1915 lp->l_sysid = 0; 1916 lp->l_pid = ttoproc(curthread)->p_pid; 1917 i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK; 1918 rw_exit(&ip->i_contents); 1919 if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 || 1920 lp->l_type != F_UNLCK) { 1921 return (i ? i : EAGAIN); 1922 } 1923 rw_enter(&ip->i_contents, RW_READER); 1924 1925 lp->l_start = save_start; 1926 } 1927 1928 /* 1929 * Make sure a write isn't in progress (allocating blocks) 1930 * by acquiring i_rwlock (we promised ufs_bmap we wouldn't 1931 * truncate while it was allocating blocks). 1932 * Grab the locks in the right order. 1933 */ 1934 rw_exit(&ip->i_contents); 1935 rw_enter(&ip->i_rwlock, RW_WRITER); 1936 error = TRANS_ITRUNC(ip, (u_offset_t)lp->l_start, 0, cr); 1937 rw_exit(&ip->i_rwlock); 1938 return (error); 1939 } 1940 1941 /* 1942 * Find a cg with as close to nb contiguous bytes as possible 1943 * THIS MAY TAKE MANY DISK READS! 1944 * 1945 * Implemented in an attempt to allocate contiguous blocks for 1946 * writing the ufs log file to, minimizing future disk head seeking 1947 */ 1948 daddr_t 1949 contigpref(ufsvfs_t *ufsvfsp, size_t nb, size_t minb) 1950 { 1951 struct fs *fs = ufsvfsp->vfs_fs; 1952 daddr_t nblk = lblkno(fs, blkroundup(fs, nb)); 1953 daddr_t minblk = lblkno(fs, blkroundup(fs, minb)); 1954 daddr_t savebno, curbno, cgbno; 1955 int cg, cgblks, savecg, savenblk, curnblk, startcg; 1956 uchar_t *blksfree; 1957 buf_t *bp; 1958 struct cg *cgp; 1959 1960 savenblk = 0; 1961 savecg = 0; 1962 savebno = 0; 1963 1964 if ((startcg = findlogstartcg(fs, nblk, minblk)) == -1) 1965 cg = 0; /* Nothing suitable found */ 1966 else 1967 cg = startcg; 1968 1969 for (; cg < fs->fs_ncg; ++cg) { 1970 /* 1971 * find the largest contiguous range in this cg 1972 */ 1973 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, 1974 (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 1975 (int)fs->fs_cgsize); 1976 cgp = bp->b_un.b_cg; 1977 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 1978 brelse(bp); 1979 continue; 1980 } 1981 blksfree = cg_blksfree(cgp); /* free array */ 1982 cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */ 1983 cgbno = 0; 1984 while (cgbno < cgblks && savenblk < nblk) { 1985 /* find a free block */ 1986 for (; cgbno < cgblks; ++cgbno) { 1987 if (isblock(fs, blksfree, cgbno)) { 1988 if (startcg != -1) { 1989 brelse(bp); 1990 savecg = startcg; 1991 savebno = cgbno; 1992 goto done; 1993 } else 1994 break; 1995 } 1996 } 1997 curbno = cgbno; 1998 /* count the number of free blocks */ 1999 for (curnblk = 0; cgbno < cgblks; ++cgbno) { 2000 if (!isblock(fs, blksfree, cgbno)) 2001 break; 2002 if (++curnblk >= nblk) 2003 break; 2004 } 2005 if (curnblk > savenblk) { 2006 savecg = cg; 2007 savenblk = curnblk; 2008 savebno = curbno; 2009 } 2010 } 2011 brelse(bp); 2012 if (savenblk >= nblk) 2013 break; 2014 } 2015 2016 done: 2017 2018 /* convert block offset in cg to frag offset in cg */ 2019 savebno = blkstofrags(fs, savebno); 2020 2021 /* convert frag offset in cg to frag offset in fs */ 2022 savebno += (savecg * fs->fs_fpg); 2023 2024 return (savebno); 2025 } 2026 2027 /* 2028 * The object of this routine is to find a start point for the UFS log. 2029 * Ideally the space should be allocated from the smallest possible number 2030 * of contiguous cylinder groups. This is found by using a sliding window 2031 * technique. The smallest window of contiguous cylinder groups, which is 2032 * still able to accommodate the target, is found by moving the window 2033 * through the cylinder groups in a single pass. The end of the window is 2034 * advanced until the space is accommodated, then the start is advanced until 2035 * it no longer fits, the end is then advanced again and so on until the 2036 * final cylinder group is reached. The first suitable instance is recorded 2037 * and its starting cg number is returned. 2038 * 2039 * If we are not able to find a minimum amount of space, represented by 2040 * minblk, or to do so uses more than the available extents, then return -1. 2041 */ 2042 2043 int 2044 findlogstartcg(struct fs *fs, daddr_t requested, daddr_t minblk) 2045 { 2046 int ncgs; /* number of cylinder groups */ 2047 daddr_t target; /* amount of space sought */ 2048 int cwidth, ctotal; /* current window width and total */ 2049 int bwidth, btotal; /* best window width and total so far */ 2050 int s; /* index of the first element in the current window */ 2051 int e; /* index of the first element + the width */ 2052 /* (i.e. 1 + index of last element) */ 2053 int bs; /* index of the first element in the best window so far */ 2054 int header, max_extents; 2055 2056 target = requested; 2057 ncgs = fs->fs_ncg; 2058 2059 header = sizeof (extent_block_t) - sizeof (extent_t); 2060 max_extents = ((fs->fs_bsize)-header) / sizeof (extent_t); 2061 cwidth = ctotal = 0; 2062 btotal = -1; 2063 bwidth = ncgs; 2064 s = e = 0; 2065 while (e < ncgs) { 2066 /* Advance the end of the window until it accommodates the target. */ 2067 while (ctotal < target && e < ncgs) { 2068 ctotal += fs->fs_cs(fs, e).cs_nbfree; 2069 e++; 2070 } 2071 2072 /* 2073 * Advance the start of the window until it no longer 2074 * accommodates the target. 2075 */ 2076 while (ctotal >= target && s < e) { 2077 /* See if this is the smallest window so far. */ 2078 cwidth = e - s; 2079 if (cwidth <= bwidth) { 2080 if (cwidth == bwidth && ctotal <= btotal) 2081 goto more; 2082 bwidth = cwidth; 2083 btotal = ctotal; 2084 bs = s; 2085 } 2086 more: 2087 ctotal -= fs->fs_cs(fs, s).cs_nbfree; 2088 s++; 2089 } 2090 } 2091 2092 /* 2093 * If we cannot allocate the minimum required or we use too many 2094 * extents to do so, return -1. 2095 */ 2096 if (btotal < minblk || bwidth > max_extents) 2097 bs = -1; 2098 2099 return (bs); 2100 } 2101