1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/condvar_impl.h> 43 #include <sys/types.h> 44 #include <sys/t_lock.h> 45 #include <sys/debug.h> 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/signal.h> 49 #include <sys/cred.h> 50 #include <sys/proc.h> 51 #include <sys/disp.h> 52 #include <sys/user.h> 53 #include <sys/buf.h> 54 #include <sys/vfs.h> 55 #include <sys/vnode.h> 56 #include <sys/acl.h> 57 #include <sys/fs/ufs_fs.h> 58 #include <sys/fs/ufs_inode.h> 59 #include <sys/fs/ufs_acl.h> 60 #include <sys/fs/ufs_bio.h> 61 #include <sys/fs/ufs_quota.h> 62 #include <sys/kmem.h> 63 #include <sys/fs/ufs_trans.h> 64 #include <sys/fs/ufs_panic.h> 65 #include <sys/errno.h> 66 #include <sys/time.h> 67 #include <sys/sysmacros.h> 68 #include <sys/file.h> 69 #include <sys/fcntl.h> 70 #include <sys/flock.h> 71 #include <fs/fs_subr.h> 72 #include <sys/cmn_err.h> 73 #include <sys/policy.h> 74 75 static ino_t hashalloc(); 76 static daddr_t fragextend(); 77 static daddr_t alloccg(); 78 static daddr_t alloccgblk(); 79 static ino_t ialloccg(); 80 static daddr_t mapsearch(); 81 82 extern int inside[], around[]; 83 extern uchar_t *fragtbl[]; 84 void delay(); 85 86 /* 87 * Allocate a block in the file system. 88 * 89 * The size of the requested block is given, which must be some 90 * multiple of fs_fsize and <= fs_bsize. 91 * A preference may be optionally specified. If a preference is given 92 * the following hierarchy is used to allocate a block: 93 * 1) allocate the requested block. 94 * 2) allocate a rotationally optimal block in the same cylinder. 95 * 3) allocate a block in the same cylinder group. 96 * 4) quadratically rehash into other cylinder groups, until an 97 * available block is located. 98 * If no block preference is given the following hierarchy is used 99 * to allocate a block: 100 * 1) allocate a block in the cylinder group that contains the 101 * inode for the file. 102 * 2) quadratically rehash into other cylinder groups, until an 103 * available block is located. 104 */ 105 int 106 alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr) 107 { 108 struct fs *fs; 109 struct ufsvfs *ufsvfsp; 110 daddr_t bno; 111 int cg; 112 int err; 113 char *errmsg = NULL; 114 size_t len; 115 116 ufsvfsp = ip->i_ufsvfs; 117 fs = ufsvfsp->vfs_fs; 118 if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) { 119 err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx," 120 " bsize = %d, size = %d, fs = %s\n", 121 ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); 122 return (err); 123 } 124 if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) 125 goto nospace; 126 if (freespace(fs, ufsvfsp) <= 0 && 127 secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0) 128 goto nospace; 129 err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len); 130 /* Note that may not have err, but may have errmsg */ 131 if (errmsg != NULL) { 132 uprintf(errmsg); 133 kmem_free(errmsg, len); 134 errmsg = NULL; 135 } 136 if (err) 137 return (err); 138 if (bpref >= fs->fs_size) 139 bpref = 0; 140 if (bpref == 0) 141 cg = (int)itog(fs, ip->i_number); 142 else 143 cg = dtog(fs, bpref); 144 145 bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size, 146 (ulong_t (*)())alloccg); 147 if (bno > 0) { 148 *bnp = bno; 149 return (0); 150 } 151 152 /* 153 * hashalloc() failed because some other thread grabbed 154 * the last block so unwind the quota operation. We can 155 * ignore the return because subtractions don't fail and 156 * size is guaranteed to be >= zero by our caller. 157 */ 158 (void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL, 159 (size_t *)NULL); 160 161 nospace: 162 mutex_enter(&ufsvfsp->vfs_lock); 163 if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) && 164 (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) { 165 ufsvfsp->vfs_lastwhinetime = lbolt; 166 cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt); 167 } 168 mutex_exit(&ufsvfsp->vfs_lock); 169 return (ENOSPC); 170 } 171 172 /* 173 * Reallocate a fragment to a bigger size 174 * 175 * The number and size of the old block is given, and a preference 176 * and new size is also specified. The allocator attempts to extend 177 * the original block. Failing that, the regular block allocator is 178 * invoked to get an appropriate block. 179 */ 180 int 181 realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize, 182 int nsize, daddr_t *bnp, cred_t *cr) 183 { 184 daddr_t bno; 185 struct fs *fs; 186 struct ufsvfs *ufsvfsp; 187 int cg, request; 188 int err; 189 char *errmsg = NULL; 190 size_t len; 191 192 ufsvfsp = ip->i_ufsvfs; 193 fs = ufsvfsp->vfs_fs; 194 if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || 195 (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { 196 err = ufs_fault(ITOV(ip), 197 "realloccg: bad size, dev=0x%lx, bsize=%d, " 198 "osize=%d, nsize=%d, fs=%s\n", 199 ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt); 200 return (err); 201 } 202 if (freespace(fs, ufsvfsp) <= 0 && 203 secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0) 204 goto nospace; 205 if (bprev == 0) { 206 err = ufs_fault(ITOV(ip), 207 "realloccg: bad bprev, dev = 0x%lx, bsize = %d," 208 " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev, 209 fs->fs_fsmnt); 210 return (err); 211 } 212 err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len); 213 /* Note that may not have err, but may have errmsg */ 214 if (errmsg != NULL) { 215 uprintf(errmsg); 216 kmem_free(errmsg, len); 217 errmsg = NULL; 218 } 219 if (err) 220 return (err); 221 cg = dtog(fs, bprev); 222 bno = fragextend(ip, cg, (long)bprev, osize, nsize); 223 if (bno != 0) { 224 *bnp = bno; 225 return (0); 226 } 227 if (bpref >= fs->fs_size) 228 bpref = 0; 229 230 /* 231 * When optimizing for time we allocate a full block and 232 * then only use the upper portion for this request. When 233 * this file grows again it will grow into the unused portion 234 * of the block (See fragextend() above). This saves time 235 * because an extra disk write would be needed if the frags 236 * following the current allocation were not free. The extra 237 * disk write is needed to move the data from its current 238 * location into the newly allocated position. 239 * 240 * When optimizing for space we allocate a run of frags 241 * that is just the right size for this request. 242 */ 243 request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize; 244 bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request, 245 (ulong_t (*)())alloccg); 246 if (bno > 0) { 247 *bnp = bno; 248 if (nsize < request) 249 (void) free(ip, bno + numfrags(fs, nsize), 250 (off_t)(request - nsize), I_NOCANCEL); 251 return (0); 252 } 253 254 /* 255 * hashalloc() failed because some other thread grabbed 256 * the last block so unwind the quota operation. We can 257 * ignore the return because subtractions don't fail, and 258 * our caller guarantees nsize >= osize. 259 */ 260 (void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL, 261 (size_t *)NULL); 262 263 nospace: 264 mutex_enter(&ufsvfsp->vfs_lock); 265 if ((lbolt - ufsvfsp->vfs_lastwhinetime) > (hz << 2) && 266 (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) { 267 ufsvfsp->vfs_lastwhinetime = lbolt; 268 cmn_err(CE_NOTE, 269 "realloccg %s: file system full", fs->fs_fsmnt); 270 } 271 mutex_exit(&ufsvfsp->vfs_lock); 272 return (ENOSPC); 273 } 274 275 /* 276 * Allocate an inode in the file system. 277 * 278 * A preference may be optionally specified. If a preference is given 279 * the following hierarchy is used to allocate an inode: 280 * 1) allocate the requested inode. 281 * 2) allocate an inode in the same cylinder group. 282 * 3) quadratically rehash into other cylinder groups, until an 283 * available inode is located. 284 * If no inode preference is given the following hierarchy is used 285 * to allocate an inode: 286 * 1) allocate an inode in cylinder group 0. 287 * 2) quadratically rehash into other cylinder groups, until an 288 * available inode is located. 289 */ 290 int 291 ufs_ialloc(struct inode *pip, 292 ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr) 293 { 294 struct inode *ip; 295 struct fs *fs; 296 int cg; 297 ino_t ino; 298 int err; 299 int nifree; 300 struct ufsvfs *ufsvfsp = pip->i_ufsvfs; 301 char *errmsg = NULL; 302 size_t len; 303 304 ASSERT(RW_WRITE_HELD(&pip->i_rwlock)); 305 fs = pip->i_fs; 306 loop: 307 nifree = fs->fs_cstotal.cs_nifree; 308 309 if (nifree == 0) 310 goto noinodes; 311 /* 312 * Shadow inodes don't count against a user's inode allocation. 313 * They are an implementation method and not a resource. 314 */ 315 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 316 err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data, 317 /* change */ 1, (struct inode *)NULL, crgetuid(cr), 0, 318 cr, &errmsg, &len); 319 /* 320 * As we haven't acquired any locks yet, dump the message 321 * now. 322 */ 323 if (errmsg != NULL) { 324 uprintf(errmsg); 325 kmem_free(errmsg, len); 326 errmsg = NULL; 327 } 328 if (err) 329 return (err); 330 } 331 332 if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg)) 333 ipref = 0; 334 cg = (int)itog(fs, ipref); 335 ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode, 336 (ulong_t (*)())ialloccg); 337 if (ino == 0) { 338 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 339 /* 340 * We can safely ignore the return from chkiq() 341 * because deallocations can only fail if we 342 * can't get the user's quota info record off 343 * the disk due to an I/O error. In that case, 344 * the quota subsystem is already messed up. 345 */ 346 (void) chkiq(ufsvfsp, /* change */ -1, 347 (struct inode *)NULL, crgetuid(cr), 0, cr, 348 (char **)NULL, (size_t *)NULL); 349 } 350 goto noinodes; 351 } 352 err = ufs_iget(pip->i_vfs, ino, ipp, cr); 353 if (err) { 354 if ((mode != IFSHAD) && (mode != IFATTRDIR)) { 355 /* 356 * See above comment about why it is safe to ignore an 357 * error return here. 358 */ 359 (void) chkiq(ufsvfsp, /* change */ -1, 360 (struct inode *)NULL, crgetuid(cr), 0, cr, 361 (char **)NULL, (size_t *)NULL); 362 } 363 ufs_ifree(pip, ino, 0); 364 return (err); 365 } 366 ip = *ipp; 367 ASSERT(!ip->i_ufs_acl); 368 ASSERT(!ip->i_dquot); 369 rw_enter(&ip->i_contents, RW_WRITER); 370 371 /* 372 * Check if we really got a free inode, if not then complain 373 * and mark the inode ISTALE so that it will be freed by the 374 * ufs idle thread eventually and will not be sent to ufs_delete(). 375 */ 376 if (ip->i_mode || (ip->i_nlink > 0)) { 377 ip->i_flag |= ISTALE; 378 rw_exit(&ip->i_contents); 379 VN_RELE(ITOV(ip)); 380 cmn_err(CE_WARN, 381 "%s: unexpected allocated inode %d, run fsck(1M)%s", 382 fs->fs_fsmnt, (int)ino, 383 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 384 goto loop; 385 } 386 387 /* 388 * Check the inode has no size or data blocks. 389 * This could have happened if the truncation failed when 390 * deleting the inode. It used to be possible for this to occur 391 * if a block allocation failed when iteratively truncating a 392 * large file using logging and with a full file system. 393 * This was fixed with bug fix 4348738. However, truncation may 394 * still fail on an IO error. So in all cases for safety and 395 * security we clear out the size; the blocks allocated; and 396 * pointers to the blocks. This will ultimately cause a fsck 397 * error of un-accounted for blocks, but its a fairly benign error, 398 * and possibly the correct thing to do anyway as accesssing those 399 * blocks agains may lead to more IO errors. 400 */ 401 if (ip->i_size || ip->i_blocks) { 402 int i; 403 404 if (ip->i_size) { 405 cmn_err(CE_WARN, 406 "%s: free inode %d had size 0x%llx, run fsck(1M)%s", 407 fs->fs_fsmnt, (int)ino, ip->i_size, 408 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 409 } 410 /* 411 * Clear any garbage left behind. 412 */ 413 ip->i_size = (u_offset_t)0; 414 ip->i_blocks = 0; 415 for (i = 0; i < NDADDR; i++) 416 ip->i_db[i] = 0; 417 for (i = 0; i < NIADDR; i++) 418 ip->i_ib[i] = 0; 419 } 420 421 /* 422 * Initialize the link count 423 */ 424 ip->i_nlink = 0; 425 426 /* 427 * Clear the old flags 428 */ 429 ip->i_flag &= IREF; 430 431 /* 432 * Access times are not really defined if the fs is mounted 433 * with 'noatime'. But it can cause nfs clients to fail 434 * open() if the atime is not a legal value. Set a legal value 435 * here when the inode is allocated. 436 */ 437 if (ufsvfsp->vfs_noatime) { 438 mutex_enter(&ufs_iuniqtime_lock); 439 ip->i_atime = iuniqtime; 440 mutex_exit(&ufs_iuniqtime_lock); 441 } 442 rw_exit(&ip->i_contents); 443 return (0); 444 noinodes: 445 if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET)) 446 cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt); 447 return (ENOSPC); 448 } 449 450 /* 451 * Find a cylinder group to place a directory. 452 * Returns an inumber within the selected cylinder group. 453 * Note, the vfs_lock is not needed as we don't require exact cg summary info. 454 * 455 * If the switch ufs_close_dirs is set, then the policy is to use 456 * the current cg if it has more than 25% free inodes and more 457 * than 25% free blocks. Otherwise the cgs are searched from 458 * the beginning and the first cg with the same criteria is 459 * used. If that is also null then we revert to the old algorithm. 460 * This tends to cluster files at the beginning of the disk 461 * until the disk gets full. 462 * 463 * Otherwise if ufs_close_dirs is not set then the original policy is 464 * used which is to select from among those cylinder groups with 465 * above the average number of free inodes, the one with the smallest 466 * number of directories. 467 */ 468 469 int ufs_close_dirs = 1; /* allocate directories close as possible */ 470 471 ino_t 472 dirpref(inode_t *dp) 473 { 474 int cg, minndir, mincg, avgifree, mininode, minbpg, ifree; 475 struct fs *fs = dp->i_fs; 476 477 cg = itog(fs, dp->i_number); 478 mininode = fs->fs_ipg >> 2; 479 minbpg = fs->fs_maxbpg >> 2; 480 if (ufs_close_dirs && 481 (fs->fs_cs(fs, cg).cs_nifree > mininode) && 482 (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) { 483 return (dp->i_number); 484 } 485 486 avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; 487 minndir = fs->fs_ipg; 488 mincg = 0; 489 for (cg = 0; cg < fs->fs_ncg; cg++) { 490 ifree = fs->fs_cs(fs, cg).cs_nifree; 491 if (ufs_close_dirs && 492 (ifree > mininode) && 493 (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) { 494 return ((ino_t)(fs->fs_ipg * cg)); 495 } 496 if ((fs->fs_cs(fs, cg).cs_ndir < minndir) && 497 (ifree >= avgifree)) { 498 mincg = cg; 499 minndir = fs->fs_cs(fs, cg).cs_ndir; 500 } 501 } 502 return ((ino_t)(fs->fs_ipg * mincg)); 503 } 504 505 /* 506 * Select the desired position for the next block in a file. The file is 507 * logically divided into sections. The first section is composed of the 508 * direct blocks. Each additional section contains fs_maxbpg blocks. 509 * 510 * If no blocks have been allocated in the first section, the policy is to 511 * request a block in the same cylinder group as the inode that describes 512 * the file. If no blocks have been allocated in any other section, the 513 * policy is to place the section in a cylinder group with a greater than 514 * average number of free blocks. An appropriate cylinder group is found 515 * by using a rotor that sweeps the cylinder groups. When a new group of 516 * blocks is needed, the sweep begins in the cylinder group following the 517 * cylinder group from which the previous allocation was made. The sweep 518 * continues until a cylinder group with greater than the average number 519 * of free blocks is found. If the allocation is for the first block in an 520 * indirect block, the information on the previous allocation is unavailable; 521 * here a best guess is made based upon the logical block number being 522 * allocated. 523 * 524 * If a section is already partially allocated, the policy is to 525 * contiguously allocate fs_maxcontig blocks. The end of one of these 526 * contiguous blocks and the beginning of the next is physically separated 527 * so that the disk head will be in transit between them for at least 528 * fs_rotdelay milliseconds. This is to allow time for the processor to 529 * schedule another I/O transfer. 530 */ 531 daddr_t 532 blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap) 533 { 534 struct fs *fs; 535 struct ufsvfs *ufsvfsp; 536 int cg; 537 int avgbfree, startcg; 538 daddr_t nextblk; 539 540 ufsvfsp = ip->i_ufsvfs; 541 fs = ip->i_fs; 542 if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { 543 if (lbn < NDADDR) { 544 cg = itog(fs, ip->i_number); 545 return (fs->fs_fpg * cg + fs->fs_frag); 546 } 547 /* 548 * Find a cylinder with greater than average 549 * number of unused data blocks. 550 */ 551 if (indx == 0 || bap[indx - 1] == 0) 552 startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg; 553 else 554 startcg = dtog(fs, bap[indx - 1]) + 1; 555 startcg %= fs->fs_ncg; 556 557 mutex_enter(&ufsvfsp->vfs_lock); 558 avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; 559 /* 560 * used for computing log space for writes/truncs 561 */ 562 ufsvfsp->vfs_avgbfree = avgbfree; 563 for (cg = startcg; cg < fs->fs_ncg; cg++) 564 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 565 fs->fs_cgrotor = cg; 566 mutex_exit(&ufsvfsp->vfs_lock); 567 return (fs->fs_fpg * cg + fs->fs_frag); 568 } 569 for (cg = 0; cg <= startcg; cg++) 570 if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { 571 fs->fs_cgrotor = cg; 572 mutex_exit(&ufsvfsp->vfs_lock); 573 return (fs->fs_fpg * cg + fs->fs_frag); 574 } 575 mutex_exit(&ufsvfsp->vfs_lock); 576 return (NULL); 577 } 578 /* 579 * One or more previous blocks have been laid out. If less 580 * than fs_maxcontig previous blocks are contiguous, the 581 * next block is requested contiguously, otherwise it is 582 * requested rotationally delayed by fs_rotdelay milliseconds. 583 */ 584 585 nextblk = bap[indx - 1]; 586 /* 587 * Provision for fallocate to return positive 588 * blk preference based on last allocation 589 */ 590 if (nextblk < 0 && nextblk != UFS_HOLE) { 591 nextblk = (-bap[indx - 1]) + fs->fs_frag; 592 } else { 593 nextblk = bap[indx - 1] + fs->fs_frag; 594 } 595 596 if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] + 597 blkstofrags(fs, fs->fs_maxcontig) != nextblk) { 598 return (nextblk); 599 } 600 if (fs->fs_rotdelay != 0) 601 /* 602 * Here we convert ms of delay to frags as: 603 * (frags) = (ms) * (rev/sec) * (sect/rev) / 604 * ((sect/frag) * (ms/sec)) 605 * then round up to the next block. 606 */ 607 nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect / 608 (NSPF(fs) * 1000), fs->fs_frag); 609 return (nextblk); 610 } 611 612 /* 613 * Free a block or fragment. 614 * 615 * The specified block or fragment is placed back in the 616 * free map. If a fragment is deallocated, a possible 617 * block reassembly is checked. 618 */ 619 void 620 free(struct inode *ip, daddr_t bno, off_t size, int flags) 621 { 622 struct fs *fs = ip->i_fs; 623 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 624 struct ufs_q *delq = &ufsvfsp->vfs_delete; 625 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 626 struct cg *cgp; 627 struct buf *bp; 628 int cg, bmap, bbase; 629 int i; 630 uchar_t *blksfree; 631 int *blktot; 632 short *blks; 633 daddr_t blkno, cylno, rpos; 634 635 /* 636 * fallocate'd files will have negative block address. 637 * So negate it again to get original block address. 638 */ 639 if (bno < 0 && (bno % fs->fs_frag == 0) && bno != UFS_HOLE) { 640 bno = -bno; 641 } 642 643 if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) { 644 (void) ufs_fault(ITOV(ip), 645 "free: bad size, dev = 0x%lx, bsize = %d, size = %d, " 646 "fs = %s\n", ip->i_dev, fs->fs_bsize, 647 (int)size, fs->fs_fsmnt); 648 return; 649 } 650 cg = dtog(fs, bno); 651 ASSERT(!ufs_badblock(ip, bno)); 652 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 653 (int)fs->fs_cgsize); 654 655 cgp = bp->b_un.b_cg; 656 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 657 brelse(bp); 658 return; 659 } 660 661 if (!(flags & I_NOCANCEL)) 662 TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags); 663 if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) { 664 TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size); 665 } 666 blksfree = cg_blksfree(cgp); 667 blktot = cg_blktot(cgp); 668 mutex_enter(&ufsvfsp->vfs_lock); 669 cgp->cg_time = gethrestime_sec(); 670 bno = dtogd(fs, bno); 671 if (size == fs->fs_bsize) { 672 blkno = fragstoblks(fs, bno); 673 cylno = cbtocylno(fs, bno); 674 rpos = cbtorpos(ufsvfsp, bno); 675 blks = cg_blks(ufsvfsp, cgp, cylno); 676 if (!isclrblock(fs, blksfree, blkno)) { 677 mutex_exit(&ufsvfsp->vfs_lock); 678 brelse(bp); 679 (void) ufs_fault(ITOV(ip), "free: freeing free block, " 680 "dev:0x%lx, block:%ld, ino:%lu, fs:%s", 681 ip->i_dev, bno, ip->i_number, fs->fs_fsmnt); 682 return; 683 } 684 setblock(fs, blksfree, blkno); 685 blks[rpos]++; 686 blktot[cylno]++; 687 cgp->cg_cs.cs_nbfree++; /* Log below */ 688 fs->fs_cstotal.cs_nbfree++; 689 fs->fs_cs(fs, cg).cs_nbfree++; 690 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) { 691 mutex_enter(&delq->uq_mutex); 692 delq_info->delq_unreclaimed_blocks -= 693 btodb(fs->fs_bsize); 694 mutex_exit(&delq->uq_mutex); 695 } 696 } else { 697 bbase = bno - fragnum(fs, bno); 698 /* 699 * Decrement the counts associated with the old frags 700 */ 701 bmap = blkmap(fs, blksfree, bbase); 702 fragacct(fs, bmap, cgp->cg_frsum, -1); 703 /* 704 * Deallocate the fragment 705 */ 706 for (i = 0; i < numfrags(fs, size); i++) { 707 if (isset(blksfree, bno + i)) { 708 brelse(bp); 709 mutex_exit(&ufsvfsp->vfs_lock); 710 (void) ufs_fault(ITOV(ip), 711 "free: freeing free frag, " 712 "dev:0x%lx, blk:%ld, cg:%d, " 713 "ino:%lu, fs:%s", 714 ip->i_dev, 715 bno + i, 716 cgp->cg_cgx, 717 ip->i_number, 718 fs->fs_fsmnt); 719 return; 720 } 721 setbit(blksfree, bno + i); 722 } 723 cgp->cg_cs.cs_nffree += i; 724 fs->fs_cstotal.cs_nffree += i; 725 fs->fs_cs(fs, cg).cs_nffree += i; 726 if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) { 727 mutex_enter(&delq->uq_mutex); 728 delq_info->delq_unreclaimed_blocks -= 729 btodb(i * fs->fs_fsize); 730 mutex_exit(&delq->uq_mutex); 731 } 732 /* 733 * Add back in counts associated with the new frags 734 */ 735 bmap = blkmap(fs, blksfree, bbase); 736 fragacct(fs, bmap, cgp->cg_frsum, 1); 737 /* 738 * If a complete block has been reassembled, account for it 739 */ 740 blkno = fragstoblks(fs, bbase); 741 if (isblock(fs, blksfree, blkno)) { 742 cylno = cbtocylno(fs, bbase); 743 rpos = cbtorpos(ufsvfsp, bbase); 744 blks = cg_blks(ufsvfsp, cgp, cylno); 745 blks[rpos]++; 746 blktot[cylno]++; 747 cgp->cg_cs.cs_nffree -= fs->fs_frag; 748 fs->fs_cstotal.cs_nffree -= fs->fs_frag; 749 fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; 750 cgp->cg_cs.cs_nbfree++; 751 fs->fs_cstotal.cs_nbfree++; 752 fs->fs_cs(fs, cg).cs_nbfree++; 753 } 754 } 755 fs->fs_fmod = 1; 756 ufs_notclean(ufsvfsp); 757 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 758 TRANS_SI(ufsvfsp, fs, cg); 759 bdrwrite(bp); 760 } 761 762 /* 763 * Free an inode. 764 * 765 * The specified inode is placed back in the free map. 766 */ 767 void 768 ufs_ifree(struct inode *ip, ino_t ino, mode_t mode) 769 { 770 struct fs *fs = ip->i_fs; 771 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 772 struct cg *cgp; 773 struct buf *bp; 774 unsigned int inot; 775 int cg; 776 char *iused; 777 778 if (ip->i_number == ino && ip->i_mode != 0) { 779 (void) ufs_fault(ITOV(ip), 780 "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, " 781 "fs = %s\n", 782 ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt); 783 return; 784 } 785 if (ino >= fs->fs_ipg * fs->fs_ncg) { 786 (void) ufs_fault(ITOV(ip), 787 "ifree: range, dev = 0x%x, ino = %d, fs = %s\n", 788 (int)ip->i_dev, (int)ino, fs->fs_fsmnt); 789 return; 790 } 791 cg = (int)itog(fs, ino); 792 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 793 (int)fs->fs_cgsize); 794 795 cgp = bp->b_un.b_cg; 796 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 797 brelse(bp); 798 return; 799 } 800 mutex_enter(&ufsvfsp->vfs_lock); 801 cgp->cg_time = gethrestime_sec(); 802 iused = cg_inosused(cgp); 803 inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg); 804 if (isclr(iused, inot)) { 805 mutex_exit(&ufsvfsp->vfs_lock); 806 brelse(bp); 807 (void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, " 808 "mode: (imode) %o, (omode) %o, ino:%d, " 809 "fs:%s", 810 ip->i_mode, mode, (int)ino, fs->fs_fsmnt); 811 return; 812 } 813 clrbit(iused, inot); 814 815 if (inot < (ulong_t)cgp->cg_irotor) 816 cgp->cg_irotor = inot; 817 cgp->cg_cs.cs_nifree++; 818 fs->fs_cstotal.cs_nifree++; 819 fs->fs_cs(fs, cg).cs_nifree++; 820 if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) { 821 cgp->cg_cs.cs_ndir--; 822 fs->fs_cstotal.cs_ndir--; 823 fs->fs_cs(fs, cg).cs_ndir--; 824 } 825 fs->fs_fmod = 1; 826 ufs_notclean(ufsvfsp); 827 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 828 TRANS_SI(ufsvfsp, fs, cg); 829 bdrwrite(bp); 830 } 831 832 /* 833 * Implement the cylinder overflow algorithm. 834 * 835 * The policy implemented by this algorithm is: 836 * 1) allocate the block in its requested cylinder group. 837 * 2) quadratically rehash on the cylinder group number. 838 * 3) brute force search for a free block. 839 * The size parameter means size for data blocks, mode for inodes. 840 */ 841 static ino_t 842 hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)()) 843 { 844 struct fs *fs; 845 int i; 846 long result; 847 int icg = cg; 848 849 fs = ip->i_fs; 850 /* 851 * 1: preferred cylinder group 852 */ 853 result = (*allocator)(ip, cg, pref, size); 854 if (result) 855 return (result); 856 /* 857 * 2: quadratic rehash 858 */ 859 for (i = 1; i < fs->fs_ncg; i *= 2) { 860 cg += i; 861 if (cg >= fs->fs_ncg) 862 cg -= fs->fs_ncg; 863 result = (*allocator)(ip, cg, 0, size); 864 if (result) 865 return (result); 866 } 867 /* 868 * 3: brute force search 869 * Note that we start at i == 2, since 0 was checked initially, 870 * and 1 is always checked in the quadratic rehash. 871 */ 872 cg = (icg + 2) % fs->fs_ncg; 873 for (i = 2; i < fs->fs_ncg; i++) { 874 result = (*allocator)(ip, cg, 0, size); 875 if (result) 876 return (result); 877 cg++; 878 if (cg == fs->fs_ncg) 879 cg = 0; 880 } 881 return (NULL); 882 } 883 884 /* 885 * Determine whether a fragment can be extended. 886 * 887 * Check to see if the necessary fragments are available, and 888 * if they are, allocate them. 889 */ 890 static daddr_t 891 fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize) 892 { 893 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 894 struct fs *fs = ip->i_fs; 895 struct buf *bp; 896 struct cg *cgp; 897 uchar_t *blksfree; 898 long bno; 899 int frags, bbase; 900 int i, j; 901 902 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) 903 return (NULL); 904 frags = numfrags(fs, nsize); 905 bbase = (int)fragnum(fs, bprev); 906 if (bbase > fragnum(fs, (bprev + frags - 1))) { 907 /* cannot extend across a block boundary */ 908 return (NULL); 909 } 910 911 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 912 (int)fs->fs_cgsize); 913 cgp = bp->b_un.b_cg; 914 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 915 brelse(bp); 916 return (NULL); 917 } 918 919 blksfree = cg_blksfree(cgp); 920 mutex_enter(&ufsvfsp->vfs_lock); 921 bno = dtogd(fs, bprev); 922 for (i = numfrags(fs, osize); i < frags; i++) { 923 if (isclr(blksfree, bno + i)) { 924 mutex_exit(&ufsvfsp->vfs_lock); 925 brelse(bp); 926 return (NULL); 927 } 928 if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)), 929 fs->fs_fsize))) { 930 mutex_exit(&ufsvfsp->vfs_lock); 931 brelse(bp); 932 return (NULL); 933 } 934 } 935 936 cgp->cg_time = gethrestime_sec(); 937 /* 938 * The current fragment can be extended, 939 * deduct the count on fragment being extended into 940 * increase the count on the remaining fragment (if any) 941 * allocate the extended piece. 942 */ 943 for (i = frags; i < fs->fs_frag - bbase; i++) 944 if (isclr(blksfree, bno + i)) 945 break; 946 j = i - numfrags(fs, osize); 947 cgp->cg_frsum[j]--; 948 ASSERT(cgp->cg_frsum[j] >= 0); 949 if (i != frags) 950 cgp->cg_frsum[i - frags]++; 951 for (i = numfrags(fs, osize); i < frags; i++) { 952 clrbit(blksfree, bno + i); 953 cgp->cg_cs.cs_nffree--; 954 fs->fs_cs(fs, cg).cs_nffree--; 955 fs->fs_cstotal.cs_nffree--; 956 } 957 fs->fs_fmod = 1; 958 ufs_notclean(ufsvfsp); 959 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 960 TRANS_SI(ufsvfsp, fs, cg); 961 bdrwrite(bp); 962 return ((daddr_t)bprev); 963 } 964 965 /* 966 * Determine whether a block can be allocated. 967 * 968 * Check to see if a block of the apprpriate size 969 * is available, and if it is, allocate it. 970 */ 971 static daddr_t 972 alloccg(struct inode *ip, int cg, daddr_t bpref, int size) 973 { 974 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 975 struct fs *fs = ip->i_fs; 976 struct buf *bp; 977 struct cg *cgp; 978 uchar_t *blksfree; 979 int bno, frags; 980 int allocsiz; 981 int i; 982 983 /* 984 * Searching for space could be time expensive so do some 985 * up front checking to verify that there is actually space 986 * available (free blocks or free frags). 987 */ 988 if (fs->fs_cs(fs, cg).cs_nbfree == 0) { 989 if (size == fs->fs_bsize) 990 return (0); 991 992 /* 993 * If there are not enough free frags then return. 994 */ 995 if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, size)) 996 return (0); 997 } 998 999 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 1000 (int)fs->fs_cgsize); 1001 1002 cgp = bp->b_un.b_cg; 1003 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) || 1004 (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { 1005 brelse(bp); 1006 return (0); 1007 } 1008 blksfree = cg_blksfree(cgp); 1009 mutex_enter(&ufsvfsp->vfs_lock); 1010 cgp->cg_time = gethrestime_sec(); 1011 if (size == fs->fs_bsize) { 1012 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0) 1013 goto errout; 1014 fs->fs_fmod = 1; 1015 ufs_notclean(ufsvfsp); 1016 TRANS_SI(ufsvfsp, fs, cg); 1017 bdrwrite(bp); 1018 return (bno); 1019 } 1020 /* 1021 * Check to see if any fragments are already available 1022 * allocsiz is the size which will be allocated, hacking 1023 * it down to a smaller size if necessary. 1024 */ 1025 frags = numfrags(fs, size); 1026 for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) 1027 if (cgp->cg_frsum[allocsiz] != 0) 1028 break; 1029 1030 if (allocsiz != fs->fs_frag) 1031 bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz); 1032 1033 if (allocsiz == fs->fs_frag || bno < 0) { 1034 /* 1035 * No fragments were available, so a block 1036 * will be allocated and hacked up. 1037 */ 1038 if (cgp->cg_cs.cs_nbfree == 0) 1039 goto errout; 1040 if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0) 1041 goto errout; 1042 bpref = dtogd(fs, bno); 1043 for (i = frags; i < fs->fs_frag; i++) 1044 setbit(blksfree, bpref + i); 1045 i = fs->fs_frag - frags; 1046 cgp->cg_cs.cs_nffree += i; 1047 fs->fs_cstotal.cs_nffree += i; 1048 fs->fs_cs(fs, cg).cs_nffree += i; 1049 cgp->cg_frsum[i]++; 1050 fs->fs_fmod = 1; 1051 ufs_notclean(ufsvfsp); 1052 TRANS_SI(ufsvfsp, fs, cg); 1053 bdrwrite(bp); 1054 return (bno); 1055 } 1056 1057 for (i = 0; i < frags; i++) 1058 clrbit(blksfree, bno + i); 1059 cgp->cg_cs.cs_nffree -= frags; 1060 fs->fs_cstotal.cs_nffree -= frags; 1061 fs->fs_cs(fs, cg).cs_nffree -= frags; 1062 cgp->cg_frsum[allocsiz]--; 1063 ASSERT(cgp->cg_frsum[allocsiz] >= 0); 1064 if (frags != allocsiz) { 1065 cgp->cg_frsum[allocsiz - frags]++; 1066 } 1067 fs->fs_fmod = 1; 1068 ufs_notclean(ufsvfsp); 1069 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1070 TRANS_SI(ufsvfsp, fs, cg); 1071 bdrwrite(bp); 1072 return (cg * fs->fs_fpg + bno); 1073 errout: 1074 mutex_exit(&ufsvfsp->vfs_lock); 1075 brelse(bp); 1076 return (0); 1077 } 1078 1079 /* 1080 * Allocate a block in a cylinder group. 1081 * 1082 * This algorithm implements the following policy: 1083 * 1) allocate the requested block. 1084 * 2) allocate a rotationally optimal block in the same cylinder. 1085 * 3) allocate the next available block on the block rotor for the 1086 * specified cylinder group. 1087 * Note that this routine only allocates fs_bsize blocks; these 1088 * blocks may be fragmented by the routine that allocates them. 1089 */ 1090 static daddr_t 1091 alloccgblk( 1092 struct ufsvfs *ufsvfsp, 1093 struct cg *cgp, 1094 daddr_t bpref, 1095 struct buf *bp) 1096 { 1097 daddr_t bno; 1098 int cylno, pos, delta, rotbl_size; 1099 short *cylbp; 1100 int i; 1101 struct fs *fs; 1102 uchar_t *blksfree; 1103 daddr_t blkno, rpos, frag; 1104 short *blks; 1105 int32_t *blktot; 1106 1107 ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock)); 1108 fs = ufsvfsp->vfs_fs; 1109 blksfree = cg_blksfree(cgp); 1110 if (bpref == 0) { 1111 bpref = cgp->cg_rotor; 1112 goto norot; 1113 } 1114 bpref = blknum(fs, bpref); 1115 bpref = dtogd(fs, bpref); 1116 /* 1117 * If the requested block is available, use it. 1118 */ 1119 if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) { 1120 bno = bpref; 1121 goto gotit; 1122 } 1123 /* 1124 * Check for a block available on the same cylinder. 1125 */ 1126 cylno = cbtocylno(fs, bpref); 1127 if (cg_blktot(cgp)[cylno] == 0) 1128 goto norot; 1129 if (fs->fs_cpc == 0) { 1130 /* 1131 * Block layout info is not available, so just 1132 * have to take any block in this cylinder. 1133 */ 1134 bpref = howmany(fs->fs_spc * cylno, NSPF(fs)); 1135 goto norot; 1136 } 1137 /* 1138 * Check the summary information to see if a block is 1139 * available in the requested cylinder starting at the 1140 * requested rotational position and proceeding around. 1141 */ 1142 cylbp = cg_blks(ufsvfsp, cgp, cylno); 1143 pos = cbtorpos(ufsvfsp, bpref); 1144 for (i = pos; i < ufsvfsp->vfs_nrpos; i++) 1145 if (cylbp[i] > 0) 1146 break; 1147 if (i == ufsvfsp->vfs_nrpos) 1148 for (i = 0; i < pos; i++) 1149 if (cylbp[i] > 0) 1150 break; 1151 if (cylbp[i] > 0) { 1152 /* 1153 * Found a rotational position, now find the actual 1154 * block. A "panic" if none is actually there. 1155 */ 1156 1157 /* 1158 * Up to this point, "pos" has referred to the rotational 1159 * position of the desired block. From now on, it holds 1160 * the offset of the current cylinder within a cylinder 1161 * cycle. (A cylinder cycle refers to a set of cylinders 1162 * which are described by a single rotational table; the 1163 * size of the cycle is fs_cpc.) 1164 * 1165 * bno is set to the block number of the first block within 1166 * the current cylinder cycle. 1167 */ 1168 1169 pos = cylno % fs->fs_cpc; 1170 bno = (cylno - pos) * fs->fs_spc / NSPB(fs); 1171 1172 /* 1173 * The blocks within a cylinder are grouped into equivalence 1174 * classes according to their "rotational position." There 1175 * are two tables used to determine these classes. 1176 * 1177 * The positional offset table (fs_postbl) has an entry for 1178 * each rotational position of each cylinder in a cylinder 1179 * cycle. This entry contains the relative block number 1180 * (counting from the start of the cylinder cycle) of the 1181 * first block in the equivalence class for that position 1182 * and that cylinder. Positions for which no blocks exist 1183 * are indicated by a -1. 1184 * 1185 * The rotational delta table (fs_rotbl) has an entry for 1186 * each block in a cylinder cycle. This entry contains 1187 * the offset from that block to the next block in the 1188 * same equivalence class. The last block in the class 1189 * is indicated by a zero in the table. 1190 * 1191 * The following code, then, walks through all of the blocks 1192 * in the cylinder (cylno) which we're allocating within 1193 * which are in the equivalence class for the rotational 1194 * position (i) which we're allocating within. 1195 */ 1196 1197 if (fs_postbl(ufsvfsp, pos)[i] == -1) { 1198 (void) ufs_fault(ufsvfsp->vfs_root, 1199 "alloccgblk: cyl groups corrupted, pos = %d, " 1200 "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt); 1201 return (0); 1202 } 1203 1204 /* 1205 * There is one entry in the rotational table for each block 1206 * in the cylinder cycle. These are whole blocks, not frags. 1207 */ 1208 1209 rotbl_size = (fs->fs_cpc * fs->fs_spc) >> 1210 (fs->fs_fragshift + fs->fs_fsbtodb); 1211 1212 /* 1213 * As we start, "i" is the rotational position within which 1214 * we're searching. After the next line, it will be a block 1215 * number (relative to the start of the cylinder cycle) 1216 * within the equivalence class of that rotational position. 1217 */ 1218 1219 i = fs_postbl(ufsvfsp, pos)[i]; 1220 1221 for (;;) { 1222 if (isblock(fs, blksfree, (daddr_t)(bno + i))) { 1223 bno = blkstofrags(fs, (bno + i)); 1224 goto gotit; 1225 } 1226 delta = fs_rotbl(fs)[i]; 1227 if (delta <= 0 || /* End of chain, or */ 1228 delta + i > rotbl_size) /* end of table? */ 1229 break; /* If so, panic. */ 1230 i += delta; 1231 } 1232 (void) ufs_fault(ufsvfsp->vfs_root, 1233 "alloccgblk: can't find blk in cyl, pos:%d, i:%d, " 1234 "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno); 1235 return (0); 1236 } 1237 norot: 1238 /* 1239 * No blocks in the requested cylinder, so take 1240 * next available one in this cylinder group. 1241 */ 1242 bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag); 1243 if (bno < 0) 1244 return (0); 1245 cgp->cg_rotor = bno; 1246 gotit: 1247 blkno = fragstoblks(fs, bno); 1248 frag = (cgp->cg_cgx * fs->fs_fpg) + bno; 1249 if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize)) 1250 goto norot; 1251 clrblock(fs, blksfree, (long)blkno); 1252 /* 1253 * the other cg/sb/si fields are TRANS'ed by the caller 1254 */ 1255 cgp->cg_cs.cs_nbfree--; 1256 fs->fs_cstotal.cs_nbfree--; 1257 fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; 1258 cylno = cbtocylno(fs, bno); 1259 blks = cg_blks(ufsvfsp, cgp, cylno); 1260 rpos = cbtorpos(ufsvfsp, bno); 1261 blktot = cg_blktot(cgp); 1262 blks[rpos]--; 1263 blktot[cylno]--; 1264 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1265 fs->fs_fmod = 1; 1266 return (frag); 1267 } 1268 1269 /* 1270 * Determine whether an inode can be allocated. 1271 * 1272 * Check to see if an inode is available, and if it is, 1273 * allocate it using the following policy: 1274 * 1) allocate the requested inode. 1275 * 2) allocate the next available inode after the requested 1276 * inode in the specified cylinder group. 1277 */ 1278 static ino_t 1279 ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode) 1280 { 1281 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1282 struct fs *fs = ip->i_fs; 1283 struct cg *cgp; 1284 struct buf *bp; 1285 int start, len, loc, map, i; 1286 char *iused; 1287 1288 if (fs->fs_cs(fs, cg).cs_nifree == 0) 1289 return (0); 1290 bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 1291 (int)fs->fs_cgsize); 1292 1293 cgp = bp->b_un.b_cg; 1294 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) || 1295 cgp->cg_cs.cs_nifree == 0) { 1296 brelse(bp); 1297 return (0); 1298 } 1299 iused = cg_inosused(cgp); 1300 mutex_enter(&ufsvfsp->vfs_lock); 1301 /* 1302 * While we are waiting for the mutex, someone may have taken 1303 * the last available inode. Need to recheck. 1304 */ 1305 if (cgp->cg_cs.cs_nifree == 0) { 1306 mutex_exit(&ufsvfsp->vfs_lock); 1307 brelse(bp); 1308 return (0); 1309 } 1310 1311 cgp->cg_time = gethrestime_sec(); 1312 if (ipref) { 1313 ipref %= fs->fs_ipg; 1314 if (isclr(iused, ipref)) 1315 goto gotit; 1316 } 1317 start = cgp->cg_irotor / NBBY; 1318 len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); 1319 loc = skpc(0xff, (uint_t)len, &iused[start]); 1320 if (loc == 0) { 1321 len = start + 1; 1322 start = 0; 1323 loc = skpc(0xff, (uint_t)len, &iused[0]); 1324 if (loc == 0) { 1325 mutex_exit(&ufsvfsp->vfs_lock); 1326 (void) ufs_fault(ITOV(ip), 1327 "ialloccg: map corrupted, cg = %d, irotor = %d, " 1328 "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt); 1329 return (0); 1330 } 1331 } 1332 i = start + len - loc; 1333 map = iused[i]; 1334 ipref = i * NBBY; 1335 for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) { 1336 if ((map & i) == 0) { 1337 cgp->cg_irotor = ipref; 1338 goto gotit; 1339 } 1340 } 1341 1342 mutex_exit(&ufsvfsp->vfs_lock); 1343 (void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s", 1344 fs->fs_fsmnt); 1345 return (0); 1346 gotit: 1347 setbit(iused, ipref); 1348 cgp->cg_cs.cs_nifree--; 1349 fs->fs_cstotal.cs_nifree--; 1350 fs->fs_cs(fs, cg).cs_nifree--; 1351 if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) { 1352 cgp->cg_cs.cs_ndir++; 1353 fs->fs_cstotal.cs_ndir++; 1354 fs->fs_cs(fs, cg).cs_ndir++; 1355 } 1356 fs->fs_fmod = 1; 1357 ufs_notclean(ufsvfsp); 1358 TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG); 1359 TRANS_SI(ufsvfsp, fs, cg); 1360 bdrwrite(bp); 1361 return (cg * fs->fs_ipg + ipref); 1362 } 1363 1364 /* 1365 * Find a block of the specified size in the specified cylinder group. 1366 * 1367 * It is a panic if a request is made to find a block if none are 1368 * available. 1369 */ 1370 static daddr_t 1371 mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref, 1372 int allocsiz) 1373 { 1374 struct fs *fs = ufsvfsp->vfs_fs; 1375 daddr_t bno, cfrag; 1376 int start, len, loc, i, last, first, secondtime; 1377 int blk, field, subfield, pos; 1378 int gotit; 1379 1380 /* 1381 * ufsvfs->vfs_lock is held when calling this. 1382 */ 1383 /* 1384 * Find the fragment by searching through the 1385 * free block map for an appropriate bit pattern. 1386 */ 1387 if (bpref) 1388 start = dtogd(fs, bpref) / NBBY; 1389 else 1390 start = cgp->cg_frotor / NBBY; 1391 /* 1392 * the following loop performs two scans -- the first scan 1393 * searches the bottom half of the array for a match and the 1394 * second scan searches the top half of the array. The loops 1395 * have been merged just to make things difficult. 1396 */ 1397 first = start; 1398 last = howmany(fs->fs_fpg, NBBY); 1399 secondtime = 0; 1400 cfrag = cgp->cg_cgx * fs->fs_fpg; 1401 while (first < last) { 1402 len = last - first; 1403 /* 1404 * search the array for a match 1405 */ 1406 loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first], 1407 (uchar_t *)fragtbl[fs->fs_frag], 1408 (int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); 1409 /* 1410 * match found 1411 */ 1412 if (loc) { 1413 bno = (last - loc) * NBBY; 1414 1415 /* 1416 * Found the byte in the map, sift 1417 * through the bits to find the selected frag 1418 */ 1419 cgp->cg_frotor = bno; 1420 gotit = 0; 1421 for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { 1422 blk = blkmap(fs, cg_blksfree(cgp), bno); 1423 blk <<= 1; 1424 field = around[allocsiz]; 1425 subfield = inside[allocsiz]; 1426 for (pos = 0; 1427 pos <= fs->fs_frag - allocsiz; 1428 pos++) { 1429 if ((blk & field) == subfield) { 1430 gotit++; 1431 break; 1432 } 1433 field <<= 1; 1434 subfield <<= 1; 1435 } 1436 if (gotit) 1437 break; 1438 } 1439 bno += pos; 1440 1441 /* 1442 * success if block is *not* being converted from 1443 * metadata into userdata (harpy). If so, ignore. 1444 */ 1445 if (!TRANS_ISCANCEL(ufsvfsp, 1446 ldbtob(fsbtodb(fs, (cfrag+bno))), 1447 allocsiz * fs->fs_fsize)) 1448 return (bno); 1449 1450 /* 1451 * keep looking -- this block is being converted 1452 */ 1453 first = (last - loc) + 1; 1454 loc = 0; 1455 if (first < last) 1456 continue; 1457 } 1458 /* 1459 * no usable matches in bottom half -- now search the top half 1460 */ 1461 if (secondtime) 1462 /* 1463 * no usable matches in top half -- all done 1464 */ 1465 break; 1466 secondtime = 1; 1467 last = start + 1; 1468 first = 0; 1469 } 1470 /* 1471 * no usable matches 1472 */ 1473 return ((daddr_t)-1); 1474 } 1475 1476 #define UFSNADDR (NDADDR + NIADDR) /* NADDR applies to (obsolete) S5FS */ 1477 #define IB(i) (NDADDR + (i)) /* index of i'th indirect block ptr */ 1478 #define SINGLE 0 /* single indirect block ptr */ 1479 #define DOUBLE 1 /* double indirect block ptr */ 1480 #define TRIPLE 2 /* triple indirect block ptr */ 1481 1482 /* 1483 * Acquire a write lock, and keep trying till we get it 1484 */ 1485 static int 1486 allocsp_wlockfs(struct vnode *vp, struct lockfs *lf) 1487 { 1488 int err = 0; 1489 1490 lockagain: 1491 do { 1492 err = ufs_fiolfss(vp, lf); 1493 if (err) 1494 return (err); 1495 } while (!LOCKFS_IS_ULOCK(lf)); 1496 1497 lf->lf_lock = LOCKFS_WLOCK; 1498 lf->lf_flags = 0; 1499 lf->lf_comment = NULL; 1500 err = ufs__fiolfs(vp, lf, 1, 0); 1501 1502 if (err == EBUSY || err == EINVAL) 1503 goto lockagain; 1504 1505 return (err); 1506 } 1507 1508 /* 1509 * Release the write lock 1510 */ 1511 static int 1512 allocsp_unlockfs(struct vnode *vp, struct lockfs *lf) 1513 { 1514 int err = 0; 1515 1516 lf->lf_lock = LOCKFS_ULOCK; 1517 lf->lf_flags = 0; 1518 err = ufs__fiolfs(vp, lf, 1, 0); 1519 return (err); 1520 } 1521 1522 struct allocsp_undo { 1523 daddr_t offset; 1524 daddr_t blk; 1525 struct allocsp_undo *next; 1526 }; 1527 1528 /* 1529 * ufs_allocsp() can be used to pre-allocate blocks for a file on a given 1530 * file system. For direct blocks, the blocks are allocated from the offset 1531 * requested to the block boundary, then any full blocks are allocated, 1532 * and finally any remainder. 1533 * For indirect blocks the blocks are not initialized and are 1534 * only marked as allocated. These addresses are then stored as negative 1535 * block numbers in the inode to imply special handling. UFS has been modified 1536 * where necessary to understand this new notion. 1537 * Successfully fallocated files will have IFALLOCATE cflag set in the inode. 1538 */ 1539 int 1540 ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr) 1541 { 1542 struct lockfs lf; 1543 int berr, err, resv, issync; 1544 off_t istart, len; /* istart, special for idb */ 1545 struct inode *ip; 1546 struct fs *fs; 1547 struct ufsvfs *ufsvfsp; 1548 u_offset_t resid, i, uoff; 1549 daddr32_t db_undo[NDADDR]; /* old direct blocks */ 1550 struct allocsp_undo *ib_undo = NULL; /* ib undo */ 1551 struct allocsp_undo *undo = NULL; 1552 u_offset_t osz; /* old file size */ 1553 int chunkblks = 0; /* # of blocks in 1 allocation */ 1554 int cnt = 0; 1555 daddr_t allocblk; 1556 daddr_t totblks = 0; 1557 struct ulockfs *ulp; 1558 size_t done_len; 1559 int nbytes, offsetn; 1560 1561 1562 ASSERT(vp->v_type == VREG); 1563 1564 ip = VTOI(vp); 1565 fs = ip->i_fs; 1566 if ((ufsvfsp = ip->i_ufsvfs) == NULL) { 1567 err = EIO; 1568 goto out_allocsp; 1569 } 1570 1571 istart = blkroundup(fs, (lp->l_start)); 1572 len = blkroundup(fs, (lp->l_len)); 1573 chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize; 1574 ulp = &ufsvfsp->vfs_ulockfs; 1575 1576 if (lp->l_start < 0 || lp->l_len <= 0) 1577 return (EINVAL); 1578 1579 /* Quickly check to make sure we have space before we proceed */ 1580 if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) { 1581 if (TRANS_ISTRANS(ufsvfsp)) { 1582 ufs_delete_drain_wait(ufsvfsp, 1); 1583 if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) 1584 return (ENOSPC); 1585 } else 1586 return (ENOSPC); 1587 } 1588 1589 /* 1590 * We will keep i_rwlock locked as WRITER through out the function 1591 * since we don't want anyone else reading or writing to the inode 1592 * while we are in the middle of fallocating the file. 1593 */ 1594 rw_enter(&ip->i_rwlock, RW_WRITER); 1595 1596 /* Back up the direct block list, used for undo later if necessary */ 1597 rw_enter(&ip->i_contents, RW_READER); 1598 for (i = 0; i < NDADDR; i++) 1599 db_undo[i] = ip->i_db[i]; 1600 osz = ip->i_size; 1601 rw_exit(&ip->i_contents); 1602 1603 /* Write lock the file system */ 1604 if (err = allocsp_wlockfs(vp, &lf)) 1605 goto exit; 1606 1607 /* 1608 * Allocate any direct blocks now. 1609 * Blocks are allocated from the offset requested to the block 1610 * boundary, then any full blocks are allocated, and finally any 1611 * remainder. 1612 */ 1613 if (lblkno(fs, lp->l_start) < NDADDR) { 1614 ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize), 1615 &resv, &resid); 1616 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1617 1618 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1619 rw_enter(&ip->i_contents, RW_WRITER); 1620 1621 done_len = 0; 1622 while ((done_len < lp->l_len) && 1623 (lblkno(fs, lp->l_start + done_len) < NDADDR)) { 1624 uoff = (offset_t)(lp->l_start + done_len); 1625 offsetn = (int)blkoff(fs, uoff); 1626 nbytes = (int)MIN(fs->fs_bsize - offsetn, 1627 lp->l_len - done_len); 1628 1629 berr = bmap_write(ip, uoff, offsetn + nbytes, 1630 BI_FALLOCATE, &allocblk, cr); 1631 /* Yikes error, quit */ 1632 if (berr) { 1633 TRANS_INODE(ufsvfsp, ip); 1634 rw_exit(&ip->i_contents); 1635 rw_exit(&ufsvfsp->vfs_dqrwlock); 1636 TRANS_END_CSYNC(ufsvfsp, err, issync, 1637 TOP_ALLOCSP, resv); 1638 err = allocsp_unlockfs(vp, &lf); 1639 goto exit; 1640 } 1641 1642 if (allocblk) { 1643 totblks++; 1644 if ((uoff + nbytes) > ip->i_size) 1645 ip->i_size = (uoff + nbytes); 1646 } 1647 done_len += nbytes; 1648 } 1649 1650 TRANS_INODE(ufsvfsp, ip); 1651 rw_exit(&ip->i_contents); 1652 rw_exit(&ufsvfsp->vfs_dqrwlock); 1653 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1654 1655 /* start offset for indirect allocation */ 1656 istart = (uoff + nbytes); 1657 } 1658 1659 /* Break the transactions into vfs_iotransz units */ 1660 ufs_trans_trunc_resv(ip, ip->i_size + 1661 blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid); 1662 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1663 1664 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1665 rw_enter(&ip->i_contents, RW_WRITER); 1666 1667 /* Now go about fallocating necessary indirect blocks */ 1668 for (i = istart; i < (lp->l_start + lp->l_len); i += fs->fs_bsize) { 1669 berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE, 1670 &allocblk, cr); 1671 if (berr) { 1672 TRANS_INODE(ufsvfsp, ip); 1673 rw_exit(&ip->i_contents); 1674 rw_exit(&ufsvfsp->vfs_dqrwlock); 1675 TRANS_END_CSYNC(ufsvfsp, err, issync, 1676 TOP_ALLOCSP, resv); 1677 err = allocsp_unlockfs(vp, &lf); 1678 goto exit; 1679 } 1680 1681 /* Update the blk counter only if new block was added */ 1682 if (allocblk) { 1683 /* Save undo information */ 1684 undo = kmem_alloc(sizeof (struct allocsp_undo), 1685 KM_SLEEP); 1686 undo->offset = i; 1687 undo->blk = allocblk; 1688 undo->next = ib_undo; 1689 ib_undo = undo; 1690 totblks++; 1691 1692 if (i >= ip->i_size) 1693 ip->i_size += fs->fs_bsize; 1694 } 1695 cnt++; 1696 1697 /* Being a good UFS citizen, let others get a share */ 1698 if (cnt == chunkblks) { 1699 /* 1700 * If there are waiters or the fs is hard locked, 1701 * error locked, or read-only error locked, 1702 * quit with EIO 1703 */ 1704 if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) || 1705 ULOCKFS_IS_ROELOCK(ulp)) { 1706 ip->i_cflags |= IFALLOCATE; 1707 TRANS_INODE(ufsvfsp, ip); 1708 rw_exit(&ip->i_contents); 1709 rw_exit(&ufsvfsp->vfs_dqrwlock); 1710 1711 TRANS_END_CSYNC(ufsvfsp, err, issync, 1712 TOP_ALLOCSP, resv); 1713 rw_exit(&ip->i_rwlock); 1714 (void) allocsp_unlockfs(vp, &lf); 1715 return (EIO); 1716 } 1717 1718 TRANS_INODE(ufsvfsp, ip); 1719 rw_exit(&ip->i_contents); 1720 rw_exit(&ufsvfsp->vfs_dqrwlock); 1721 1722 /* End the current transaction */ 1723 TRANS_END_CSYNC(ufsvfsp, err, issync, 1724 TOP_ALLOCSP, resv); 1725 1726 if (CV_HAS_WAITERS(&ulp->ul_cv)) { 1727 /* Release the write lock */ 1728 if (err = allocsp_unlockfs(vp, &lf)) 1729 goto exit; 1730 1731 /* Wake up others waiting to do operations */ 1732 mutex_enter(&ulp->ul_lock); 1733 cv_broadcast(&ulp->ul_cv); 1734 mutex_exit(&ulp->ul_lock); 1735 1736 /* Grab the write lock again */ 1737 if (err = allocsp_wlockfs(vp, &lf)) 1738 goto exit; 1739 } /* end of CV_HAS_WAITERS(&ulp->ul_cv) */ 1740 1741 /* Reserve more space in log for this file */ 1742 ufs_trans_trunc_resv(ip, 1743 ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz), 1744 &resv, &resid); 1745 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1746 1747 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1748 rw_enter(&ip->i_contents, RW_WRITER); 1749 1750 cnt = 0; /* reset cnt b/c of new transaction */ 1751 } 1752 } 1753 1754 if (!err && !berr) 1755 ip->i_cflags |= IFALLOCATE; 1756 1757 /* If the file has grown then correct the file size */ 1758 if (osz < (lp->l_start + lp->l_len)) 1759 ip->i_size = (lp->l_start + lp->l_len); 1760 1761 /* Release locks, end log transaction and unlock fs */ 1762 TRANS_INODE(ufsvfsp, ip); 1763 rw_exit(&ip->i_contents); 1764 rw_exit(&ufsvfsp->vfs_dqrwlock); 1765 1766 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1767 err = allocsp_unlockfs(vp, &lf); 1768 1769 /* 1770 * @ exit label, we should no longer be holding the fs write lock, and 1771 * all logging transactions should have been ended. We still hold 1772 * ip->i_rwlock. 1773 */ 1774 exit: 1775 /* 1776 * File has grown larger than 2GB. Set flag 1777 * in superblock to indicate this, if it 1778 * is not already set. 1779 */ 1780 if ((ip->i_size > MAXOFF32_T) && 1781 !(fs->fs_flags & FSLARGEFILES)) { 1782 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1783 mutex_enter(&ufsvfsp->vfs_lock); 1784 fs->fs_flags |= FSLARGEFILES; 1785 ufs_sbwrite(ufsvfsp); 1786 mutex_exit(&ufsvfsp->vfs_lock); 1787 } 1788 1789 /* 1790 * Since we couldn't allocate completely, we will undo the allocations. 1791 */ 1792 if (berr) { 1793 ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid); 1794 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); 1795 1796 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 1797 rw_enter(&ip->i_contents, RW_WRITER); 1798 1799 /* Direct blocks */ 1800 for (i = 0; i < NDADDR; i++) { 1801 /* 1802 * Only free the block if they are not same, and 1803 * the old one isn't zero (the fragment was 1804 * re-allocated). 1805 */ 1806 if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) { 1807 free(ip, ip->i_db[i], fs->fs_bsize, 0); 1808 ip->i_db[i] = 0; 1809 } 1810 } 1811 1812 /* Undo the indirect blocks */ 1813 while (ib_undo != NULL) { 1814 undo = ib_undo; 1815 err = bmap_set_bn(vp, undo->offset, 0); 1816 if (err) 1817 cmn_err(CE_PANIC, "ufs_allocsp(): failed to " 1818 "undo allocation of block %ld", 1819 undo->offset); 1820 free(ip, undo->blk, fs->fs_bsize, I_IBLK); 1821 ib_undo = undo->next; 1822 kmem_free(undo, sizeof (struct allocsp_undo)); 1823 } 1824 1825 ip->i_size = osz; 1826 TRANS_INODE(ufsvfsp, ip); 1827 1828 rw_exit(&ip->i_contents); 1829 rw_exit(&ufsvfsp->vfs_dqrwlock); 1830 1831 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); 1832 1833 rw_exit(&ip->i_rwlock); 1834 return (berr); 1835 } 1836 1837 /* 1838 * Don't forget to free the undo chain :) 1839 */ 1840 while (ib_undo != NULL) { 1841 undo = ib_undo; 1842 ib_undo = undo->next; 1843 kmem_free(undo, sizeof (struct allocsp_undo)); 1844 } 1845 1846 rw_exit(&ip->i_rwlock); 1847 1848 out_allocsp: 1849 return (err); 1850 } 1851 1852 /* 1853 * Free storage space associated with the specified inode. The portion 1854 * to be freed is specified by lp->l_start and lp->l_len (already 1855 * normalized to a "whence" of 0). 1856 * 1857 * This is an experimental facility whose continued existence is not 1858 * guaranteed. Currently, we only support the special case 1859 * of l_len == 0, meaning free to end of file. 1860 * 1861 * Blocks are freed in reverse order. This FILO algorithm will tend to 1862 * maintain a contiguous free list much longer than FIFO. 1863 * See also ufs_itrunc() in ufs_inode.c. 1864 * 1865 * Bug: unused bytes in the last retained block are not cleared. 1866 * This may result in a "hole" in the file that does not read as zeroes. 1867 */ 1868 /* ARGSUSED */ 1869 int 1870 ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr) 1871 { 1872 int i; 1873 struct inode *ip = VTOI(vp); 1874 int error; 1875 1876 ASSERT(vp->v_type == VREG); 1877 ASSERT(lp->l_start >= 0); /* checked by convoff */ 1878 1879 if (lp->l_len != 0) 1880 return (EINVAL); 1881 1882 rw_enter(&ip->i_contents, RW_READER); 1883 if (ip->i_size == (u_offset_t)lp->l_start) { 1884 rw_exit(&ip->i_contents); 1885 return (0); 1886 } 1887 1888 /* 1889 * Check if there is any active mandatory lock on the 1890 * range that will be truncated/expanded. 1891 */ 1892 if (MANDLOCK(vp, ip->i_mode)) { 1893 offset_t save_start; 1894 1895 save_start = lp->l_start; 1896 1897 if (ip->i_size < lp->l_start) { 1898 /* 1899 * "Truncate up" case: need to make sure there 1900 * is no lock beyond current end-of-file. To 1901 * do so, we need to set l_start to the size 1902 * of the file temporarily. 1903 */ 1904 lp->l_start = ip->i_size; 1905 } 1906 lp->l_type = F_WRLCK; 1907 lp->l_sysid = 0; 1908 lp->l_pid = ttoproc(curthread)->p_pid; 1909 i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK; 1910 rw_exit(&ip->i_contents); 1911 if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 || 1912 lp->l_type != F_UNLCK) { 1913 return (i ? i : EAGAIN); 1914 } 1915 rw_enter(&ip->i_contents, RW_READER); 1916 1917 lp->l_start = save_start; 1918 } 1919 1920 /* 1921 * Make sure a write isn't in progress (allocating blocks) 1922 * by acquiring i_rwlock (we promised ufs_bmap we wouldn't 1923 * truncate while it was allocating blocks). 1924 * Grab the locks in the right order. 1925 */ 1926 rw_exit(&ip->i_contents); 1927 rw_enter(&ip->i_rwlock, RW_WRITER); 1928 error = TRANS_ITRUNC(ip, (u_offset_t)lp->l_start, 0, cr); 1929 rw_exit(&ip->i_rwlock); 1930 return (error); 1931 } 1932 1933 /* 1934 * Find a cg with as close to nb contiguous bytes as possible 1935 * THIS MAY TAKE MANY DISK READS! 1936 * 1937 * Implemented in an attempt to allocate contiguous blocks for 1938 * writing the ufs log file to, minimizing future disk head seeking 1939 */ 1940 daddr_t 1941 contigpref(ufsvfs_t *ufsvfsp, size_t nb) 1942 { 1943 struct fs *fs = ufsvfsp->vfs_fs; 1944 daddr_t nblk = lblkno(fs, blkroundup(fs, nb)); 1945 daddr_t savebno, curbno, cgbno; 1946 int cg, cgblks, savecg, savenblk, curnblk; 1947 uchar_t *blksfree; 1948 buf_t *bp; 1949 struct cg *cgp; 1950 1951 savenblk = 0; 1952 savecg = 0; 1953 savebno = 0; 1954 for (cg = 0; cg < fs->fs_ncg; ++cg) { 1955 1956 /* not enough free blks for a contig check */ 1957 if (fs->fs_cs(fs, cg).cs_nbfree < nblk) 1958 continue; 1959 1960 /* 1961 * find the largest contiguous range in this cg 1962 */ 1963 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, 1964 (daddr_t)fsbtodb(fs, cgtod(fs, cg)), 1965 (int)fs->fs_cgsize); 1966 cgp = bp->b_un.b_cg; 1967 if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { 1968 brelse(bp); 1969 continue; 1970 } 1971 blksfree = cg_blksfree(cgp); /* free array */ 1972 cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */ 1973 cgbno = 0; 1974 while (cgbno < cgblks && savenblk < nblk) { 1975 /* find a free block */ 1976 for (; cgbno < cgblks; ++cgbno) 1977 if (isblock(fs, blksfree, cgbno)) 1978 break; 1979 curbno = cgbno; 1980 /* count the number of free blocks */ 1981 for (curnblk = 0; cgbno < cgblks; ++cgbno) { 1982 if (!isblock(fs, blksfree, cgbno)) 1983 break; 1984 if (++curnblk >= nblk) 1985 break; 1986 } 1987 if (curnblk > savenblk) { 1988 savecg = cg; 1989 savenblk = curnblk; 1990 savebno = curbno; 1991 } 1992 } 1993 brelse(bp); 1994 if (savenblk >= nblk) 1995 break; 1996 } 1997 1998 /* convert block offset in cg to frag offset in cg */ 1999 savebno = blkstofrags(fs, savebno); 2000 2001 /* convert frag offset in cg to frag offset in fs */ 2002 savebno += (savecg * fs->fs_fpg); 2003 2004 return (savebno); 2005 } 2006