1 /* 2 * Copyright (C) International Business Machines Corp., 2000-2004 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See 12 * the GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 */ 18 19 #include <linux/fs.h> 20 #include "jfs_incore.h" 21 #include "jfs_superblock.h" 22 #include "jfs_dmap.h" 23 #include "jfs_imap.h" 24 #include "jfs_lock.h" 25 #include "jfs_metapage.h" 26 #include "jfs_debug.h" 27 28 /* 29 * SERIALIZATION of the Block Allocation Map. 30 * 31 * the working state of the block allocation map is accessed in 32 * two directions: 33 * 34 * 1) allocation and free requests that start at the dmap 35 * level and move up through the dmap control pages (i.e. 36 * the vast majority of requests). 37 * 38 * 2) allocation requests that start at dmap control page 39 * level and work down towards the dmaps. 40 * 41 * the serialization scheme used here is as follows. 42 * 43 * requests which start at the bottom are serialized against each 44 * other through buffers and each requests holds onto its buffers 45 * as it works it way up from a single dmap to the required level 46 * of dmap control page. 47 * requests that start at the top are serialized against each other 48 * and request that start from the bottom by the multiple read/single 49 * write inode lock of the bmap inode. requests starting at the top 50 * take this lock in write mode while request starting at the bottom 51 * take the lock in read mode. a single top-down request may proceed 52 * exclusively while multiple bottoms-up requests may proceed 53 * simultaneously (under the protection of busy buffers). 54 * 55 * in addition to information found in dmaps and dmap control pages, 56 * the working state of the block allocation map also includes read/ 57 * write information maintained in the bmap descriptor (i.e. total 58 * free block count, allocation group level free block counts). 59 * a single exclusive lock (BMAP_LOCK) is used to guard this information 60 * in the face of multiple-bottoms up requests. 61 * (lock ordering: IREAD_LOCK, BMAP_LOCK); 62 * 63 * accesses to the persistent state of the block allocation map (limited 64 * to the persistent bitmaps in dmaps) is guarded by (busy) buffers. 65 */ 66 67 #define BMAP_LOCK_INIT(bmp) init_MUTEX(&bmp->db_bmaplock) 68 #define BMAP_LOCK(bmp) down(&bmp->db_bmaplock) 69 #define BMAP_UNLOCK(bmp) up(&bmp->db_bmaplock) 70 71 /* 72 * forward references 73 */ 74 static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, 75 int nblocks); 76 static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval); 77 static void dbBackSplit(dmtree_t * tp, int leafno); 78 static int dbJoin(dmtree_t * tp, int leafno, int newval); 79 static void dbAdjTree(dmtree_t * tp, int leafno, int newval); 80 static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, 81 int level); 82 static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results); 83 static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, 84 int nblocks); 85 static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno, 86 int nblocks, 87 int l2nb, s64 * results); 88 static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, 89 int nblocks); 90 static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks, 91 int l2nb, 92 s64 * results); 93 static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, 94 s64 * results); 95 static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, 96 s64 * results); 97 static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks); 98 static int dbFindBits(u32 word, int l2nb); 99 static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno); 100 static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx); 101 static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, 102 int nblocks); 103 static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, 104 int nblocks); 105 static int dbMaxBud(u8 * cp); 106 s64 dbMapFileSizeToMapSize(struct inode *ipbmap); 107 static int blkstol2(s64 nb); 108 109 static int cntlz(u32 value); 110 static int cnttz(u32 word); 111 112 static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, 113 int nblocks); 114 static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks); 115 static int dbInitDmapTree(struct dmap * dp); 116 static int dbInitTree(struct dmaptree * dtp); 117 static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i); 118 static int dbGetL2AGSize(s64 nblocks); 119 120 /* 121 * buddy table 122 * 123 * table used for determining buddy sizes within characters of 124 * dmap bitmap words. the characters themselves serve as indexes 125 * into the table, with the table elements yielding the maximum 126 * binary buddy of free bits within the character. 127 */ 128 static s8 budtab[256] = { 129 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 130 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 131 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 132 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 133 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 134 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 135 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 136 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 137 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 138 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 139 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 140 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 141 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 142 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 143 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 144 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1 145 }; 146 147 148 /* 149 * NAME: dbMount() 150 * 151 * FUNCTION: initializate the block allocation map. 152 * 153 * memory is allocated for the in-core bmap descriptor and 154 * the in-core descriptor is initialized from disk. 155 * 156 * PARAMETERS: 157 * ipbmap - pointer to in-core inode for the block map. 158 * 159 * RETURN VALUES: 160 * 0 - success 161 * -ENOMEM - insufficient memory 162 * -EIO - i/o error 163 */ 164 int dbMount(struct inode *ipbmap) 165 { 166 struct bmap *bmp; 167 struct dbmap_disk *dbmp_le; 168 struct metapage *mp; 169 int i; 170 171 /* 172 * allocate/initialize the in-memory bmap descriptor 173 */ 174 /* allocate memory for the in-memory bmap descriptor */ 175 bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL); 176 if (bmp == NULL) 177 return -ENOMEM; 178 179 /* read the on-disk bmap descriptor. */ 180 mp = read_metapage(ipbmap, 181 BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, 182 PSIZE, 0); 183 if (mp == NULL) { 184 kfree(bmp); 185 return -EIO; 186 } 187 188 /* copy the on-disk bmap descriptor to its in-memory version. */ 189 dbmp_le = (struct dbmap_disk *) mp->data; 190 bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize); 191 bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); 192 bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); 193 bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); 194 bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); 195 bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); 196 bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); 197 bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); 198 bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth); 199 bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); 200 bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); 201 bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); 202 for (i = 0; i < MAXAG; i++) 203 bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]); 204 bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize); 205 bmp->db_maxfreebud = dbmp_le->dn_maxfreebud; 206 207 /* release the buffer. */ 208 release_metapage(mp); 209 210 /* bind the bmap inode and the bmap descriptor to each other. */ 211 bmp->db_ipbmap = ipbmap; 212 JFS_SBI(ipbmap->i_sb)->bmap = bmp; 213 214 memset(bmp->db_active, 0, sizeof(bmp->db_active)); 215 216 /* 217 * allocate/initialize the bmap lock 218 */ 219 BMAP_LOCK_INIT(bmp); 220 221 return (0); 222 } 223 224 225 /* 226 * NAME: dbUnmount() 227 * 228 * FUNCTION: terminate the block allocation map in preparation for 229 * file system unmount. 230 * 231 * the in-core bmap descriptor is written to disk and 232 * the memory for this descriptor is freed. 233 * 234 * PARAMETERS: 235 * ipbmap - pointer to in-core inode for the block map. 236 * 237 * RETURN VALUES: 238 * 0 - success 239 * -EIO - i/o error 240 */ 241 int dbUnmount(struct inode *ipbmap, int mounterror) 242 { 243 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; 244 245 if (!(mounterror || isReadOnly(ipbmap))) 246 dbSync(ipbmap); 247 248 /* 249 * Invalidate the page cache buffers 250 */ 251 truncate_inode_pages(ipbmap->i_mapping, 0); 252 253 /* free the memory for the in-memory bmap. */ 254 kfree(bmp); 255 256 return (0); 257 } 258 259 /* 260 * dbSync() 261 */ 262 int dbSync(struct inode *ipbmap) 263 { 264 struct dbmap_disk *dbmp_le; 265 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; 266 struct metapage *mp; 267 int i; 268 269 /* 270 * write bmap global control page 271 */ 272 /* get the buffer for the on-disk bmap descriptor. */ 273 mp = read_metapage(ipbmap, 274 BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, 275 PSIZE, 0); 276 if (mp == NULL) { 277 jfs_err("dbSync: read_metapage failed!"); 278 return -EIO; 279 } 280 /* copy the in-memory version of the bmap to the on-disk version */ 281 dbmp_le = (struct dbmap_disk *) mp->data; 282 dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize); 283 dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree); 284 dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage); 285 dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag); 286 dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel); 287 dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag); 288 dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref); 289 dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel); 290 dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth); 291 dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth); 292 dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart); 293 dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size); 294 for (i = 0; i < MAXAG; i++) 295 dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]); 296 dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize); 297 dbmp_le->dn_maxfreebud = bmp->db_maxfreebud; 298 299 /* write the buffer */ 300 write_metapage(mp); 301 302 /* 303 * write out dirty pages of bmap 304 */ 305 filemap_fdatawrite(ipbmap->i_mapping); 306 filemap_fdatawait(ipbmap->i_mapping); 307 308 ipbmap->i_state |= I_DIRTY; 309 diWriteSpecial(ipbmap, 0); 310 311 return (0); 312 } 313 314 315 /* 316 * NAME: dbFree() 317 * 318 * FUNCTION: free the specified block range from the working block 319 * allocation map. 320 * 321 * the blocks will be free from the working map one dmap 322 * at a time. 323 * 324 * PARAMETERS: 325 * ip - pointer to in-core inode; 326 * blkno - starting block number to be freed. 327 * nblocks - number of blocks to be freed. 328 * 329 * RETURN VALUES: 330 * 0 - success 331 * -EIO - i/o error 332 */ 333 int dbFree(struct inode *ip, s64 blkno, s64 nblocks) 334 { 335 struct metapage *mp; 336 struct dmap *dp; 337 int nb, rc; 338 s64 lblkno, rem; 339 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; 340 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; 341 342 IREAD_LOCK(ipbmap); 343 344 /* block to be freed better be within the mapsize. */ 345 if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) { 346 IREAD_UNLOCK(ipbmap); 347 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", 348 (unsigned long long) blkno, 349 (unsigned long long) nblocks); 350 jfs_error(ip->i_sb, 351 "dbFree: block to be freed is outside the map"); 352 return -EIO; 353 } 354 355 /* 356 * free the blocks a dmap at a time. 357 */ 358 mp = NULL; 359 for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) { 360 /* release previous dmap if any */ 361 if (mp) { 362 write_metapage(mp); 363 } 364 365 /* get the buffer for the current dmap. */ 366 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); 367 mp = read_metapage(ipbmap, lblkno, PSIZE, 0); 368 if (mp == NULL) { 369 IREAD_UNLOCK(ipbmap); 370 return -EIO; 371 } 372 dp = (struct dmap *) mp->data; 373 374 /* determine the number of blocks to be freed from 375 * this dmap. 376 */ 377 nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); 378 379 /* free the blocks. */ 380 if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) { 381 jfs_error(ip->i_sb, "dbFree: error in block map\n"); 382 release_metapage(mp); 383 IREAD_UNLOCK(ipbmap); 384 return (rc); 385 } 386 } 387 388 /* write the last buffer. */ 389 write_metapage(mp); 390 391 IREAD_UNLOCK(ipbmap); 392 393 return (0); 394 } 395 396 397 /* 398 * NAME: dbUpdatePMap() 399 * 400 * FUNCTION: update the allocation state (free or allocate) of the 401 * specified block range in the persistent block allocation map. 402 * 403 * the blocks will be updated in the persistent map one 404 * dmap at a time. 405 * 406 * PARAMETERS: 407 * ipbmap - pointer to in-core inode for the block map. 408 * free - TRUE if block range is to be freed from the persistent 409 * map; FALSE if it is to be allocated. 410 * blkno - starting block number of the range. 411 * nblocks - number of contiguous blocks in the range. 412 * tblk - transaction block; 413 * 414 * RETURN VALUES: 415 * 0 - success 416 * -EIO - i/o error 417 */ 418 int 419 dbUpdatePMap(struct inode *ipbmap, 420 int free, s64 blkno, s64 nblocks, struct tblock * tblk) 421 { 422 int nblks, dbitno, wbitno, rbits; 423 int word, nbits, nwords; 424 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; 425 s64 lblkno, rem, lastlblkno; 426 u32 mask; 427 struct dmap *dp; 428 struct metapage *mp; 429 struct jfs_log *log; 430 int lsn, difft, diffp; 431 unsigned long flags; 432 433 /* the blocks better be within the mapsize. */ 434 if (blkno + nblocks > bmp->db_mapsize) { 435 printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", 436 (unsigned long long) blkno, 437 (unsigned long long) nblocks); 438 jfs_error(ipbmap->i_sb, 439 "dbUpdatePMap: blocks are outside the map"); 440 return -EIO; 441 } 442 443 /* compute delta of transaction lsn from log syncpt */ 444 lsn = tblk->lsn; 445 log = (struct jfs_log *) JFS_SBI(tblk->sb)->log; 446 logdiff(difft, lsn, log); 447 448 /* 449 * update the block state a dmap at a time. 450 */ 451 mp = NULL; 452 lastlblkno = 0; 453 for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) { 454 /* get the buffer for the current dmap. */ 455 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); 456 if (lblkno != lastlblkno) { 457 if (mp) { 458 write_metapage(mp); 459 } 460 461 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 462 0); 463 if (mp == NULL) 464 return -EIO; 465 metapage_wait_for_io(mp); 466 } 467 dp = (struct dmap *) mp->data; 468 469 /* determine the bit number and word within the dmap of 470 * the starting block. also determine how many blocks 471 * are to be updated within this dmap. 472 */ 473 dbitno = blkno & (BPERDMAP - 1); 474 word = dbitno >> L2DBWORD; 475 nblks = min(rem, (s64)BPERDMAP - dbitno); 476 477 /* update the bits of the dmap words. the first and last 478 * words may only have a subset of their bits updated. if 479 * this is the case, we'll work against that word (i.e. 480 * partial first and/or last) only in a single pass. a 481 * single pass will also be used to update all words that 482 * are to have all their bits updated. 483 */ 484 for (rbits = nblks; rbits > 0; 485 rbits -= nbits, dbitno += nbits) { 486 /* determine the bit number within the word and 487 * the number of bits within the word. 488 */ 489 wbitno = dbitno & (DBWORD - 1); 490 nbits = min(rbits, DBWORD - wbitno); 491 492 /* check if only part of the word is to be updated. */ 493 if (nbits < DBWORD) { 494 /* update (free or allocate) the bits 495 * in this word. 496 */ 497 mask = 498 (ONES << (DBWORD - nbits) >> wbitno); 499 if (free) 500 dp->pmap[word] &= 501 cpu_to_le32(~mask); 502 else 503 dp->pmap[word] |= 504 cpu_to_le32(mask); 505 506 word += 1; 507 } else { 508 /* one or more words are to have all 509 * their bits updated. determine how 510 * many words and how many bits. 511 */ 512 nwords = rbits >> L2DBWORD; 513 nbits = nwords << L2DBWORD; 514 515 /* update (free or allocate) the bits 516 * in these words. 517 */ 518 if (free) 519 memset(&dp->pmap[word], 0, 520 nwords * 4); 521 else 522 memset(&dp->pmap[word], (int) ONES, 523 nwords * 4); 524 525 word += nwords; 526 } 527 } 528 529 /* 530 * update dmap lsn 531 */ 532 if (lblkno == lastlblkno) 533 continue; 534 535 lastlblkno = lblkno; 536 537 if (mp->lsn != 0) { 538 /* inherit older/smaller lsn */ 539 logdiff(diffp, mp->lsn, log); 540 LOGSYNC_LOCK(log, flags); 541 if (difft < diffp) { 542 mp->lsn = lsn; 543 544 /* move bp after tblock in logsync list */ 545 list_move(&mp->synclist, &tblk->synclist); 546 } 547 548 /* inherit younger/larger clsn */ 549 logdiff(difft, tblk->clsn, log); 550 logdiff(diffp, mp->clsn, log); 551 if (difft > diffp) 552 mp->clsn = tblk->clsn; 553 LOGSYNC_UNLOCK(log, flags); 554 } else { 555 mp->log = log; 556 mp->lsn = lsn; 557 558 /* insert bp after tblock in logsync list */ 559 LOGSYNC_LOCK(log, flags); 560 561 log->count++; 562 list_add(&mp->synclist, &tblk->synclist); 563 564 mp->clsn = tblk->clsn; 565 LOGSYNC_UNLOCK(log, flags); 566 } 567 } 568 569 /* write the last buffer. */ 570 if (mp) { 571 write_metapage(mp); 572 } 573 574 return (0); 575 } 576 577 578 /* 579 * NAME: dbNextAG() 580 * 581 * FUNCTION: find the preferred allocation group for new allocations. 582 * 583 * Within the allocation groups, we maintain a preferred 584 * allocation group which consists of a group with at least 585 * average free space. It is the preferred group that we target 586 * new inode allocation towards. The tie-in between inode 587 * allocation and block allocation occurs as we allocate the 588 * first (data) block of an inode and specify the inode (block) 589 * as the allocation hint for this block. 590 * 591 * We try to avoid having more than one open file growing in 592 * an allocation group, as this will lead to fragmentation. 593 * This differs from the old OS/2 method of trying to keep 594 * empty ags around for large allocations. 595 * 596 * PARAMETERS: 597 * ipbmap - pointer to in-core inode for the block map. 598 * 599 * RETURN VALUES: 600 * the preferred allocation group number. 601 */ 602 int dbNextAG(struct inode *ipbmap) 603 { 604 s64 avgfree; 605 int agpref; 606 s64 hwm = 0; 607 int i; 608 int next_best = -1; 609 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; 610 611 BMAP_LOCK(bmp); 612 613 /* determine the average number of free blocks within the ags. */ 614 avgfree = (u32)bmp->db_nfree / bmp->db_numag; 615 616 /* 617 * if the current preferred ag does not have an active allocator 618 * and has at least average freespace, return it 619 */ 620 agpref = bmp->db_agpref; 621 if ((atomic_read(&bmp->db_active[agpref]) == 0) && 622 (bmp->db_agfree[agpref] >= avgfree)) 623 goto unlock; 624 625 /* From the last preferred ag, find the next one with at least 626 * average free space. 627 */ 628 for (i = 0 ; i < bmp->db_numag; i++, agpref++) { 629 if (agpref == bmp->db_numag) 630 agpref = 0; 631 632 if (atomic_read(&bmp->db_active[agpref])) 633 /* open file is currently growing in this ag */ 634 continue; 635 if (bmp->db_agfree[agpref] >= avgfree) { 636 /* Return this one */ 637 bmp->db_agpref = agpref; 638 goto unlock; 639 } else if (bmp->db_agfree[agpref] > hwm) { 640 /* Less than avg. freespace, but best so far */ 641 hwm = bmp->db_agfree[agpref]; 642 next_best = agpref; 643 } 644 } 645 646 /* 647 * If no inactive ag was found with average freespace, use the 648 * next best 649 */ 650 if (next_best != -1) 651 bmp->db_agpref = next_best; 652 /* else leave db_agpref unchanged */ 653 unlock: 654 BMAP_UNLOCK(bmp); 655 656 /* return the preferred group. 657 */ 658 return (bmp->db_agpref); 659 } 660 661 /* 662 * NAME: dbAlloc() 663 * 664 * FUNCTION: attempt to allocate a specified number of contiguous free 665 * blocks from the working allocation block map. 666 * 667 * the block allocation policy uses hints and a multi-step 668 * approach. 669 * 670 * for allocation requests smaller than the number of blocks 671 * per dmap, we first try to allocate the new blocks 672 * immediately following the hint. if these blocks are not 673 * available, we try to allocate blocks near the hint. if 674 * no blocks near the hint are available, we next try to 675 * allocate within the same dmap as contains the hint. 676 * 677 * if no blocks are available in the dmap or the allocation 678 * request is larger than the dmap size, we try to allocate 679 * within the same allocation group as contains the hint. if 680 * this does not succeed, we finally try to allocate anywhere 681 * within the aggregate. 682 * 683 * we also try to allocate anywhere within the aggregate for 684 * for allocation requests larger than the allocation group 685 * size or requests that specify no hint value. 686 * 687 * PARAMETERS: 688 * ip - pointer to in-core inode; 689 * hint - allocation hint. 690 * nblocks - number of contiguous blocks in the range. 691 * results - on successful return, set to the starting block number 692 * of the newly allocated contiguous range. 693 * 694 * RETURN VALUES: 695 * 0 - success 696 * -ENOSPC - insufficient disk resources 697 * -EIO - i/o error 698 */ 699 int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) 700 { 701 int rc, agno; 702 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; 703 struct bmap *bmp; 704 struct metapage *mp; 705 s64 lblkno, blkno; 706 struct dmap *dp; 707 int l2nb; 708 s64 mapSize; 709 int writers; 710 711 /* assert that nblocks is valid */ 712 assert(nblocks > 0); 713 714 #ifdef _STILL_TO_PORT 715 /* DASD limit check F226941 */ 716 if (OVER_LIMIT(ip, nblocks)) 717 return -ENOSPC; 718 #endif /* _STILL_TO_PORT */ 719 720 /* get the log2 number of blocks to be allocated. 721 * if the number of blocks is not a log2 multiple, 722 * it will be rounded up to the next log2 multiple. 723 */ 724 l2nb = BLKSTOL2(nblocks); 725 726 bmp = JFS_SBI(ip->i_sb)->bmap; 727 728 //retry: /* serialize w.r.t.extendfs() */ 729 mapSize = bmp->db_mapsize; 730 731 /* the hint should be within the map */ 732 if (hint >= mapSize) { 733 jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map"); 734 return -EIO; 735 } 736 737 /* if the number of blocks to be allocated is greater than the 738 * allocation group size, try to allocate anywhere. 739 */ 740 if (l2nb > bmp->db_agl2size) { 741 IWRITE_LOCK(ipbmap); 742 743 rc = dbAllocAny(bmp, nblocks, l2nb, results); 744 745 goto write_unlock; 746 } 747 748 /* 749 * If no hint, let dbNextAG recommend an allocation group 750 */ 751 if (hint == 0) 752 goto pref_ag; 753 754 /* we would like to allocate close to the hint. adjust the 755 * hint to the block following the hint since the allocators 756 * will start looking for free space starting at this point. 757 */ 758 blkno = hint + 1; 759 760 if (blkno >= bmp->db_mapsize) 761 goto pref_ag; 762 763 agno = blkno >> bmp->db_agl2size; 764 765 /* check if blkno crosses over into a new allocation group. 766 * if so, check if we should allow allocations within this 767 * allocation group. 768 */ 769 if ((blkno & (bmp->db_agsize - 1)) == 0) 770 /* check if the AG is currenly being written to. 771 * if so, call dbNextAG() to find a non-busy 772 * AG with sufficient free space. 773 */ 774 if (atomic_read(&bmp->db_active[agno])) 775 goto pref_ag; 776 777 /* check if the allocation request size can be satisfied from a 778 * single dmap. if so, try to allocate from the dmap containing 779 * the hint using a tiered strategy. 780 */ 781 if (nblocks <= BPERDMAP) { 782 IREAD_LOCK(ipbmap); 783 784 /* get the buffer for the dmap containing the hint. 785 */ 786 rc = -EIO; 787 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); 788 mp = read_metapage(ipbmap, lblkno, PSIZE, 0); 789 if (mp == NULL) 790 goto read_unlock; 791 792 dp = (struct dmap *) mp->data; 793 794 /* first, try to satisfy the allocation request with the 795 * blocks beginning at the hint. 796 */ 797 if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks)) 798 != -ENOSPC) { 799 if (rc == 0) { 800 *results = blkno; 801 mark_metapage_dirty(mp); 802 } 803 804 release_metapage(mp); 805 goto read_unlock; 806 } 807 808 writers = atomic_read(&bmp->db_active[agno]); 809 if ((writers > 1) || 810 ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) { 811 /* 812 * Someone else is writing in this allocation 813 * group. To avoid fragmenting, try another ag 814 */ 815 release_metapage(mp); 816 IREAD_UNLOCK(ipbmap); 817 goto pref_ag; 818 } 819 820 /* next, try to satisfy the allocation request with blocks 821 * near the hint. 822 */ 823 if ((rc = 824 dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results)) 825 != -ENOSPC) { 826 if (rc == 0) 827 mark_metapage_dirty(mp); 828 829 release_metapage(mp); 830 goto read_unlock; 831 } 832 833 /* try to satisfy the allocation request with blocks within 834 * the same dmap as the hint. 835 */ 836 if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results)) 837 != -ENOSPC) { 838 if (rc == 0) 839 mark_metapage_dirty(mp); 840 841 release_metapage(mp); 842 goto read_unlock; 843 } 844 845 release_metapage(mp); 846 IREAD_UNLOCK(ipbmap); 847 } 848 849 /* try to satisfy the allocation request with blocks within 850 * the same allocation group as the hint. 851 */ 852 IWRITE_LOCK(ipbmap); 853 if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC) 854 goto write_unlock; 855 856 IWRITE_UNLOCK(ipbmap); 857 858 859 pref_ag: 860 /* 861 * Let dbNextAG recommend a preferred allocation group 862 */ 863 agno = dbNextAG(ipbmap); 864 IWRITE_LOCK(ipbmap); 865 866 /* Try to allocate within this allocation group. if that fails, try to 867 * allocate anywhere in the map. 868 */ 869 if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC) 870 rc = dbAllocAny(bmp, nblocks, l2nb, results); 871 872 write_unlock: 873 IWRITE_UNLOCK(ipbmap); 874 875 return (rc); 876 877 read_unlock: 878 IREAD_UNLOCK(ipbmap); 879 880 return (rc); 881 } 882 883 #ifdef _NOTYET 884 /* 885 * NAME: dbAllocExact() 886 * 887 * FUNCTION: try to allocate the requested extent; 888 * 889 * PARAMETERS: 890 * ip - pointer to in-core inode; 891 * blkno - extent address; 892 * nblocks - extent length; 893 * 894 * RETURN VALUES: 895 * 0 - success 896 * -ENOSPC - insufficient disk resources 897 * -EIO - i/o error 898 */ 899 int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) 900 { 901 int rc; 902 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; 903 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; 904 struct dmap *dp; 905 s64 lblkno; 906 struct metapage *mp; 907 908 IREAD_LOCK(ipbmap); 909 910 /* 911 * validate extent request: 912 * 913 * note: defragfs policy: 914 * max 64 blocks will be moved. 915 * allocation request size must be satisfied from a single dmap. 916 */ 917 if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) { 918 IREAD_UNLOCK(ipbmap); 919 return -EINVAL; 920 } 921 922 if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) { 923 /* the free space is no longer available */ 924 IREAD_UNLOCK(ipbmap); 925 return -ENOSPC; 926 } 927 928 /* read in the dmap covering the extent */ 929 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); 930 mp = read_metapage(ipbmap, lblkno, PSIZE, 0); 931 if (mp == NULL) { 932 IREAD_UNLOCK(ipbmap); 933 return -EIO; 934 } 935 dp = (struct dmap *) mp->data; 936 937 /* try to allocate the requested extent */ 938 rc = dbAllocNext(bmp, dp, blkno, nblocks); 939 940 IREAD_UNLOCK(ipbmap); 941 942 if (rc == 0) 943 mark_metapage_dirty(mp); 944 945 release_metapage(mp); 946 947 return (rc); 948 } 949 #endif /* _NOTYET */ 950 951 /* 952 * NAME: dbReAlloc() 953 * 954 * FUNCTION: attempt to extend a current allocation by a specified 955 * number of blocks. 956 * 957 * this routine attempts to satisfy the allocation request 958 * by first trying to extend the existing allocation in 959 * place by allocating the additional blocks as the blocks 960 * immediately following the current allocation. if these 961 * blocks are not available, this routine will attempt to 962 * allocate a new set of contiguous blocks large enough 963 * to cover the existing allocation plus the additional 964 * number of blocks required. 965 * 966 * PARAMETERS: 967 * ip - pointer to in-core inode requiring allocation. 968 * blkno - starting block of the current allocation. 969 * nblocks - number of contiguous blocks within the current 970 * allocation. 971 * addnblocks - number of blocks to add to the allocation. 972 * results - on successful return, set to the starting block number 973 * of the existing allocation if the existing allocation 974 * was extended in place or to a newly allocated contiguous 975 * range if the existing allocation could not be extended 976 * in place. 977 * 978 * RETURN VALUES: 979 * 0 - success 980 * -ENOSPC - insufficient disk resources 981 * -EIO - i/o error 982 */ 983 int 984 dbReAlloc(struct inode *ip, 985 s64 blkno, s64 nblocks, s64 addnblocks, s64 * results) 986 { 987 int rc; 988 989 /* try to extend the allocation in place. 990 */ 991 if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) { 992 *results = blkno; 993 return (0); 994 } else { 995 if (rc != -ENOSPC) 996 return (rc); 997 } 998 999 /* could not extend the allocation in place, so allocate a 1000 * new set of blocks for the entire request (i.e. try to get 1001 * a range of contiguous blocks large enough to cover the 1002 * existing allocation plus the additional blocks.) 1003 */ 1004 return (dbAlloc 1005 (ip, blkno + nblocks - 1, addnblocks + nblocks, results)); 1006 } 1007 1008 1009 /* 1010 * NAME: dbExtend() 1011 * 1012 * FUNCTION: attempt to extend a current allocation by a specified 1013 * number of blocks. 1014 * 1015 * this routine attempts to satisfy the allocation request 1016 * by first trying to extend the existing allocation in 1017 * place by allocating the additional blocks as the blocks 1018 * immediately following the current allocation. 1019 * 1020 * PARAMETERS: 1021 * ip - pointer to in-core inode requiring allocation. 1022 * blkno - starting block of the current allocation. 1023 * nblocks - number of contiguous blocks within the current 1024 * allocation. 1025 * addnblocks - number of blocks to add to the allocation. 1026 * 1027 * RETURN VALUES: 1028 * 0 - success 1029 * -ENOSPC - insufficient disk resources 1030 * -EIO - i/o error 1031 */ 1032 static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) 1033 { 1034 struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); 1035 s64 lblkno, lastblkno, extblkno; 1036 uint rel_block; 1037 struct metapage *mp; 1038 struct dmap *dp; 1039 int rc; 1040 struct inode *ipbmap = sbi->ipbmap; 1041 struct bmap *bmp; 1042 1043 /* 1044 * We don't want a non-aligned extent to cross a page boundary 1045 */ 1046 if (((rel_block = blkno & (sbi->nbperpage - 1))) && 1047 (rel_block + nblocks + addnblocks > sbi->nbperpage)) 1048 return -ENOSPC; 1049 1050 /* get the last block of the current allocation */ 1051 lastblkno = blkno + nblocks - 1; 1052 1053 /* determine the block number of the block following 1054 * the existing allocation. 1055 */ 1056 extblkno = lastblkno + 1; 1057 1058 IREAD_LOCK(ipbmap); 1059 1060 /* better be within the file system */ 1061 bmp = sbi->bmap; 1062 if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) { 1063 IREAD_UNLOCK(ipbmap); 1064 jfs_error(ip->i_sb, 1065 "dbExtend: the block is outside the filesystem"); 1066 return -EIO; 1067 } 1068 1069 /* we'll attempt to extend the current allocation in place by 1070 * allocating the additional blocks as the blocks immediately 1071 * following the current allocation. we only try to extend the 1072 * current allocation in place if the number of additional blocks 1073 * can fit into a dmap, the last block of the current allocation 1074 * is not the last block of the file system, and the start of the 1075 * inplace extension is not on an allocation group boundary. 1076 */ 1077 if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize || 1078 (extblkno & (bmp->db_agsize - 1)) == 0) { 1079 IREAD_UNLOCK(ipbmap); 1080 return -ENOSPC; 1081 } 1082 1083 /* get the buffer for the dmap containing the first block 1084 * of the extension. 1085 */ 1086 lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage); 1087 mp = read_metapage(ipbmap, lblkno, PSIZE, 0); 1088 if (mp == NULL) { 1089 IREAD_UNLOCK(ipbmap); 1090 return -EIO; 1091 } 1092 1093 dp = (struct dmap *) mp->data; 1094 1095 /* try to allocate the blocks immediately following the 1096 * current allocation. 1097 */ 1098 rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks); 1099 1100 IREAD_UNLOCK(ipbmap); 1101 1102 /* were we successful ? */ 1103 if (rc == 0) 1104 write_metapage(mp); 1105 else 1106 /* we were not successful */ 1107 release_metapage(mp); 1108 1109 1110 return (rc); 1111 } 1112 1113 1114 /* 1115 * NAME: dbAllocNext() 1116 * 1117 * FUNCTION: attempt to allocate the blocks of the specified block 1118 * range within a dmap. 1119 * 1120 * PARAMETERS: 1121 * bmp - pointer to bmap descriptor 1122 * dp - pointer to dmap. 1123 * blkno - starting block number of the range. 1124 * nblocks - number of contiguous free blocks of the range. 1125 * 1126 * RETURN VALUES: 1127 * 0 - success 1128 * -ENOSPC - insufficient disk resources 1129 * -EIO - i/o error 1130 * 1131 * serialization: IREAD_LOCK(ipbmap) held on entry/exit; 1132 */ 1133 static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, 1134 int nblocks) 1135 { 1136 int dbitno, word, rembits, nb, nwords, wbitno, nw; 1137 int l2size; 1138 s8 *leaf; 1139 u32 mask; 1140 1141 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { 1142 jfs_error(bmp->db_ipbmap->i_sb, 1143 "dbAllocNext: Corrupt dmap page"); 1144 return -EIO; 1145 } 1146 1147 /* pick up a pointer to the leaves of the dmap tree. 1148 */ 1149 leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx); 1150 1151 /* determine the bit number and word within the dmap of the 1152 * starting block. 1153 */ 1154 dbitno = blkno & (BPERDMAP - 1); 1155 word = dbitno >> L2DBWORD; 1156 1157 /* check if the specified block range is contained within 1158 * this dmap. 1159 */ 1160 if (dbitno + nblocks > BPERDMAP) 1161 return -ENOSPC; 1162 1163 /* check if the starting leaf indicates that anything 1164 * is free. 1165 */ 1166 if (leaf[word] == NOFREE) 1167 return -ENOSPC; 1168 1169 /* check the dmaps words corresponding to block range to see 1170 * if the block range is free. not all bits of the first and 1171 * last words may be contained within the block range. if this 1172 * is the case, we'll work against those words (i.e. partial first 1173 * and/or last) on an individual basis (a single pass) and examine 1174 * the actual bits to determine if they are free. a single pass 1175 * will be used for all dmap words fully contained within the 1176 * specified range. within this pass, the leaves of the dmap 1177 * tree will be examined to determine if the blocks are free. a 1178 * single leaf may describe the free space of multiple dmap 1179 * words, so we may visit only a subset of the actual leaves 1180 * corresponding to the dmap words of the block range. 1181 */ 1182 for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { 1183 /* determine the bit number within the word and 1184 * the number of bits within the word. 1185 */ 1186 wbitno = dbitno & (DBWORD - 1); 1187 nb = min(rembits, DBWORD - wbitno); 1188 1189 /* check if only part of the word is to be examined. 1190 */ 1191 if (nb < DBWORD) { 1192 /* check if the bits are free. 1193 */ 1194 mask = (ONES << (DBWORD - nb) >> wbitno); 1195 if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask) 1196 return -ENOSPC; 1197 1198 word += 1; 1199 } else { 1200 /* one or more dmap words are fully contained 1201 * within the block range. determine how many 1202 * words and how many bits. 1203 */ 1204 nwords = rembits >> L2DBWORD; 1205 nb = nwords << L2DBWORD; 1206 1207 /* now examine the appropriate leaves to determine 1208 * if the blocks are free. 1209 */ 1210 while (nwords > 0) { 1211 /* does the leaf describe any free space ? 1212 */ 1213 if (leaf[word] < BUDMIN) 1214 return -ENOSPC; 1215 1216 /* determine the l2 number of bits provided 1217 * by this leaf. 1218 */ 1219 l2size = 1220 min((int)leaf[word], NLSTOL2BSZ(nwords)); 1221 1222 /* determine how many words were handled. 1223 */ 1224 nw = BUDSIZE(l2size, BUDMIN); 1225 1226 nwords -= nw; 1227 word += nw; 1228 } 1229 } 1230 } 1231 1232 /* allocate the blocks. 1233 */ 1234 return (dbAllocDmap(bmp, dp, blkno, nblocks)); 1235 } 1236 1237 1238 /* 1239 * NAME: dbAllocNear() 1240 * 1241 * FUNCTION: attempt to allocate a number of contiguous free blocks near 1242 * a specified block (hint) within a dmap. 1243 * 1244 * starting with the dmap leaf that covers the hint, we'll 1245 * check the next four contiguous leaves for sufficient free 1246 * space. if sufficient free space is found, we'll allocate 1247 * the desired free space. 1248 * 1249 * PARAMETERS: 1250 * bmp - pointer to bmap descriptor 1251 * dp - pointer to dmap. 1252 * blkno - block number to allocate near. 1253 * nblocks - actual number of contiguous free blocks desired. 1254 * l2nb - log2 number of contiguous free blocks desired. 1255 * results - on successful return, set to the starting block number 1256 * of the newly allocated range. 1257 * 1258 * RETURN VALUES: 1259 * 0 - success 1260 * -ENOSPC - insufficient disk resources 1261 * -EIO - i/o error 1262 * 1263 * serialization: IREAD_LOCK(ipbmap) held on entry/exit; 1264 */ 1265 static int 1266 dbAllocNear(struct bmap * bmp, 1267 struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results) 1268 { 1269 int word, lword, rc; 1270 s8 *leaf; 1271 1272 if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { 1273 jfs_error(bmp->db_ipbmap->i_sb, 1274 "dbAllocNear: Corrupt dmap page"); 1275 return -EIO; 1276 } 1277 1278 leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx); 1279 1280 /* determine the word within the dmap that holds the hint 1281 * (i.e. blkno). also, determine the last word in the dmap 1282 * that we'll include in our examination. 1283 */ 1284 word = (blkno & (BPERDMAP - 1)) >> L2DBWORD; 1285 lword = min(word + 4, LPERDMAP); 1286 1287 /* examine the leaves for sufficient free space. 1288 */ 1289 for (; word < lword; word++) { 1290 /* does the leaf describe sufficient free space ? 1291 */ 1292 if (leaf[word] < l2nb) 1293 continue; 1294 1295 /* determine the block number within the file system 1296 * of the first block described by this dmap word. 1297 */ 1298 blkno = le64_to_cpu(dp->start) + (word << L2DBWORD); 1299 1300 /* if not all bits of the dmap word are free, get the 1301 * starting bit number within the dmap word of the required 1302 * string of free bits and adjust the block number with the 1303 * value. 1304 */ 1305 if (leaf[word] < BUDMIN) 1306 blkno += 1307 dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb); 1308 1309 /* allocate the blocks. 1310 */ 1311 if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0) 1312 *results = blkno; 1313 1314 return (rc); 1315 } 1316 1317 return -ENOSPC; 1318 } 1319 1320 1321 /* 1322 * NAME: dbAllocAG() 1323 * 1324 * FUNCTION: attempt to allocate the specified number of contiguous 1325 * free blocks within the specified allocation group. 1326 * 1327 * unless the allocation group size is equal to the number 1328 * of blocks per dmap, the dmap control pages will be used to 1329 * find the required free space, if available. we start the 1330 * search at the highest dmap control page level which 1331 * distinctly describes the allocation group's free space 1332 * (i.e. the highest level at which the allocation group's 1333 * free space is not mixed in with that of any other group). 1334 * in addition, we start the search within this level at a 1335 * height of the dmapctl dmtree at which the nodes distinctly 1336 * describe the allocation group's free space. at this height, 1337 * the allocation group's free space may be represented by 1 1338 * or two sub-trees, depending on the allocation group size. 1339 * we search the top nodes of these subtrees left to right for 1340 * sufficient free space. if sufficient free space is found, 1341 * the subtree is searched to find the leftmost leaf that 1342 * has free space. once we have made it to the leaf, we 1343 * move the search to the next lower level dmap control page 1344 * corresponding to this leaf. we continue down the dmap control 1345 * pages until we find the dmap that contains or starts the 1346 * sufficient free space and we allocate at this dmap. 1347 * 1348 * if the allocation group size is equal to the dmap size, 1349 * we'll start at the dmap corresponding to the allocation 1350 * group and attempt the allocation at this level. 1351 * 1352 * the dmap control page search is also not performed if the 1353 * allocation group is completely free and we go to the first 1354 * dmap of the allocation group to do the allocation. this is 1355 * done because the allocation group may be part (not the first 1356 * part) of a larger binary buddy system, causing the dmap 1357 * control pages to indicate no free space (NOFREE) within 1358 * the allocation group. 1359 * 1360 * PARAMETERS: 1361 * bmp - pointer to bmap descriptor 1362 * agno - allocation group number. 1363 * nblocks - actual number of contiguous free blocks desired. 1364 * l2nb - log2 number of contiguous free blocks desired. 1365 * results - on successful return, set to the starting block number 1366 * of the newly allocated range. 1367 * 1368 * RETURN VALUES: 1369 * 0 - success 1370 * -ENOSPC - insufficient disk resources 1371 * -EIO - i/o error 1372 * 1373 * note: IWRITE_LOCK(ipmap) held on entry/exit; 1374 */ 1375 static int 1376 dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) 1377 { 1378 struct metapage *mp; 1379 struct dmapctl *dcp; 1380 int rc, ti, i, k, m, n, agperlev; 1381 s64 blkno, lblkno; 1382 int budmin; 1383 1384 /* allocation request should not be for more than the 1385 * allocation group size. 1386 */ 1387 if (l2nb > bmp->db_agl2size) { 1388 jfs_error(bmp->db_ipbmap->i_sb, 1389 "dbAllocAG: allocation request is larger than the " 1390 "allocation group size"); 1391 return -EIO; 1392 } 1393 1394 /* determine the starting block number of the allocation 1395 * group. 1396 */ 1397 blkno = (s64) agno << bmp->db_agl2size; 1398 1399 /* check if the allocation group size is the minimum allocation 1400 * group size or if the allocation group is completely free. if 1401 * the allocation group size is the minimum size of BPERDMAP (i.e. 1402 * 1 dmap), there is no need to search the dmap control page (below) 1403 * that fully describes the allocation group since the allocation 1404 * group is already fully described by a dmap. in this case, we 1405 * just call dbAllocCtl() to search the dmap tree and allocate the 1406 * required space if available. 1407 * 1408 * if the allocation group is completely free, dbAllocCtl() is 1409 * also called to allocate the required space. this is done for 1410 * two reasons. first, it makes no sense searching the dmap control 1411 * pages for free space when we know that free space exists. second, 1412 * the dmap control pages may indicate that the allocation group 1413 * has no free space if the allocation group is part (not the first 1414 * part) of a larger binary buddy system. 1415 */ 1416 if (bmp->db_agsize == BPERDMAP 1417 || bmp->db_agfree[agno] == bmp->db_agsize) { 1418 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); 1419 if ((rc == -ENOSPC) && 1420 (bmp->db_agfree[agno] == bmp->db_agsize)) { 1421 printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n", 1422 (unsigned long long) blkno, 1423 (unsigned long long) nblocks); 1424 jfs_error(bmp->db_ipbmap->i_sb, 1425 "dbAllocAG: dbAllocCtl failed in free AG"); 1426 } 1427 return (rc); 1428 } 1429 1430 /* the buffer for the dmap control page that fully describes the 1431 * allocation group. 1432 */ 1433 lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel); 1434 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); 1435 if (mp == NULL) 1436 return -EIO; 1437 dcp = (struct dmapctl *) mp->data; 1438 budmin = dcp->budmin; 1439 1440 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { 1441 jfs_error(bmp->db_ipbmap->i_sb, 1442 "dbAllocAG: Corrupt dmapctl page"); 1443 release_metapage(mp); 1444 return -EIO; 1445 } 1446 1447 /* search the subtree(s) of the dmap control page that describes 1448 * the allocation group, looking for sufficient free space. to begin, 1449 * determine how many allocation groups are represented in a dmap 1450 * control page at the control page level (i.e. L0, L1, L2) that 1451 * fully describes an allocation group. next, determine the starting 1452 * tree index of this allocation group within the control page. 1453 */ 1454 agperlev = 1455 (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth; 1456 ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); 1457 1458 /* dmap control page trees fan-out by 4 and a single allocation 1459 * group may be described by 1 or 2 subtrees within the ag level 1460 * dmap control page, depending upon the ag size. examine the ag's 1461 * subtrees for sufficient free space, starting with the leftmost 1462 * subtree. 1463 */ 1464 for (i = 0; i < bmp->db_agwidth; i++, ti++) { 1465 /* is there sufficient free space ? 1466 */ 1467 if (l2nb > dcp->stree[ti]) 1468 continue; 1469 1470 /* sufficient free space found in a subtree. now search down 1471 * the subtree to find the leftmost leaf that describes this 1472 * free space. 1473 */ 1474 for (k = bmp->db_agheigth; k > 0; k--) { 1475 for (n = 0, m = (ti << 2) + 1; n < 4; n++) { 1476 if (l2nb <= dcp->stree[m + n]) { 1477 ti = m + n; 1478 break; 1479 } 1480 } 1481 if (n == 4) { 1482 jfs_error(bmp->db_ipbmap->i_sb, 1483 "dbAllocAG: failed descending stree"); 1484 release_metapage(mp); 1485 return -EIO; 1486 } 1487 } 1488 1489 /* determine the block number within the file system 1490 * that corresponds to this leaf. 1491 */ 1492 if (bmp->db_aglevel == 2) 1493 blkno = 0; 1494 else if (bmp->db_aglevel == 1) 1495 blkno &= ~(MAXL1SIZE - 1); 1496 else /* bmp->db_aglevel == 0 */ 1497 blkno &= ~(MAXL0SIZE - 1); 1498 1499 blkno += 1500 ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin; 1501 1502 /* release the buffer in preparation for going down 1503 * the next level of dmap control pages. 1504 */ 1505 release_metapage(mp); 1506 1507 /* check if we need to continue to search down the lower 1508 * level dmap control pages. we need to if the number of 1509 * blocks required is less than maximum number of blocks 1510 * described at the next lower level. 1511 */ 1512 if (l2nb < budmin) { 1513 1514 /* search the lower level dmap control pages to get 1515 * the starting block number of the the dmap that 1516 * contains or starts off the free space. 1517 */ 1518 if ((rc = 1519 dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1, 1520 &blkno))) { 1521 if (rc == -ENOSPC) { 1522 jfs_error(bmp->db_ipbmap->i_sb, 1523 "dbAllocAG: control page " 1524 "inconsistent"); 1525 return -EIO; 1526 } 1527 return (rc); 1528 } 1529 } 1530 1531 /* allocate the blocks. 1532 */ 1533 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); 1534 if (rc == -ENOSPC) { 1535 jfs_error(bmp->db_ipbmap->i_sb, 1536 "dbAllocAG: unable to allocate blocks"); 1537 rc = -EIO; 1538 } 1539 return (rc); 1540 } 1541 1542 /* no space in the allocation group. release the buffer and 1543 * return -ENOSPC. 1544 */ 1545 release_metapage(mp); 1546 1547 return -ENOSPC; 1548 } 1549 1550 1551 /* 1552 * NAME: dbAllocAny() 1553 * 1554 * FUNCTION: attempt to allocate the specified number of contiguous 1555 * free blocks anywhere in the file system. 1556 * 1557 * dbAllocAny() attempts to find the sufficient free space by 1558 * searching down the dmap control pages, starting with the 1559 * highest level (i.e. L0, L1, L2) control page. if free space 1560 * large enough to satisfy the desired free space is found, the 1561 * desired free space is allocated. 1562 * 1563 * PARAMETERS: 1564 * bmp - pointer to bmap descriptor 1565 * nblocks - actual number of contiguous free blocks desired. 1566 * l2nb - log2 number of contiguous free blocks desired. 1567 * results - on successful return, set to the starting block number 1568 * of the newly allocated range. 1569 * 1570 * RETURN VALUES: 1571 * 0 - success 1572 * -ENOSPC - insufficient disk resources 1573 * -EIO - i/o error 1574 * 1575 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; 1576 */ 1577 static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results) 1578 { 1579 int rc; 1580 s64 blkno = 0; 1581 1582 /* starting with the top level dmap control page, search 1583 * down the dmap control levels for sufficient free space. 1584 * if free space is found, dbFindCtl() returns the starting 1585 * block number of the dmap that contains or starts off the 1586 * range of free space. 1587 */ 1588 if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno))) 1589 return (rc); 1590 1591 /* allocate the blocks. 1592 */ 1593 rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); 1594 if (rc == -ENOSPC) { 1595 jfs_error(bmp->db_ipbmap->i_sb, 1596 "dbAllocAny: unable to allocate blocks"); 1597 return -EIO; 1598 } 1599 return (rc); 1600 } 1601 1602 1603 /* 1604 * NAME: dbFindCtl() 1605 * 1606 * FUNCTION: starting at a specified dmap control page level and block 1607 * number, search down the dmap control levels for a range of 1608 * contiguous free blocks large enough to satisfy an allocation 1609 * request for the specified number of free blocks. 1610 * 1611 * if sufficient contiguous free blocks are found, this routine 1612 * returns the starting block number within a dmap page that 1613 * contains or starts a range of contiqious free blocks that 1614 * is sufficient in size. 1615 * 1616 * PARAMETERS: 1617 * bmp - pointer to bmap descriptor 1618 * level - starting dmap control page level. 1619 * l2nb - log2 number of contiguous free blocks desired. 1620 * *blkno - on entry, starting block number for conducting the search. 1621 * on successful return, the first block within a dmap page 1622 * that contains or starts a range of contiguous free blocks. 1623 * 1624 * RETURN VALUES: 1625 * 0 - success 1626 * -ENOSPC - insufficient disk resources 1627 * -EIO - i/o error 1628 * 1629 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; 1630 */ 1631 static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) 1632 { 1633 int rc, leafidx, lev; 1634 s64 b, lblkno; 1635 struct dmapctl *dcp; 1636 int budmin; 1637 struct metapage *mp; 1638 1639 /* starting at the specified dmap control page level and block 1640 * number, search down the dmap control levels for the starting 1641 * block number of a dmap page that contains or starts off 1642 * sufficient free blocks. 1643 */ 1644 for (lev = level, b = *blkno; lev >= 0; lev--) { 1645 /* get the buffer of the dmap control page for the block 1646 * number and level (i.e. L0, L1, L2). 1647 */ 1648 lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev); 1649 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); 1650 if (mp == NULL) 1651 return -EIO; 1652 dcp = (struct dmapctl *) mp->data; 1653 budmin = dcp->budmin; 1654 1655 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { 1656 jfs_error(bmp->db_ipbmap->i_sb, 1657 "dbFindCtl: Corrupt dmapctl page"); 1658 release_metapage(mp); 1659 return -EIO; 1660 } 1661 1662 /* search the tree within the dmap control page for 1663 * sufficent free space. if sufficient free space is found, 1664 * dbFindLeaf() returns the index of the leaf at which 1665 * free space was found. 1666 */ 1667 rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx); 1668 1669 /* release the buffer. 1670 */ 1671 release_metapage(mp); 1672 1673 /* space found ? 1674 */ 1675 if (rc) { 1676 if (lev != level) { 1677 jfs_error(bmp->db_ipbmap->i_sb, 1678 "dbFindCtl: dmap inconsistent"); 1679 return -EIO; 1680 } 1681 return -ENOSPC; 1682 } 1683 1684 /* adjust the block number to reflect the location within 1685 * the dmap control page (i.e. the leaf) at which free 1686 * space was found. 1687 */ 1688 b += (((s64) leafidx) << budmin); 1689 1690 /* we stop the search at this dmap control page level if 1691 * the number of blocks required is greater than or equal 1692 * to the maximum number of blocks described at the next 1693 * (lower) level. 1694 */ 1695 if (l2nb >= budmin) 1696 break; 1697 } 1698 1699 *blkno = b; 1700 return (0); 1701 } 1702 1703 1704 /* 1705 * NAME: dbAllocCtl() 1706 * 1707 * FUNCTION: attempt to allocate a specified number of contiguous 1708 * blocks starting within a specific dmap. 1709 * 1710 * this routine is called by higher level routines that search 1711 * the dmap control pages above the actual dmaps for contiguous 1712 * free space. the result of successful searches by these 1713 * routines are the starting block numbers within dmaps, with 1714 * the dmaps themselves containing the desired contiguous free 1715 * space or starting a contiguous free space of desired size 1716 * that is made up of the blocks of one or more dmaps. these 1717 * calls should not fail due to insufficent resources. 1718 * 1719 * this routine is called in some cases where it is not known 1720 * whether it will fail due to insufficient resources. more 1721 * specifically, this occurs when allocating from an allocation 1722 * group whose size is equal to the number of blocks per dmap. 1723 * in this case, the dmap control pages are not examined prior 1724 * to calling this routine (to save pathlength) and the call 1725 * might fail. 1726 * 1727 * for a request size that fits within a dmap, this routine relies 1728 * upon the dmap's dmtree to find the requested contiguous free 1729 * space. for request sizes that are larger than a dmap, the 1730 * requested free space will start at the first block of the 1731 * first dmap (i.e. blkno). 1732 * 1733 * PARAMETERS: 1734 * bmp - pointer to bmap descriptor 1735 * nblocks - actual number of contiguous free blocks to allocate. 1736 * l2nb - log2 number of contiguous free blocks to allocate. 1737 * blkno - starting block number of the dmap to start the allocation 1738 * from. 1739 * results - on successful return, set to the starting block number 1740 * of the newly allocated range. 1741 * 1742 * RETURN VALUES: 1743 * 0 - success 1744 * -ENOSPC - insufficient disk resources 1745 * -EIO - i/o error 1746 * 1747 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; 1748 */ 1749 static int 1750 dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) 1751 { 1752 int rc, nb; 1753 s64 b, lblkno, n; 1754 struct metapage *mp; 1755 struct dmap *dp; 1756 1757 /* check if the allocation request is confined to a single dmap. 1758 */ 1759 if (l2nb <= L2BPERDMAP) { 1760 /* get the buffer for the dmap. 1761 */ 1762 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); 1763 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); 1764 if (mp == NULL) 1765 return -EIO; 1766 dp = (struct dmap *) mp->data; 1767 1768 /* try to allocate the blocks. 1769 */ 1770 rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results); 1771 if (rc == 0) 1772 mark_metapage_dirty(mp); 1773 1774 release_metapage(mp); 1775 1776 return (rc); 1777 } 1778 1779 /* allocation request involving multiple dmaps. it must start on 1780 * a dmap boundary. 1781 */ 1782 assert((blkno & (BPERDMAP - 1)) == 0); 1783 1784 /* allocate the blocks dmap by dmap. 1785 */ 1786 for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) { 1787 /* get the buffer for the dmap. 1788 */ 1789 lblkno = BLKTODMAP(b, bmp->db_l2nbperpage); 1790 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); 1791 if (mp == NULL) { 1792 rc = -EIO; 1793 goto backout; 1794 } 1795 dp = (struct dmap *) mp->data; 1796 1797 /* the dmap better be all free. 1798 */ 1799 if (dp->tree.stree[ROOT] != L2BPERDMAP) { 1800 release_metapage(mp); 1801 jfs_error(bmp->db_ipbmap->i_sb, 1802 "dbAllocCtl: the dmap is not all free"); 1803 rc = -EIO; 1804 goto backout; 1805 } 1806 1807 /* determine how many blocks to allocate from this dmap. 1808 */ 1809 nb = min(n, (s64)BPERDMAP); 1810 1811 /* allocate the blocks from the dmap. 1812 */ 1813 if ((rc = dbAllocDmap(bmp, dp, b, nb))) { 1814 release_metapage(mp); 1815 goto backout; 1816 } 1817 1818 /* write the buffer. 1819 */ 1820 write_metapage(mp); 1821 } 1822 1823 /* set the results (starting block number) and return. 1824 */ 1825 *results = blkno; 1826 return (0); 1827 1828 /* something failed in handling an allocation request involving 1829 * multiple dmaps. we'll try to clean up by backing out any 1830 * allocation that has already happened for this request. if 1831 * we fail in backing out the allocation, we'll mark the file 1832 * system to indicate that blocks have been leaked. 1833 */ 1834 backout: 1835 1836 /* try to backout the allocations dmap by dmap. 1837 */ 1838 for (n = nblocks - n, b = blkno; n > 0; 1839 n -= BPERDMAP, b += BPERDMAP) { 1840 /* get the buffer for this dmap. 1841 */ 1842 lblkno = BLKTODMAP(b, bmp->db_l2nbperpage); 1843 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); 1844 if (mp == NULL) { 1845 /* could not back out. mark the file system 1846 * to indicate that we have leaked blocks. 1847 */ 1848 jfs_error(bmp->db_ipbmap->i_sb, 1849 "dbAllocCtl: I/O Error: Block Leakage."); 1850 continue; 1851 } 1852 dp = (struct dmap *) mp->data; 1853 1854 /* free the blocks is this dmap. 1855 */ 1856 if (dbFreeDmap(bmp, dp, b, BPERDMAP)) { 1857 /* could not back out. mark the file system 1858 * to indicate that we have leaked blocks. 1859 */ 1860 release_metapage(mp); 1861 jfs_error(bmp->db_ipbmap->i_sb, 1862 "dbAllocCtl: Block Leakage."); 1863 continue; 1864 } 1865 1866 /* write the buffer. 1867 */ 1868 write_metapage(mp); 1869 } 1870 1871 return (rc); 1872 } 1873 1874 1875 /* 1876 * NAME: dbAllocDmapLev() 1877 * 1878 * FUNCTION: attempt to allocate a specified number of contiguous blocks 1879 * from a specified dmap. 1880 * 1881 * this routine checks if the contiguous blocks are available. 1882 * if so, nblocks of blocks are allocated; otherwise, ENOSPC is 1883 * returned. 1884 * 1885 * PARAMETERS: 1886 * mp - pointer to bmap descriptor 1887 * dp - pointer to dmap to attempt to allocate blocks from. 1888 * l2nb - log2 number of contiguous block desired. 1889 * nblocks - actual number of contiguous block desired. 1890 * results - on successful return, set to the starting block number 1891 * of the newly allocated range. 1892 * 1893 * RETURN VALUES: 1894 * 0 - success 1895 * -ENOSPC - insufficient disk resources 1896 * -EIO - i/o error 1897 * 1898 * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or 1899 * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit; 1900 */ 1901 static int 1902 dbAllocDmapLev(struct bmap * bmp, 1903 struct dmap * dp, int nblocks, int l2nb, s64 * results) 1904 { 1905 s64 blkno; 1906 int leafidx, rc; 1907 1908 /* can't be more than a dmaps worth of blocks */ 1909 assert(l2nb <= L2BPERDMAP); 1910 1911 /* search the tree within the dmap page for sufficient 1912 * free space. if sufficient free space is found, dbFindLeaf() 1913 * returns the index of the leaf at which free space was found. 1914 */ 1915 if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx)) 1916 return -ENOSPC; 1917 1918 /* determine the block number within the file system corresponding 1919 * to the leaf at which free space was found. 1920 */ 1921 blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD); 1922 1923 /* if not all bits of the dmap word are free, get the starting 1924 * bit number within the dmap word of the required string of free 1925 * bits and adjust the block number with this value. 1926 */ 1927 if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN) 1928 blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb); 1929 1930 /* allocate the blocks */ 1931 if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0) 1932 *results = blkno; 1933 1934 return (rc); 1935 } 1936 1937 1938 /* 1939 * NAME: dbAllocDmap() 1940 * 1941 * FUNCTION: adjust the disk allocation map to reflect the allocation 1942 * of a specified block range within a dmap. 1943 * 1944 * this routine allocates the specified blocks from the dmap 1945 * through a call to dbAllocBits(). if the allocation of the 1946 * block range causes the maximum string of free blocks within 1947 * the dmap to change (i.e. the value of the root of the dmap's 1948 * dmtree), this routine will cause this change to be reflected 1949 * up through the appropriate levels of the dmap control pages 1950 * by a call to dbAdjCtl() for the L0 dmap control page that 1951 * covers this dmap. 1952 * 1953 * PARAMETERS: 1954 * bmp - pointer to bmap descriptor 1955 * dp - pointer to dmap to allocate the block range from. 1956 * blkno - starting block number of the block to be allocated. 1957 * nblocks - number of blocks to be allocated. 1958 * 1959 * RETURN VALUES: 1960 * 0 - success 1961 * -EIO - i/o error 1962 * 1963 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; 1964 */ 1965 static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, 1966 int nblocks) 1967 { 1968 s8 oldroot; 1969 int rc; 1970 1971 /* save the current value of the root (i.e. maximum free string) 1972 * of the dmap tree. 1973 */ 1974 oldroot = dp->tree.stree[ROOT]; 1975 1976 /* allocate the specified (blocks) bits */ 1977 dbAllocBits(bmp, dp, blkno, nblocks); 1978 1979 /* if the root has not changed, done. */ 1980 if (dp->tree.stree[ROOT] == oldroot) 1981 return (0); 1982 1983 /* root changed. bubble the change up to the dmap control pages. 1984 * if the adjustment of the upper level control pages fails, 1985 * backout the bit allocation (thus making everything consistent). 1986 */ 1987 if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0))) 1988 dbFreeBits(bmp, dp, blkno, nblocks); 1989 1990 return (rc); 1991 } 1992 1993 1994 /* 1995 * NAME: dbFreeDmap() 1996 * 1997 * FUNCTION: adjust the disk allocation map to reflect the allocation 1998 * of a specified block range within a dmap. 1999 * 2000 * this routine frees the specified blocks from the dmap through 2001 * a call to dbFreeBits(). if the deallocation of the block range 2002 * causes the maximum string of free blocks within the dmap to 2003 * change (i.e. the value of the root of the dmap's dmtree), this 2004 * routine will cause this change to be reflected up through the 2005 * appropriate levels of the dmap control pages by a call to 2006 * dbAdjCtl() for the L0 dmap control page that covers this dmap. 2007 * 2008 * PARAMETERS: 2009 * bmp - pointer to bmap descriptor 2010 * dp - pointer to dmap to free the block range from. 2011 * blkno - starting block number of the block to be freed. 2012 * nblocks - number of blocks to be freed. 2013 * 2014 * RETURN VALUES: 2015 * 0 - success 2016 * -EIO - i/o error 2017 * 2018 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; 2019 */ 2020 static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, 2021 int nblocks) 2022 { 2023 s8 oldroot; 2024 int rc = 0, word; 2025 2026 /* save the current value of the root (i.e. maximum free string) 2027 * of the dmap tree. 2028 */ 2029 oldroot = dp->tree.stree[ROOT]; 2030 2031 /* free the specified (blocks) bits */ 2032 rc = dbFreeBits(bmp, dp, blkno, nblocks); 2033 2034 /* if error or the root has not changed, done. */ 2035 if (rc || (dp->tree.stree[ROOT] == oldroot)) 2036 return (rc); 2037 2038 /* root changed. bubble the change up to the dmap control pages. 2039 * if the adjustment of the upper level control pages fails, 2040 * backout the deallocation. 2041 */ 2042 if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) { 2043 word = (blkno & (BPERDMAP - 1)) >> L2DBWORD; 2044 2045 /* as part of backing out the deallocation, we will have 2046 * to back split the dmap tree if the deallocation caused 2047 * the freed blocks to become part of a larger binary buddy 2048 * system. 2049 */ 2050 if (dp->tree.stree[word] == NOFREE) 2051 dbBackSplit((dmtree_t *) & dp->tree, word); 2052 2053 dbAllocBits(bmp, dp, blkno, nblocks); 2054 } 2055 2056 return (rc); 2057 } 2058 2059 2060 /* 2061 * NAME: dbAllocBits() 2062 * 2063 * FUNCTION: allocate a specified block range from a dmap. 2064 * 2065 * this routine updates the dmap to reflect the working 2066 * state allocation of the specified block range. it directly 2067 * updates the bits of the working map and causes the adjustment 2068 * of the binary buddy system described by the dmap's dmtree 2069 * leaves to reflect the bits allocated. it also causes the 2070 * dmap's dmtree, as a whole, to reflect the allocated range. 2071 * 2072 * PARAMETERS: 2073 * bmp - pointer to bmap descriptor 2074 * dp - pointer to dmap to allocate bits from. 2075 * blkno - starting block number of the bits to be allocated. 2076 * nblocks - number of bits to be allocated. 2077 * 2078 * RETURN VALUES: none 2079 * 2080 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; 2081 */ 2082 static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, 2083 int nblocks) 2084 { 2085 int dbitno, word, rembits, nb, nwords, wbitno, nw, agno; 2086 dmtree_t *tp = (dmtree_t *) & dp->tree; 2087 int size; 2088 s8 *leaf; 2089 2090 /* pick up a pointer to the leaves of the dmap tree */ 2091 leaf = dp->tree.stree + LEAFIND; 2092 2093 /* determine the bit number and word within the dmap of the 2094 * starting block. 2095 */ 2096 dbitno = blkno & (BPERDMAP - 1); 2097 word = dbitno >> L2DBWORD; 2098 2099 /* block range better be within the dmap */ 2100 assert(dbitno + nblocks <= BPERDMAP); 2101 2102 /* allocate the bits of the dmap's words corresponding to the block 2103 * range. not all bits of the first and last words may be contained 2104 * within the block range. if this is the case, we'll work against 2105 * those words (i.e. partial first and/or last) on an individual basis 2106 * (a single pass), allocating the bits of interest by hand and 2107 * updating the leaf corresponding to the dmap word. a single pass 2108 * will be used for all dmap words fully contained within the 2109 * specified range. within this pass, the bits of all fully contained 2110 * dmap words will be marked as free in a single shot and the leaves 2111 * will be updated. a single leaf may describe the free space of 2112 * multiple dmap words, so we may update only a subset of the actual 2113 * leaves corresponding to the dmap words of the block range. 2114 */ 2115 for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { 2116 /* determine the bit number within the word and 2117 * the number of bits within the word. 2118 */ 2119 wbitno = dbitno & (DBWORD - 1); 2120 nb = min(rembits, DBWORD - wbitno); 2121 2122 /* check if only part of a word is to be allocated. 2123 */ 2124 if (nb < DBWORD) { 2125 /* allocate (set to 1) the appropriate bits within 2126 * this dmap word. 2127 */ 2128 dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb) 2129 >> wbitno); 2130 2131 /* update the leaf for this dmap word. in addition 2132 * to setting the leaf value to the binary buddy max 2133 * of the updated dmap word, dbSplit() will split 2134 * the binary system of the leaves if need be. 2135 */ 2136 dbSplit(tp, word, BUDMIN, 2137 dbMaxBud((u8 *) & dp->wmap[word])); 2138 2139 word += 1; 2140 } else { 2141 /* one or more dmap words are fully contained 2142 * within the block range. determine how many 2143 * words and allocate (set to 1) the bits of these 2144 * words. 2145 */ 2146 nwords = rembits >> L2DBWORD; 2147 memset(&dp->wmap[word], (int) ONES, nwords * 4); 2148 2149 /* determine how many bits. 2150 */ 2151 nb = nwords << L2DBWORD; 2152 2153 /* now update the appropriate leaves to reflect 2154 * the allocated words. 2155 */ 2156 for (; nwords > 0; nwords -= nw) { 2157 if (leaf[word] < BUDMIN) { 2158 jfs_error(bmp->db_ipbmap->i_sb, 2159 "dbAllocBits: leaf page " 2160 "corrupt"); 2161 break; 2162 } 2163 2164 /* determine what the leaf value should be 2165 * updated to as the minimum of the l2 number 2166 * of bits being allocated and the l2 number 2167 * of bits currently described by this leaf. 2168 */ 2169 size = min((int)leaf[word], NLSTOL2BSZ(nwords)); 2170 2171 /* update the leaf to reflect the allocation. 2172 * in addition to setting the leaf value to 2173 * NOFREE, dbSplit() will split the binary 2174 * system of the leaves to reflect the current 2175 * allocation (size). 2176 */ 2177 dbSplit(tp, word, size, NOFREE); 2178 2179 /* get the number of dmap words handled */ 2180 nw = BUDSIZE(size, BUDMIN); 2181 word += nw; 2182 } 2183 } 2184 } 2185 2186 /* update the free count for this dmap */ 2187 dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks); 2188 2189 BMAP_LOCK(bmp); 2190 2191 /* if this allocation group is completely free, 2192 * update the maximum allocation group number if this allocation 2193 * group is the new max. 2194 */ 2195 agno = blkno >> bmp->db_agl2size; 2196 if (agno > bmp->db_maxag) 2197 bmp->db_maxag = agno; 2198 2199 /* update the free count for the allocation group and map */ 2200 bmp->db_agfree[agno] -= nblocks; 2201 bmp->db_nfree -= nblocks; 2202 2203 BMAP_UNLOCK(bmp); 2204 } 2205 2206 2207 /* 2208 * NAME: dbFreeBits() 2209 * 2210 * FUNCTION: free a specified block range from a dmap. 2211 * 2212 * this routine updates the dmap to reflect the working 2213 * state allocation of the specified block range. it directly 2214 * updates the bits of the working map and causes the adjustment 2215 * of the binary buddy system described by the dmap's dmtree 2216 * leaves to reflect the bits freed. it also causes the dmap's 2217 * dmtree, as a whole, to reflect the deallocated range. 2218 * 2219 * PARAMETERS: 2220 * bmp - pointer to bmap descriptor 2221 * dp - pointer to dmap to free bits from. 2222 * blkno - starting block number of the bits to be freed. 2223 * nblocks - number of bits to be freed. 2224 * 2225 * RETURN VALUES: 0 for success 2226 * 2227 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; 2228 */ 2229 static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, 2230 int nblocks) 2231 { 2232 int dbitno, word, rembits, nb, nwords, wbitno, nw, agno; 2233 dmtree_t *tp = (dmtree_t *) & dp->tree; 2234 int rc = 0; 2235 int size; 2236 2237 /* determine the bit number and word within the dmap of the 2238 * starting block. 2239 */ 2240 dbitno = blkno & (BPERDMAP - 1); 2241 word = dbitno >> L2DBWORD; 2242 2243 /* block range better be within the dmap. 2244 */ 2245 assert(dbitno + nblocks <= BPERDMAP); 2246 2247 /* free the bits of the dmaps words corresponding to the block range. 2248 * not all bits of the first and last words may be contained within 2249 * the block range. if this is the case, we'll work against those 2250 * words (i.e. partial first and/or last) on an individual basis 2251 * (a single pass), freeing the bits of interest by hand and updating 2252 * the leaf corresponding to the dmap word. a single pass will be used 2253 * for all dmap words fully contained within the specified range. 2254 * within this pass, the bits of all fully contained dmap words will 2255 * be marked as free in a single shot and the leaves will be updated. a 2256 * single leaf may describe the free space of multiple dmap words, 2257 * so we may update only a subset of the actual leaves corresponding 2258 * to the dmap words of the block range. 2259 * 2260 * dbJoin() is used to update leaf values and will join the binary 2261 * buddy system of the leaves if the new leaf values indicate this 2262 * should be done. 2263 */ 2264 for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { 2265 /* determine the bit number within the word and 2266 * the number of bits within the word. 2267 */ 2268 wbitno = dbitno & (DBWORD - 1); 2269 nb = min(rembits, DBWORD - wbitno); 2270 2271 /* check if only part of a word is to be freed. 2272 */ 2273 if (nb < DBWORD) { 2274 /* free (zero) the appropriate bits within this 2275 * dmap word. 2276 */ 2277 dp->wmap[word] &= 2278 cpu_to_le32(~(ONES << (DBWORD - nb) 2279 >> wbitno)); 2280 2281 /* update the leaf for this dmap word. 2282 */ 2283 rc = dbJoin(tp, word, 2284 dbMaxBud((u8 *) & dp->wmap[word])); 2285 if (rc) 2286 return rc; 2287 2288 word += 1; 2289 } else { 2290 /* one or more dmap words are fully contained 2291 * within the block range. determine how many 2292 * words and free (zero) the bits of these words. 2293 */ 2294 nwords = rembits >> L2DBWORD; 2295 memset(&dp->wmap[word], 0, nwords * 4); 2296 2297 /* determine how many bits. 2298 */ 2299 nb = nwords << L2DBWORD; 2300 2301 /* now update the appropriate leaves to reflect 2302 * the freed words. 2303 */ 2304 for (; nwords > 0; nwords -= nw) { 2305 /* determine what the leaf value should be 2306 * updated to as the minimum of the l2 number 2307 * of bits being freed and the l2 (max) number 2308 * of bits that can be described by this leaf. 2309 */ 2310 size = 2311 min(LITOL2BSZ 2312 (word, L2LPERDMAP, BUDMIN), 2313 NLSTOL2BSZ(nwords)); 2314 2315 /* update the leaf. 2316 */ 2317 rc = dbJoin(tp, word, size); 2318 if (rc) 2319 return rc; 2320 2321 /* get the number of dmap words handled. 2322 */ 2323 nw = BUDSIZE(size, BUDMIN); 2324 word += nw; 2325 } 2326 } 2327 } 2328 2329 /* update the free count for this dmap. 2330 */ 2331 dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks); 2332 2333 BMAP_LOCK(bmp); 2334 2335 /* update the free count for the allocation group and 2336 * map. 2337 */ 2338 agno = blkno >> bmp->db_agl2size; 2339 bmp->db_nfree += nblocks; 2340 bmp->db_agfree[agno] += nblocks; 2341 2342 /* check if this allocation group is not completely free and 2343 * if it is currently the maximum (rightmost) allocation group. 2344 * if so, establish the new maximum allocation group number by 2345 * searching left for the first allocation group with allocation. 2346 */ 2347 if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) || 2348 (agno == bmp->db_numag - 1 && 2349 bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) { 2350 while (bmp->db_maxag > 0) { 2351 bmp->db_maxag -= 1; 2352 if (bmp->db_agfree[bmp->db_maxag] != 2353 bmp->db_agsize) 2354 break; 2355 } 2356 2357 /* re-establish the allocation group preference if the 2358 * current preference is right of the maximum allocation 2359 * group. 2360 */ 2361 if (bmp->db_agpref > bmp->db_maxag) 2362 bmp->db_agpref = bmp->db_maxag; 2363 } 2364 2365 BMAP_UNLOCK(bmp); 2366 2367 return 0; 2368 } 2369 2370 2371 /* 2372 * NAME: dbAdjCtl() 2373 * 2374 * FUNCTION: adjust a dmap control page at a specified level to reflect 2375 * the change in a lower level dmap or dmap control page's 2376 * maximum string of free blocks (i.e. a change in the root 2377 * of the lower level object's dmtree) due to the allocation 2378 * or deallocation of a range of blocks with a single dmap. 2379 * 2380 * on entry, this routine is provided with the new value of 2381 * the lower level dmap or dmap control page root and the 2382 * starting block number of the block range whose allocation 2383 * or deallocation resulted in the root change. this range 2384 * is respresented by a single leaf of the current dmapctl 2385 * and the leaf will be updated with this value, possibly 2386 * causing a binary buddy system within the leaves to be 2387 * split or joined. the update may also cause the dmapctl's 2388 * dmtree to be updated. 2389 * 2390 * if the adjustment of the dmap control page, itself, causes its 2391 * root to change, this change will be bubbled up to the next dmap 2392 * control level by a recursive call to this routine, specifying 2393 * the new root value and the next dmap control page level to 2394 * be adjusted. 2395 * PARAMETERS: 2396 * bmp - pointer to bmap descriptor 2397 * blkno - the first block of a block range within a dmap. it is 2398 * the allocation or deallocation of this block range that 2399 * requires the dmap control page to be adjusted. 2400 * newval - the new value of the lower level dmap or dmap control 2401 * page root. 2402 * alloc - TRUE if adjustment is due to an allocation. 2403 * level - current level of dmap control page (i.e. L0, L1, L2) to 2404 * be adjusted. 2405 * 2406 * RETURN VALUES: 2407 * 0 - success 2408 * -EIO - i/o error 2409 * 2410 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; 2411 */ 2412 static int 2413 dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) 2414 { 2415 struct metapage *mp; 2416 s8 oldroot; 2417 int oldval; 2418 s64 lblkno; 2419 struct dmapctl *dcp; 2420 int rc, leafno, ti; 2421 2422 /* get the buffer for the dmap control page for the specified 2423 * block number and control page level. 2424 */ 2425 lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level); 2426 mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); 2427 if (mp == NULL) 2428 return -EIO; 2429 dcp = (struct dmapctl *) mp->data; 2430 2431 if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { 2432 jfs_error(bmp->db_ipbmap->i_sb, 2433 "dbAdjCtl: Corrupt dmapctl page"); 2434 release_metapage(mp); 2435 return -EIO; 2436 } 2437 2438 /* determine the leaf number corresponding to the block and 2439 * the index within the dmap control tree. 2440 */ 2441 leafno = BLKTOCTLLEAF(blkno, dcp->budmin); 2442 ti = leafno + le32_to_cpu(dcp->leafidx); 2443 2444 /* save the current leaf value and the current root level (i.e. 2445 * maximum l2 free string described by this dmapctl). 2446 */ 2447 oldval = dcp->stree[ti]; 2448 oldroot = dcp->stree[ROOT]; 2449 2450 /* check if this is a control page update for an allocation. 2451 * if so, update the leaf to reflect the new leaf value using 2452 * dbSplit(); otherwise (deallocation), use dbJoin() to udpate 2453 * the leaf with the new value. in addition to updating the 2454 * leaf, dbSplit() will also split the binary buddy system of 2455 * the leaves, if required, and bubble new values within the 2456 * dmapctl tree, if required. similarly, dbJoin() will join 2457 * the binary buddy system of leaves and bubble new values up 2458 * the dmapctl tree as required by the new leaf value. 2459 */ 2460 if (alloc) { 2461 /* check if we are in the middle of a binary buddy 2462 * system. this happens when we are performing the 2463 * first allocation out of an allocation group that 2464 * is part (not the first part) of a larger binary 2465 * buddy system. if we are in the middle, back split 2466 * the system prior to calling dbSplit() which assumes 2467 * that it is at the front of a binary buddy system. 2468 */ 2469 if (oldval == NOFREE) { 2470 dbBackSplit((dmtree_t *) dcp, leafno); 2471 oldval = dcp->stree[ti]; 2472 } 2473 dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval); 2474 } else { 2475 rc = dbJoin((dmtree_t *) dcp, leafno, newval); 2476 if (rc) 2477 return rc; 2478 } 2479 2480 /* check if the root of the current dmap control page changed due 2481 * to the update and if the current dmap control page is not at 2482 * the current top level (i.e. L0, L1, L2) of the map. if so (i.e. 2483 * root changed and this is not the top level), call this routine 2484 * again (recursion) for the next higher level of the mapping to 2485 * reflect the change in root for the current dmap control page. 2486 */ 2487 if (dcp->stree[ROOT] != oldroot) { 2488 /* are we below the top level of the map. if so, 2489 * bubble the root up to the next higher level. 2490 */ 2491 if (level < bmp->db_maxlevel) { 2492 /* bubble up the new root of this dmap control page to 2493 * the next level. 2494 */ 2495 if ((rc = 2496 dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc, 2497 level + 1))) { 2498 /* something went wrong in bubbling up the new 2499 * root value, so backout the changes to the 2500 * current dmap control page. 2501 */ 2502 if (alloc) { 2503 dbJoin((dmtree_t *) dcp, leafno, 2504 oldval); 2505 } else { 2506 /* the dbJoin() above might have 2507 * caused a larger binary buddy system 2508 * to form and we may now be in the 2509 * middle of it. if this is the case, 2510 * back split the buddies. 2511 */ 2512 if (dcp->stree[ti] == NOFREE) 2513 dbBackSplit((dmtree_t *) 2514 dcp, leafno); 2515 dbSplit((dmtree_t *) dcp, leafno, 2516 dcp->budmin, oldval); 2517 } 2518 2519 /* release the buffer and return the error. 2520 */ 2521 release_metapage(mp); 2522 return (rc); 2523 } 2524 } else { 2525 /* we're at the top level of the map. update 2526 * the bmap control page to reflect the size 2527 * of the maximum free buddy system. 2528 */ 2529 assert(level == bmp->db_maxlevel); 2530 if (bmp->db_maxfreebud != oldroot) { 2531 jfs_error(bmp->db_ipbmap->i_sb, 2532 "dbAdjCtl: the maximum free buddy is " 2533 "not the old root"); 2534 } 2535 bmp->db_maxfreebud = dcp->stree[ROOT]; 2536 } 2537 } 2538 2539 /* write the buffer. 2540 */ 2541 write_metapage(mp); 2542 2543 return (0); 2544 } 2545 2546 2547 /* 2548 * NAME: dbSplit() 2549 * 2550 * FUNCTION: update the leaf of a dmtree with a new value, splitting 2551 * the leaf from the binary buddy system of the dmtree's 2552 * leaves, as required. 2553 * 2554 * PARAMETERS: 2555 * tp - pointer to the tree containing the leaf. 2556 * leafno - the number of the leaf to be updated. 2557 * splitsz - the size the binary buddy system starting at the leaf 2558 * must be split to, specified as the log2 number of blocks. 2559 * newval - the new value for the leaf. 2560 * 2561 * RETURN VALUES: none 2562 * 2563 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; 2564 */ 2565 static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) 2566 { 2567 int budsz; 2568 int cursz; 2569 s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); 2570 2571 /* check if the leaf needs to be split. 2572 */ 2573 if (leaf[leafno] > tp->dmt_budmin) { 2574 /* the split occurs by cutting the buddy system in half 2575 * at the specified leaf until we reach the specified 2576 * size. pick up the starting split size (current size 2577 * - 1 in l2) and the corresponding buddy size. 2578 */ 2579 cursz = leaf[leafno] - 1; 2580 budsz = BUDSIZE(cursz, tp->dmt_budmin); 2581 2582 /* split until we reach the specified size. 2583 */ 2584 while (cursz >= splitsz) { 2585 /* update the buddy's leaf with its new value. 2586 */ 2587 dbAdjTree(tp, leafno ^ budsz, cursz); 2588 2589 /* on to the next size and buddy. 2590 */ 2591 cursz -= 1; 2592 budsz >>= 1; 2593 } 2594 } 2595 2596 /* adjust the dmap tree to reflect the specified leaf's new 2597 * value. 2598 */ 2599 dbAdjTree(tp, leafno, newval); 2600 } 2601 2602 2603 /* 2604 * NAME: dbBackSplit() 2605 * 2606 * FUNCTION: back split the binary buddy system of dmtree leaves 2607 * that hold a specified leaf until the specified leaf 2608 * starts its own binary buddy system. 2609 * 2610 * the allocators typically perform allocations at the start 2611 * of binary buddy systems and dbSplit() is used to accomplish 2612 * any required splits. in some cases, however, allocation 2613 * may occur in the middle of a binary system and requires a 2614 * back split, with the split proceeding out from the middle of 2615 * the system (less efficient) rather than the start of the 2616 * system (more efficient). the cases in which a back split 2617 * is required are rare and are limited to the first allocation 2618 * within an allocation group which is a part (not first part) 2619 * of a larger binary buddy system and a few exception cases 2620 * in which a previous join operation must be backed out. 2621 * 2622 * PARAMETERS: 2623 * tp - pointer to the tree containing the leaf. 2624 * leafno - the number of the leaf to be updated. 2625 * 2626 * RETURN VALUES: none 2627 * 2628 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; 2629 */ 2630 static void dbBackSplit(dmtree_t * tp, int leafno) 2631 { 2632 int budsz, bud, w, bsz, size; 2633 int cursz; 2634 s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); 2635 2636 /* leaf should be part (not first part) of a binary 2637 * buddy system. 2638 */ 2639 assert(leaf[leafno] == NOFREE); 2640 2641 /* the back split is accomplished by iteratively finding the leaf 2642 * that starts the buddy system that contains the specified leaf and 2643 * splitting that system in two. this iteration continues until 2644 * the specified leaf becomes the start of a buddy system. 2645 * 2646 * determine maximum possible l2 size for the specified leaf. 2647 */ 2648 size = 2649 LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs), 2650 tp->dmt_budmin); 2651 2652 /* determine the number of leaves covered by this size. this 2653 * is the buddy size that we will start with as we search for 2654 * the buddy system that contains the specified leaf. 2655 */ 2656 budsz = BUDSIZE(size, tp->dmt_budmin); 2657 2658 /* back split. 2659 */ 2660 while (leaf[leafno] == NOFREE) { 2661 /* find the leftmost buddy leaf. 2662 */ 2663 for (w = leafno, bsz = budsz;; bsz <<= 1, 2664 w = (w < bud) ? w : bud) { 2665 assert(bsz < le32_to_cpu(tp->dmt_nleafs)); 2666 2667 /* determine the buddy. 2668 */ 2669 bud = w ^ bsz; 2670 2671 /* check if this buddy is the start of the system. 2672 */ 2673 if (leaf[bud] != NOFREE) { 2674 /* split the leaf at the start of the 2675 * system in two. 2676 */ 2677 cursz = leaf[bud] - 1; 2678 dbSplit(tp, bud, cursz, cursz); 2679 break; 2680 } 2681 } 2682 } 2683 2684 assert(leaf[leafno] == size); 2685 } 2686 2687 2688 /* 2689 * NAME: dbJoin() 2690 * 2691 * FUNCTION: update the leaf of a dmtree with a new value, joining 2692 * the leaf with other leaves of the dmtree into a multi-leaf 2693 * binary buddy system, as required. 2694 * 2695 * PARAMETERS: 2696 * tp - pointer to the tree containing the leaf. 2697 * leafno - the number of the leaf to be updated. 2698 * newval - the new value for the leaf. 2699 * 2700 * RETURN VALUES: none 2701 */ 2702 static int dbJoin(dmtree_t * tp, int leafno, int newval) 2703 { 2704 int budsz, buddy; 2705 s8 *leaf; 2706 2707 /* can the new leaf value require a join with other leaves ? 2708 */ 2709 if (newval >= tp->dmt_budmin) { 2710 /* pickup a pointer to the leaves of the tree. 2711 */ 2712 leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); 2713 2714 /* try to join the specified leaf into a large binary 2715 * buddy system. the join proceeds by attempting to join 2716 * the specified leafno with its buddy (leaf) at new value. 2717 * if the join occurs, we attempt to join the left leaf 2718 * of the joined buddies with its buddy at new value + 1. 2719 * we continue to join until we find a buddy that cannot be 2720 * joined (does not have a value equal to the size of the 2721 * last join) or until all leaves have been joined into a 2722 * single system. 2723 * 2724 * get the buddy size (number of words covered) of 2725 * the new value. 2726 */ 2727 budsz = BUDSIZE(newval, tp->dmt_budmin); 2728 2729 /* try to join. 2730 */ 2731 while (budsz < le32_to_cpu(tp->dmt_nleafs)) { 2732 /* get the buddy leaf. 2733 */ 2734 buddy = leafno ^ budsz; 2735 2736 /* if the leaf's new value is greater than its 2737 * buddy's value, we join no more. 2738 */ 2739 if (newval > leaf[buddy]) 2740 break; 2741 2742 /* It shouldn't be less */ 2743 if (newval < leaf[buddy]) 2744 return -EIO; 2745 2746 /* check which (leafno or buddy) is the left buddy. 2747 * the left buddy gets to claim the blocks resulting 2748 * from the join while the right gets to claim none. 2749 * the left buddy is also eligable to participate in 2750 * a join at the next higher level while the right 2751 * is not. 2752 * 2753 */ 2754 if (leafno < buddy) { 2755 /* leafno is the left buddy. 2756 */ 2757 dbAdjTree(tp, buddy, NOFREE); 2758 } else { 2759 /* buddy is the left buddy and becomes 2760 * leafno. 2761 */ 2762 dbAdjTree(tp, leafno, NOFREE); 2763 leafno = buddy; 2764 } 2765 2766 /* on to try the next join. 2767 */ 2768 newval += 1; 2769 budsz <<= 1; 2770 } 2771 } 2772 2773 /* update the leaf value. 2774 */ 2775 dbAdjTree(tp, leafno, newval); 2776 2777 return 0; 2778 } 2779 2780 2781 /* 2782 * NAME: dbAdjTree() 2783 * 2784 * FUNCTION: update a leaf of a dmtree with a new value, adjusting 2785 * the dmtree, as required, to reflect the new leaf value. 2786 * the combination of any buddies must already be done before 2787 * this is called. 2788 * 2789 * PARAMETERS: 2790 * tp - pointer to the tree to be adjusted. 2791 * leafno - the number of the leaf to be updated. 2792 * newval - the new value for the leaf. 2793 * 2794 * RETURN VALUES: none 2795 */ 2796 static void dbAdjTree(dmtree_t * tp, int leafno, int newval) 2797 { 2798 int lp, pp, k; 2799 int max; 2800 2801 /* pick up the index of the leaf for this leafno. 2802 */ 2803 lp = leafno + le32_to_cpu(tp->dmt_leafidx); 2804 2805 /* is the current value the same as the old value ? if so, 2806 * there is nothing to do. 2807 */ 2808 if (tp->dmt_stree[lp] == newval) 2809 return; 2810 2811 /* set the new value. 2812 */ 2813 tp->dmt_stree[lp] = newval; 2814 2815 /* bubble the new value up the tree as required. 2816 */ 2817 for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) { 2818 /* get the index of the first leaf of the 4 leaf 2819 * group containing the specified leaf (leafno). 2820 */ 2821 lp = ((lp - 1) & ~0x03) + 1; 2822 2823 /* get the index of the parent of this 4 leaf group. 2824 */ 2825 pp = (lp - 1) >> 2; 2826 2827 /* determine the maximum of the 4 leaves. 2828 */ 2829 max = TREEMAX(&tp->dmt_stree[lp]); 2830 2831 /* if the maximum of the 4 is the same as the 2832 * parent's value, we're done. 2833 */ 2834 if (tp->dmt_stree[pp] == max) 2835 break; 2836 2837 /* parent gets new value. 2838 */ 2839 tp->dmt_stree[pp] = max; 2840 2841 /* parent becomes leaf for next go-round. 2842 */ 2843 lp = pp; 2844 } 2845 } 2846 2847 2848 /* 2849 * NAME: dbFindLeaf() 2850 * 2851 * FUNCTION: search a dmtree_t for sufficient free blocks, returning 2852 * the index of a leaf describing the free blocks if 2853 * sufficient free blocks are found. 2854 * 2855 * the search starts at the top of the dmtree_t tree and 2856 * proceeds down the tree to the leftmost leaf with sufficient 2857 * free space. 2858 * 2859 * PARAMETERS: 2860 * tp - pointer to the tree to be searched. 2861 * l2nb - log2 number of free blocks to search for. 2862 * leafidx - return pointer to be set to the index of the leaf 2863 * describing at least l2nb free blocks if sufficient 2864 * free blocks are found. 2865 * 2866 * RETURN VALUES: 2867 * 0 - success 2868 * -ENOSPC - insufficient free blocks. 2869 */ 2870 static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) 2871 { 2872 int ti, n = 0, k, x = 0; 2873 2874 /* first check the root of the tree to see if there is 2875 * sufficient free space. 2876 */ 2877 if (l2nb > tp->dmt_stree[ROOT]) 2878 return -ENOSPC; 2879 2880 /* sufficient free space available. now search down the tree 2881 * starting at the next level for the leftmost leaf that 2882 * describes sufficient free space. 2883 */ 2884 for (k = le32_to_cpu(tp->dmt_height), ti = 1; 2885 k > 0; k--, ti = ((ti + n) << 2) + 1) { 2886 /* search the four nodes at this level, starting from 2887 * the left. 2888 */ 2889 for (x = ti, n = 0; n < 4; n++) { 2890 /* sufficient free space found. move to the next 2891 * level (or quit if this is the last level). 2892 */ 2893 if (l2nb <= tp->dmt_stree[x + n]) 2894 break; 2895 } 2896 2897 /* better have found something since the higher 2898 * levels of the tree said it was here. 2899 */ 2900 assert(n < 4); 2901 } 2902 2903 /* set the return to the leftmost leaf describing sufficient 2904 * free space. 2905 */ 2906 *leafidx = x + n - le32_to_cpu(tp->dmt_leafidx); 2907 2908 return (0); 2909 } 2910 2911 2912 /* 2913 * NAME: dbFindBits() 2914 * 2915 * FUNCTION: find a specified number of binary buddy free bits within a 2916 * dmap bitmap word value. 2917 * 2918 * this routine searches the bitmap value for (1 << l2nb) free 2919 * bits at (1 << l2nb) alignments within the value. 2920 * 2921 * PARAMETERS: 2922 * word - dmap bitmap word value. 2923 * l2nb - number of free bits specified as a log2 number. 2924 * 2925 * RETURN VALUES: 2926 * starting bit number of free bits. 2927 */ 2928 static int dbFindBits(u32 word, int l2nb) 2929 { 2930 int bitno, nb; 2931 u32 mask; 2932 2933 /* get the number of bits. 2934 */ 2935 nb = 1 << l2nb; 2936 assert(nb <= DBWORD); 2937 2938 /* complement the word so we can use a mask (i.e. 0s represent 2939 * free bits) and compute the mask. 2940 */ 2941 word = ~word; 2942 mask = ONES << (DBWORD - nb); 2943 2944 /* scan the word for nb free bits at nb alignments. 2945 */ 2946 for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) { 2947 if ((mask & word) == mask) 2948 break; 2949 } 2950 2951 ASSERT(bitno < 32); 2952 2953 /* return the bit number. 2954 */ 2955 return (bitno); 2956 } 2957 2958 2959 /* 2960 * NAME: dbMaxBud(u8 *cp) 2961 * 2962 * FUNCTION: determine the largest binary buddy string of free 2963 * bits within 32-bits of the map. 2964 * 2965 * PARAMETERS: 2966 * cp - pointer to the 32-bit value. 2967 * 2968 * RETURN VALUES: 2969 * largest binary buddy of free bits within a dmap word. 2970 */ 2971 static int dbMaxBud(u8 * cp) 2972 { 2973 signed char tmp1, tmp2; 2974 2975 /* check if the wmap word is all free. if so, the 2976 * free buddy size is BUDMIN. 2977 */ 2978 if (*((uint *) cp) == 0) 2979 return (BUDMIN); 2980 2981 /* check if the wmap word is half free. if so, the 2982 * free buddy size is BUDMIN-1. 2983 */ 2984 if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0) 2985 return (BUDMIN - 1); 2986 2987 /* not all free or half free. determine the free buddy 2988 * size thru table lookup using quarters of the wmap word. 2989 */ 2990 tmp1 = max(budtab[cp[2]], budtab[cp[3]]); 2991 tmp2 = max(budtab[cp[0]], budtab[cp[1]]); 2992 return (max(tmp1, tmp2)); 2993 } 2994 2995 2996 /* 2997 * NAME: cnttz(uint word) 2998 * 2999 * FUNCTION: determine the number of trailing zeros within a 32-bit 3000 * value. 3001 * 3002 * PARAMETERS: 3003 * value - 32-bit value to be examined. 3004 * 3005 * RETURN VALUES: 3006 * count of trailing zeros 3007 */ 3008 static int cnttz(u32 word) 3009 { 3010 int n; 3011 3012 for (n = 0; n < 32; n++, word >>= 1) { 3013 if (word & 0x01) 3014 break; 3015 } 3016 3017 return (n); 3018 } 3019 3020 3021 /* 3022 * NAME: cntlz(u32 value) 3023 * 3024 * FUNCTION: determine the number of leading zeros within a 32-bit 3025 * value. 3026 * 3027 * PARAMETERS: 3028 * value - 32-bit value to be examined. 3029 * 3030 * RETURN VALUES: 3031 * count of leading zeros 3032 */ 3033 static int cntlz(u32 value) 3034 { 3035 int n; 3036 3037 for (n = 0; n < 32; n++, value <<= 1) { 3038 if (value & HIGHORDER) 3039 break; 3040 } 3041 return (n); 3042 } 3043 3044 3045 /* 3046 * NAME: blkstol2(s64 nb) 3047 * 3048 * FUNCTION: convert a block count to its log2 value. if the block 3049 * count is not a l2 multiple, it is rounded up to the next 3050 * larger l2 multiple. 3051 * 3052 * PARAMETERS: 3053 * nb - number of blocks 3054 * 3055 * RETURN VALUES: 3056 * log2 number of blocks 3057 */ 3058 int blkstol2(s64 nb) 3059 { 3060 int l2nb; 3061 s64 mask; /* meant to be signed */ 3062 3063 mask = (s64) 1 << (64 - 1); 3064 3065 /* count the leading bits. 3066 */ 3067 for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) { 3068 /* leading bit found. 3069 */ 3070 if (nb & mask) { 3071 /* determine the l2 value. 3072 */ 3073 l2nb = (64 - 1) - l2nb; 3074 3075 /* check if we need to round up. 3076 */ 3077 if (~mask & nb) 3078 l2nb++; 3079 3080 return (l2nb); 3081 } 3082 } 3083 assert(0); 3084 return 0; /* fix compiler warning */ 3085 } 3086 3087 3088 /* 3089 * NAME: dbAllocBottomUp() 3090 * 3091 * FUNCTION: alloc the specified block range from the working block 3092 * allocation map. 3093 * 3094 * the blocks will be alloc from the working map one dmap 3095 * at a time. 3096 * 3097 * PARAMETERS: 3098 * ip - pointer to in-core inode; 3099 * blkno - starting block number to be freed. 3100 * nblocks - number of blocks to be freed. 3101 * 3102 * RETURN VALUES: 3103 * 0 - success 3104 * -EIO - i/o error 3105 */ 3106 int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks) 3107 { 3108 struct metapage *mp; 3109 struct dmap *dp; 3110 int nb, rc; 3111 s64 lblkno, rem; 3112 struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; 3113 struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; 3114 3115 IREAD_LOCK(ipbmap); 3116 3117 /* block to be allocated better be within the mapsize. */ 3118 ASSERT(nblocks <= bmp->db_mapsize - blkno); 3119 3120 /* 3121 * allocate the blocks a dmap at a time. 3122 */ 3123 mp = NULL; 3124 for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) { 3125 /* release previous dmap if any */ 3126 if (mp) { 3127 write_metapage(mp); 3128 } 3129 3130 /* get the buffer for the current dmap. */ 3131 lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); 3132 mp = read_metapage(ipbmap, lblkno, PSIZE, 0); 3133 if (mp == NULL) { 3134 IREAD_UNLOCK(ipbmap); 3135 return -EIO; 3136 } 3137 dp = (struct dmap *) mp->data; 3138 3139 /* determine the number of blocks to be allocated from 3140 * this dmap. 3141 */ 3142 nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); 3143 3144 /* allocate the blocks. */ 3145 if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) { 3146 release_metapage(mp); 3147 IREAD_UNLOCK(ipbmap); 3148 return (rc); 3149 } 3150 } 3151 3152 /* write the last buffer. */ 3153 write_metapage(mp); 3154 3155 IREAD_UNLOCK(ipbmap); 3156 3157 return (0); 3158 } 3159 3160 3161 static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, 3162 int nblocks) 3163 { 3164 int rc; 3165 int dbitno, word, rembits, nb, nwords, wbitno, agno; 3166 s8 oldroot, *leaf; 3167 struct dmaptree *tp = (struct dmaptree *) & dp->tree; 3168 3169 /* save the current value of the root (i.e. maximum free string) 3170 * of the dmap tree. 3171 */ 3172 oldroot = tp->stree[ROOT]; 3173 3174 /* pick up a pointer to the leaves of the dmap tree */ 3175 leaf = tp->stree + LEAFIND; 3176 3177 /* determine the bit number and word within the dmap of the 3178 * starting block. 3179 */ 3180 dbitno = blkno & (BPERDMAP - 1); 3181 word = dbitno >> L2DBWORD; 3182 3183 /* block range better be within the dmap */ 3184 assert(dbitno + nblocks <= BPERDMAP); 3185 3186 /* allocate the bits of the dmap's words corresponding to the block 3187 * range. not all bits of the first and last words may be contained 3188 * within the block range. if this is the case, we'll work against 3189 * those words (i.e. partial first and/or last) on an individual basis 3190 * (a single pass), allocating the bits of interest by hand and 3191 * updating the leaf corresponding to the dmap word. a single pass 3192 * will be used for all dmap words fully contained within the 3193 * specified range. within this pass, the bits of all fully contained 3194 * dmap words will be marked as free in a single shot and the leaves 3195 * will be updated. a single leaf may describe the free space of 3196 * multiple dmap words, so we may update only a subset of the actual 3197 * leaves corresponding to the dmap words of the block range. 3198 */ 3199 for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { 3200 /* determine the bit number within the word and 3201 * the number of bits within the word. 3202 */ 3203 wbitno = dbitno & (DBWORD - 1); 3204 nb = min(rembits, DBWORD - wbitno); 3205 3206 /* check if only part of a word is to be allocated. 3207 */ 3208 if (nb < DBWORD) { 3209 /* allocate (set to 1) the appropriate bits within 3210 * this dmap word. 3211 */ 3212 dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb) 3213 >> wbitno); 3214 3215 word++; 3216 } else { 3217 /* one or more dmap words are fully contained 3218 * within the block range. determine how many 3219 * words and allocate (set to 1) the bits of these 3220 * words. 3221 */ 3222 nwords = rembits >> L2DBWORD; 3223 memset(&dp->wmap[word], (int) ONES, nwords * 4); 3224 3225 /* determine how many bits */ 3226 nb = nwords << L2DBWORD; 3227 word += nwords; 3228 } 3229 } 3230 3231 /* update the free count for this dmap */ 3232 dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks); 3233 3234 /* reconstruct summary tree */ 3235 dbInitDmapTree(dp); 3236 3237 BMAP_LOCK(bmp); 3238 3239 /* if this allocation group is completely free, 3240 * update the highest active allocation group number 3241 * if this allocation group is the new max. 3242 */ 3243 agno = blkno >> bmp->db_agl2size; 3244 if (agno > bmp->db_maxag) 3245 bmp->db_maxag = agno; 3246 3247 /* update the free count for the allocation group and map */ 3248 bmp->db_agfree[agno] -= nblocks; 3249 bmp->db_nfree -= nblocks; 3250 3251 BMAP_UNLOCK(bmp); 3252 3253 /* if the root has not changed, done. */ 3254 if (tp->stree[ROOT] == oldroot) 3255 return (0); 3256 3257 /* root changed. bubble the change up to the dmap control pages. 3258 * if the adjustment of the upper level control pages fails, 3259 * backout the bit allocation (thus making everything consistent). 3260 */ 3261 if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0))) 3262 dbFreeBits(bmp, dp, blkno, nblocks); 3263 3264 return (rc); 3265 } 3266 3267 3268 /* 3269 * NAME: dbExtendFS() 3270 * 3271 * FUNCTION: extend bmap from blkno for nblocks; 3272 * dbExtendFS() updates bmap ready for dbAllocBottomUp(); 3273 * 3274 * L2 3275 * | 3276 * L1---------------------------------L1 3277 * | | 3278 * L0---------L0---------L0 L0---------L0---------L0 3279 * | | | | | | 3280 * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm; 3281 * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm 3282 * 3283 * <---old---><----------------------------extend-----------------------> 3284 */ 3285 int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) 3286 { 3287 struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb); 3288 int nbperpage = sbi->nbperpage; 3289 int i, i0 = TRUE, j, j0 = TRUE, k, n; 3290 s64 newsize; 3291 s64 p; 3292 struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL; 3293 struct dmapctl *l2dcp, *l1dcp, *l0dcp; 3294 struct dmap *dp; 3295 s8 *l0leaf, *l1leaf, *l2leaf; 3296 struct bmap *bmp = sbi->bmap; 3297 int agno, l2agsize, oldl2agsize; 3298 s64 ag_rem; 3299 3300 newsize = blkno + nblocks; 3301 3302 jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld", 3303 (long long) blkno, (long long) nblocks, (long long) newsize); 3304 3305 /* 3306 * initialize bmap control page. 3307 * 3308 * all the data in bmap control page should exclude 3309 * the mkfs hidden dmap page. 3310 */ 3311 3312 /* update mapsize */ 3313 bmp->db_mapsize = newsize; 3314 bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize); 3315 3316 /* compute new AG size */ 3317 l2agsize = dbGetL2AGSize(newsize); 3318 oldl2agsize = bmp->db_agl2size; 3319 3320 bmp->db_agl2size = l2agsize; 3321 bmp->db_agsize = 1 << l2agsize; 3322 3323 /* compute new number of AG */ 3324 agno = bmp->db_numag; 3325 bmp->db_numag = newsize >> l2agsize; 3326 bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0; 3327 3328 /* 3329 * reconfigure db_agfree[] 3330 * from old AG configuration to new AG configuration; 3331 * 3332 * coalesce contiguous k (newAGSize/oldAGSize) AGs; 3333 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; 3334 * note: new AG size = old AG size * (2**x). 3335 */ 3336 if (l2agsize == oldl2agsize) 3337 goto extend; 3338 k = 1 << (l2agsize - oldl2agsize); 3339 ag_rem = bmp->db_agfree[0]; /* save agfree[0] */ 3340 for (i = 0, n = 0; i < agno; n++) { 3341 bmp->db_agfree[n] = 0; /* init collection point */ 3342 3343 /* coalesce cotiguous k AGs; */ 3344 for (j = 0; j < k && i < agno; j++, i++) { 3345 /* merge AGi to AGn */ 3346 bmp->db_agfree[n] += bmp->db_agfree[i]; 3347 } 3348 } 3349 bmp->db_agfree[0] += ag_rem; /* restore agfree[0] */ 3350 3351 for (; n < MAXAG; n++) 3352 bmp->db_agfree[n] = 0; 3353 3354 /* 3355 * update highest active ag number 3356 */ 3357 3358 bmp->db_maxag = bmp->db_maxag / k; 3359 3360 /* 3361 * extend bmap 3362 * 3363 * update bit maps and corresponding level control pages; 3364 * global control page db_nfree, db_agfree[agno], db_maxfreebud; 3365 */ 3366 extend: 3367 /* get L2 page */ 3368 p = BMAPBLKNO + nbperpage; /* L2 page */ 3369 l2mp = read_metapage(ipbmap, p, PSIZE, 0); 3370 if (!l2mp) { 3371 jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read"); 3372 return -EIO; 3373 } 3374 l2dcp = (struct dmapctl *) l2mp->data; 3375 3376 /* compute start L1 */ 3377 k = blkno >> L2MAXL1SIZE; 3378 l2leaf = l2dcp->stree + CTLLEAFIND + k; 3379 p = BLKTOL1(blkno, sbi->l2nbperpage); /* L1 page */ 3380 3381 /* 3382 * extend each L1 in L2 3383 */ 3384 for (; k < LPERCTL; k++, p += nbperpage) { 3385 /* get L1 page */ 3386 if (j0) { 3387 /* read in L1 page: (blkno & (MAXL1SIZE - 1)) */ 3388 l1mp = read_metapage(ipbmap, p, PSIZE, 0); 3389 if (l1mp == NULL) 3390 goto errout; 3391 l1dcp = (struct dmapctl *) l1mp->data; 3392 3393 /* compute start L0 */ 3394 j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE; 3395 l1leaf = l1dcp->stree + CTLLEAFIND + j; 3396 p = BLKTOL0(blkno, sbi->l2nbperpage); 3397 j0 = FALSE; 3398 } else { 3399 /* assign/init L1 page */ 3400 l1mp = get_metapage(ipbmap, p, PSIZE, 0); 3401 if (l1mp == NULL) 3402 goto errout; 3403 3404 l1dcp = (struct dmapctl *) l1mp->data; 3405 3406 /* compute start L0 */ 3407 j = 0; 3408 l1leaf = l1dcp->stree + CTLLEAFIND; 3409 p += nbperpage; /* 1st L0 of L1.k */ 3410 } 3411 3412 /* 3413 * extend each L0 in L1 3414 */ 3415 for (; j < LPERCTL; j++) { 3416 /* get L0 page */ 3417 if (i0) { 3418 /* read in L0 page: (blkno & (MAXL0SIZE - 1)) */ 3419 3420 l0mp = read_metapage(ipbmap, p, PSIZE, 0); 3421 if (l0mp == NULL) 3422 goto errout; 3423 l0dcp = (struct dmapctl *) l0mp->data; 3424 3425 /* compute start dmap */ 3426 i = (blkno & (MAXL0SIZE - 1)) >> 3427 L2BPERDMAP; 3428 l0leaf = l0dcp->stree + CTLLEAFIND + i; 3429 p = BLKTODMAP(blkno, 3430 sbi->l2nbperpage); 3431 i0 = FALSE; 3432 } else { 3433 /* assign/init L0 page */ 3434 l0mp = get_metapage(ipbmap, p, PSIZE, 0); 3435 if (l0mp == NULL) 3436 goto errout; 3437 3438 l0dcp = (struct dmapctl *) l0mp->data; 3439 3440 /* compute start dmap */ 3441 i = 0; 3442 l0leaf = l0dcp->stree + CTLLEAFIND; 3443 p += nbperpage; /* 1st dmap of L0.j */ 3444 } 3445 3446 /* 3447 * extend each dmap in L0 3448 */ 3449 for (; i < LPERCTL; i++) { 3450 /* 3451 * reconstruct the dmap page, and 3452 * initialize corresponding parent L0 leaf 3453 */ 3454 if ((n = blkno & (BPERDMAP - 1))) { 3455 /* read in dmap page: */ 3456 mp = read_metapage(ipbmap, p, 3457 PSIZE, 0); 3458 if (mp == NULL) 3459 goto errout; 3460 n = min(nblocks, (s64)BPERDMAP - n); 3461 } else { 3462 /* assign/init dmap page */ 3463 mp = read_metapage(ipbmap, p, 3464 PSIZE, 0); 3465 if (mp == NULL) 3466 goto errout; 3467 3468 n = min(nblocks, (s64)BPERDMAP); 3469 } 3470 3471 dp = (struct dmap *) mp->data; 3472 *l0leaf = dbInitDmap(dp, blkno, n); 3473 3474 bmp->db_nfree += n; 3475 agno = le64_to_cpu(dp->start) >> l2agsize; 3476 bmp->db_agfree[agno] += n; 3477 3478 write_metapage(mp); 3479 3480 l0leaf++; 3481 p += nbperpage; 3482 3483 blkno += n; 3484 nblocks -= n; 3485 if (nblocks == 0) 3486 break; 3487 } /* for each dmap in a L0 */ 3488 3489 /* 3490 * build current L0 page from its leaves, and 3491 * initialize corresponding parent L1 leaf 3492 */ 3493 *l1leaf = dbInitDmapCtl(l0dcp, 0, ++i); 3494 write_metapage(l0mp); 3495 l0mp = NULL; 3496 3497 if (nblocks) 3498 l1leaf++; /* continue for next L0 */ 3499 else { 3500 /* more than 1 L0 ? */ 3501 if (j > 0) 3502 break; /* build L1 page */ 3503 else { 3504 /* summarize in global bmap page */ 3505 bmp->db_maxfreebud = *l1leaf; 3506 release_metapage(l1mp); 3507 release_metapage(l2mp); 3508 goto finalize; 3509 } 3510 } 3511 } /* for each L0 in a L1 */ 3512 3513 /* 3514 * build current L1 page from its leaves, and 3515 * initialize corresponding parent L2 leaf 3516 */ 3517 *l2leaf = dbInitDmapCtl(l1dcp, 1, ++j); 3518 write_metapage(l1mp); 3519 l1mp = NULL; 3520 3521 if (nblocks) 3522 l2leaf++; /* continue for next L1 */ 3523 else { 3524 /* more than 1 L1 ? */ 3525 if (k > 0) 3526 break; /* build L2 page */ 3527 else { 3528 /* summarize in global bmap page */ 3529 bmp->db_maxfreebud = *l2leaf; 3530 release_metapage(l2mp); 3531 goto finalize; 3532 } 3533 } 3534 } /* for each L1 in a L2 */ 3535 3536 jfs_error(ipbmap->i_sb, 3537 "dbExtendFS: function has not returned as expected"); 3538 errout: 3539 if (l0mp) 3540 release_metapage(l0mp); 3541 if (l1mp) 3542 release_metapage(l1mp); 3543 release_metapage(l2mp); 3544 return -EIO; 3545 3546 /* 3547 * finalize bmap control page 3548 */ 3549 finalize: 3550 3551 return 0; 3552 } 3553 3554 3555 /* 3556 * dbFinalizeBmap() 3557 */ 3558 void dbFinalizeBmap(struct inode *ipbmap) 3559 { 3560 struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; 3561 int actags, inactags, l2nl; 3562 s64 ag_rem, actfree, inactfree, avgfree; 3563 int i, n; 3564 3565 /* 3566 * finalize bmap control page 3567 */ 3568 //finalize: 3569 /* 3570 * compute db_agpref: preferred ag to allocate from 3571 * (the leftmost ag with average free space in it); 3572 */ 3573 //agpref: 3574 /* get the number of active ags and inacitve ags */ 3575 actags = bmp->db_maxag + 1; 3576 inactags = bmp->db_numag - actags; 3577 ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1); /* ??? */ 3578 3579 /* determine how many blocks are in the inactive allocation 3580 * groups. in doing this, we must account for the fact that 3581 * the rightmost group might be a partial group (i.e. file 3582 * system size is not a multiple of the group size). 3583 */ 3584 inactfree = (inactags && ag_rem) ? 3585 ((inactags - 1) << bmp->db_agl2size) + ag_rem 3586 : inactags << bmp->db_agl2size; 3587 3588 /* determine how many free blocks are in the active 3589 * allocation groups plus the average number of free blocks 3590 * within the active ags. 3591 */ 3592 actfree = bmp->db_nfree - inactfree; 3593 avgfree = (u32) actfree / (u32) actags; 3594 3595 /* if the preferred allocation group has not average free space. 3596 * re-establish the preferred group as the leftmost 3597 * group with average free space. 3598 */ 3599 if (bmp->db_agfree[bmp->db_agpref] < avgfree) { 3600 for (bmp->db_agpref = 0; bmp->db_agpref < actags; 3601 bmp->db_agpref++) { 3602 if (bmp->db_agfree[bmp->db_agpref] >= avgfree) 3603 break; 3604 } 3605 if (bmp->db_agpref >= bmp->db_numag) { 3606 jfs_error(ipbmap->i_sb, 3607 "cannot find ag with average freespace"); 3608 } 3609 } 3610 3611 /* 3612 * compute db_aglevel, db_agheigth, db_width, db_agstart: 3613 * an ag is covered in aglevel dmapctl summary tree, 3614 * at agheight level height (from leaf) with agwidth number of nodes 3615 * each, which starts at agstart index node of the smmary tree node 3616 * array; 3617 */ 3618 bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize); 3619 l2nl = 3620 bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL); 3621 bmp->db_agheigth = l2nl >> 1; 3622 bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1)); 3623 for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0; 3624 i--) { 3625 bmp->db_agstart += n; 3626 n <<= 2; 3627 } 3628 3629 } 3630 3631 3632 /* 3633 * NAME: dbInitDmap()/ujfs_idmap_page() 3634 * 3635 * FUNCTION: initialize working/persistent bitmap of the dmap page 3636 * for the specified number of blocks: 3637 * 3638 * at entry, the bitmaps had been initialized as free (ZEROS); 3639 * The number of blocks will only account for the actually 3640 * existing blocks. Blocks which don't actually exist in 3641 * the aggregate will be marked as allocated (ONES); 3642 * 3643 * PARAMETERS: 3644 * dp - pointer to page of map 3645 * nblocks - number of blocks this page 3646 * 3647 * RETURNS: NONE 3648 */ 3649 static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks) 3650 { 3651 int blkno, w, b, r, nw, nb, i; 3652 3653 /* starting block number within the dmap */ 3654 blkno = Blkno & (BPERDMAP - 1); 3655 3656 if (blkno == 0) { 3657 dp->nblocks = dp->nfree = cpu_to_le32(nblocks); 3658 dp->start = cpu_to_le64(Blkno); 3659 3660 if (nblocks == BPERDMAP) { 3661 memset(&dp->wmap[0], 0, LPERDMAP * 4); 3662 memset(&dp->pmap[0], 0, LPERDMAP * 4); 3663 goto initTree; 3664 } 3665 } else { 3666 dp->nblocks = 3667 cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks); 3668 dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks); 3669 } 3670 3671 /* word number containing start block number */ 3672 w = blkno >> L2DBWORD; 3673 3674 /* 3675 * free the bits corresponding to the block range (ZEROS): 3676 * note: not all bits of the first and last words may be contained 3677 * within the block range. 3678 */ 3679 for (r = nblocks; r > 0; r -= nb, blkno += nb) { 3680 /* number of bits preceding range to be freed in the word */ 3681 b = blkno & (DBWORD - 1); 3682 /* number of bits to free in the word */ 3683 nb = min(r, DBWORD - b); 3684 3685 /* is partial word to be freed ? */ 3686 if (nb < DBWORD) { 3687 /* free (set to 0) from the bitmap word */ 3688 dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb) 3689 >> b)); 3690 dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb) 3691 >> b)); 3692 3693 /* skip the word freed */ 3694 w++; 3695 } else { 3696 /* free (set to 0) contiguous bitmap words */ 3697 nw = r >> L2DBWORD; 3698 memset(&dp->wmap[w], 0, nw * 4); 3699 memset(&dp->pmap[w], 0, nw * 4); 3700 3701 /* skip the words freed */ 3702 nb = nw << L2DBWORD; 3703 w += nw; 3704 } 3705 } 3706 3707 /* 3708 * mark bits following the range to be freed (non-existing 3709 * blocks) as allocated (ONES) 3710 */ 3711 3712 if (blkno == BPERDMAP) 3713 goto initTree; 3714 3715 /* the first word beyond the end of existing blocks */ 3716 w = blkno >> L2DBWORD; 3717 3718 /* does nblocks fall on a 32-bit boundary ? */ 3719 b = blkno & (DBWORD - 1); 3720 if (b) { 3721 /* mark a partial word allocated */ 3722 dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b); 3723 w++; 3724 } 3725 3726 /* set the rest of the words in the page to allocated (ONES) */ 3727 for (i = w; i < LPERDMAP; i++) 3728 dp->pmap[i] = dp->wmap[i] = cpu_to_le32(ONES); 3729 3730 /* 3731 * init tree 3732 */ 3733 initTree: 3734 return (dbInitDmapTree(dp)); 3735 } 3736 3737 3738 /* 3739 * NAME: dbInitDmapTree()/ujfs_complete_dmap() 3740 * 3741 * FUNCTION: initialize summary tree of the specified dmap: 3742 * 3743 * at entry, bitmap of the dmap has been initialized; 3744 * 3745 * PARAMETERS: 3746 * dp - dmap to complete 3747 * blkno - starting block number for this dmap 3748 * treemax - will be filled in with max free for this dmap 3749 * 3750 * RETURNS: max free string at the root of the tree 3751 */ 3752 static int dbInitDmapTree(struct dmap * dp) 3753 { 3754 struct dmaptree *tp; 3755 s8 *cp; 3756 int i; 3757 3758 /* init fixed info of tree */ 3759 tp = &dp->tree; 3760 tp->nleafs = cpu_to_le32(LPERDMAP); 3761 tp->l2nleafs = cpu_to_le32(L2LPERDMAP); 3762 tp->leafidx = cpu_to_le32(LEAFIND); 3763 tp->height = cpu_to_le32(4); 3764 tp->budmin = BUDMIN; 3765 3766 /* init each leaf from corresponding wmap word: 3767 * note: leaf is set to NOFREE(-1) if all blocks of corresponding 3768 * bitmap word are allocated. 3769 */ 3770 cp = tp->stree + le32_to_cpu(tp->leafidx); 3771 for (i = 0; i < LPERDMAP; i++) 3772 *cp++ = dbMaxBud((u8 *) & dp->wmap[i]); 3773 3774 /* build the dmap's binary buddy summary tree */ 3775 return (dbInitTree(tp)); 3776 } 3777 3778 3779 /* 3780 * NAME: dbInitTree()/ujfs_adjtree() 3781 * 3782 * FUNCTION: initialize binary buddy summary tree of a dmap or dmapctl. 3783 * 3784 * at entry, the leaves of the tree has been initialized 3785 * from corresponding bitmap word or root of summary tree 3786 * of the child control page; 3787 * configure binary buddy system at the leaf level, then 3788 * bubble up the values of the leaf nodes up the tree. 3789 * 3790 * PARAMETERS: 3791 * cp - Pointer to the root of the tree 3792 * l2leaves- Number of leaf nodes as a power of 2 3793 * l2min - Number of blocks that can be covered by a leaf 3794 * as a power of 2 3795 * 3796 * RETURNS: max free string at the root of the tree 3797 */ 3798 static int dbInitTree(struct dmaptree * dtp) 3799 { 3800 int l2max, l2free, bsize, nextb, i; 3801 int child, parent, nparent; 3802 s8 *tp, *cp, *cp1; 3803 3804 tp = dtp->stree; 3805 3806 /* Determine the maximum free string possible for the leaves */ 3807 l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin; 3808 3809 /* 3810 * configure the leaf levevl into binary buddy system 3811 * 3812 * Try to combine buddies starting with a buddy size of 1 3813 * (i.e. two leaves). At a buddy size of 1 two buddy leaves 3814 * can be combined if both buddies have a maximum free of l2min; 3815 * the combination will result in the left-most buddy leaf having 3816 * a maximum free of l2min+1. 3817 * After processing all buddies for a given size, process buddies 3818 * at the next higher buddy size (i.e. current size * 2) and 3819 * the next maximum free (current free + 1). 3820 * This continues until the maximum possible buddy combination 3821 * yields maximum free. 3822 */ 3823 for (l2free = dtp->budmin, bsize = 1; l2free < l2max; 3824 l2free++, bsize = nextb) { 3825 /* get next buddy size == current buddy pair size */ 3826 nextb = bsize << 1; 3827 3828 /* scan each adjacent buddy pair at current buddy size */ 3829 for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx); 3830 i < le32_to_cpu(dtp->nleafs); 3831 i += nextb, cp += nextb) { 3832 /* coalesce if both adjacent buddies are max free */ 3833 if (*cp == l2free && *(cp + bsize) == l2free) { 3834 *cp = l2free + 1; /* left take right */ 3835 *(cp + bsize) = -1; /* right give left */ 3836 } 3837 } 3838 } 3839 3840 /* 3841 * bubble summary information of leaves up the tree. 3842 * 3843 * Starting at the leaf node level, the four nodes described by 3844 * the higher level parent node are compared for a maximum free and 3845 * this maximum becomes the value of the parent node. 3846 * when all lower level nodes are processed in this fashion then 3847 * move up to the next level (parent becomes a lower level node) and 3848 * continue the process for that level. 3849 */ 3850 for (child = le32_to_cpu(dtp->leafidx), 3851 nparent = le32_to_cpu(dtp->nleafs) >> 2; 3852 nparent > 0; nparent >>= 2, child = parent) { 3853 /* get index of 1st node of parent level */ 3854 parent = (child - 1) >> 2; 3855 3856 /* set the value of the parent node as the maximum 3857 * of the four nodes of the current level. 3858 */ 3859 for (i = 0, cp = tp + child, cp1 = tp + parent; 3860 i < nparent; i++, cp += 4, cp1++) 3861 *cp1 = TREEMAX(cp); 3862 } 3863 3864 return (*tp); 3865 } 3866 3867 3868 /* 3869 * dbInitDmapCtl() 3870 * 3871 * function: initialize dmapctl page 3872 */ 3873 static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i) 3874 { /* start leaf index not covered by range */ 3875 s8 *cp; 3876 3877 dcp->nleafs = cpu_to_le32(LPERCTL); 3878 dcp->l2nleafs = cpu_to_le32(L2LPERCTL); 3879 dcp->leafidx = cpu_to_le32(CTLLEAFIND); 3880 dcp->height = cpu_to_le32(5); 3881 dcp->budmin = L2BPERDMAP + L2LPERCTL * level; 3882 3883 /* 3884 * initialize the leaves of current level that were not covered 3885 * by the specified input block range (i.e. the leaves have no 3886 * low level dmapctl or dmap). 3887 */ 3888 cp = &dcp->stree[CTLLEAFIND + i]; 3889 for (; i < LPERCTL; i++) 3890 *cp++ = NOFREE; 3891 3892 /* build the dmap's binary buddy summary tree */ 3893 return (dbInitTree((struct dmaptree *) dcp)); 3894 } 3895 3896 3897 /* 3898 * NAME: dbGetL2AGSize()/ujfs_getagl2size() 3899 * 3900 * FUNCTION: Determine log2(allocation group size) from aggregate size 3901 * 3902 * PARAMETERS: 3903 * nblocks - Number of blocks in aggregate 3904 * 3905 * RETURNS: log2(allocation group size) in aggregate blocks 3906 */ 3907 static int dbGetL2AGSize(s64 nblocks) 3908 { 3909 s64 sz; 3910 s64 m; 3911 int l2sz; 3912 3913 if (nblocks < BPERDMAP * MAXAG) 3914 return (L2BPERDMAP); 3915 3916 /* round up aggregate size to power of 2 */ 3917 m = ((u64) 1 << (64 - 1)); 3918 for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) { 3919 if (m & nblocks) 3920 break; 3921 } 3922 3923 sz = (s64) 1 << l2sz; 3924 if (sz < nblocks) 3925 l2sz += 1; 3926 3927 /* agsize = roundupSize/max_number_of_ag */ 3928 return (l2sz - L2MAXAG); 3929 } 3930 3931 3932 /* 3933 * NAME: dbMapFileSizeToMapSize() 3934 * 3935 * FUNCTION: compute number of blocks the block allocation map file 3936 * can cover from the map file size; 3937 * 3938 * RETURNS: Number of blocks which can be covered by this block map file; 3939 */ 3940 3941 /* 3942 * maximum number of map pages at each level including control pages 3943 */ 3944 #define MAXL0PAGES (1 + LPERCTL) 3945 #define MAXL1PAGES (1 + LPERCTL * MAXL0PAGES) 3946 #define MAXL2PAGES (1 + LPERCTL * MAXL1PAGES) 3947 3948 /* 3949 * convert number of map pages to the zero origin top dmapctl level 3950 */ 3951 #define BMAPPGTOLEV(npages) \ 3952 (((npages) <= 3 + MAXL0PAGES) ? 0 \ 3953 : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2) 3954 3955 s64 dbMapFileSizeToMapSize(struct inode * ipbmap) 3956 { 3957 struct super_block *sb = ipbmap->i_sb; 3958 s64 nblocks; 3959 s64 npages, ndmaps; 3960 int level, i; 3961 int complete, factor; 3962 3963 nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize; 3964 npages = nblocks >> JFS_SBI(sb)->l2nbperpage; 3965 level = BMAPPGTOLEV(npages); 3966 3967 /* At each level, accumulate the number of dmap pages covered by 3968 * the number of full child levels below it; 3969 * repeat for the last incomplete child level. 3970 */ 3971 ndmaps = 0; 3972 npages--; /* skip the first global control page */ 3973 /* skip higher level control pages above top level covered by map */ 3974 npages -= (2 - level); 3975 npages--; /* skip top level's control page */ 3976 for (i = level; i >= 0; i--) { 3977 factor = 3978 (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1); 3979 complete = (u32) npages / factor; 3980 ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL 3981 : ((i == 1) ? LPERCTL : 1)); 3982 3983 /* pages in last/incomplete child */ 3984 npages = (u32) npages % factor; 3985 /* skip incomplete child's level control page */ 3986 npages--; 3987 } 3988 3989 /* convert the number of dmaps into the number of blocks 3990 * which can be covered by the dmaps; 3991 */ 3992 nblocks = ndmaps << L2BPERDMAP; 3993 3994 return (nblocks); 3995 } 3996