xref: /linux/fs/jfs/jfs_dmap.c (revision 54a8a2220c936a47840c9a3d74910c5a56fae2ed)
1 /*
2  *   Copyright (C) International Business Machines Corp., 2000-2004
3  *
4  *   This program is free software;  you can redistribute it and/or modify
5  *   it under the terms of the GNU General Public License as published by
6  *   the Free Software Foundation; either version 2 of the License, or
7  *   (at your option) any later version.
8  *
9  *   This program is distributed in the hope that it will be useful,
10  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
11  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
12  *   the GNU General Public License for more details.
13  *
14  *   You should have received a copy of the GNU General Public License
15  *   along with this program;  if not, write to the Free Software
16  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17  */
18 
19 #include <linux/fs.h>
20 #include "jfs_incore.h"
21 #include "jfs_superblock.h"
22 #include "jfs_dmap.h"
23 #include "jfs_imap.h"
24 #include "jfs_lock.h"
25 #include "jfs_metapage.h"
26 #include "jfs_debug.h"
27 
28 /*
29  *	SERIALIZATION of the Block Allocation Map.
30  *
31  *	the working state of the block allocation map is accessed in
32  *	two directions:
33  *
34  *	1) allocation and free requests that start at the dmap
35  *	   level and move up through the dmap control pages (i.e.
36  *	   the vast majority of requests).
37  *
38  * 	2) allocation requests that start at dmap control page
39  *	   level and work down towards the dmaps.
40  *
41  *	the serialization scheme used here is as follows.
42  *
43  *	requests which start at the bottom are serialized against each
44  *	other through buffers and each requests holds onto its buffers
45  *	as it works it way up from a single dmap to the required level
46  *	of dmap control page.
47  *	requests that start at the top are serialized against each other
48  *	and request that start from the bottom by the multiple read/single
49  *	write inode lock of the bmap inode. requests starting at the top
50  *	take this lock in write mode while request starting at the bottom
51  *	take the lock in read mode.  a single top-down request may proceed
52  *	exclusively while multiple bottoms-up requests may proceed
53  * 	simultaneously (under the protection of busy buffers).
54  *
55  *	in addition to information found in dmaps and dmap control pages,
56  *	the working state of the block allocation map also includes read/
57  *	write information maintained in the bmap descriptor (i.e. total
58  *	free block count, allocation group level free block counts).
59  *	a single exclusive lock (BMAP_LOCK) is used to guard this information
60  *	in the face of multiple-bottoms up requests.
61  *	(lock ordering: IREAD_LOCK, BMAP_LOCK);
62  *
63  *	accesses to the persistent state of the block allocation map (limited
64  *	to the persistent bitmaps in dmaps) is guarded by (busy) buffers.
65  */
66 
67 #define BMAP_LOCK_INIT(bmp)	init_MUTEX(&bmp->db_bmaplock)
68 #define BMAP_LOCK(bmp)		down(&bmp->db_bmaplock)
69 #define BMAP_UNLOCK(bmp)	up(&bmp->db_bmaplock)
70 
71 /*
72  * forward references
73  */
74 static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
75 			int nblocks);
76 static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
77 static void dbBackSplit(dmtree_t * tp, int leafno);
78 static int dbJoin(dmtree_t * tp, int leafno, int newval);
79 static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
80 static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
81 		    int level);
82 static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results);
83 static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
84 		       int nblocks);
85 static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno,
86 		       int nblocks,
87 		       int l2nb, s64 * results);
88 static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
89 		       int nblocks);
90 static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks,
91 			  int l2nb,
92 			  s64 * results);
93 static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb,
94 		     s64 * results);
95 static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno,
96 		      s64 * results);
97 static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
98 static int dbFindBits(u32 word, int l2nb);
99 static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno);
100 static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx);
101 static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
102 		      int nblocks);
103 static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
104 		      int nblocks);
105 static int dbMaxBud(u8 * cp);
106 s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
107 static int blkstol2(s64 nb);
108 
109 static int cntlz(u32 value);
110 static int cnttz(u32 word);
111 
112 static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
113 			 int nblocks);
114 static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks);
115 static int dbInitDmapTree(struct dmap * dp);
116 static int dbInitTree(struct dmaptree * dtp);
117 static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i);
118 static int dbGetL2AGSize(s64 nblocks);
119 
120 /*
121  *	buddy table
122  *
123  * table used for determining buddy sizes within characters of
124  * dmap bitmap words.  the characters themselves serve as indexes
125  * into the table, with the table elements yielding the maximum
126  * binary buddy of free bits within the character.
127  */
128 static s8 budtab[256] = {
129 	3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
130 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
131 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
132 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
133 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
134 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
135 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
136 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
137 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
138 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
139 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
140 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
141 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
142 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
143 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
144 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1
145 };
146 
147 
148 /*
149  * NAME:    	dbMount()
150  *
151  * FUNCTION:	initializate the block allocation map.
152  *
153  *		memory is allocated for the in-core bmap descriptor and
154  *		the in-core descriptor is initialized from disk.
155  *
156  * PARAMETERS:
157  *      ipbmap	-  pointer to in-core inode for the block map.
158  *
159  * RETURN VALUES:
160  *      0	- success
161  *      -ENOMEM	- insufficient memory
162  *      -EIO	- i/o error
163  */
164 int dbMount(struct inode *ipbmap)
165 {
166 	struct bmap *bmp;
167 	struct dbmap_disk *dbmp_le;
168 	struct metapage *mp;
169 	int i;
170 
171 	/*
172 	 * allocate/initialize the in-memory bmap descriptor
173 	 */
174 	/* allocate memory for the in-memory bmap descriptor */
175 	bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL);
176 	if (bmp == NULL)
177 		return -ENOMEM;
178 
179 	/* read the on-disk bmap descriptor. */
180 	mp = read_metapage(ipbmap,
181 			   BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
182 			   PSIZE, 0);
183 	if (mp == NULL) {
184 		kfree(bmp);
185 		return -EIO;
186 	}
187 
188 	/* copy the on-disk bmap descriptor to its in-memory version. */
189 	dbmp_le = (struct dbmap_disk *) mp->data;
190 	bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
191 	bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
192 	bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
193 	bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
194 	bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
195 	bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
196 	bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
197 	bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
198 	bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth);
199 	bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
200 	bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
201 	bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
202 	for (i = 0; i < MAXAG; i++)
203 		bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
204 	bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
205 	bmp->db_maxfreebud = dbmp_le->dn_maxfreebud;
206 
207 	/* release the buffer. */
208 	release_metapage(mp);
209 
210 	/* bind the bmap inode and the bmap descriptor to each other. */
211 	bmp->db_ipbmap = ipbmap;
212 	JFS_SBI(ipbmap->i_sb)->bmap = bmp;
213 
214 	memset(bmp->db_active, 0, sizeof(bmp->db_active));
215 
216 	/*
217 	 * allocate/initialize the bmap lock
218 	 */
219 	BMAP_LOCK_INIT(bmp);
220 
221 	return (0);
222 }
223 
224 
225 /*
226  * NAME:    	dbUnmount()
227  *
228  * FUNCTION:	terminate the block allocation map in preparation for
229  *		file system unmount.
230  *
231  * 		the in-core bmap descriptor is written to disk and
232  *		the memory for this descriptor is freed.
233  *
234  * PARAMETERS:
235  *      ipbmap	-  pointer to in-core inode for the block map.
236  *
237  * RETURN VALUES:
238  *      0	- success
239  *      -EIO	- i/o error
240  */
241 int dbUnmount(struct inode *ipbmap, int mounterror)
242 {
243 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
244 
245 	if (!(mounterror || isReadOnly(ipbmap)))
246 		dbSync(ipbmap);
247 
248 	/*
249 	 * Invalidate the page cache buffers
250 	 */
251 	truncate_inode_pages(ipbmap->i_mapping, 0);
252 
253 	/* free the memory for the in-memory bmap. */
254 	kfree(bmp);
255 
256 	return (0);
257 }
258 
259 /*
260  *	dbSync()
261  */
262 int dbSync(struct inode *ipbmap)
263 {
264 	struct dbmap_disk *dbmp_le;
265 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
266 	struct metapage *mp;
267 	int i;
268 
269 	/*
270 	 * write bmap global control page
271 	 */
272 	/* get the buffer for the on-disk bmap descriptor. */
273 	mp = read_metapage(ipbmap,
274 			   BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
275 			   PSIZE, 0);
276 	if (mp == NULL) {
277 		jfs_err("dbSync: read_metapage failed!");
278 		return -EIO;
279 	}
280 	/* copy the in-memory version of the bmap to the on-disk version */
281 	dbmp_le = (struct dbmap_disk *) mp->data;
282 	dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize);
283 	dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree);
284 	dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage);
285 	dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag);
286 	dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel);
287 	dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
288 	dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
289 	dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
290 	dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth);
291 	dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
292 	dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
293 	dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
294 	for (i = 0; i < MAXAG; i++)
295 		dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]);
296 	dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize);
297 	dbmp_le->dn_maxfreebud = bmp->db_maxfreebud;
298 
299 	/* write the buffer */
300 	write_metapage(mp);
301 
302 	/*
303 	 * write out dirty pages of bmap
304 	 */
305 	filemap_fdatawrite(ipbmap->i_mapping);
306 	filemap_fdatawait(ipbmap->i_mapping);
307 
308 	ipbmap->i_state |= I_DIRTY;
309 	diWriteSpecial(ipbmap, 0);
310 
311 	return (0);
312 }
313 
314 
315 /*
316  * NAME:    	dbFree()
317  *
318  * FUNCTION:	free the specified block range from the working block
319  *		allocation map.
320  *
321  *		the blocks will be free from the working map one dmap
322  *		at a time.
323  *
324  * PARAMETERS:
325  *      ip	-  pointer to in-core inode;
326  *      blkno	-  starting block number to be freed.
327  *      nblocks	-  number of blocks to be freed.
328  *
329  * RETURN VALUES:
330  *      0	- success
331  *      -EIO	- i/o error
332  */
333 int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
334 {
335 	struct metapage *mp;
336 	struct dmap *dp;
337 	int nb, rc;
338 	s64 lblkno, rem;
339 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
340 	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
341 
342 	IREAD_LOCK(ipbmap);
343 
344 	/* block to be freed better be within the mapsize. */
345 	if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) {
346 		IREAD_UNLOCK(ipbmap);
347 		printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
348 		       (unsigned long long) blkno,
349 		       (unsigned long long) nblocks);
350 		jfs_error(ip->i_sb,
351 			  "dbFree: block to be freed is outside the map");
352 		return -EIO;
353 	}
354 
355 	/*
356 	 * free the blocks a dmap at a time.
357 	 */
358 	mp = NULL;
359 	for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
360 		/* release previous dmap if any */
361 		if (mp) {
362 			write_metapage(mp);
363 		}
364 
365 		/* get the buffer for the current dmap. */
366 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
367 		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
368 		if (mp == NULL) {
369 			IREAD_UNLOCK(ipbmap);
370 			return -EIO;
371 		}
372 		dp = (struct dmap *) mp->data;
373 
374 		/* determine the number of blocks to be freed from
375 		 * this dmap.
376 		 */
377 		nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
378 
379 		/* free the blocks. */
380 		if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
381 			jfs_error(ip->i_sb, "dbFree: error in block map\n");
382 			release_metapage(mp);
383 			IREAD_UNLOCK(ipbmap);
384 			return (rc);
385 		}
386 	}
387 
388 	/* write the last buffer. */
389 	write_metapage(mp);
390 
391 	IREAD_UNLOCK(ipbmap);
392 
393 	return (0);
394 }
395 
396 
397 /*
398  * NAME:	dbUpdatePMap()
399  *
400  * FUNCTION:    update the allocation state (free or allocate) of the
401  *		specified block range in the persistent block allocation map.
402  *
403  *		the blocks will be updated in the persistent map one
404  *		dmap at a time.
405  *
406  * PARAMETERS:
407  *      ipbmap	-  pointer to in-core inode for the block map.
408  *      free	- TRUE if block range is to be freed from the persistent
409  *		  map; FALSE if it is to   be allocated.
410  *      blkno	-  starting block number of the range.
411  *      nblocks	-  number of contiguous blocks in the range.
412  *      tblk	-  transaction block;
413  *
414  * RETURN VALUES:
415  *      0	- success
416  *      -EIO	- i/o error
417  */
418 int
419 dbUpdatePMap(struct inode *ipbmap,
420 	     int free, s64 blkno, s64 nblocks, struct tblock * tblk)
421 {
422 	int nblks, dbitno, wbitno, rbits;
423 	int word, nbits, nwords;
424 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
425 	s64 lblkno, rem, lastlblkno;
426 	u32 mask;
427 	struct dmap *dp;
428 	struct metapage *mp;
429 	struct jfs_log *log;
430 	int lsn, difft, diffp;
431 	unsigned long flags;
432 
433 	/* the blocks better be within the mapsize. */
434 	if (blkno + nblocks > bmp->db_mapsize) {
435 		printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
436 		       (unsigned long long) blkno,
437 		       (unsigned long long) nblocks);
438 		jfs_error(ipbmap->i_sb,
439 			  "dbUpdatePMap: blocks are outside the map");
440 		return -EIO;
441 	}
442 
443 	/* compute delta of transaction lsn from log syncpt */
444 	lsn = tblk->lsn;
445 	log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
446 	logdiff(difft, lsn, log);
447 
448 	/*
449 	 * update the block state a dmap at a time.
450 	 */
451 	mp = NULL;
452 	lastlblkno = 0;
453 	for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) {
454 		/* get the buffer for the current dmap. */
455 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
456 		if (lblkno != lastlblkno) {
457 			if (mp) {
458 				write_metapage(mp);
459 			}
460 
461 			mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE,
462 					   0);
463 			if (mp == NULL)
464 				return -EIO;
465 			metapage_wait_for_io(mp);
466 		}
467 		dp = (struct dmap *) mp->data;
468 
469 		/* determine the bit number and word within the dmap of
470 		 * the starting block.  also determine how many blocks
471 		 * are to be updated within this dmap.
472 		 */
473 		dbitno = blkno & (BPERDMAP - 1);
474 		word = dbitno >> L2DBWORD;
475 		nblks = min(rem, (s64)BPERDMAP - dbitno);
476 
477 		/* update the bits of the dmap words. the first and last
478 		 * words may only have a subset of their bits updated. if
479 		 * this is the case, we'll work against that word (i.e.
480 		 * partial first and/or last) only in a single pass.  a
481 		 * single pass will also be used to update all words that
482 		 * are to have all their bits updated.
483 		 */
484 		for (rbits = nblks; rbits > 0;
485 		     rbits -= nbits, dbitno += nbits) {
486 			/* determine the bit number within the word and
487 			 * the number of bits within the word.
488 			 */
489 			wbitno = dbitno & (DBWORD - 1);
490 			nbits = min(rbits, DBWORD - wbitno);
491 
492 			/* check if only part of the word is to be updated. */
493 			if (nbits < DBWORD) {
494 				/* update (free or allocate) the bits
495 				 * in this word.
496 				 */
497 				mask =
498 				    (ONES << (DBWORD - nbits) >> wbitno);
499 				if (free)
500 					dp->pmap[word] &=
501 					    cpu_to_le32(~mask);
502 				else
503 					dp->pmap[word] |=
504 					    cpu_to_le32(mask);
505 
506 				word += 1;
507 			} else {
508 				/* one or more words are to have all
509 				 * their bits updated.  determine how
510 				 * many words and how many bits.
511 				 */
512 				nwords = rbits >> L2DBWORD;
513 				nbits = nwords << L2DBWORD;
514 
515 				/* update (free or allocate) the bits
516 				 * in these words.
517 				 */
518 				if (free)
519 					memset(&dp->pmap[word], 0,
520 					       nwords * 4);
521 				else
522 					memset(&dp->pmap[word], (int) ONES,
523 					       nwords * 4);
524 
525 				word += nwords;
526 			}
527 		}
528 
529 		/*
530 		 * update dmap lsn
531 		 */
532 		if (lblkno == lastlblkno)
533 			continue;
534 
535 		lastlblkno = lblkno;
536 
537 		if (mp->lsn != 0) {
538 			/* inherit older/smaller lsn */
539 			logdiff(diffp, mp->lsn, log);
540 			LOGSYNC_LOCK(log, flags);
541 			if (difft < diffp) {
542 				mp->lsn = lsn;
543 
544 				/* move bp after tblock in logsync list */
545 				list_move(&mp->synclist, &tblk->synclist);
546 			}
547 
548 			/* inherit younger/larger clsn */
549 			logdiff(difft, tblk->clsn, log);
550 			logdiff(diffp, mp->clsn, log);
551 			if (difft > diffp)
552 				mp->clsn = tblk->clsn;
553 			LOGSYNC_UNLOCK(log, flags);
554 		} else {
555 			mp->log = log;
556 			mp->lsn = lsn;
557 
558 			/* insert bp after tblock in logsync list */
559 			LOGSYNC_LOCK(log, flags);
560 
561 			log->count++;
562 			list_add(&mp->synclist, &tblk->synclist);
563 
564 			mp->clsn = tblk->clsn;
565 			LOGSYNC_UNLOCK(log, flags);
566 		}
567 	}
568 
569 	/* write the last buffer. */
570 	if (mp) {
571 		write_metapage(mp);
572 	}
573 
574 	return (0);
575 }
576 
577 
578 /*
579  * NAME:	dbNextAG()
580  *
581  * FUNCTION:    find the preferred allocation group for new allocations.
582  *
583  *		Within the allocation groups, we maintain a preferred
584  *		allocation group which consists of a group with at least
585  *		average free space.  It is the preferred group that we target
586  *		new inode allocation towards.  The tie-in between inode
587  *		allocation and block allocation occurs as we allocate the
588  *		first (data) block of an inode and specify the inode (block)
589  *		as the allocation hint for this block.
590  *
591  *		We try to avoid having more than one open file growing in
592  *		an allocation group, as this will lead to fragmentation.
593  *		This differs from the old OS/2 method of trying to keep
594  *		empty ags around for large allocations.
595  *
596  * PARAMETERS:
597  *      ipbmap	-  pointer to in-core inode for the block map.
598  *
599  * RETURN VALUES:
600  *      the preferred allocation group number.
601  */
602 int dbNextAG(struct inode *ipbmap)
603 {
604 	s64 avgfree;
605 	int agpref;
606 	s64 hwm = 0;
607 	int i;
608 	int next_best = -1;
609 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
610 
611 	BMAP_LOCK(bmp);
612 
613 	/* determine the average number of free blocks within the ags. */
614 	avgfree = (u32)bmp->db_nfree / bmp->db_numag;
615 
616 	/*
617 	 * if the current preferred ag does not have an active allocator
618 	 * and has at least average freespace, return it
619 	 */
620 	agpref = bmp->db_agpref;
621 	if ((atomic_read(&bmp->db_active[agpref]) == 0) &&
622 	    (bmp->db_agfree[agpref] >= avgfree))
623 		goto unlock;
624 
625 	/* From the last preferred ag, find the next one with at least
626 	 * average free space.
627 	 */
628 	for (i = 0 ; i < bmp->db_numag; i++, agpref++) {
629 		if (agpref == bmp->db_numag)
630 			agpref = 0;
631 
632 		if (atomic_read(&bmp->db_active[agpref]))
633 			/* open file is currently growing in this ag */
634 			continue;
635 		if (bmp->db_agfree[agpref] >= avgfree) {
636 			/* Return this one */
637 			bmp->db_agpref = agpref;
638 			goto unlock;
639 		} else if (bmp->db_agfree[agpref] > hwm) {
640 			/* Less than avg. freespace, but best so far */
641 			hwm = bmp->db_agfree[agpref];
642 			next_best = agpref;
643 		}
644 	}
645 
646 	/*
647 	 * If no inactive ag was found with average freespace, use the
648 	 * next best
649 	 */
650 	if (next_best != -1)
651 		bmp->db_agpref = next_best;
652 	/* else leave db_agpref unchanged */
653 unlock:
654 	BMAP_UNLOCK(bmp);
655 
656 	/* return the preferred group.
657 	 */
658 	return (bmp->db_agpref);
659 }
660 
661 /*
662  * NAME:	dbAlloc()
663  *
664  * FUNCTION:    attempt to allocate a specified number of contiguous free
665  *		blocks from the working allocation block map.
666  *
667  *		the block allocation policy uses hints and a multi-step
668  *		approach.
669  *
670  *	  	for allocation requests smaller than the number of blocks
671  *		per dmap, we first try to allocate the new blocks
672  *		immediately following the hint.  if these blocks are not
673  *		available, we try to allocate blocks near the hint.  if
674  *		no blocks near the hint are available, we next try to
675  *		allocate within the same dmap as contains the hint.
676  *
677  *		if no blocks are available in the dmap or the allocation
678  *		request is larger than the dmap size, we try to allocate
679  *		within the same allocation group as contains the hint. if
680  *		this does not succeed, we finally try to allocate anywhere
681  *		within the aggregate.
682  *
683  *		we also try to allocate anywhere within the aggregate for
684  *		for allocation requests larger than the allocation group
685  *		size or requests that specify no hint value.
686  *
687  * PARAMETERS:
688  *      ip	-  pointer to in-core inode;
689  *      hint	- allocation hint.
690  *      nblocks	- number of contiguous blocks in the range.
691  *      results	- on successful return, set to the starting block number
692  *		  of the newly allocated contiguous range.
693  *
694  * RETURN VALUES:
695  *      0	- success
696  *      -ENOSPC	- insufficient disk resources
697  *      -EIO	- i/o error
698  */
699 int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
700 {
701 	int rc, agno;
702 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
703 	struct bmap *bmp;
704 	struct metapage *mp;
705 	s64 lblkno, blkno;
706 	struct dmap *dp;
707 	int l2nb;
708 	s64 mapSize;
709 	int writers;
710 
711 	/* assert that nblocks is valid */
712 	assert(nblocks > 0);
713 
714 #ifdef _STILL_TO_PORT
715 	/* DASD limit check                                     F226941 */
716 	if (OVER_LIMIT(ip, nblocks))
717 		return -ENOSPC;
718 #endif				/* _STILL_TO_PORT */
719 
720 	/* get the log2 number of blocks to be allocated.
721 	 * if the number of blocks is not a log2 multiple,
722 	 * it will be rounded up to the next log2 multiple.
723 	 */
724 	l2nb = BLKSTOL2(nblocks);
725 
726 	bmp = JFS_SBI(ip->i_sb)->bmap;
727 
728 //retry:        /* serialize w.r.t.extendfs() */
729 	mapSize = bmp->db_mapsize;
730 
731 	/* the hint should be within the map */
732 	if (hint >= mapSize) {
733 		jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map");
734 		return -EIO;
735 	}
736 
737 	/* if the number of blocks to be allocated is greater than the
738 	 * allocation group size, try to allocate anywhere.
739 	 */
740 	if (l2nb > bmp->db_agl2size) {
741 		IWRITE_LOCK(ipbmap);
742 
743 		rc = dbAllocAny(bmp, nblocks, l2nb, results);
744 
745 		goto write_unlock;
746 	}
747 
748 	/*
749 	 * If no hint, let dbNextAG recommend an allocation group
750 	 */
751 	if (hint == 0)
752 		goto pref_ag;
753 
754 	/* we would like to allocate close to the hint.  adjust the
755 	 * hint to the block following the hint since the allocators
756 	 * will start looking for free space starting at this point.
757 	 */
758 	blkno = hint + 1;
759 
760 	if (blkno >= bmp->db_mapsize)
761 		goto pref_ag;
762 
763 	agno = blkno >> bmp->db_agl2size;
764 
765 	/* check if blkno crosses over into a new allocation group.
766 	 * if so, check if we should allow allocations within this
767 	 * allocation group.
768 	 */
769 	if ((blkno & (bmp->db_agsize - 1)) == 0)
770 		/* check if the AG is currenly being written to.
771 		 * if so, call dbNextAG() to find a non-busy
772 		 * AG with sufficient free space.
773 		 */
774 		if (atomic_read(&bmp->db_active[agno]))
775 			goto pref_ag;
776 
777 	/* check if the allocation request size can be satisfied from a
778 	 * single dmap.  if so, try to allocate from the dmap containing
779 	 * the hint using a tiered strategy.
780 	 */
781 	if (nblocks <= BPERDMAP) {
782 		IREAD_LOCK(ipbmap);
783 
784 		/* get the buffer for the dmap containing the hint.
785 		 */
786 		rc = -EIO;
787 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
788 		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
789 		if (mp == NULL)
790 			goto read_unlock;
791 
792 		dp = (struct dmap *) mp->data;
793 
794 		/* first, try to satisfy the allocation request with the
795 		 * blocks beginning at the hint.
796 		 */
797 		if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks))
798 		    != -ENOSPC) {
799 			if (rc == 0) {
800 				*results = blkno;
801 				mark_metapage_dirty(mp);
802 			}
803 
804 			release_metapage(mp);
805 			goto read_unlock;
806 		}
807 
808 		writers = atomic_read(&bmp->db_active[agno]);
809 		if ((writers > 1) ||
810 		    ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) {
811 			/*
812 			 * Someone else is writing in this allocation
813 			 * group.  To avoid fragmenting, try another ag
814 			 */
815 			release_metapage(mp);
816 			IREAD_UNLOCK(ipbmap);
817 			goto pref_ag;
818 		}
819 
820 		/* next, try to satisfy the allocation request with blocks
821 		 * near the hint.
822 		 */
823 		if ((rc =
824 		     dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results))
825 		    != -ENOSPC) {
826 			if (rc == 0)
827 				mark_metapage_dirty(mp);
828 
829 			release_metapage(mp);
830 			goto read_unlock;
831 		}
832 
833 		/* try to satisfy the allocation request with blocks within
834 		 * the same dmap as the hint.
835 		 */
836 		if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
837 		    != -ENOSPC) {
838 			if (rc == 0)
839 				mark_metapage_dirty(mp);
840 
841 			release_metapage(mp);
842 			goto read_unlock;
843 		}
844 
845 		release_metapage(mp);
846 		IREAD_UNLOCK(ipbmap);
847 	}
848 
849 	/* try to satisfy the allocation request with blocks within
850 	 * the same allocation group as the hint.
851 	 */
852 	IWRITE_LOCK(ipbmap);
853 	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC)
854 		goto write_unlock;
855 
856 	IWRITE_UNLOCK(ipbmap);
857 
858 
859       pref_ag:
860 	/*
861 	 * Let dbNextAG recommend a preferred allocation group
862 	 */
863 	agno = dbNextAG(ipbmap);
864 	IWRITE_LOCK(ipbmap);
865 
866 	/* Try to allocate within this allocation group.  if that fails, try to
867 	 * allocate anywhere in the map.
868 	 */
869 	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC)
870 		rc = dbAllocAny(bmp, nblocks, l2nb, results);
871 
872       write_unlock:
873 	IWRITE_UNLOCK(ipbmap);
874 
875 	return (rc);
876 
877       read_unlock:
878 	IREAD_UNLOCK(ipbmap);
879 
880 	return (rc);
881 }
882 
883 #ifdef _NOTYET
884 /*
885  * NAME:	dbAllocExact()
886  *
887  * FUNCTION:    try to allocate the requested extent;
888  *
889  * PARAMETERS:
890  *      ip	- pointer to in-core inode;
891  *      blkno	- extent address;
892  *      nblocks	- extent length;
893  *
894  * RETURN VALUES:
895  *      0	- success
896  *      -ENOSPC	- insufficient disk resources
897  *      -EIO	- i/o error
898  */
899 int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
900 {
901 	int rc;
902 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
903 	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
904 	struct dmap *dp;
905 	s64 lblkno;
906 	struct metapage *mp;
907 
908 	IREAD_LOCK(ipbmap);
909 
910 	/*
911 	 * validate extent request:
912 	 *
913 	 * note: defragfs policy:
914 	 *  max 64 blocks will be moved.
915 	 *  allocation request size must be satisfied from a single dmap.
916 	 */
917 	if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) {
918 		IREAD_UNLOCK(ipbmap);
919 		return -EINVAL;
920 	}
921 
922 	if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) {
923 		/* the free space is no longer available */
924 		IREAD_UNLOCK(ipbmap);
925 		return -ENOSPC;
926 	}
927 
928 	/* read in the dmap covering the extent */
929 	lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
930 	mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
931 	if (mp == NULL) {
932 		IREAD_UNLOCK(ipbmap);
933 		return -EIO;
934 	}
935 	dp = (struct dmap *) mp->data;
936 
937 	/* try to allocate the requested extent */
938 	rc = dbAllocNext(bmp, dp, blkno, nblocks);
939 
940 	IREAD_UNLOCK(ipbmap);
941 
942 	if (rc == 0)
943 		mark_metapage_dirty(mp);
944 
945 	release_metapage(mp);
946 
947 	return (rc);
948 }
949 #endif /* _NOTYET */
950 
951 /*
952  * NAME:	dbReAlloc()
953  *
954  * FUNCTION:    attempt to extend a current allocation by a specified
955  *		number of blocks.
956  *
957  *		this routine attempts to satisfy the allocation request
958  *		by first trying to extend the existing allocation in
959  *		place by allocating the additional blocks as the blocks
960  *		immediately following the current allocation.  if these
961  *		blocks are not available, this routine will attempt to
962  *		allocate a new set of contiguous blocks large enough
963  *		to cover the existing allocation plus the additional
964  *		number of blocks required.
965  *
966  * PARAMETERS:
967  *      ip	    -  pointer to in-core inode requiring allocation.
968  *      blkno	    -  starting block of the current allocation.
969  *      nblocks	    -  number of contiguous blocks within the current
970  *		       allocation.
971  *      addnblocks  -  number of blocks to add to the allocation.
972  *      results	-      on successful return, set to the starting block number
973  *		       of the existing allocation if the existing allocation
974  *		       was extended in place or to a newly allocated contiguous
975  *		       range if the existing allocation could not be extended
976  *		       in place.
977  *
978  * RETURN VALUES:
979  *      0	- success
980  *      -ENOSPC	- insufficient disk resources
981  *      -EIO	- i/o error
982  */
983 int
984 dbReAlloc(struct inode *ip,
985 	  s64 blkno, s64 nblocks, s64 addnblocks, s64 * results)
986 {
987 	int rc;
988 
989 	/* try to extend the allocation in place.
990 	 */
991 	if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) {
992 		*results = blkno;
993 		return (0);
994 	} else {
995 		if (rc != -ENOSPC)
996 			return (rc);
997 	}
998 
999 	/* could not extend the allocation in place, so allocate a
1000 	 * new set of blocks for the entire request (i.e. try to get
1001 	 * a range of contiguous blocks large enough to cover the
1002 	 * existing allocation plus the additional blocks.)
1003 	 */
1004 	return (dbAlloc
1005 		(ip, blkno + nblocks - 1, addnblocks + nblocks, results));
1006 }
1007 
1008 
1009 /*
1010  * NAME:	dbExtend()
1011  *
1012  * FUNCTION:    attempt to extend a current allocation by a specified
1013  *		number of blocks.
1014  *
1015  *		this routine attempts to satisfy the allocation request
1016  *		by first trying to extend the existing allocation in
1017  *		place by allocating the additional blocks as the blocks
1018  *		immediately following the current allocation.
1019  *
1020  * PARAMETERS:
1021  *      ip	    -  pointer to in-core inode requiring allocation.
1022  *      blkno	    -  starting block of the current allocation.
1023  *      nblocks	    -  number of contiguous blocks within the current
1024  *		       allocation.
1025  *      addnblocks  -  number of blocks to add to the allocation.
1026  *
1027  * RETURN VALUES:
1028  *      0	- success
1029  *      -ENOSPC	- insufficient disk resources
1030  *      -EIO	- i/o error
1031  */
1032 static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
1033 {
1034 	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
1035 	s64 lblkno, lastblkno, extblkno;
1036 	uint rel_block;
1037 	struct metapage *mp;
1038 	struct dmap *dp;
1039 	int rc;
1040 	struct inode *ipbmap = sbi->ipbmap;
1041 	struct bmap *bmp;
1042 
1043 	/*
1044 	 * We don't want a non-aligned extent to cross a page boundary
1045 	 */
1046 	if (((rel_block = blkno & (sbi->nbperpage - 1))) &&
1047 	    (rel_block + nblocks + addnblocks > sbi->nbperpage))
1048 		return -ENOSPC;
1049 
1050 	/* get the last block of the current allocation */
1051 	lastblkno = blkno + nblocks - 1;
1052 
1053 	/* determine the block number of the block following
1054 	 * the existing allocation.
1055 	 */
1056 	extblkno = lastblkno + 1;
1057 
1058 	IREAD_LOCK(ipbmap);
1059 
1060 	/* better be within the file system */
1061 	bmp = sbi->bmap;
1062 	if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
1063 		IREAD_UNLOCK(ipbmap);
1064 		jfs_error(ip->i_sb,
1065 			  "dbExtend: the block is outside the filesystem");
1066 		return -EIO;
1067 	}
1068 
1069 	/* we'll attempt to extend the current allocation in place by
1070 	 * allocating the additional blocks as the blocks immediately
1071 	 * following the current allocation.  we only try to extend the
1072 	 * current allocation in place if the number of additional blocks
1073 	 * can fit into a dmap, the last block of the current allocation
1074 	 * is not the last block of the file system, and the start of the
1075 	 * inplace extension is not on an allocation group boundary.
1076 	 */
1077 	if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize ||
1078 	    (extblkno & (bmp->db_agsize - 1)) == 0) {
1079 		IREAD_UNLOCK(ipbmap);
1080 		return -ENOSPC;
1081 	}
1082 
1083 	/* get the buffer for the dmap containing the first block
1084 	 * of the extension.
1085 	 */
1086 	lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage);
1087 	mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
1088 	if (mp == NULL) {
1089 		IREAD_UNLOCK(ipbmap);
1090 		return -EIO;
1091 	}
1092 
1093 	dp = (struct dmap *) mp->data;
1094 
1095 	/* try to allocate the blocks immediately following the
1096 	 * current allocation.
1097 	 */
1098 	rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks);
1099 
1100 	IREAD_UNLOCK(ipbmap);
1101 
1102 	/* were we successful ? */
1103 	if (rc == 0)
1104 		write_metapage(mp);
1105 	else
1106 		/* we were not successful */
1107 		release_metapage(mp);
1108 
1109 
1110 	return (rc);
1111 }
1112 
1113 
1114 /*
1115  * NAME:	dbAllocNext()
1116  *
1117  * FUNCTION:    attempt to allocate the blocks of the specified block
1118  *		range within a dmap.
1119  *
1120  * PARAMETERS:
1121  *      bmp	-  pointer to bmap descriptor
1122  *      dp	-  pointer to dmap.
1123  *      blkno	-  starting block number of the range.
1124  *      nblocks	-  number of contiguous free blocks of the range.
1125  *
1126  * RETURN VALUES:
1127  *      0	- success
1128  *      -ENOSPC	- insufficient disk resources
1129  *      -EIO	- i/o error
1130  *
1131  * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
1132  */
1133 static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
1134 		       int nblocks)
1135 {
1136 	int dbitno, word, rembits, nb, nwords, wbitno, nw;
1137 	int l2size;
1138 	s8 *leaf;
1139 	u32 mask;
1140 
1141 	if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
1142 		jfs_error(bmp->db_ipbmap->i_sb,
1143 			  "dbAllocNext: Corrupt dmap page");
1144 		return -EIO;
1145 	}
1146 
1147 	/* pick up a pointer to the leaves of the dmap tree.
1148 	 */
1149 	leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
1150 
1151 	/* determine the bit number and word within the dmap of the
1152 	 * starting block.
1153 	 */
1154 	dbitno = blkno & (BPERDMAP - 1);
1155 	word = dbitno >> L2DBWORD;
1156 
1157 	/* check if the specified block range is contained within
1158 	 * this dmap.
1159 	 */
1160 	if (dbitno + nblocks > BPERDMAP)
1161 		return -ENOSPC;
1162 
1163 	/* check if the starting leaf indicates that anything
1164 	 * is free.
1165 	 */
1166 	if (leaf[word] == NOFREE)
1167 		return -ENOSPC;
1168 
1169 	/* check the dmaps words corresponding to block range to see
1170 	 * if the block range is free.  not all bits of the first and
1171 	 * last words may be contained within the block range.  if this
1172 	 * is the case, we'll work against those words (i.e. partial first
1173 	 * and/or last) on an individual basis (a single pass) and examine
1174 	 * the actual bits to determine if they are free.  a single pass
1175 	 * will be used for all dmap words fully contained within the
1176 	 * specified range.  within this pass, the leaves of the dmap
1177 	 * tree will be examined to determine if the blocks are free. a
1178 	 * single leaf may describe the free space of multiple dmap
1179 	 * words, so we may visit only a subset of the actual leaves
1180 	 * corresponding to the dmap words of the block range.
1181 	 */
1182 	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
1183 		/* determine the bit number within the word and
1184 		 * the number of bits within the word.
1185 		 */
1186 		wbitno = dbitno & (DBWORD - 1);
1187 		nb = min(rembits, DBWORD - wbitno);
1188 
1189 		/* check if only part of the word is to be examined.
1190 		 */
1191 		if (nb < DBWORD) {
1192 			/* check if the bits are free.
1193 			 */
1194 			mask = (ONES << (DBWORD - nb) >> wbitno);
1195 			if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask)
1196 				return -ENOSPC;
1197 
1198 			word += 1;
1199 		} else {
1200 			/* one or more dmap words are fully contained
1201 			 * within the block range.  determine how many
1202 			 * words and how many bits.
1203 			 */
1204 			nwords = rembits >> L2DBWORD;
1205 			nb = nwords << L2DBWORD;
1206 
1207 			/* now examine the appropriate leaves to determine
1208 			 * if the blocks are free.
1209 			 */
1210 			while (nwords > 0) {
1211 				/* does the leaf describe any free space ?
1212 				 */
1213 				if (leaf[word] < BUDMIN)
1214 					return -ENOSPC;
1215 
1216 				/* determine the l2 number of bits provided
1217 				 * by this leaf.
1218 				 */
1219 				l2size =
1220 				    min((int)leaf[word], NLSTOL2BSZ(nwords));
1221 
1222 				/* determine how many words were handled.
1223 				 */
1224 				nw = BUDSIZE(l2size, BUDMIN);
1225 
1226 				nwords -= nw;
1227 				word += nw;
1228 			}
1229 		}
1230 	}
1231 
1232 	/* allocate the blocks.
1233 	 */
1234 	return (dbAllocDmap(bmp, dp, blkno, nblocks));
1235 }
1236 
1237 
1238 /*
1239  * NAME:	dbAllocNear()
1240  *
1241  * FUNCTION:    attempt to allocate a number of contiguous free blocks near
1242  *		a specified block (hint) within a dmap.
1243  *
1244  *		starting with the dmap leaf that covers the hint, we'll
1245  *		check the next four contiguous leaves for sufficient free
1246  *		space.  if sufficient free space is found, we'll allocate
1247  *		the desired free space.
1248  *
1249  * PARAMETERS:
1250  *      bmp	-  pointer to bmap descriptor
1251  *      dp	-  pointer to dmap.
1252  *      blkno	-  block number to allocate near.
1253  *      nblocks	-  actual number of contiguous free blocks desired.
1254  *      l2nb	-  log2 number of contiguous free blocks desired.
1255  *      results	-  on successful return, set to the starting block number
1256  *		   of the newly allocated range.
1257  *
1258  * RETURN VALUES:
1259  *      0	- success
1260  *      -ENOSPC	- insufficient disk resources
1261  *      -EIO	- i/o error
1262  *
1263  * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
1264  */
1265 static int
1266 dbAllocNear(struct bmap * bmp,
1267 	    struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results)
1268 {
1269 	int word, lword, rc;
1270 	s8 *leaf;
1271 
1272 	if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
1273 		jfs_error(bmp->db_ipbmap->i_sb,
1274 			  "dbAllocNear: Corrupt dmap page");
1275 		return -EIO;
1276 	}
1277 
1278 	leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
1279 
1280 	/* determine the word within the dmap that holds the hint
1281 	 * (i.e. blkno).  also, determine the last word in the dmap
1282 	 * that we'll include in our examination.
1283 	 */
1284 	word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
1285 	lword = min(word + 4, LPERDMAP);
1286 
1287 	/* examine the leaves for sufficient free space.
1288 	 */
1289 	for (; word < lword; word++) {
1290 		/* does the leaf describe sufficient free space ?
1291 		 */
1292 		if (leaf[word] < l2nb)
1293 			continue;
1294 
1295 		/* determine the block number within the file system
1296 		 * of the first block described by this dmap word.
1297 		 */
1298 		blkno = le64_to_cpu(dp->start) + (word << L2DBWORD);
1299 
1300 		/* if not all bits of the dmap word are free, get the
1301 		 * starting bit number within the dmap word of the required
1302 		 * string of free bits and adjust the block number with the
1303 		 * value.
1304 		 */
1305 		if (leaf[word] < BUDMIN)
1306 			blkno +=
1307 			    dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb);
1308 
1309 		/* allocate the blocks.
1310 		 */
1311 		if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
1312 			*results = blkno;
1313 
1314 		return (rc);
1315 	}
1316 
1317 	return -ENOSPC;
1318 }
1319 
1320 
1321 /*
1322  * NAME:	dbAllocAG()
1323  *
1324  * FUNCTION:    attempt to allocate the specified number of contiguous
1325  *		free blocks within the specified allocation group.
1326  *
1327  *		unless the allocation group size is equal to the number
1328  *		of blocks per dmap, the dmap control pages will be used to
1329  *		find the required free space, if available.  we start the
1330  *		search at the highest dmap control page level which
1331  *		distinctly describes the allocation group's free space
1332  *		(i.e. the highest level at which the allocation group's
1333  *		free space is not mixed in with that of any other group).
1334  *		in addition, we start the search within this level at a
1335  *		height of the dmapctl dmtree at which the nodes distinctly
1336  *		describe the allocation group's free space.  at this height,
1337  *		the allocation group's free space may be represented by 1
1338  *		or two sub-trees, depending on the allocation group size.
1339  *		we search the top nodes of these subtrees left to right for
1340  *		sufficient free space.  if sufficient free space is found,
1341  *		the subtree is searched to find the leftmost leaf that
1342  *		has free space.  once we have made it to the leaf, we
1343  *		move the search to the next lower level dmap control page
1344  *		corresponding to this leaf.  we continue down the dmap control
1345  *		pages until we find the dmap that contains or starts the
1346  *		sufficient free space and we allocate at this dmap.
1347  *
1348  *		if the allocation group size is equal to the dmap size,
1349  *		we'll start at the dmap corresponding to the allocation
1350  *		group and attempt the allocation at this level.
1351  *
1352  *		the dmap control page search is also not performed if the
1353  *		allocation group is completely free and we go to the first
1354  *		dmap of the allocation group to do the allocation.  this is
1355  *		done because the allocation group may be part (not the first
1356  *		part) of a larger binary buddy system, causing the dmap
1357  *		control pages to indicate no free space (NOFREE) within
1358  *		the allocation group.
1359  *
1360  * PARAMETERS:
1361  *      bmp	-  pointer to bmap descriptor
1362  *	agno	- allocation group number.
1363  *      nblocks	-  actual number of contiguous free blocks desired.
1364  *      l2nb	-  log2 number of contiguous free blocks desired.
1365  *      results	-  on successful return, set to the starting block number
1366  *		   of the newly allocated range.
1367  *
1368  * RETURN VALUES:
1369  *      0	- success
1370  *      -ENOSPC	- insufficient disk resources
1371  *      -EIO	- i/o error
1372  *
1373  * note: IWRITE_LOCK(ipmap) held on entry/exit;
1374  */
1375 static int
1376 dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1377 {
1378 	struct metapage *mp;
1379 	struct dmapctl *dcp;
1380 	int rc, ti, i, k, m, n, agperlev;
1381 	s64 blkno, lblkno;
1382 	int budmin;
1383 
1384 	/* allocation request should not be for more than the
1385 	 * allocation group size.
1386 	 */
1387 	if (l2nb > bmp->db_agl2size) {
1388 		jfs_error(bmp->db_ipbmap->i_sb,
1389 			  "dbAllocAG: allocation request is larger than the "
1390 			  "allocation group size");
1391 		return -EIO;
1392 	}
1393 
1394 	/* determine the starting block number of the allocation
1395 	 * group.
1396 	 */
1397 	blkno = (s64) agno << bmp->db_agl2size;
1398 
1399 	/* check if the allocation group size is the minimum allocation
1400 	 * group size or if the allocation group is completely free. if
1401 	 * the allocation group size is the minimum size of BPERDMAP (i.e.
1402 	 * 1 dmap), there is no need to search the dmap control page (below)
1403 	 * that fully describes the allocation group since the allocation
1404 	 * group is already fully described by a dmap.  in this case, we
1405 	 * just call dbAllocCtl() to search the dmap tree and allocate the
1406 	 * required space if available.
1407 	 *
1408 	 * if the allocation group is completely free, dbAllocCtl() is
1409 	 * also called to allocate the required space.  this is done for
1410 	 * two reasons.  first, it makes no sense searching the dmap control
1411 	 * pages for free space when we know that free space exists.  second,
1412 	 * the dmap control pages may indicate that the allocation group
1413 	 * has no free space if the allocation group is part (not the first
1414 	 * part) of a larger binary buddy system.
1415 	 */
1416 	if (bmp->db_agsize == BPERDMAP
1417 	    || bmp->db_agfree[agno] == bmp->db_agsize) {
1418 		rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1419 		if ((rc == -ENOSPC) &&
1420 		    (bmp->db_agfree[agno] == bmp->db_agsize)) {
1421 			printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n",
1422 			       (unsigned long long) blkno,
1423 			       (unsigned long long) nblocks);
1424 			jfs_error(bmp->db_ipbmap->i_sb,
1425 				  "dbAllocAG: dbAllocCtl failed in free AG");
1426 		}
1427 		return (rc);
1428 	}
1429 
1430 	/* the buffer for the dmap control page that fully describes the
1431 	 * allocation group.
1432 	 */
1433 	lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel);
1434 	mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1435 	if (mp == NULL)
1436 		return -EIO;
1437 	dcp = (struct dmapctl *) mp->data;
1438 	budmin = dcp->budmin;
1439 
1440 	if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
1441 		jfs_error(bmp->db_ipbmap->i_sb,
1442 			  "dbAllocAG: Corrupt dmapctl page");
1443 		release_metapage(mp);
1444 		return -EIO;
1445 	}
1446 
1447 	/* search the subtree(s) of the dmap control page that describes
1448 	 * the allocation group, looking for sufficient free space.  to begin,
1449 	 * determine how many allocation groups are represented in a dmap
1450 	 * control page at the control page level (i.e. L0, L1, L2) that
1451 	 * fully describes an allocation group. next, determine the starting
1452 	 * tree index of this allocation group within the control page.
1453 	 */
1454 	agperlev =
1455 	    (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth;
1456 	ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
1457 
1458 	/* dmap control page trees fan-out by 4 and a single allocation
1459 	 * group may be described by 1 or 2 subtrees within the ag level
1460 	 * dmap control page, depending upon the ag size. examine the ag's
1461 	 * subtrees for sufficient free space, starting with the leftmost
1462 	 * subtree.
1463 	 */
1464 	for (i = 0; i < bmp->db_agwidth; i++, ti++) {
1465 		/* is there sufficient free space ?
1466 		 */
1467 		if (l2nb > dcp->stree[ti])
1468 			continue;
1469 
1470 		/* sufficient free space found in a subtree. now search down
1471 		 * the subtree to find the leftmost leaf that describes this
1472 		 * free space.
1473 		 */
1474 		for (k = bmp->db_agheigth; k > 0; k--) {
1475 			for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
1476 				if (l2nb <= dcp->stree[m + n]) {
1477 					ti = m + n;
1478 					break;
1479 				}
1480 			}
1481 			if (n == 4) {
1482 				jfs_error(bmp->db_ipbmap->i_sb,
1483 					  "dbAllocAG: failed descending stree");
1484 				release_metapage(mp);
1485 				return -EIO;
1486 			}
1487 		}
1488 
1489 		/* determine the block number within the file system
1490 		 * that corresponds to this leaf.
1491 		 */
1492 		if (bmp->db_aglevel == 2)
1493 			blkno = 0;
1494 		else if (bmp->db_aglevel == 1)
1495 			blkno &= ~(MAXL1SIZE - 1);
1496 		else		/* bmp->db_aglevel == 0 */
1497 			blkno &= ~(MAXL0SIZE - 1);
1498 
1499 		blkno +=
1500 		    ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin;
1501 
1502 		/* release the buffer in preparation for going down
1503 		 * the next level of dmap control pages.
1504 		 */
1505 		release_metapage(mp);
1506 
1507 		/* check if we need to continue to search down the lower
1508 		 * level dmap control pages.  we need to if the number of
1509 		 * blocks required is less than maximum number of blocks
1510 		 * described at the next lower level.
1511 		 */
1512 		if (l2nb < budmin) {
1513 
1514 			/* search the lower level dmap control pages to get
1515 			 * the starting block number of the the dmap that
1516 			 * contains or starts off the free space.
1517 			 */
1518 			if ((rc =
1519 			     dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1,
1520 				       &blkno))) {
1521 				if (rc == -ENOSPC) {
1522 					jfs_error(bmp->db_ipbmap->i_sb,
1523 						  "dbAllocAG: control page "
1524 						  "inconsistent");
1525 					return -EIO;
1526 				}
1527 				return (rc);
1528 			}
1529 		}
1530 
1531 		/* allocate the blocks.
1532 		 */
1533 		rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1534 		if (rc == -ENOSPC) {
1535 			jfs_error(bmp->db_ipbmap->i_sb,
1536 				  "dbAllocAG: unable to allocate blocks");
1537 			rc = -EIO;
1538 		}
1539 		return (rc);
1540 	}
1541 
1542 	/* no space in the allocation group.  release the buffer and
1543 	 * return -ENOSPC.
1544 	 */
1545 	release_metapage(mp);
1546 
1547 	return -ENOSPC;
1548 }
1549 
1550 
1551 /*
1552  * NAME:	dbAllocAny()
1553  *
1554  * FUNCTION:    attempt to allocate the specified number of contiguous
1555  *		free blocks anywhere in the file system.
1556  *
1557  *		dbAllocAny() attempts to find the sufficient free space by
1558  *		searching down the dmap control pages, starting with the
1559  *		highest level (i.e. L0, L1, L2) control page.  if free space
1560  *		large enough to satisfy the desired free space is found, the
1561  *		desired free space is allocated.
1562  *
1563  * PARAMETERS:
1564  *      bmp	-  pointer to bmap descriptor
1565  *      nblocks	 -  actual number of contiguous free blocks desired.
1566  *      l2nb	 -  log2 number of contiguous free blocks desired.
1567  *      results	-  on successful return, set to the starting block number
1568  *		   of the newly allocated range.
1569  *
1570  * RETURN VALUES:
1571  *      0	- success
1572  *      -ENOSPC	- insufficient disk resources
1573  *      -EIO	- i/o error
1574  *
1575  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1576  */
1577 static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
1578 {
1579 	int rc;
1580 	s64 blkno = 0;
1581 
1582 	/* starting with the top level dmap control page, search
1583 	 * down the dmap control levels for sufficient free space.
1584 	 * if free space is found, dbFindCtl() returns the starting
1585 	 * block number of the dmap that contains or starts off the
1586 	 * range of free space.
1587 	 */
1588 	if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno)))
1589 		return (rc);
1590 
1591 	/* allocate the blocks.
1592 	 */
1593 	rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1594 	if (rc == -ENOSPC) {
1595 		jfs_error(bmp->db_ipbmap->i_sb,
1596 			  "dbAllocAny: unable to allocate blocks");
1597 		return -EIO;
1598 	}
1599 	return (rc);
1600 }
1601 
1602 
1603 /*
1604  * NAME:	dbFindCtl()
1605  *
1606  * FUNCTION:    starting at a specified dmap control page level and block
1607  *		number, search down the dmap control levels for a range of
1608  *	        contiguous free blocks large enough to satisfy an allocation
1609  *		request for the specified number of free blocks.
1610  *
1611  *		if sufficient contiguous free blocks are found, this routine
1612  *		returns the starting block number within a dmap page that
1613  *		contains or starts a range of contiqious free blocks that
1614  *		is sufficient in size.
1615  *
1616  * PARAMETERS:
1617  *      bmp	-  pointer to bmap descriptor
1618  *      level	-  starting dmap control page level.
1619  *      l2nb	-  log2 number of contiguous free blocks desired.
1620  *      *blkno	-  on entry, starting block number for conducting the search.
1621  *		   on successful return, the first block within a dmap page
1622  *		   that contains or starts a range of contiguous free blocks.
1623  *
1624  * RETURN VALUES:
1625  *      0	- success
1626  *      -ENOSPC	- insufficient disk resources
1627  *      -EIO	- i/o error
1628  *
1629  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1630  */
1631 static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1632 {
1633 	int rc, leafidx, lev;
1634 	s64 b, lblkno;
1635 	struct dmapctl *dcp;
1636 	int budmin;
1637 	struct metapage *mp;
1638 
1639 	/* starting at the specified dmap control page level and block
1640 	 * number, search down the dmap control levels for the starting
1641 	 * block number of a dmap page that contains or starts off
1642 	 * sufficient free blocks.
1643 	 */
1644 	for (lev = level, b = *blkno; lev >= 0; lev--) {
1645 		/* get the buffer of the dmap control page for the block
1646 		 * number and level (i.e. L0, L1, L2).
1647 		 */
1648 		lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev);
1649 		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1650 		if (mp == NULL)
1651 			return -EIO;
1652 		dcp = (struct dmapctl *) mp->data;
1653 		budmin = dcp->budmin;
1654 
1655 		if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
1656 			jfs_error(bmp->db_ipbmap->i_sb,
1657 				  "dbFindCtl: Corrupt dmapctl page");
1658 			release_metapage(mp);
1659 			return -EIO;
1660 		}
1661 
1662 		/* search the tree within the dmap control page for
1663 		 * sufficent free space.  if sufficient free space is found,
1664 		 * dbFindLeaf() returns the index of the leaf at which
1665 		 * free space was found.
1666 		 */
1667 		rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx);
1668 
1669 		/* release the buffer.
1670 		 */
1671 		release_metapage(mp);
1672 
1673 		/* space found ?
1674 		 */
1675 		if (rc) {
1676 			if (lev != level) {
1677 				jfs_error(bmp->db_ipbmap->i_sb,
1678 					  "dbFindCtl: dmap inconsistent");
1679 				return -EIO;
1680 			}
1681 			return -ENOSPC;
1682 		}
1683 
1684 		/* adjust the block number to reflect the location within
1685 		 * the dmap control page (i.e. the leaf) at which free
1686 		 * space was found.
1687 		 */
1688 		b += (((s64) leafidx) << budmin);
1689 
1690 		/* we stop the search at this dmap control page level if
1691 		 * the number of blocks required is greater than or equal
1692 		 * to the maximum number of blocks described at the next
1693 		 * (lower) level.
1694 		 */
1695 		if (l2nb >= budmin)
1696 			break;
1697 	}
1698 
1699 	*blkno = b;
1700 	return (0);
1701 }
1702 
1703 
1704 /*
1705  * NAME:	dbAllocCtl()
1706  *
1707  * FUNCTION:    attempt to allocate a specified number of contiguous
1708  *		blocks starting within a specific dmap.
1709  *
1710  *		this routine is called by higher level routines that search
1711  *		the dmap control pages above the actual dmaps for contiguous
1712  *		free space.  the result of successful searches by these
1713  * 		routines are the starting block numbers within dmaps, with
1714  *		the dmaps themselves containing the desired contiguous free
1715  *		space or starting a contiguous free space of desired size
1716  *		that is made up of the blocks of one or more dmaps. these
1717  *		calls should not fail due to insufficent resources.
1718  *
1719  *		this routine is called in some cases where it is not known
1720  *		whether it will fail due to insufficient resources.  more
1721  *		specifically, this occurs when allocating from an allocation
1722  *		group whose size is equal to the number of blocks per dmap.
1723  *		in this case, the dmap control pages are not examined prior
1724  *		to calling this routine (to save pathlength) and the call
1725  *		might fail.
1726  *
1727  *		for a request size that fits within a dmap, this routine relies
1728  *		upon the dmap's dmtree to find the requested contiguous free
1729  *		space.  for request sizes that are larger than a dmap, the
1730  *		requested free space will start at the first block of the
1731  *		first dmap (i.e. blkno).
1732  *
1733  * PARAMETERS:
1734  *      bmp	-  pointer to bmap descriptor
1735  *      nblocks	 -  actual number of contiguous free blocks to allocate.
1736  *      l2nb	 -  log2 number of contiguous free blocks to allocate.
1737  *      blkno	 -  starting block number of the dmap to start the allocation
1738  *		    from.
1739  *      results	-  on successful return, set to the starting block number
1740  *		   of the newly allocated range.
1741  *
1742  * RETURN VALUES:
1743  *      0	- success
1744  *      -ENOSPC	- insufficient disk resources
1745  *      -EIO	- i/o error
1746  *
1747  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1748  */
1749 static int
1750 dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1751 {
1752 	int rc, nb;
1753 	s64 b, lblkno, n;
1754 	struct metapage *mp;
1755 	struct dmap *dp;
1756 
1757 	/* check if the allocation request is confined to a single dmap.
1758 	 */
1759 	if (l2nb <= L2BPERDMAP) {
1760 		/* get the buffer for the dmap.
1761 		 */
1762 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
1763 		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1764 		if (mp == NULL)
1765 			return -EIO;
1766 		dp = (struct dmap *) mp->data;
1767 
1768 		/* try to allocate the blocks.
1769 		 */
1770 		rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results);
1771 		if (rc == 0)
1772 			mark_metapage_dirty(mp);
1773 
1774 		release_metapage(mp);
1775 
1776 		return (rc);
1777 	}
1778 
1779 	/* allocation request involving multiple dmaps. it must start on
1780 	 * a dmap boundary.
1781 	 */
1782 	assert((blkno & (BPERDMAP - 1)) == 0);
1783 
1784 	/* allocate the blocks dmap by dmap.
1785 	 */
1786 	for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) {
1787 		/* get the buffer for the dmap.
1788 		 */
1789 		lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
1790 		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1791 		if (mp == NULL) {
1792 			rc = -EIO;
1793 			goto backout;
1794 		}
1795 		dp = (struct dmap *) mp->data;
1796 
1797 		/* the dmap better be all free.
1798 		 */
1799 		if (dp->tree.stree[ROOT] != L2BPERDMAP) {
1800 			release_metapage(mp);
1801 			jfs_error(bmp->db_ipbmap->i_sb,
1802 				  "dbAllocCtl: the dmap is not all free");
1803 			rc = -EIO;
1804 			goto backout;
1805 		}
1806 
1807 		/* determine how many blocks to allocate from this dmap.
1808 		 */
1809 		nb = min(n, (s64)BPERDMAP);
1810 
1811 		/* allocate the blocks from the dmap.
1812 		 */
1813 		if ((rc = dbAllocDmap(bmp, dp, b, nb))) {
1814 			release_metapage(mp);
1815 			goto backout;
1816 		}
1817 
1818 		/* write the buffer.
1819 		 */
1820 		write_metapage(mp);
1821 	}
1822 
1823 	/* set the results (starting block number) and return.
1824 	 */
1825 	*results = blkno;
1826 	return (0);
1827 
1828 	/* something failed in handling an allocation request involving
1829 	 * multiple dmaps.  we'll try to clean up by backing out any
1830 	 * allocation that has already happened for this request.  if
1831 	 * we fail in backing out the allocation, we'll mark the file
1832 	 * system to indicate that blocks have been leaked.
1833 	 */
1834       backout:
1835 
1836 	/* try to backout the allocations dmap by dmap.
1837 	 */
1838 	for (n = nblocks - n, b = blkno; n > 0;
1839 	     n -= BPERDMAP, b += BPERDMAP) {
1840 		/* get the buffer for this dmap.
1841 		 */
1842 		lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
1843 		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1844 		if (mp == NULL) {
1845 			/* could not back out.  mark the file system
1846 			 * to indicate that we have leaked blocks.
1847 			 */
1848 			jfs_error(bmp->db_ipbmap->i_sb,
1849 				  "dbAllocCtl: I/O Error: Block Leakage.");
1850 			continue;
1851 		}
1852 		dp = (struct dmap *) mp->data;
1853 
1854 		/* free the blocks is this dmap.
1855 		 */
1856 		if (dbFreeDmap(bmp, dp, b, BPERDMAP)) {
1857 			/* could not back out.  mark the file system
1858 			 * to indicate that we have leaked blocks.
1859 			 */
1860 			release_metapage(mp);
1861 			jfs_error(bmp->db_ipbmap->i_sb,
1862 				  "dbAllocCtl: Block Leakage.");
1863 			continue;
1864 		}
1865 
1866 		/* write the buffer.
1867 		 */
1868 		write_metapage(mp);
1869 	}
1870 
1871 	return (rc);
1872 }
1873 
1874 
1875 /*
1876  * NAME:	dbAllocDmapLev()
1877  *
1878  * FUNCTION:    attempt to allocate a specified number of contiguous blocks
1879  *		from a specified dmap.
1880  *
1881  *		this routine checks if the contiguous blocks are available.
1882  *		if so, nblocks of blocks are allocated; otherwise, ENOSPC is
1883  *		returned.
1884  *
1885  * PARAMETERS:
1886  *      mp	-  pointer to bmap descriptor
1887  *      dp	-  pointer to dmap to attempt to allocate blocks from.
1888  *      l2nb	-  log2 number of contiguous block desired.
1889  *      nblocks	-  actual number of contiguous block desired.
1890  *      results	-  on successful return, set to the starting block number
1891  *		   of the newly allocated range.
1892  *
1893  * RETURN VALUES:
1894  *      0	- success
1895  *      -ENOSPC	- insufficient disk resources
1896  *      -EIO	- i/o error
1897  *
1898  * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or
1899  *	IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
1900  */
1901 static int
1902 dbAllocDmapLev(struct bmap * bmp,
1903 	       struct dmap * dp, int nblocks, int l2nb, s64 * results)
1904 {
1905 	s64 blkno;
1906 	int leafidx, rc;
1907 
1908 	/* can't be more than a dmaps worth of blocks */
1909 	assert(l2nb <= L2BPERDMAP);
1910 
1911 	/* search the tree within the dmap page for sufficient
1912 	 * free space.  if sufficient free space is found, dbFindLeaf()
1913 	 * returns the index of the leaf at which free space was found.
1914 	 */
1915 	if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx))
1916 		return -ENOSPC;
1917 
1918 	/* determine the block number within the file system corresponding
1919 	 * to the leaf at which free space was found.
1920 	 */
1921 	blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD);
1922 
1923 	/* if not all bits of the dmap word are free, get the starting
1924 	 * bit number within the dmap word of the required string of free
1925 	 * bits and adjust the block number with this value.
1926 	 */
1927 	if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN)
1928 		blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb);
1929 
1930 	/* allocate the blocks */
1931 	if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
1932 		*results = blkno;
1933 
1934 	return (rc);
1935 }
1936 
1937 
1938 /*
1939  * NAME:	dbAllocDmap()
1940  *
1941  * FUNCTION:    adjust the disk allocation map to reflect the allocation
1942  *		of a specified block range within a dmap.
1943  *
1944  *		this routine allocates the specified blocks from the dmap
1945  *		through a call to dbAllocBits(). if the allocation of the
1946  *		block range causes the maximum string of free blocks within
1947  *		the dmap to change (i.e. the value of the root of the dmap's
1948  *		dmtree), this routine will cause this change to be reflected
1949  *		up through the appropriate levels of the dmap control pages
1950  *		by a call to dbAdjCtl() for the L0 dmap control page that
1951  *		covers this dmap.
1952  *
1953  * PARAMETERS:
1954  *      bmp	-  pointer to bmap descriptor
1955  *      dp	-  pointer to dmap to allocate the block range from.
1956  *      blkno	-  starting block number of the block to be allocated.
1957  *      nblocks	-  number of blocks to be allocated.
1958  *
1959  * RETURN VALUES:
1960  *      0	- success
1961  *      -EIO	- i/o error
1962  *
1963  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
1964  */
1965 static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
1966 		       int nblocks)
1967 {
1968 	s8 oldroot;
1969 	int rc;
1970 
1971 	/* save the current value of the root (i.e. maximum free string)
1972 	 * of the dmap tree.
1973 	 */
1974 	oldroot = dp->tree.stree[ROOT];
1975 
1976 	/* allocate the specified (blocks) bits */
1977 	dbAllocBits(bmp, dp, blkno, nblocks);
1978 
1979 	/* if the root has not changed, done. */
1980 	if (dp->tree.stree[ROOT] == oldroot)
1981 		return (0);
1982 
1983 	/* root changed. bubble the change up to the dmap control pages.
1984 	 * if the adjustment of the upper level control pages fails,
1985 	 * backout the bit allocation (thus making everything consistent).
1986 	 */
1987 	if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0)))
1988 		dbFreeBits(bmp, dp, blkno, nblocks);
1989 
1990 	return (rc);
1991 }
1992 
1993 
1994 /*
1995  * NAME:	dbFreeDmap()
1996  *
1997  * FUNCTION:    adjust the disk allocation map to reflect the allocation
1998  *		of a specified block range within a dmap.
1999  *
2000  *		this routine frees the specified blocks from the dmap through
2001  *		a call to dbFreeBits(). if the deallocation of the block range
2002  *		causes the maximum string of free blocks within the dmap to
2003  *		change (i.e. the value of the root of the dmap's dmtree), this
2004  *		routine will cause this change to be reflected up through the
2005  *	        appropriate levels of the dmap control pages by a call to
2006  *		dbAdjCtl() for the L0 dmap control page that covers this dmap.
2007  *
2008  * PARAMETERS:
2009  *      bmp	-  pointer to bmap descriptor
2010  *      dp	-  pointer to dmap to free the block range from.
2011  *      blkno	-  starting block number of the block to be freed.
2012  *      nblocks	-  number of blocks to be freed.
2013  *
2014  * RETURN VALUES:
2015  *      0	- success
2016  *      -EIO	- i/o error
2017  *
2018  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2019  */
2020 static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
2021 		      int nblocks)
2022 {
2023 	s8 oldroot;
2024 	int rc = 0, word;
2025 
2026 	/* save the current value of the root (i.e. maximum free string)
2027 	 * of the dmap tree.
2028 	 */
2029 	oldroot = dp->tree.stree[ROOT];
2030 
2031 	/* free the specified (blocks) bits */
2032 	rc = dbFreeBits(bmp, dp, blkno, nblocks);
2033 
2034 	/* if error or the root has not changed, done. */
2035 	if (rc || (dp->tree.stree[ROOT] == oldroot))
2036 		return (rc);
2037 
2038 	/* root changed. bubble the change up to the dmap control pages.
2039 	 * if the adjustment of the upper level control pages fails,
2040 	 * backout the deallocation.
2041 	 */
2042 	if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) {
2043 		word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
2044 
2045 		/* as part of backing out the deallocation, we will have
2046 		 * to back split the dmap tree if the deallocation caused
2047 		 * the freed blocks to become part of a larger binary buddy
2048 		 * system.
2049 		 */
2050 		if (dp->tree.stree[word] == NOFREE)
2051 			dbBackSplit((dmtree_t *) & dp->tree, word);
2052 
2053 		dbAllocBits(bmp, dp, blkno, nblocks);
2054 	}
2055 
2056 	return (rc);
2057 }
2058 
2059 
2060 /*
2061  * NAME:	dbAllocBits()
2062  *
2063  * FUNCTION:    allocate a specified block range from a dmap.
2064  *
2065  *		this routine updates the dmap to reflect the working
2066  *		state allocation of the specified block range. it directly
2067  *		updates the bits of the working map and causes the adjustment
2068  *		of the binary buddy system described by the dmap's dmtree
2069  *		leaves to reflect the bits allocated.  it also causes the
2070  *		dmap's dmtree, as a whole, to reflect the allocated range.
2071  *
2072  * PARAMETERS:
2073  *      bmp	-  pointer to bmap descriptor
2074  *      dp	-  pointer to dmap to allocate bits from.
2075  *      blkno	-  starting block number of the bits to be allocated.
2076  *      nblocks	-  number of bits to be allocated.
2077  *
2078  * RETURN VALUES: none
2079  *
2080  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2081  */
2082 static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2083 			int nblocks)
2084 {
2085 	int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
2086 	dmtree_t *tp = (dmtree_t *) & dp->tree;
2087 	int size;
2088 	s8 *leaf;
2089 
2090 	/* pick up a pointer to the leaves of the dmap tree */
2091 	leaf = dp->tree.stree + LEAFIND;
2092 
2093 	/* determine the bit number and word within the dmap of the
2094 	 * starting block.
2095 	 */
2096 	dbitno = blkno & (BPERDMAP - 1);
2097 	word = dbitno >> L2DBWORD;
2098 
2099 	/* block range better be within the dmap */
2100 	assert(dbitno + nblocks <= BPERDMAP);
2101 
2102 	/* allocate the bits of the dmap's words corresponding to the block
2103 	 * range. not all bits of the first and last words may be contained
2104 	 * within the block range.  if this is the case, we'll work against
2105 	 * those words (i.e. partial first and/or last) on an individual basis
2106 	 * (a single pass), allocating the bits of interest by hand and
2107 	 * updating the leaf corresponding to the dmap word. a single pass
2108 	 * will be used for all dmap words fully contained within the
2109 	 * specified range.  within this pass, the bits of all fully contained
2110 	 * dmap words will be marked as free in a single shot and the leaves
2111 	 * will be updated. a single leaf may describe the free space of
2112 	 * multiple dmap words, so we may update only a subset of the actual
2113 	 * leaves corresponding to the dmap words of the block range.
2114 	 */
2115 	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
2116 		/* determine the bit number within the word and
2117 		 * the number of bits within the word.
2118 		 */
2119 		wbitno = dbitno & (DBWORD - 1);
2120 		nb = min(rembits, DBWORD - wbitno);
2121 
2122 		/* check if only part of a word is to be allocated.
2123 		 */
2124 		if (nb < DBWORD) {
2125 			/* allocate (set to 1) the appropriate bits within
2126 			 * this dmap word.
2127 			 */
2128 			dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
2129 						      >> wbitno);
2130 
2131 			/* update the leaf for this dmap word. in addition
2132 			 * to setting the leaf value to the binary buddy max
2133 			 * of the updated dmap word, dbSplit() will split
2134 			 * the binary system of the leaves if need be.
2135 			 */
2136 			dbSplit(tp, word, BUDMIN,
2137 				dbMaxBud((u8 *) & dp->wmap[word]));
2138 
2139 			word += 1;
2140 		} else {
2141 			/* one or more dmap words are fully contained
2142 			 * within the block range.  determine how many
2143 			 * words and allocate (set to 1) the bits of these
2144 			 * words.
2145 			 */
2146 			nwords = rembits >> L2DBWORD;
2147 			memset(&dp->wmap[word], (int) ONES, nwords * 4);
2148 
2149 			/* determine how many bits.
2150 			 */
2151 			nb = nwords << L2DBWORD;
2152 
2153 			/* now update the appropriate leaves to reflect
2154 			 * the allocated words.
2155 			 */
2156 			for (; nwords > 0; nwords -= nw) {
2157 			        if (leaf[word] < BUDMIN) {
2158 					jfs_error(bmp->db_ipbmap->i_sb,
2159 						  "dbAllocBits: leaf page "
2160 						  "corrupt");
2161 					break;
2162 				}
2163 
2164 				/* determine what the leaf value should be
2165 				 * updated to as the minimum of the l2 number
2166 				 * of bits being allocated and the l2 number
2167 				 * of bits currently described by this leaf.
2168 				 */
2169 				size = min((int)leaf[word], NLSTOL2BSZ(nwords));
2170 
2171 				/* update the leaf to reflect the allocation.
2172 				 * in addition to setting the leaf value to
2173 				 * NOFREE, dbSplit() will split the binary
2174 				 * system of the leaves to reflect the current
2175 				 * allocation (size).
2176 				 */
2177 				dbSplit(tp, word, size, NOFREE);
2178 
2179 				/* get the number of dmap words handled */
2180 				nw = BUDSIZE(size, BUDMIN);
2181 				word += nw;
2182 			}
2183 		}
2184 	}
2185 
2186 	/* update the free count for this dmap */
2187 	dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
2188 
2189 	BMAP_LOCK(bmp);
2190 
2191 	/* if this allocation group is completely free,
2192 	 * update the maximum allocation group number if this allocation
2193 	 * group is the new max.
2194 	 */
2195 	agno = blkno >> bmp->db_agl2size;
2196 	if (agno > bmp->db_maxag)
2197 		bmp->db_maxag = agno;
2198 
2199 	/* update the free count for the allocation group and map */
2200 	bmp->db_agfree[agno] -= nblocks;
2201 	bmp->db_nfree -= nblocks;
2202 
2203 	BMAP_UNLOCK(bmp);
2204 }
2205 
2206 
2207 /*
2208  * NAME:	dbFreeBits()
2209  *
2210  * FUNCTION:    free a specified block range from a dmap.
2211  *
2212  *		this routine updates the dmap to reflect the working
2213  *		state allocation of the specified block range. it directly
2214  *		updates the bits of the working map and causes the adjustment
2215  *		of the binary buddy system described by the dmap's dmtree
2216  *		leaves to reflect the bits freed.  it also causes the dmap's
2217  *		dmtree, as a whole, to reflect the deallocated range.
2218  *
2219  * PARAMETERS:
2220  *      bmp	-  pointer to bmap descriptor
2221  *      dp	-  pointer to dmap to free bits from.
2222  *      blkno	-  starting block number of the bits to be freed.
2223  *      nblocks	-  number of bits to be freed.
2224  *
2225  * RETURN VALUES: 0 for success
2226  *
2227  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2228  */
2229 static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2230 		       int nblocks)
2231 {
2232 	int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
2233 	dmtree_t *tp = (dmtree_t *) & dp->tree;
2234 	int rc = 0;
2235 	int size;
2236 
2237 	/* determine the bit number and word within the dmap of the
2238 	 * starting block.
2239 	 */
2240 	dbitno = blkno & (BPERDMAP - 1);
2241 	word = dbitno >> L2DBWORD;
2242 
2243 	/* block range better be within the dmap.
2244 	 */
2245 	assert(dbitno + nblocks <= BPERDMAP);
2246 
2247 	/* free the bits of the dmaps words corresponding to the block range.
2248 	 * not all bits of the first and last words may be contained within
2249 	 * the block range.  if this is the case, we'll work against those
2250 	 * words (i.e. partial first and/or last) on an individual basis
2251 	 * (a single pass), freeing the bits of interest by hand and updating
2252 	 * the leaf corresponding to the dmap word. a single pass will be used
2253 	 * for all dmap words fully contained within the specified range.
2254 	 * within this pass, the bits of all fully contained dmap words will
2255 	 * be marked as free in a single shot and the leaves will be updated. a
2256 	 * single leaf may describe the free space of multiple dmap words,
2257 	 * so we may update only a subset of the actual leaves corresponding
2258 	 * to the dmap words of the block range.
2259 	 *
2260 	 * dbJoin() is used to update leaf values and will join the binary
2261 	 * buddy system of the leaves if the new leaf values indicate this
2262 	 * should be done.
2263 	 */
2264 	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
2265 		/* determine the bit number within the word and
2266 		 * the number of bits within the word.
2267 		 */
2268 		wbitno = dbitno & (DBWORD - 1);
2269 		nb = min(rembits, DBWORD - wbitno);
2270 
2271 		/* check if only part of a word is to be freed.
2272 		 */
2273 		if (nb < DBWORD) {
2274 			/* free (zero) the appropriate bits within this
2275 			 * dmap word.
2276 			 */
2277 			dp->wmap[word] &=
2278 			    cpu_to_le32(~(ONES << (DBWORD - nb)
2279 					  >> wbitno));
2280 
2281 			/* update the leaf for this dmap word.
2282 			 */
2283 			rc = dbJoin(tp, word,
2284 				    dbMaxBud((u8 *) & dp->wmap[word]));
2285 			if (rc)
2286 				return rc;
2287 
2288 			word += 1;
2289 		} else {
2290 			/* one or more dmap words are fully contained
2291 			 * within the block range.  determine how many
2292 			 * words and free (zero) the bits of these words.
2293 			 */
2294 			nwords = rembits >> L2DBWORD;
2295 			memset(&dp->wmap[word], 0, nwords * 4);
2296 
2297 			/* determine how many bits.
2298 			 */
2299 			nb = nwords << L2DBWORD;
2300 
2301 			/* now update the appropriate leaves to reflect
2302 			 * the freed words.
2303 			 */
2304 			for (; nwords > 0; nwords -= nw) {
2305 				/* determine what the leaf value should be
2306 				 * updated to as the minimum of the l2 number
2307 				 * of bits being freed and the l2 (max) number
2308 				 * of bits that can be described by this leaf.
2309 				 */
2310 				size =
2311 				    min(LITOL2BSZ
2312 					(word, L2LPERDMAP, BUDMIN),
2313 					NLSTOL2BSZ(nwords));
2314 
2315 				/* update the leaf.
2316 				 */
2317 				rc = dbJoin(tp, word, size);
2318 				if (rc)
2319 					return rc;
2320 
2321 				/* get the number of dmap words handled.
2322 				 */
2323 				nw = BUDSIZE(size, BUDMIN);
2324 				word += nw;
2325 			}
2326 		}
2327 	}
2328 
2329 	/* update the free count for this dmap.
2330 	 */
2331 	dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
2332 
2333 	BMAP_LOCK(bmp);
2334 
2335 	/* update the free count for the allocation group and
2336 	 * map.
2337 	 */
2338 	agno = blkno >> bmp->db_agl2size;
2339 	bmp->db_nfree += nblocks;
2340 	bmp->db_agfree[agno] += nblocks;
2341 
2342 	/* check if this allocation group is not completely free and
2343 	 * if it is currently the maximum (rightmost) allocation group.
2344 	 * if so, establish the new maximum allocation group number by
2345 	 * searching left for the first allocation group with allocation.
2346 	 */
2347 	if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) ||
2348 	    (agno == bmp->db_numag - 1 &&
2349 	     bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) {
2350 		while (bmp->db_maxag > 0) {
2351 			bmp->db_maxag -= 1;
2352 			if (bmp->db_agfree[bmp->db_maxag] !=
2353 			    bmp->db_agsize)
2354 				break;
2355 		}
2356 
2357 		/* re-establish the allocation group preference if the
2358 		 * current preference is right of the maximum allocation
2359 		 * group.
2360 		 */
2361 		if (bmp->db_agpref > bmp->db_maxag)
2362 			bmp->db_agpref = bmp->db_maxag;
2363 	}
2364 
2365 	BMAP_UNLOCK(bmp);
2366 
2367 	return 0;
2368 }
2369 
2370 
2371 /*
2372  * NAME:	dbAdjCtl()
2373  *
2374  * FUNCTION:	adjust a dmap control page at a specified level to reflect
2375  *		the change in a lower level dmap or dmap control page's
2376  *		maximum string of free blocks (i.e. a change in the root
2377  *		of the lower level object's dmtree) due to the allocation
2378  *		or deallocation of a range of blocks with a single dmap.
2379  *
2380  *		on entry, this routine is provided with the new value of
2381  *		the lower level dmap or dmap control page root and the
2382  *		starting block number of the block range whose allocation
2383  *		or deallocation resulted in the root change.  this range
2384  *		is respresented by a single leaf of the current dmapctl
2385  *		and the leaf will be updated with this value, possibly
2386  *		causing a binary buddy system within the leaves to be
2387  *		split or joined.  the update may also cause the dmapctl's
2388  *		dmtree to be updated.
2389  *
2390  *		if the adjustment of the dmap control page, itself, causes its
2391  *		root to change, this change will be bubbled up to the next dmap
2392  *		control level by a recursive call to this routine, specifying
2393  *		the new root value and the next dmap control page level to
2394  *		be adjusted.
2395  * PARAMETERS:
2396  *      bmp	-  pointer to bmap descriptor
2397  *      blkno	-  the first block of a block range within a dmap.  it is
2398  *		   the allocation or deallocation of this block range that
2399  *		   requires the dmap control page to be adjusted.
2400  *      newval	-  the new value of the lower level dmap or dmap control
2401  *		   page root.
2402  *      alloc	-  TRUE if adjustment is due to an allocation.
2403  *      level	-  current level of dmap control page (i.e. L0, L1, L2) to
2404  *		   be adjusted.
2405  *
2406  * RETURN VALUES:
2407  *      0	- success
2408  *      -EIO	- i/o error
2409  *
2410  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2411  */
2412 static int
2413 dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2414 {
2415 	struct metapage *mp;
2416 	s8 oldroot;
2417 	int oldval;
2418 	s64 lblkno;
2419 	struct dmapctl *dcp;
2420 	int rc, leafno, ti;
2421 
2422 	/* get the buffer for the dmap control page for the specified
2423 	 * block number and control page level.
2424 	 */
2425 	lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level);
2426 	mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
2427 	if (mp == NULL)
2428 		return -EIO;
2429 	dcp = (struct dmapctl *) mp->data;
2430 
2431 	if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
2432 		jfs_error(bmp->db_ipbmap->i_sb,
2433 			  "dbAdjCtl: Corrupt dmapctl page");
2434 		release_metapage(mp);
2435 		return -EIO;
2436 	}
2437 
2438 	/* determine the leaf number corresponding to the block and
2439 	 * the index within the dmap control tree.
2440 	 */
2441 	leafno = BLKTOCTLLEAF(blkno, dcp->budmin);
2442 	ti = leafno + le32_to_cpu(dcp->leafidx);
2443 
2444 	/* save the current leaf value and the current root level (i.e.
2445 	 * maximum l2 free string described by this dmapctl).
2446 	 */
2447 	oldval = dcp->stree[ti];
2448 	oldroot = dcp->stree[ROOT];
2449 
2450 	/* check if this is a control page update for an allocation.
2451 	 * if so, update the leaf to reflect the new leaf value using
2452 	 * dbSplit(); otherwise (deallocation), use dbJoin() to udpate
2453 	 * the leaf with the new value.  in addition to updating the
2454 	 * leaf, dbSplit() will also split the binary buddy system of
2455 	 * the leaves, if required, and bubble new values within the
2456 	 * dmapctl tree, if required.  similarly, dbJoin() will join
2457 	 * the binary buddy system of leaves and bubble new values up
2458 	 * the dmapctl tree as required by the new leaf value.
2459 	 */
2460 	if (alloc) {
2461 		/* check if we are in the middle of a binary buddy
2462 		 * system.  this happens when we are performing the
2463 		 * first allocation out of an allocation group that
2464 		 * is part (not the first part) of a larger binary
2465 		 * buddy system.  if we are in the middle, back split
2466 		 * the system prior to calling dbSplit() which assumes
2467 		 * that it is at the front of a binary buddy system.
2468 		 */
2469 		if (oldval == NOFREE) {
2470 			dbBackSplit((dmtree_t *) dcp, leafno);
2471 			oldval = dcp->stree[ti];
2472 		}
2473 		dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
2474 	} else {
2475 		rc = dbJoin((dmtree_t *) dcp, leafno, newval);
2476 		if (rc)
2477 			return rc;
2478 	}
2479 
2480 	/* check if the root of the current dmap control page changed due
2481 	 * to the update and if the current dmap control page is not at
2482 	 * the current top level (i.e. L0, L1, L2) of the map.  if so (i.e.
2483 	 * root changed and this is not the top level), call this routine
2484 	 * again (recursion) for the next higher level of the mapping to
2485 	 * reflect the change in root for the current dmap control page.
2486 	 */
2487 	if (dcp->stree[ROOT] != oldroot) {
2488 		/* are we below the top level of the map.  if so,
2489 		 * bubble the root up to the next higher level.
2490 		 */
2491 		if (level < bmp->db_maxlevel) {
2492 			/* bubble up the new root of this dmap control page to
2493 			 * the next level.
2494 			 */
2495 			if ((rc =
2496 			     dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc,
2497 				      level + 1))) {
2498 				/* something went wrong in bubbling up the new
2499 				 * root value, so backout the changes to the
2500 				 * current dmap control page.
2501 				 */
2502 				if (alloc) {
2503 					dbJoin((dmtree_t *) dcp, leafno,
2504 					       oldval);
2505 				} else {
2506 					/* the dbJoin() above might have
2507 					 * caused a larger binary buddy system
2508 					 * to form and we may now be in the
2509 					 * middle of it.  if this is the case,
2510 					 * back split the buddies.
2511 					 */
2512 					if (dcp->stree[ti] == NOFREE)
2513 						dbBackSplit((dmtree_t *)
2514 							    dcp, leafno);
2515 					dbSplit((dmtree_t *) dcp, leafno,
2516 						dcp->budmin, oldval);
2517 				}
2518 
2519 				/* release the buffer and return the error.
2520 				 */
2521 				release_metapage(mp);
2522 				return (rc);
2523 			}
2524 		} else {
2525 			/* we're at the top level of the map. update
2526 			 * the bmap control page to reflect the size
2527 			 * of the maximum free buddy system.
2528 			 */
2529 			assert(level == bmp->db_maxlevel);
2530 			if (bmp->db_maxfreebud != oldroot) {
2531 				jfs_error(bmp->db_ipbmap->i_sb,
2532 					  "dbAdjCtl: the maximum free buddy is "
2533 					  "not the old root");
2534 			}
2535 			bmp->db_maxfreebud = dcp->stree[ROOT];
2536 		}
2537 	}
2538 
2539 	/* write the buffer.
2540 	 */
2541 	write_metapage(mp);
2542 
2543 	return (0);
2544 }
2545 
2546 
2547 /*
2548  * NAME:	dbSplit()
2549  *
2550  * FUNCTION:    update the leaf of a dmtree with a new value, splitting
2551  *		the leaf from the binary buddy system of the dmtree's
2552  *		leaves, as required.
2553  *
2554  * PARAMETERS:
2555  *      tp	- pointer to the tree containing the leaf.
2556  *      leafno	- the number of the leaf to be updated.
2557  *      splitsz	- the size the binary buddy system starting at the leaf
2558  *		  must be split to, specified as the log2 number of blocks.
2559  *      newval	- the new value for the leaf.
2560  *
2561  * RETURN VALUES: none
2562  *
2563  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2564  */
2565 static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
2566 {
2567 	int budsz;
2568 	int cursz;
2569 	s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
2570 
2571 	/* check if the leaf needs to be split.
2572 	 */
2573 	if (leaf[leafno] > tp->dmt_budmin) {
2574 		/* the split occurs by cutting the buddy system in half
2575 		 * at the specified leaf until we reach the specified
2576 		 * size.  pick up the starting split size (current size
2577 		 * - 1 in l2) and the corresponding buddy size.
2578 		 */
2579 		cursz = leaf[leafno] - 1;
2580 		budsz = BUDSIZE(cursz, tp->dmt_budmin);
2581 
2582 		/* split until we reach the specified size.
2583 		 */
2584 		while (cursz >= splitsz) {
2585 			/* update the buddy's leaf with its new value.
2586 			 */
2587 			dbAdjTree(tp, leafno ^ budsz, cursz);
2588 
2589 			/* on to the next size and buddy.
2590 			 */
2591 			cursz -= 1;
2592 			budsz >>= 1;
2593 		}
2594 	}
2595 
2596 	/* adjust the dmap tree to reflect the specified leaf's new
2597 	 * value.
2598 	 */
2599 	dbAdjTree(tp, leafno, newval);
2600 }
2601 
2602 
2603 /*
2604  * NAME:	dbBackSplit()
2605  *
2606  * FUNCTION:    back split the binary buddy system of dmtree leaves
2607  *		that hold a specified leaf until the specified leaf
2608  *		starts its own binary buddy system.
2609  *
2610  *		the allocators typically perform allocations at the start
2611  *		of binary buddy systems and dbSplit() is used to accomplish
2612  *		any required splits.  in some cases, however, allocation
2613  *		may occur in the middle of a binary system and requires a
2614  *		back split, with the split proceeding out from the middle of
2615  *		the system (less efficient) rather than the start of the
2616  *		system (more efficient).  the cases in which a back split
2617  *		is required are rare and are limited to the first allocation
2618  *		within an allocation group which is a part (not first part)
2619  *		of a larger binary buddy system and a few exception cases
2620  *		in which a previous join operation must be backed out.
2621  *
2622  * PARAMETERS:
2623  *      tp	- pointer to the tree containing the leaf.
2624  *      leafno	- the number of the leaf to be updated.
2625  *
2626  * RETURN VALUES: none
2627  *
2628  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2629  */
2630 static void dbBackSplit(dmtree_t * tp, int leafno)
2631 {
2632 	int budsz, bud, w, bsz, size;
2633 	int cursz;
2634 	s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
2635 
2636 	/* leaf should be part (not first part) of a binary
2637 	 * buddy system.
2638 	 */
2639 	assert(leaf[leafno] == NOFREE);
2640 
2641 	/* the back split is accomplished by iteratively finding the leaf
2642 	 * that starts the buddy system that contains the specified leaf and
2643 	 * splitting that system in two.  this iteration continues until
2644 	 * the specified leaf becomes the start of a buddy system.
2645 	 *
2646 	 * determine maximum possible l2 size for the specified leaf.
2647 	 */
2648 	size =
2649 	    LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs),
2650 		      tp->dmt_budmin);
2651 
2652 	/* determine the number of leaves covered by this size.  this
2653 	 * is the buddy size that we will start with as we search for
2654 	 * the buddy system that contains the specified leaf.
2655 	 */
2656 	budsz = BUDSIZE(size, tp->dmt_budmin);
2657 
2658 	/* back split.
2659 	 */
2660 	while (leaf[leafno] == NOFREE) {
2661 		/* find the leftmost buddy leaf.
2662 		 */
2663 		for (w = leafno, bsz = budsz;; bsz <<= 1,
2664 		     w = (w < bud) ? w : bud) {
2665 			assert(bsz < le32_to_cpu(tp->dmt_nleafs));
2666 
2667 			/* determine the buddy.
2668 			 */
2669 			bud = w ^ bsz;
2670 
2671 			/* check if this buddy is the start of the system.
2672 			 */
2673 			if (leaf[bud] != NOFREE) {
2674 				/* split the leaf at the start of the
2675 				 * system in two.
2676 				 */
2677 				cursz = leaf[bud] - 1;
2678 				dbSplit(tp, bud, cursz, cursz);
2679 				break;
2680 			}
2681 		}
2682 	}
2683 
2684 	assert(leaf[leafno] == size);
2685 }
2686 
2687 
2688 /*
2689  * NAME:	dbJoin()
2690  *
2691  * FUNCTION:    update the leaf of a dmtree with a new value, joining
2692  *		the leaf with other leaves of the dmtree into a multi-leaf
2693  *		binary buddy system, as required.
2694  *
2695  * PARAMETERS:
2696  *      tp	- pointer to the tree containing the leaf.
2697  *      leafno	- the number of the leaf to be updated.
2698  *      newval	- the new value for the leaf.
2699  *
2700  * RETURN VALUES: none
2701  */
2702 static int dbJoin(dmtree_t * tp, int leafno, int newval)
2703 {
2704 	int budsz, buddy;
2705 	s8 *leaf;
2706 
2707 	/* can the new leaf value require a join with other leaves ?
2708 	 */
2709 	if (newval >= tp->dmt_budmin) {
2710 		/* pickup a pointer to the leaves of the tree.
2711 		 */
2712 		leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
2713 
2714 		/* try to join the specified leaf into a large binary
2715 		 * buddy system.  the join proceeds by attempting to join
2716 		 * the specified leafno with its buddy (leaf) at new value.
2717 		 * if the join occurs, we attempt to join the left leaf
2718 		 * of the joined buddies with its buddy at new value + 1.
2719 		 * we continue to join until we find a buddy that cannot be
2720 		 * joined (does not have a value equal to the size of the
2721 		 * last join) or until all leaves have been joined into a
2722 		 * single system.
2723 		 *
2724 		 * get the buddy size (number of words covered) of
2725 		 * the new value.
2726 		 */
2727 		budsz = BUDSIZE(newval, tp->dmt_budmin);
2728 
2729 		/* try to join.
2730 		 */
2731 		while (budsz < le32_to_cpu(tp->dmt_nleafs)) {
2732 			/* get the buddy leaf.
2733 			 */
2734 			buddy = leafno ^ budsz;
2735 
2736 			/* if the leaf's new value is greater than its
2737 			 * buddy's value, we join no more.
2738 			 */
2739 			if (newval > leaf[buddy])
2740 				break;
2741 
2742 			/* It shouldn't be less */
2743 			if (newval < leaf[buddy])
2744 				return -EIO;
2745 
2746 			/* check which (leafno or buddy) is the left buddy.
2747 			 * the left buddy gets to claim the blocks resulting
2748 			 * from the join while the right gets to claim none.
2749 			 * the left buddy is also eligable to participate in
2750 			 * a join at the next higher level while the right
2751 			 * is not.
2752 			 *
2753 			 */
2754 			if (leafno < buddy) {
2755 				/* leafno is the left buddy.
2756 				 */
2757 				dbAdjTree(tp, buddy, NOFREE);
2758 			} else {
2759 				/* buddy is the left buddy and becomes
2760 				 * leafno.
2761 				 */
2762 				dbAdjTree(tp, leafno, NOFREE);
2763 				leafno = buddy;
2764 			}
2765 
2766 			/* on to try the next join.
2767 			 */
2768 			newval += 1;
2769 			budsz <<= 1;
2770 		}
2771 	}
2772 
2773 	/* update the leaf value.
2774 	 */
2775 	dbAdjTree(tp, leafno, newval);
2776 
2777 	return 0;
2778 }
2779 
2780 
2781 /*
2782  * NAME:	dbAdjTree()
2783  *
2784  * FUNCTION:    update a leaf of a dmtree with a new value, adjusting
2785  *		the dmtree, as required, to reflect the new leaf value.
2786  *		the combination of any buddies must already be done before
2787  *		this is called.
2788  *
2789  * PARAMETERS:
2790  *      tp	- pointer to the tree to be adjusted.
2791  *      leafno	- the number of the leaf to be updated.
2792  *      newval	- the new value for the leaf.
2793  *
2794  * RETURN VALUES: none
2795  */
2796 static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
2797 {
2798 	int lp, pp, k;
2799 	int max;
2800 
2801 	/* pick up the index of the leaf for this leafno.
2802 	 */
2803 	lp = leafno + le32_to_cpu(tp->dmt_leafidx);
2804 
2805 	/* is the current value the same as the old value ?  if so,
2806 	 * there is nothing to do.
2807 	 */
2808 	if (tp->dmt_stree[lp] == newval)
2809 		return;
2810 
2811 	/* set the new value.
2812 	 */
2813 	tp->dmt_stree[lp] = newval;
2814 
2815 	/* bubble the new value up the tree as required.
2816 	 */
2817 	for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) {
2818 		/* get the index of the first leaf of the 4 leaf
2819 		 * group containing the specified leaf (leafno).
2820 		 */
2821 		lp = ((lp - 1) & ~0x03) + 1;
2822 
2823 		/* get the index of the parent of this 4 leaf group.
2824 		 */
2825 		pp = (lp - 1) >> 2;
2826 
2827 		/* determine the maximum of the 4 leaves.
2828 		 */
2829 		max = TREEMAX(&tp->dmt_stree[lp]);
2830 
2831 		/* if the maximum of the 4 is the same as the
2832 		 * parent's value, we're done.
2833 		 */
2834 		if (tp->dmt_stree[pp] == max)
2835 			break;
2836 
2837 		/* parent gets new value.
2838 		 */
2839 		tp->dmt_stree[pp] = max;
2840 
2841 		/* parent becomes leaf for next go-round.
2842 		 */
2843 		lp = pp;
2844 	}
2845 }
2846 
2847 
2848 /*
2849  * NAME:	dbFindLeaf()
2850  *
2851  * FUNCTION:    search a dmtree_t for sufficient free blocks, returning
2852  *		the index of a leaf describing the free blocks if
2853  *		sufficient free blocks are found.
2854  *
2855  *		the search starts at the top of the dmtree_t tree and
2856  *		proceeds down the tree to the leftmost leaf with sufficient
2857  *		free space.
2858  *
2859  * PARAMETERS:
2860  *      tp	- pointer to the tree to be searched.
2861  *      l2nb	- log2 number of free blocks to search for.
2862  *	leafidx	- return pointer to be set to the index of the leaf
2863  *		  describing at least l2nb free blocks if sufficient
2864  *		  free blocks are found.
2865  *
2866  * RETURN VALUES:
2867  *      0	- success
2868  *      -ENOSPC	- insufficient free blocks.
2869  */
2870 static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
2871 {
2872 	int ti, n = 0, k, x = 0;
2873 
2874 	/* first check the root of the tree to see if there is
2875 	 * sufficient free space.
2876 	 */
2877 	if (l2nb > tp->dmt_stree[ROOT])
2878 		return -ENOSPC;
2879 
2880 	/* sufficient free space available. now search down the tree
2881 	 * starting at the next level for the leftmost leaf that
2882 	 * describes sufficient free space.
2883 	 */
2884 	for (k = le32_to_cpu(tp->dmt_height), ti = 1;
2885 	     k > 0; k--, ti = ((ti + n) << 2) + 1) {
2886 		/* search the four nodes at this level, starting from
2887 		 * the left.
2888 		 */
2889 		for (x = ti, n = 0; n < 4; n++) {
2890 			/* sufficient free space found.  move to the next
2891 			 * level (or quit if this is the last level).
2892 			 */
2893 			if (l2nb <= tp->dmt_stree[x + n])
2894 				break;
2895 		}
2896 
2897 		/* better have found something since the higher
2898 		 * levels of the tree said it was here.
2899 		 */
2900 		assert(n < 4);
2901 	}
2902 
2903 	/* set the return to the leftmost leaf describing sufficient
2904 	 * free space.
2905 	 */
2906 	*leafidx = x + n - le32_to_cpu(tp->dmt_leafidx);
2907 
2908 	return (0);
2909 }
2910 
2911 
2912 /*
2913  * NAME:	dbFindBits()
2914  *
2915  * FUNCTION:    find a specified number of binary buddy free bits within a
2916  *		dmap bitmap word value.
2917  *
2918  *		this routine searches the bitmap value for (1 << l2nb) free
2919  *		bits at (1 << l2nb) alignments within the value.
2920  *
2921  * PARAMETERS:
2922  *      word	-  dmap bitmap word value.
2923  *      l2nb	-  number of free bits specified as a log2 number.
2924  *
2925  * RETURN VALUES:
2926  *      starting bit number of free bits.
2927  */
2928 static int dbFindBits(u32 word, int l2nb)
2929 {
2930 	int bitno, nb;
2931 	u32 mask;
2932 
2933 	/* get the number of bits.
2934 	 */
2935 	nb = 1 << l2nb;
2936 	assert(nb <= DBWORD);
2937 
2938 	/* complement the word so we can use a mask (i.e. 0s represent
2939 	 * free bits) and compute the mask.
2940 	 */
2941 	word = ~word;
2942 	mask = ONES << (DBWORD - nb);
2943 
2944 	/* scan the word for nb free bits at nb alignments.
2945 	 */
2946 	for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) {
2947 		if ((mask & word) == mask)
2948 			break;
2949 	}
2950 
2951 	ASSERT(bitno < 32);
2952 
2953 	/* return the bit number.
2954 	 */
2955 	return (bitno);
2956 }
2957 
2958 
2959 /*
2960  * NAME:	dbMaxBud(u8 *cp)
2961  *
2962  * FUNCTION:    determine the largest binary buddy string of free
2963  *		bits within 32-bits of the map.
2964  *
2965  * PARAMETERS:
2966  *      cp	-  pointer to the 32-bit value.
2967  *
2968  * RETURN VALUES:
2969  *      largest binary buddy of free bits within a dmap word.
2970  */
2971 static int dbMaxBud(u8 * cp)
2972 {
2973 	signed char tmp1, tmp2;
2974 
2975 	/* check if the wmap word is all free. if so, the
2976 	 * free buddy size is BUDMIN.
2977 	 */
2978 	if (*((uint *) cp) == 0)
2979 		return (BUDMIN);
2980 
2981 	/* check if the wmap word is half free. if so, the
2982 	 * free buddy size is BUDMIN-1.
2983 	 */
2984 	if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0)
2985 		return (BUDMIN - 1);
2986 
2987 	/* not all free or half free. determine the free buddy
2988 	 * size thru table lookup using quarters of the wmap word.
2989 	 */
2990 	tmp1 = max(budtab[cp[2]], budtab[cp[3]]);
2991 	tmp2 = max(budtab[cp[0]], budtab[cp[1]]);
2992 	return (max(tmp1, tmp2));
2993 }
2994 
2995 
2996 /*
2997  * NAME:	cnttz(uint word)
2998  *
2999  * FUNCTION:    determine the number of trailing zeros within a 32-bit
3000  *		value.
3001  *
3002  * PARAMETERS:
3003  *      value	-  32-bit value to be examined.
3004  *
3005  * RETURN VALUES:
3006  *      count of trailing zeros
3007  */
3008 static int cnttz(u32 word)
3009 {
3010 	int n;
3011 
3012 	for (n = 0; n < 32; n++, word >>= 1) {
3013 		if (word & 0x01)
3014 			break;
3015 	}
3016 
3017 	return (n);
3018 }
3019 
3020 
3021 /*
3022  * NAME:	cntlz(u32 value)
3023  *
3024  * FUNCTION:    determine the number of leading zeros within a 32-bit
3025  *		value.
3026  *
3027  * PARAMETERS:
3028  *      value	-  32-bit value to be examined.
3029  *
3030  * RETURN VALUES:
3031  *      count of leading zeros
3032  */
3033 static int cntlz(u32 value)
3034 {
3035 	int n;
3036 
3037 	for (n = 0; n < 32; n++, value <<= 1) {
3038 		if (value & HIGHORDER)
3039 			break;
3040 	}
3041 	return (n);
3042 }
3043 
3044 
3045 /*
3046  * NAME:	blkstol2(s64 nb)
3047  *
3048  * FUNCTION:	convert a block count to its log2 value. if the block
3049  *	        count is not a l2 multiple, it is rounded up to the next
3050  *		larger l2 multiple.
3051  *
3052  * PARAMETERS:
3053  *      nb	-  number of blocks
3054  *
3055  * RETURN VALUES:
3056  *      log2 number of blocks
3057  */
3058 static int blkstol2(s64 nb)
3059 {
3060 	int l2nb;
3061 	s64 mask;		/* meant to be signed */
3062 
3063 	mask = (s64) 1 << (64 - 1);
3064 
3065 	/* count the leading bits.
3066 	 */
3067 	for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) {
3068 		/* leading bit found.
3069 		 */
3070 		if (nb & mask) {
3071 			/* determine the l2 value.
3072 			 */
3073 			l2nb = (64 - 1) - l2nb;
3074 
3075 			/* check if we need to round up.
3076 			 */
3077 			if (~mask & nb)
3078 				l2nb++;
3079 
3080 			return (l2nb);
3081 		}
3082 	}
3083 	assert(0);
3084 	return 0;		/* fix compiler warning */
3085 }
3086 
3087 
3088 /*
3089  * NAME:    	dbAllocBottomUp()
3090  *
3091  * FUNCTION:	alloc the specified block range from the working block
3092  *		allocation map.
3093  *
3094  *		the blocks will be alloc from the working map one dmap
3095  *		at a time.
3096  *
3097  * PARAMETERS:
3098  *      ip	-  pointer to in-core inode;
3099  *      blkno	-  starting block number to be freed.
3100  *      nblocks	-  number of blocks to be freed.
3101  *
3102  * RETURN VALUES:
3103  *      0	- success
3104  *      -EIO	- i/o error
3105  */
3106 int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
3107 {
3108 	struct metapage *mp;
3109 	struct dmap *dp;
3110 	int nb, rc;
3111 	s64 lblkno, rem;
3112 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
3113 	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
3114 
3115 	IREAD_LOCK(ipbmap);
3116 
3117 	/* block to be allocated better be within the mapsize. */
3118 	ASSERT(nblocks <= bmp->db_mapsize - blkno);
3119 
3120 	/*
3121 	 * allocate the blocks a dmap at a time.
3122 	 */
3123 	mp = NULL;
3124 	for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
3125 		/* release previous dmap if any */
3126 		if (mp) {
3127 			write_metapage(mp);
3128 		}
3129 
3130 		/* get the buffer for the current dmap. */
3131 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
3132 		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
3133 		if (mp == NULL) {
3134 			IREAD_UNLOCK(ipbmap);
3135 			return -EIO;
3136 		}
3137 		dp = (struct dmap *) mp->data;
3138 
3139 		/* determine the number of blocks to be allocated from
3140 		 * this dmap.
3141 		 */
3142 		nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
3143 
3144 		/* allocate the blocks. */
3145 		if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) {
3146 			release_metapage(mp);
3147 			IREAD_UNLOCK(ipbmap);
3148 			return (rc);
3149 		}
3150 	}
3151 
3152 	/* write the last buffer. */
3153 	write_metapage(mp);
3154 
3155 	IREAD_UNLOCK(ipbmap);
3156 
3157 	return (0);
3158 }
3159 
3160 
3161 static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
3162 			 int nblocks)
3163 {
3164 	int rc;
3165 	int dbitno, word, rembits, nb, nwords, wbitno, agno;
3166 	s8 oldroot, *leaf;
3167 	struct dmaptree *tp = (struct dmaptree *) & dp->tree;
3168 
3169 	/* save the current value of the root (i.e. maximum free string)
3170 	 * of the dmap tree.
3171 	 */
3172 	oldroot = tp->stree[ROOT];
3173 
3174 	/* pick up a pointer to the leaves of the dmap tree */
3175 	leaf = tp->stree + LEAFIND;
3176 
3177 	/* determine the bit number and word within the dmap of the
3178 	 * starting block.
3179 	 */
3180 	dbitno = blkno & (BPERDMAP - 1);
3181 	word = dbitno >> L2DBWORD;
3182 
3183 	/* block range better be within the dmap */
3184 	assert(dbitno + nblocks <= BPERDMAP);
3185 
3186 	/* allocate the bits of the dmap's words corresponding to the block
3187 	 * range. not all bits of the first and last words may be contained
3188 	 * within the block range.  if this is the case, we'll work against
3189 	 * those words (i.e. partial first and/or last) on an individual basis
3190 	 * (a single pass), allocating the bits of interest by hand and
3191 	 * updating the leaf corresponding to the dmap word. a single pass
3192 	 * will be used for all dmap words fully contained within the
3193 	 * specified range.  within this pass, the bits of all fully contained
3194 	 * dmap words will be marked as free in a single shot and the leaves
3195 	 * will be updated. a single leaf may describe the free space of
3196 	 * multiple dmap words, so we may update only a subset of the actual
3197 	 * leaves corresponding to the dmap words of the block range.
3198 	 */
3199 	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
3200 		/* determine the bit number within the word and
3201 		 * the number of bits within the word.
3202 		 */
3203 		wbitno = dbitno & (DBWORD - 1);
3204 		nb = min(rembits, DBWORD - wbitno);
3205 
3206 		/* check if only part of a word is to be allocated.
3207 		 */
3208 		if (nb < DBWORD) {
3209 			/* allocate (set to 1) the appropriate bits within
3210 			 * this dmap word.
3211 			 */
3212 			dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
3213 						      >> wbitno);
3214 
3215 			word++;
3216 		} else {
3217 			/* one or more dmap words are fully contained
3218 			 * within the block range.  determine how many
3219 			 * words and allocate (set to 1) the bits of these
3220 			 * words.
3221 			 */
3222 			nwords = rembits >> L2DBWORD;
3223 			memset(&dp->wmap[word], (int) ONES, nwords * 4);
3224 
3225 			/* determine how many bits */
3226 			nb = nwords << L2DBWORD;
3227 			word += nwords;
3228 		}
3229 	}
3230 
3231 	/* update the free count for this dmap */
3232 	dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
3233 
3234 	/* reconstruct summary tree */
3235 	dbInitDmapTree(dp);
3236 
3237 	BMAP_LOCK(bmp);
3238 
3239 	/* if this allocation group is completely free,
3240 	 * update the highest active allocation group number
3241 	 * if this allocation group is the new max.
3242 	 */
3243 	agno = blkno >> bmp->db_agl2size;
3244 	if (agno > bmp->db_maxag)
3245 		bmp->db_maxag = agno;
3246 
3247 	/* update the free count for the allocation group and map */
3248 	bmp->db_agfree[agno] -= nblocks;
3249 	bmp->db_nfree -= nblocks;
3250 
3251 	BMAP_UNLOCK(bmp);
3252 
3253 	/* if the root has not changed, done. */
3254 	if (tp->stree[ROOT] == oldroot)
3255 		return (0);
3256 
3257 	/* root changed. bubble the change up to the dmap control pages.
3258 	 * if the adjustment of the upper level control pages fails,
3259 	 * backout the bit allocation (thus making everything consistent).
3260 	 */
3261 	if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0)))
3262 		dbFreeBits(bmp, dp, blkno, nblocks);
3263 
3264 	return (rc);
3265 }
3266 
3267 
3268 /*
3269  * NAME:	dbExtendFS()
3270  *
3271  * FUNCTION:	extend bmap from blkno for nblocks;
3272  * 		dbExtendFS() updates bmap ready for dbAllocBottomUp();
3273  *
3274  * L2
3275  *  |
3276  *   L1---------------------------------L1
3277  *    |                                  |
3278  *     L0---------L0---------L0           L0---------L0---------L0
3279  *      |          |          |            |          |          |
3280  *       d0,...,dn  d0,...,dn  d0,...,dn    d0,...,dn  d0,...,dn  d0,.,dm;
3281  * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
3282  *
3283  * <---old---><----------------------------extend----------------------->
3284  */
3285 int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
3286 {
3287 	struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb);
3288 	int nbperpage = sbi->nbperpage;
3289 	int i, i0 = TRUE, j, j0 = TRUE, k, n;
3290 	s64 newsize;
3291 	s64 p;
3292 	struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL;
3293 	struct dmapctl *l2dcp, *l1dcp, *l0dcp;
3294 	struct dmap *dp;
3295 	s8 *l0leaf, *l1leaf, *l2leaf;
3296 	struct bmap *bmp = sbi->bmap;
3297 	int agno, l2agsize, oldl2agsize;
3298 	s64 ag_rem;
3299 
3300 	newsize = blkno + nblocks;
3301 
3302 	jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld",
3303 		 (long long) blkno, (long long) nblocks, (long long) newsize);
3304 
3305 	/*
3306 	 *      initialize bmap control page.
3307 	 *
3308 	 * all the data in bmap control page should exclude
3309 	 * the mkfs hidden dmap page.
3310 	 */
3311 
3312 	/* update mapsize */
3313 	bmp->db_mapsize = newsize;
3314 	bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize);
3315 
3316 	/* compute new AG size */
3317 	l2agsize = dbGetL2AGSize(newsize);
3318 	oldl2agsize = bmp->db_agl2size;
3319 
3320 	bmp->db_agl2size = l2agsize;
3321 	bmp->db_agsize = 1 << l2agsize;
3322 
3323 	/* compute new number of AG */
3324 	agno = bmp->db_numag;
3325 	bmp->db_numag = newsize >> l2agsize;
3326 	bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
3327 
3328 	/*
3329 	 *      reconfigure db_agfree[]
3330 	 * from old AG configuration to new AG configuration;
3331 	 *
3332 	 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
3333 	 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
3334 	 * note: new AG size = old AG size * (2**x).
3335 	 */
3336 	if (l2agsize == oldl2agsize)
3337 		goto extend;
3338 	k = 1 << (l2agsize - oldl2agsize);
3339 	ag_rem = bmp->db_agfree[0];	/* save agfree[0] */
3340 	for (i = 0, n = 0; i < agno; n++) {
3341 		bmp->db_agfree[n] = 0;	/* init collection point */
3342 
3343 		/* coalesce cotiguous k AGs; */
3344 		for (j = 0; j < k && i < agno; j++, i++) {
3345 			/* merge AGi to AGn */
3346 			bmp->db_agfree[n] += bmp->db_agfree[i];
3347 		}
3348 	}
3349 	bmp->db_agfree[0] += ag_rem;	/* restore agfree[0] */
3350 
3351 	for (; n < MAXAG; n++)
3352 		bmp->db_agfree[n] = 0;
3353 
3354 	/*
3355 	 * update highest active ag number
3356 	 */
3357 
3358 	bmp->db_maxag = bmp->db_maxag / k;
3359 
3360 	/*
3361 	 *      extend bmap
3362 	 *
3363 	 * update bit maps and corresponding level control pages;
3364 	 * global control page db_nfree, db_agfree[agno], db_maxfreebud;
3365 	 */
3366       extend:
3367 	/* get L2 page */
3368 	p = BMAPBLKNO + nbperpage;	/* L2 page */
3369 	l2mp = read_metapage(ipbmap, p, PSIZE, 0);
3370 	if (!l2mp) {
3371 		jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read");
3372 		return -EIO;
3373 	}
3374 	l2dcp = (struct dmapctl *) l2mp->data;
3375 
3376 	/* compute start L1 */
3377 	k = blkno >> L2MAXL1SIZE;
3378 	l2leaf = l2dcp->stree + CTLLEAFIND + k;
3379 	p = BLKTOL1(blkno, sbi->l2nbperpage);	/* L1 page */
3380 
3381 	/*
3382 	 * extend each L1 in L2
3383 	 */
3384 	for (; k < LPERCTL; k++, p += nbperpage) {
3385 		/* get L1 page */
3386 		if (j0) {
3387 			/* read in L1 page: (blkno & (MAXL1SIZE - 1)) */
3388 			l1mp = read_metapage(ipbmap, p, PSIZE, 0);
3389 			if (l1mp == NULL)
3390 				goto errout;
3391 			l1dcp = (struct dmapctl *) l1mp->data;
3392 
3393 			/* compute start L0 */
3394 			j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE;
3395 			l1leaf = l1dcp->stree + CTLLEAFIND + j;
3396 			p = BLKTOL0(blkno, sbi->l2nbperpage);
3397 			j0 = FALSE;
3398 		} else {
3399 			/* assign/init L1 page */
3400 			l1mp = get_metapage(ipbmap, p, PSIZE, 0);
3401 			if (l1mp == NULL)
3402 				goto errout;
3403 
3404 			l1dcp = (struct dmapctl *) l1mp->data;
3405 
3406 			/* compute start L0 */
3407 			j = 0;
3408 			l1leaf = l1dcp->stree + CTLLEAFIND;
3409 			p += nbperpage;	/* 1st L0 of L1.k  */
3410 		}
3411 
3412 		/*
3413 		 * extend each L0 in L1
3414 		 */
3415 		for (; j < LPERCTL; j++) {
3416 			/* get L0 page */
3417 			if (i0) {
3418 				/* read in L0 page: (blkno & (MAXL0SIZE - 1)) */
3419 
3420 				l0mp = read_metapage(ipbmap, p, PSIZE, 0);
3421 				if (l0mp == NULL)
3422 					goto errout;
3423 				l0dcp = (struct dmapctl *) l0mp->data;
3424 
3425 				/* compute start dmap */
3426 				i = (blkno & (MAXL0SIZE - 1)) >>
3427 				    L2BPERDMAP;
3428 				l0leaf = l0dcp->stree + CTLLEAFIND + i;
3429 				p = BLKTODMAP(blkno,
3430 					      sbi->l2nbperpage);
3431 				i0 = FALSE;
3432 			} else {
3433 				/* assign/init L0 page */
3434 				l0mp = get_metapage(ipbmap, p, PSIZE, 0);
3435 				if (l0mp == NULL)
3436 					goto errout;
3437 
3438 				l0dcp = (struct dmapctl *) l0mp->data;
3439 
3440 				/* compute start dmap */
3441 				i = 0;
3442 				l0leaf = l0dcp->stree + CTLLEAFIND;
3443 				p += nbperpage;	/* 1st dmap of L0.j */
3444 			}
3445 
3446 			/*
3447 			 * extend each dmap in L0
3448 			 */
3449 			for (; i < LPERCTL; i++) {
3450 				/*
3451 				 * reconstruct the dmap page, and
3452 				 * initialize corresponding parent L0 leaf
3453 				 */
3454 				if ((n = blkno & (BPERDMAP - 1))) {
3455 					/* read in dmap page: */
3456 					mp = read_metapage(ipbmap, p,
3457 							   PSIZE, 0);
3458 					if (mp == NULL)
3459 						goto errout;
3460 					n = min(nblocks, (s64)BPERDMAP - n);
3461 				} else {
3462 					/* assign/init dmap page */
3463 					mp = read_metapage(ipbmap, p,
3464 							   PSIZE, 0);
3465 					if (mp == NULL)
3466 						goto errout;
3467 
3468 					n = min(nblocks, (s64)BPERDMAP);
3469 				}
3470 
3471 				dp = (struct dmap *) mp->data;
3472 				*l0leaf = dbInitDmap(dp, blkno, n);
3473 
3474 				bmp->db_nfree += n;
3475 				agno = le64_to_cpu(dp->start) >> l2agsize;
3476 				bmp->db_agfree[agno] += n;
3477 
3478 				write_metapage(mp);
3479 
3480 				l0leaf++;
3481 				p += nbperpage;
3482 
3483 				blkno += n;
3484 				nblocks -= n;
3485 				if (nblocks == 0)
3486 					break;
3487 			}	/* for each dmap in a L0 */
3488 
3489 			/*
3490 			 * build current L0 page from its leaves, and
3491 			 * initialize corresponding parent L1 leaf
3492 			 */
3493 			*l1leaf = dbInitDmapCtl(l0dcp, 0, ++i);
3494 			write_metapage(l0mp);
3495 			l0mp = NULL;
3496 
3497 			if (nblocks)
3498 				l1leaf++;	/* continue for next L0 */
3499 			else {
3500 				/* more than 1 L0 ? */
3501 				if (j > 0)
3502 					break;	/* build L1 page */
3503 				else {
3504 					/* summarize in global bmap page */
3505 					bmp->db_maxfreebud = *l1leaf;
3506 					release_metapage(l1mp);
3507 					release_metapage(l2mp);
3508 					goto finalize;
3509 				}
3510 			}
3511 		}		/* for each L0 in a L1 */
3512 
3513 		/*
3514 		 * build current L1 page from its leaves, and
3515 		 * initialize corresponding parent L2 leaf
3516 		 */
3517 		*l2leaf = dbInitDmapCtl(l1dcp, 1, ++j);
3518 		write_metapage(l1mp);
3519 		l1mp = NULL;
3520 
3521 		if (nblocks)
3522 			l2leaf++;	/* continue for next L1 */
3523 		else {
3524 			/* more than 1 L1 ? */
3525 			if (k > 0)
3526 				break;	/* build L2 page */
3527 			else {
3528 				/* summarize in global bmap page */
3529 				bmp->db_maxfreebud = *l2leaf;
3530 				release_metapage(l2mp);
3531 				goto finalize;
3532 			}
3533 		}
3534 	}			/* for each L1 in a L2 */
3535 
3536 	jfs_error(ipbmap->i_sb,
3537 		  "dbExtendFS: function has not returned as expected");
3538 errout:
3539 	if (l0mp)
3540 		release_metapage(l0mp);
3541 	if (l1mp)
3542 		release_metapage(l1mp);
3543 	release_metapage(l2mp);
3544 	return -EIO;
3545 
3546 	/*
3547 	 *      finalize bmap control page
3548 	 */
3549 finalize:
3550 
3551 	return 0;
3552 }
3553 
3554 
3555 /*
3556  *	dbFinalizeBmap()
3557  */
3558 void dbFinalizeBmap(struct inode *ipbmap)
3559 {
3560 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
3561 	int actags, inactags, l2nl;
3562 	s64 ag_rem, actfree, inactfree, avgfree;
3563 	int i, n;
3564 
3565 	/*
3566 	 *      finalize bmap control page
3567 	 */
3568 //finalize:
3569 	/*
3570 	 * compute db_agpref: preferred ag to allocate from
3571 	 * (the leftmost ag with average free space in it);
3572 	 */
3573 //agpref:
3574 	/* get the number of active ags and inacitve ags */
3575 	actags = bmp->db_maxag + 1;
3576 	inactags = bmp->db_numag - actags;
3577 	ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1);	/* ??? */
3578 
3579 	/* determine how many blocks are in the inactive allocation
3580 	 * groups. in doing this, we must account for the fact that
3581 	 * the rightmost group might be a partial group (i.e. file
3582 	 * system size is not a multiple of the group size).
3583 	 */
3584 	inactfree = (inactags && ag_rem) ?
3585 	    ((inactags - 1) << bmp->db_agl2size) + ag_rem
3586 	    : inactags << bmp->db_agl2size;
3587 
3588 	/* determine how many free blocks are in the active
3589 	 * allocation groups plus the average number of free blocks
3590 	 * within the active ags.
3591 	 */
3592 	actfree = bmp->db_nfree - inactfree;
3593 	avgfree = (u32) actfree / (u32) actags;
3594 
3595 	/* if the preferred allocation group has not average free space.
3596 	 * re-establish the preferred group as the leftmost
3597 	 * group with average free space.
3598 	 */
3599 	if (bmp->db_agfree[bmp->db_agpref] < avgfree) {
3600 		for (bmp->db_agpref = 0; bmp->db_agpref < actags;
3601 		     bmp->db_agpref++) {
3602 			if (bmp->db_agfree[bmp->db_agpref] >= avgfree)
3603 				break;
3604 		}
3605 		if (bmp->db_agpref >= bmp->db_numag) {
3606 			jfs_error(ipbmap->i_sb,
3607 				  "cannot find ag with average freespace");
3608 		}
3609 	}
3610 
3611 	/*
3612 	 * compute db_aglevel, db_agheigth, db_width, db_agstart:
3613 	 * an ag is covered in aglevel dmapctl summary tree,
3614 	 * at agheight level height (from leaf) with agwidth number of nodes
3615 	 * each, which starts at agstart index node of the smmary tree node
3616 	 * array;
3617 	 */
3618 	bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
3619 	l2nl =
3620 	    bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
3621 	bmp->db_agheigth = l2nl >> 1;
3622 	bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1));
3623 	for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0;
3624 	     i--) {
3625 		bmp->db_agstart += n;
3626 		n <<= 2;
3627 	}
3628 
3629 }
3630 
3631 
3632 /*
3633  * NAME:	dbInitDmap()/ujfs_idmap_page()
3634  *
3635  * FUNCTION:	initialize working/persistent bitmap of the dmap page
3636  *		for the specified number of blocks:
3637  *
3638  *		at entry, the bitmaps had been initialized as free (ZEROS);
3639  *		The number of blocks will only account for the actually
3640  *		existing blocks. Blocks which don't actually exist in
3641  *		the aggregate will be marked as allocated (ONES);
3642  *
3643  * PARAMETERS:
3644  *	dp	- pointer to page of map
3645  *	nblocks	- number of blocks this page
3646  *
3647  * RETURNS: NONE
3648  */
3649 static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
3650 {
3651 	int blkno, w, b, r, nw, nb, i;
3652 
3653 	/* starting block number within the dmap */
3654 	blkno = Blkno & (BPERDMAP - 1);
3655 
3656 	if (blkno == 0) {
3657 		dp->nblocks = dp->nfree = cpu_to_le32(nblocks);
3658 		dp->start = cpu_to_le64(Blkno);
3659 
3660 		if (nblocks == BPERDMAP) {
3661 			memset(&dp->wmap[0], 0, LPERDMAP * 4);
3662 			memset(&dp->pmap[0], 0, LPERDMAP * 4);
3663 			goto initTree;
3664 		}
3665 	} else {
3666 		dp->nblocks =
3667 		    cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks);
3668 		dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
3669 	}
3670 
3671 	/* word number containing start block number */
3672 	w = blkno >> L2DBWORD;
3673 
3674 	/*
3675 	 * free the bits corresponding to the block range (ZEROS):
3676 	 * note: not all bits of the first and last words may be contained
3677 	 * within the block range.
3678 	 */
3679 	for (r = nblocks; r > 0; r -= nb, blkno += nb) {
3680 		/* number of bits preceding range to be freed in the word */
3681 		b = blkno & (DBWORD - 1);
3682 		/* number of bits to free in the word */
3683 		nb = min(r, DBWORD - b);
3684 
3685 		/* is partial word to be freed ? */
3686 		if (nb < DBWORD) {
3687 			/* free (set to 0) from the bitmap word */
3688 			dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
3689 						     >> b));
3690 			dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
3691 						     >> b));
3692 
3693 			/* skip the word freed */
3694 			w++;
3695 		} else {
3696 			/* free (set to 0) contiguous bitmap words */
3697 			nw = r >> L2DBWORD;
3698 			memset(&dp->wmap[w], 0, nw * 4);
3699 			memset(&dp->pmap[w], 0, nw * 4);
3700 
3701 			/* skip the words freed */
3702 			nb = nw << L2DBWORD;
3703 			w += nw;
3704 		}
3705 	}
3706 
3707 	/*
3708 	 * mark bits following the range to be freed (non-existing
3709 	 * blocks) as allocated (ONES)
3710 	 */
3711 
3712 	if (blkno == BPERDMAP)
3713 		goto initTree;
3714 
3715 	/* the first word beyond the end of existing blocks */
3716 	w = blkno >> L2DBWORD;
3717 
3718 	/* does nblocks fall on a 32-bit boundary ? */
3719 	b = blkno & (DBWORD - 1);
3720 	if (b) {
3721 		/* mark a partial word allocated */
3722 		dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b);
3723 		w++;
3724 	}
3725 
3726 	/* set the rest of the words in the page to allocated (ONES) */
3727 	for (i = w; i < LPERDMAP; i++)
3728 		dp->pmap[i] = dp->wmap[i] = cpu_to_le32(ONES);
3729 
3730 	/*
3731 	 * init tree
3732 	 */
3733       initTree:
3734 	return (dbInitDmapTree(dp));
3735 }
3736 
3737 
3738 /*
3739  * NAME:	dbInitDmapTree()/ujfs_complete_dmap()
3740  *
3741  * FUNCTION:	initialize summary tree of the specified dmap:
3742  *
3743  *		at entry, bitmap of the dmap has been initialized;
3744  *
3745  * PARAMETERS:
3746  *	dp	- dmap to complete
3747  *	blkno	- starting block number for this dmap
3748  *	treemax	- will be filled in with max free for this dmap
3749  *
3750  * RETURNS:	max free string at the root of the tree
3751  */
3752 static int dbInitDmapTree(struct dmap * dp)
3753 {
3754 	struct dmaptree *tp;
3755 	s8 *cp;
3756 	int i;
3757 
3758 	/* init fixed info of tree */
3759 	tp = &dp->tree;
3760 	tp->nleafs = cpu_to_le32(LPERDMAP);
3761 	tp->l2nleafs = cpu_to_le32(L2LPERDMAP);
3762 	tp->leafidx = cpu_to_le32(LEAFIND);
3763 	tp->height = cpu_to_le32(4);
3764 	tp->budmin = BUDMIN;
3765 
3766 	/* init each leaf from corresponding wmap word:
3767 	 * note: leaf is set to NOFREE(-1) if all blocks of corresponding
3768 	 * bitmap word are allocated.
3769 	 */
3770 	cp = tp->stree + le32_to_cpu(tp->leafidx);
3771 	for (i = 0; i < LPERDMAP; i++)
3772 		*cp++ = dbMaxBud((u8 *) & dp->wmap[i]);
3773 
3774 	/* build the dmap's binary buddy summary tree */
3775 	return (dbInitTree(tp));
3776 }
3777 
3778 
3779 /*
3780  * NAME:	dbInitTree()/ujfs_adjtree()
3781  *
3782  * FUNCTION:	initialize binary buddy summary tree of a dmap or dmapctl.
3783  *
3784  *		at entry, the leaves of the tree has been initialized
3785  *		from corresponding bitmap word or root of summary tree
3786  *		of the child control page;
3787  *		configure binary buddy system at the leaf level, then
3788  *		bubble up the values of the leaf nodes up the tree.
3789  *
3790  * PARAMETERS:
3791  *	cp	- Pointer to the root of the tree
3792  *	l2leaves- Number of leaf nodes as a power of 2
3793  *	l2min	- Number of blocks that can be covered by a leaf
3794  *		  as a power of 2
3795  *
3796  * RETURNS: max free string at the root of the tree
3797  */
3798 static int dbInitTree(struct dmaptree * dtp)
3799 {
3800 	int l2max, l2free, bsize, nextb, i;
3801 	int child, parent, nparent;
3802 	s8 *tp, *cp, *cp1;
3803 
3804 	tp = dtp->stree;
3805 
3806 	/* Determine the maximum free string possible for the leaves */
3807 	l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin;
3808 
3809 	/*
3810 	 * configure the leaf levevl into binary buddy system
3811 	 *
3812 	 * Try to combine buddies starting with a buddy size of 1
3813 	 * (i.e. two leaves). At a buddy size of 1 two buddy leaves
3814 	 * can be combined if both buddies have a maximum free of l2min;
3815 	 * the combination will result in the left-most buddy leaf having
3816 	 * a maximum free of l2min+1.
3817 	 * After processing all buddies for a given size, process buddies
3818 	 * at the next higher buddy size (i.e. current size * 2) and
3819 	 * the next maximum free (current free + 1).
3820 	 * This continues until the maximum possible buddy combination
3821 	 * yields maximum free.
3822 	 */
3823 	for (l2free = dtp->budmin, bsize = 1; l2free < l2max;
3824 	     l2free++, bsize = nextb) {
3825 		/* get next buddy size == current buddy pair size */
3826 		nextb = bsize << 1;
3827 
3828 		/* scan each adjacent buddy pair at current buddy size */
3829 		for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx);
3830 		     i < le32_to_cpu(dtp->nleafs);
3831 		     i += nextb, cp += nextb) {
3832 			/* coalesce if both adjacent buddies are max free */
3833 			if (*cp == l2free && *(cp + bsize) == l2free) {
3834 				*cp = l2free + 1;	/* left take right */
3835 				*(cp + bsize) = -1;	/* right give left */
3836 			}
3837 		}
3838 	}
3839 
3840 	/*
3841 	 * bubble summary information of leaves up the tree.
3842 	 *
3843 	 * Starting at the leaf node level, the four nodes described by
3844 	 * the higher level parent node are compared for a maximum free and
3845 	 * this maximum becomes the value of the parent node.
3846 	 * when all lower level nodes are processed in this fashion then
3847 	 * move up to the next level (parent becomes a lower level node) and
3848 	 * continue the process for that level.
3849 	 */
3850 	for (child = le32_to_cpu(dtp->leafidx),
3851 	     nparent = le32_to_cpu(dtp->nleafs) >> 2;
3852 	     nparent > 0; nparent >>= 2, child = parent) {
3853 		/* get index of 1st node of parent level */
3854 		parent = (child - 1) >> 2;
3855 
3856 		/* set the value of the parent node as the maximum
3857 		 * of the four nodes of the current level.
3858 		 */
3859 		for (i = 0, cp = tp + child, cp1 = tp + parent;
3860 		     i < nparent; i++, cp += 4, cp1++)
3861 			*cp1 = TREEMAX(cp);
3862 	}
3863 
3864 	return (*tp);
3865 }
3866 
3867 
3868 /*
3869  *	dbInitDmapCtl()
3870  *
3871  * function: initialize dmapctl page
3872  */
3873 static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i)
3874 {				/* start leaf index not covered by range */
3875 	s8 *cp;
3876 
3877 	dcp->nleafs = cpu_to_le32(LPERCTL);
3878 	dcp->l2nleafs = cpu_to_le32(L2LPERCTL);
3879 	dcp->leafidx = cpu_to_le32(CTLLEAFIND);
3880 	dcp->height = cpu_to_le32(5);
3881 	dcp->budmin = L2BPERDMAP + L2LPERCTL * level;
3882 
3883 	/*
3884 	 * initialize the leaves of current level that were not covered
3885 	 * by the specified input block range (i.e. the leaves have no
3886 	 * low level dmapctl or dmap).
3887 	 */
3888 	cp = &dcp->stree[CTLLEAFIND + i];
3889 	for (; i < LPERCTL; i++)
3890 		*cp++ = NOFREE;
3891 
3892 	/* build the dmap's binary buddy summary tree */
3893 	return (dbInitTree((struct dmaptree *) dcp));
3894 }
3895 
3896 
3897 /*
3898  * NAME:	dbGetL2AGSize()/ujfs_getagl2size()
3899  *
3900  * FUNCTION:	Determine log2(allocation group size) from aggregate size
3901  *
3902  * PARAMETERS:
3903  *	nblocks	- Number of blocks in aggregate
3904  *
3905  * RETURNS: log2(allocation group size) in aggregate blocks
3906  */
3907 static int dbGetL2AGSize(s64 nblocks)
3908 {
3909 	s64 sz;
3910 	s64 m;
3911 	int l2sz;
3912 
3913 	if (nblocks < BPERDMAP * MAXAG)
3914 		return (L2BPERDMAP);
3915 
3916 	/* round up aggregate size to power of 2 */
3917 	m = ((u64) 1 << (64 - 1));
3918 	for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) {
3919 		if (m & nblocks)
3920 			break;
3921 	}
3922 
3923 	sz = (s64) 1 << l2sz;
3924 	if (sz < nblocks)
3925 		l2sz += 1;
3926 
3927 	/* agsize = roundupSize/max_number_of_ag */
3928 	return (l2sz - L2MAXAG);
3929 }
3930 
3931 
3932 /*
3933  * NAME:	dbMapFileSizeToMapSize()
3934  *
3935  * FUNCTION:	compute number of blocks the block allocation map file
3936  *		can cover from the map file size;
3937  *
3938  * RETURNS:	Number of blocks which can be covered by this block map file;
3939  */
3940 
3941 /*
3942  * maximum number of map pages at each level including control pages
3943  */
3944 #define MAXL0PAGES	(1 + LPERCTL)
3945 #define MAXL1PAGES	(1 + LPERCTL * MAXL0PAGES)
3946 #define MAXL2PAGES	(1 + LPERCTL * MAXL1PAGES)
3947 
3948 /*
3949  * convert number of map pages to the zero origin top dmapctl level
3950  */
3951 #define BMAPPGTOLEV(npages)	\
3952 	(((npages) <= 3 + MAXL0PAGES) ? 0 \
3953        : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
3954 
3955 s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
3956 {
3957 	struct super_block *sb = ipbmap->i_sb;
3958 	s64 nblocks;
3959 	s64 npages, ndmaps;
3960 	int level, i;
3961 	int complete, factor;
3962 
3963 	nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize;
3964 	npages = nblocks >> JFS_SBI(sb)->l2nbperpage;
3965 	level = BMAPPGTOLEV(npages);
3966 
3967 	/* At each level, accumulate the number of dmap pages covered by
3968 	 * the number of full child levels below it;
3969 	 * repeat for the last incomplete child level.
3970 	 */
3971 	ndmaps = 0;
3972 	npages--;		/* skip the first global control page */
3973 	/* skip higher level control pages above top level covered by map */
3974 	npages -= (2 - level);
3975 	npages--;		/* skip top level's control page */
3976 	for (i = level; i >= 0; i--) {
3977 		factor =
3978 		    (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
3979 		complete = (u32) npages / factor;
3980 		ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL
3981 				      : ((i == 1) ? LPERCTL : 1));
3982 
3983 		/* pages in last/incomplete child */
3984 		npages = (u32) npages % factor;
3985 		/* skip incomplete child's level control page */
3986 		npages--;
3987 	}
3988 
3989 	/* convert the number of dmaps into the number of blocks
3990 	 * which can be covered by the dmaps;
3991 	 */
3992 	nblocks = ndmaps << L2BPERDMAP;
3993 
3994 	return (nblocks);
3995 }
3996