xref: /linux/fs/ocfs2/suballoc.c (revision a33f32244d8550da8b4a26e277ce07d5c6d158b5)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26 
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31 
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34 
35 #include "ocfs2.h"
36 
37 #include "alloc.h"
38 #include "blockcheck.h"
39 #include "dlmglue.h"
40 #include "inode.h"
41 #include "journal.h"
42 #include "localalloc.h"
43 #include "suballoc.h"
44 #include "super.h"
45 #include "sysfile.h"
46 #include "uptodate.h"
47 
48 #include "buffer_head_io.h"
49 
50 #define NOT_ALLOC_NEW_GROUP		0
51 #define ALLOC_NEW_GROUP			0x1
52 #define ALLOC_GROUPS_FROM_GLOBAL	0x2
53 
54 #define OCFS2_MAX_TO_STEAL		1024
55 
56 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
58 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
59 static int ocfs2_block_group_fill(handle_t *handle,
60 				  struct inode *alloc_inode,
61 				  struct buffer_head *bg_bh,
62 				  u64 group_blkno,
63 				  u16 my_chain,
64 				  struct ocfs2_chain_list *cl);
65 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
66 				   struct inode *alloc_inode,
67 				   struct buffer_head *bh,
68 				   u64 max_block,
69 				   u64 *last_alloc_group,
70 				   int flags);
71 
72 static int ocfs2_cluster_group_search(struct inode *inode,
73 				      struct buffer_head *group_bh,
74 				      u32 bits_wanted, u32 min_bits,
75 				      u64 max_block,
76 				      u16 *bit_off, u16 *bits_found);
77 static int ocfs2_block_group_search(struct inode *inode,
78 				    struct buffer_head *group_bh,
79 				    u32 bits_wanted, u32 min_bits,
80 				    u64 max_block,
81 				    u16 *bit_off, u16 *bits_found);
82 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
83 				     struct ocfs2_alloc_context *ac,
84 				     handle_t *handle,
85 				     u32 bits_wanted,
86 				     u32 min_bits,
87 				     u16 *bit_off,
88 				     unsigned int *num_bits,
89 				     u64 *bg_blkno);
90 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
91 					 int nr);
92 static inline int ocfs2_block_group_set_bits(handle_t *handle,
93 					     struct inode *alloc_inode,
94 					     struct ocfs2_group_desc *bg,
95 					     struct buffer_head *group_bh,
96 					     unsigned int bit_off,
97 					     unsigned int num_bits);
98 static int ocfs2_relink_block_group(handle_t *handle,
99 				    struct inode *alloc_inode,
100 				    struct buffer_head *fe_bh,
101 				    struct buffer_head *bg_bh,
102 				    struct buffer_head *prev_bg_bh,
103 				    u16 chain);
104 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
105 						     u32 wanted);
106 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
107 						   u64 bg_blkno,
108 						   u16 bg_bit_off);
109 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
110 						u64 data_blkno,
111 						u64 *bg_blkno,
112 						u16 *bg_bit_off);
113 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
114 					     u32 bits_wanted, u64 max_block,
115 					     int flags,
116 					     struct ocfs2_alloc_context **ac);
117 
118 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
119 {
120 	struct inode *inode = ac->ac_inode;
121 
122 	if (inode) {
123 		if (ac->ac_which != OCFS2_AC_USE_LOCAL)
124 			ocfs2_inode_unlock(inode, 1);
125 
126 		mutex_unlock(&inode->i_mutex);
127 
128 		iput(inode);
129 		ac->ac_inode = NULL;
130 	}
131 	brelse(ac->ac_bh);
132 	ac->ac_bh = NULL;
133 }
134 
135 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
136 {
137 	ocfs2_free_ac_resource(ac);
138 	kfree(ac);
139 }
140 
141 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
142 {
143 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
144 }
145 
146 #define do_error(fmt, ...)						\
147 	do{								\
148 		if (resize)					\
149 			mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);	\
150 		else							\
151 			ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
152 	} while (0)
153 
154 static int ocfs2_validate_gd_self(struct super_block *sb,
155 				  struct buffer_head *bh,
156 				  int resize)
157 {
158 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
159 
160 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
161 		do_error("Group descriptor #%llu has bad signature %.*s",
162 			 (unsigned long long)bh->b_blocknr, 7,
163 			 gd->bg_signature);
164 		return -EINVAL;
165 	}
166 
167 	if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
168 		do_error("Group descriptor #%llu has an invalid bg_blkno "
169 			 "of %llu",
170 			 (unsigned long long)bh->b_blocknr,
171 			 (unsigned long long)le64_to_cpu(gd->bg_blkno));
172 		return -EINVAL;
173 	}
174 
175 	if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
176 		do_error("Group descriptor #%llu has an invalid "
177 			 "fs_generation of #%u",
178 			 (unsigned long long)bh->b_blocknr,
179 			 le32_to_cpu(gd->bg_generation));
180 		return -EINVAL;
181 	}
182 
183 	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
184 		do_error("Group descriptor #%llu has bit count %u but "
185 			 "claims that %u are free",
186 			 (unsigned long long)bh->b_blocknr,
187 			 le16_to_cpu(gd->bg_bits),
188 			 le16_to_cpu(gd->bg_free_bits_count));
189 		return -EINVAL;
190 	}
191 
192 	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
193 		do_error("Group descriptor #%llu has bit count %u but "
194 			 "max bitmap bits of %u",
195 			 (unsigned long long)bh->b_blocknr,
196 			 le16_to_cpu(gd->bg_bits),
197 			 8 * le16_to_cpu(gd->bg_size));
198 		return -EINVAL;
199 	}
200 
201 	return 0;
202 }
203 
204 static int ocfs2_validate_gd_parent(struct super_block *sb,
205 				    struct ocfs2_dinode *di,
206 				    struct buffer_head *bh,
207 				    int resize)
208 {
209 	unsigned int max_bits;
210 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
211 
212 	if (di->i_blkno != gd->bg_parent_dinode) {
213 		do_error("Group descriptor #%llu has bad parent "
214 			 "pointer (%llu, expected %llu)",
215 			 (unsigned long long)bh->b_blocknr,
216 			 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
217 			 (unsigned long long)le64_to_cpu(di->i_blkno));
218 		return -EINVAL;
219 	}
220 
221 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
222 	if (le16_to_cpu(gd->bg_bits) > max_bits) {
223 		do_error("Group descriptor #%llu has bit count of %u",
224 			 (unsigned long long)bh->b_blocknr,
225 			 le16_to_cpu(gd->bg_bits));
226 		return -EINVAL;
227 	}
228 
229 	/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
230 	if ((le16_to_cpu(gd->bg_chain) >
231 	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
232 	    ((le16_to_cpu(gd->bg_chain) ==
233 	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
234 		do_error("Group descriptor #%llu has bad chain %u",
235 			 (unsigned long long)bh->b_blocknr,
236 			 le16_to_cpu(gd->bg_chain));
237 		return -EINVAL;
238 	}
239 
240 	return 0;
241 }
242 
243 #undef do_error
244 
245 /*
246  * This version only prints errors.  It does not fail the filesystem, and
247  * exists only for resize.
248  */
249 int ocfs2_check_group_descriptor(struct super_block *sb,
250 				 struct ocfs2_dinode *di,
251 				 struct buffer_head *bh)
252 {
253 	int rc;
254 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
255 
256 	BUG_ON(!buffer_uptodate(bh));
257 
258 	/*
259 	 * If the ecc fails, we return the error but otherwise
260 	 * leave the filesystem running.  We know any error is
261 	 * local to this block.
262 	 */
263 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
264 	if (rc) {
265 		mlog(ML_ERROR,
266 		     "Checksum failed for group descriptor %llu\n",
267 		     (unsigned long long)bh->b_blocknr);
268 	} else
269 		rc = ocfs2_validate_gd_self(sb, bh, 1);
270 	if (!rc)
271 		rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
272 
273 	return rc;
274 }
275 
276 static int ocfs2_validate_group_descriptor(struct super_block *sb,
277 					   struct buffer_head *bh)
278 {
279 	int rc;
280 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
281 
282 	mlog(0, "Validating group descriptor %llu\n",
283 	     (unsigned long long)bh->b_blocknr);
284 
285 	BUG_ON(!buffer_uptodate(bh));
286 
287 	/*
288 	 * If the ecc fails, we return the error but otherwise
289 	 * leave the filesystem running.  We know any error is
290 	 * local to this block.
291 	 */
292 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
293 	if (rc)
294 		return rc;
295 
296 	/*
297 	 * Errors after here are fatal.
298 	 */
299 
300 	return ocfs2_validate_gd_self(sb, bh, 0);
301 }
302 
303 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
304 				u64 gd_blkno, struct buffer_head **bh)
305 {
306 	int rc;
307 	struct buffer_head *tmp = *bh;
308 
309 	rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
310 			      ocfs2_validate_group_descriptor);
311 	if (rc)
312 		goto out;
313 
314 	rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
315 	if (rc) {
316 		brelse(tmp);
317 		goto out;
318 	}
319 
320 	/* If ocfs2_read_block() got us a new bh, pass it up. */
321 	if (!*bh)
322 		*bh = tmp;
323 
324 out:
325 	return rc;
326 }
327 
328 static int ocfs2_block_group_fill(handle_t *handle,
329 				  struct inode *alloc_inode,
330 				  struct buffer_head *bg_bh,
331 				  u64 group_blkno,
332 				  u16 my_chain,
333 				  struct ocfs2_chain_list *cl)
334 {
335 	int status = 0;
336 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
337 	struct super_block * sb = alloc_inode->i_sb;
338 
339 	mlog_entry_void();
340 
341 	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
342 		ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
343 			    "b_blocknr (%llu)",
344 			    (unsigned long long)group_blkno,
345 			    (unsigned long long) bg_bh->b_blocknr);
346 		status = -EIO;
347 		goto bail;
348 	}
349 
350 	status = ocfs2_journal_access_gd(handle,
351 					 INODE_CACHE(alloc_inode),
352 					 bg_bh,
353 					 OCFS2_JOURNAL_ACCESS_CREATE);
354 	if (status < 0) {
355 		mlog_errno(status);
356 		goto bail;
357 	}
358 
359 	memset(bg, 0, sb->s_blocksize);
360 	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
361 	bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
362 	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
363 	bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
364 	bg->bg_chain = cpu_to_le16(my_chain);
365 	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
366 	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
367 	bg->bg_blkno = cpu_to_le64(group_blkno);
368 	/* set the 1st bit in the bitmap to account for the descriptor block */
369 	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
370 	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
371 
372 	status = ocfs2_journal_dirty(handle, bg_bh);
373 	if (status < 0)
374 		mlog_errno(status);
375 
376 	/* There is no need to zero out or otherwise initialize the
377 	 * other blocks in a group - All valid FS metadata in a block
378 	 * group stores the superblock fs_generation value at
379 	 * allocation time. */
380 
381 bail:
382 	mlog_exit(status);
383 	return status;
384 }
385 
386 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
387 {
388 	u16 curr, best;
389 
390 	best = curr = 0;
391 	while (curr < le16_to_cpu(cl->cl_count)) {
392 		if (le32_to_cpu(cl->cl_recs[best].c_total) >
393 		    le32_to_cpu(cl->cl_recs[curr].c_total))
394 			best = curr;
395 		curr++;
396 	}
397 	return best;
398 }
399 
400 /*
401  * We expect the block group allocator to already be locked.
402  */
403 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
404 				   struct inode *alloc_inode,
405 				   struct buffer_head *bh,
406 				   u64 max_block,
407 				   u64 *last_alloc_group,
408 				   int flags)
409 {
410 	int status, credits;
411 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
412 	struct ocfs2_chain_list *cl;
413 	struct ocfs2_alloc_context *ac = NULL;
414 	handle_t *handle = NULL;
415 	u32 bit_off, num_bits;
416 	u16 alloc_rec;
417 	u64 bg_blkno;
418 	struct buffer_head *bg_bh = NULL;
419 	struct ocfs2_group_desc *bg;
420 
421 	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
422 
423 	mlog_entry_void();
424 
425 	cl = &fe->id2.i_chain;
426 	status = ocfs2_reserve_clusters_with_limit(osb,
427 						   le16_to_cpu(cl->cl_cpg),
428 						   max_block, flags, &ac);
429 	if (status < 0) {
430 		if (status != -ENOSPC)
431 			mlog_errno(status);
432 		goto bail;
433 	}
434 
435 	credits = ocfs2_calc_group_alloc_credits(osb->sb,
436 						 le16_to_cpu(cl->cl_cpg));
437 	handle = ocfs2_start_trans(osb, credits);
438 	if (IS_ERR(handle)) {
439 		status = PTR_ERR(handle);
440 		handle = NULL;
441 		mlog_errno(status);
442 		goto bail;
443 	}
444 
445 	if (last_alloc_group && *last_alloc_group != 0) {
446 		mlog(0, "use old allocation group %llu for block group alloc\n",
447 		     (unsigned long long)*last_alloc_group);
448 		ac->ac_last_group = *last_alloc_group;
449 	}
450 	status = ocfs2_claim_clusters(osb,
451 				      handle,
452 				      ac,
453 				      le16_to_cpu(cl->cl_cpg),
454 				      &bit_off,
455 				      &num_bits);
456 	if (status < 0) {
457 		if (status != -ENOSPC)
458 			mlog_errno(status);
459 		goto bail;
460 	}
461 
462 	alloc_rec = ocfs2_find_smallest_chain(cl);
463 
464 	/* setup the group */
465 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
466 	mlog(0, "new descriptor, record %u, at block %llu\n",
467 	     alloc_rec, (unsigned long long)bg_blkno);
468 
469 	bg_bh = sb_getblk(osb->sb, bg_blkno);
470 	if (!bg_bh) {
471 		status = -EIO;
472 		mlog_errno(status);
473 		goto bail;
474 	}
475 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
476 
477 	status = ocfs2_block_group_fill(handle,
478 					alloc_inode,
479 					bg_bh,
480 					bg_blkno,
481 					alloc_rec,
482 					cl);
483 	if (status < 0) {
484 		mlog_errno(status);
485 		goto bail;
486 	}
487 
488 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
489 
490 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
491 					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
492 	if (status < 0) {
493 		mlog_errno(status);
494 		goto bail;
495 	}
496 
497 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
498 		     le16_to_cpu(bg->bg_free_bits_count));
499 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
500 	cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
501 	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
502 		le16_add_cpu(&cl->cl_next_free_rec, 1);
503 
504 	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
505 					le16_to_cpu(bg->bg_free_bits_count));
506 	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
507 	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
508 
509 	status = ocfs2_journal_dirty(handle, bh);
510 	if (status < 0) {
511 		mlog_errno(status);
512 		goto bail;
513 	}
514 
515 	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
516 	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
517 	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
518 					     le32_to_cpu(fe->i_clusters)));
519 	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
520 	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
521 	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
522 
523 	status = 0;
524 
525 	/* save the new last alloc group so that the caller can cache it. */
526 	if (last_alloc_group)
527 		*last_alloc_group = ac->ac_last_group;
528 
529 bail:
530 	if (handle)
531 		ocfs2_commit_trans(osb, handle);
532 
533 	if (ac)
534 		ocfs2_free_alloc_context(ac);
535 
536 	brelse(bg_bh);
537 
538 	mlog_exit(status);
539 	return status;
540 }
541 
542 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
543 				       struct ocfs2_alloc_context *ac,
544 				       int type,
545 				       u32 slot,
546 				       u64 *last_alloc_group,
547 				       int flags)
548 {
549 	int status;
550 	u32 bits_wanted = ac->ac_bits_wanted;
551 	struct inode *alloc_inode;
552 	struct buffer_head *bh = NULL;
553 	struct ocfs2_dinode *fe;
554 	u32 free_bits;
555 
556 	mlog_entry_void();
557 
558 	alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
559 	if (!alloc_inode) {
560 		mlog_errno(-EINVAL);
561 		return -EINVAL;
562 	}
563 
564 	mutex_lock(&alloc_inode->i_mutex);
565 
566 	status = ocfs2_inode_lock(alloc_inode, &bh, 1);
567 	if (status < 0) {
568 		mutex_unlock(&alloc_inode->i_mutex);
569 		iput(alloc_inode);
570 
571 		mlog_errno(status);
572 		return status;
573 	}
574 
575 	ac->ac_inode = alloc_inode;
576 	ac->ac_alloc_slot = slot;
577 
578 	fe = (struct ocfs2_dinode *) bh->b_data;
579 
580 	/* The bh was validated by the inode read inside
581 	 * ocfs2_inode_lock().  Any corruption is a code bug. */
582 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
583 
584 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
585 		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
586 			    (unsigned long long)le64_to_cpu(fe->i_blkno));
587 		status = -EIO;
588 		goto bail;
589 	}
590 
591 	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
592 		le32_to_cpu(fe->id1.bitmap1.i_used);
593 
594 	if (bits_wanted > free_bits) {
595 		/* cluster bitmap never grows */
596 		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
597 			mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
598 			     bits_wanted, free_bits);
599 			status = -ENOSPC;
600 			goto bail;
601 		}
602 
603 		if (!(flags & ALLOC_NEW_GROUP)) {
604 			mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
605 			     "and we don't alloc a new group for it.\n",
606 			     slot, bits_wanted, free_bits);
607 			status = -ENOSPC;
608 			goto bail;
609 		}
610 
611 		status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
612 						 ac->ac_max_block,
613 						 last_alloc_group, flags);
614 		if (status < 0) {
615 			if (status != -ENOSPC)
616 				mlog_errno(status);
617 			goto bail;
618 		}
619 		atomic_inc(&osb->alloc_stats.bg_extends);
620 
621 		/* You should never ask for this much metadata */
622 		BUG_ON(bits_wanted >
623 		       (le32_to_cpu(fe->id1.bitmap1.i_total)
624 			- le32_to_cpu(fe->id1.bitmap1.i_used)));
625 	}
626 
627 	get_bh(bh);
628 	ac->ac_bh = bh;
629 bail:
630 	brelse(bh);
631 
632 	mlog_exit(status);
633 	return status;
634 }
635 
636 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
637 {
638 	spin_lock(&osb->osb_lock);
639 	osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
640 	spin_unlock(&osb->osb_lock);
641 	atomic_set(&osb->s_num_inodes_stolen, 0);
642 }
643 
644 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
645 {
646 	spin_lock(&osb->osb_lock);
647 	osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
648 	spin_unlock(&osb->osb_lock);
649 	atomic_set(&osb->s_num_meta_stolen, 0);
650 }
651 
652 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
653 {
654 	ocfs2_init_inode_steal_slot(osb);
655 	ocfs2_init_meta_steal_slot(osb);
656 }
657 
658 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
659 {
660 	spin_lock(&osb->osb_lock);
661 	if (type == INODE_ALLOC_SYSTEM_INODE)
662 		osb->s_inode_steal_slot = slot;
663 	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
664 		osb->s_meta_steal_slot = slot;
665 	spin_unlock(&osb->osb_lock);
666 }
667 
668 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
669 {
670 	int slot = OCFS2_INVALID_SLOT;
671 
672 	spin_lock(&osb->osb_lock);
673 	if (type == INODE_ALLOC_SYSTEM_INODE)
674 		slot = osb->s_inode_steal_slot;
675 	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
676 		slot = osb->s_meta_steal_slot;
677 	spin_unlock(&osb->osb_lock);
678 
679 	return slot;
680 }
681 
682 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
683 {
684 	return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
685 }
686 
687 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
688 {
689 	return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
690 }
691 
692 static int ocfs2_steal_resource(struct ocfs2_super *osb,
693 				struct ocfs2_alloc_context *ac,
694 				int type)
695 {
696 	int i, status = -ENOSPC;
697 	int slot = __ocfs2_get_steal_slot(osb, type);
698 
699 	/* Start to steal resource from the first slot after ours. */
700 	if (slot == OCFS2_INVALID_SLOT)
701 		slot = osb->slot_num + 1;
702 
703 	for (i = 0; i < osb->max_slots; i++, slot++) {
704 		if (slot == osb->max_slots)
705 			slot = 0;
706 
707 		if (slot == osb->slot_num)
708 			continue;
709 
710 		status = ocfs2_reserve_suballoc_bits(osb, ac,
711 						     type,
712 						     (u32)slot, NULL,
713 						     NOT_ALLOC_NEW_GROUP);
714 		if (status >= 0) {
715 			__ocfs2_set_steal_slot(osb, slot, type);
716 			break;
717 		}
718 
719 		ocfs2_free_ac_resource(ac);
720 	}
721 
722 	return status;
723 }
724 
725 static int ocfs2_steal_inode(struct ocfs2_super *osb,
726 			     struct ocfs2_alloc_context *ac)
727 {
728 	return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
729 }
730 
731 static int ocfs2_steal_meta(struct ocfs2_super *osb,
732 			    struct ocfs2_alloc_context *ac)
733 {
734 	return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
735 }
736 
737 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
738 				      int blocks,
739 				      struct ocfs2_alloc_context **ac)
740 {
741 	int status;
742 	int slot = ocfs2_get_meta_steal_slot(osb);
743 
744 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
745 	if (!(*ac)) {
746 		status = -ENOMEM;
747 		mlog_errno(status);
748 		goto bail;
749 	}
750 
751 	(*ac)->ac_bits_wanted = blocks;
752 	(*ac)->ac_which = OCFS2_AC_USE_META;
753 	(*ac)->ac_group_search = ocfs2_block_group_search;
754 
755 	if (slot != OCFS2_INVALID_SLOT &&
756 		atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
757 		goto extent_steal;
758 
759 	atomic_set(&osb->s_num_meta_stolen, 0);
760 	status = ocfs2_reserve_suballoc_bits(osb, (*ac),
761 					     EXTENT_ALLOC_SYSTEM_INODE,
762 					     (u32)osb->slot_num, NULL,
763 					     ALLOC_NEW_GROUP);
764 
765 
766 	if (status >= 0) {
767 		status = 0;
768 		if (slot != OCFS2_INVALID_SLOT)
769 			ocfs2_init_meta_steal_slot(osb);
770 		goto bail;
771 	} else if (status < 0 && status != -ENOSPC) {
772 		mlog_errno(status);
773 		goto bail;
774 	}
775 
776 	ocfs2_free_ac_resource(*ac);
777 
778 extent_steal:
779 	status = ocfs2_steal_meta(osb, *ac);
780 	atomic_inc(&osb->s_num_meta_stolen);
781 	if (status < 0) {
782 		if (status != -ENOSPC)
783 			mlog_errno(status);
784 		goto bail;
785 	}
786 
787 	status = 0;
788 bail:
789 	if ((status < 0) && *ac) {
790 		ocfs2_free_alloc_context(*ac);
791 		*ac = NULL;
792 	}
793 
794 	mlog_exit(status);
795 	return status;
796 }
797 
798 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
799 			       struct ocfs2_extent_list *root_el,
800 			       struct ocfs2_alloc_context **ac)
801 {
802 	return ocfs2_reserve_new_metadata_blocks(osb,
803 					ocfs2_extend_meta_needed(root_el),
804 					ac);
805 }
806 
807 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
808 			    struct ocfs2_alloc_context **ac)
809 {
810 	int status;
811 	int slot = ocfs2_get_inode_steal_slot(osb);
812 	u64 alloc_group;
813 
814 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
815 	if (!(*ac)) {
816 		status = -ENOMEM;
817 		mlog_errno(status);
818 		goto bail;
819 	}
820 
821 	(*ac)->ac_bits_wanted = 1;
822 	(*ac)->ac_which = OCFS2_AC_USE_INODE;
823 
824 	(*ac)->ac_group_search = ocfs2_block_group_search;
825 
826 	/*
827 	 * stat(2) can't handle i_ino > 32bits, so we tell the
828 	 * lower levels not to allocate us a block group past that
829 	 * limit.  The 'inode64' mount option avoids this behavior.
830 	 */
831 	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
832 		(*ac)->ac_max_block = (u32)~0U;
833 
834 	/*
835 	 * slot is set when we successfully steal inode from other nodes.
836 	 * It is reset in 3 places:
837 	 * 1. when we flush the truncate log
838 	 * 2. when we complete local alloc recovery.
839 	 * 3. when we successfully allocate from our own slot.
840 	 * After it is set, we will go on stealing inodes until we find the
841 	 * need to check our slots to see whether there is some space for us.
842 	 */
843 	if (slot != OCFS2_INVALID_SLOT &&
844 	    atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
845 		goto inode_steal;
846 
847 	atomic_set(&osb->s_num_inodes_stolen, 0);
848 	alloc_group = osb->osb_inode_alloc_group;
849 	status = ocfs2_reserve_suballoc_bits(osb, *ac,
850 					     INODE_ALLOC_SYSTEM_INODE,
851 					     (u32)osb->slot_num,
852 					     &alloc_group,
853 					     ALLOC_NEW_GROUP |
854 					     ALLOC_GROUPS_FROM_GLOBAL);
855 	if (status >= 0) {
856 		status = 0;
857 
858 		spin_lock(&osb->osb_lock);
859 		osb->osb_inode_alloc_group = alloc_group;
860 		spin_unlock(&osb->osb_lock);
861 		mlog(0, "after reservation, new allocation group is "
862 		     "%llu\n", (unsigned long long)alloc_group);
863 
864 		/*
865 		 * Some inodes must be freed by us, so try to allocate
866 		 * from our own next time.
867 		 */
868 		if (slot != OCFS2_INVALID_SLOT)
869 			ocfs2_init_inode_steal_slot(osb);
870 		goto bail;
871 	} else if (status < 0 && status != -ENOSPC) {
872 		mlog_errno(status);
873 		goto bail;
874 	}
875 
876 	ocfs2_free_ac_resource(*ac);
877 
878 inode_steal:
879 	status = ocfs2_steal_inode(osb, *ac);
880 	atomic_inc(&osb->s_num_inodes_stolen);
881 	if (status < 0) {
882 		if (status != -ENOSPC)
883 			mlog_errno(status);
884 		goto bail;
885 	}
886 
887 	status = 0;
888 bail:
889 	if ((status < 0) && *ac) {
890 		ocfs2_free_alloc_context(*ac);
891 		*ac = NULL;
892 	}
893 
894 	mlog_exit(status);
895 	return status;
896 }
897 
898 /* local alloc code has to do the same thing, so rather than do this
899  * twice.. */
900 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
901 				      struct ocfs2_alloc_context *ac)
902 {
903 	int status;
904 
905 	ac->ac_which = OCFS2_AC_USE_MAIN;
906 	ac->ac_group_search = ocfs2_cluster_group_search;
907 
908 	status = ocfs2_reserve_suballoc_bits(osb, ac,
909 					     GLOBAL_BITMAP_SYSTEM_INODE,
910 					     OCFS2_INVALID_SLOT, NULL,
911 					     ALLOC_NEW_GROUP);
912 	if (status < 0 && status != -ENOSPC) {
913 		mlog_errno(status);
914 		goto bail;
915 	}
916 
917 bail:
918 	return status;
919 }
920 
921 /* Callers don't need to care which bitmap (local alloc or main) to
922  * use so we figure it out for them, but unfortunately this clutters
923  * things a bit. */
924 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
925 					     u32 bits_wanted, u64 max_block,
926 					     int flags,
927 					     struct ocfs2_alloc_context **ac)
928 {
929 	int status;
930 
931 	mlog_entry_void();
932 
933 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
934 	if (!(*ac)) {
935 		status = -ENOMEM;
936 		mlog_errno(status);
937 		goto bail;
938 	}
939 
940 	(*ac)->ac_bits_wanted = bits_wanted;
941 	(*ac)->ac_max_block = max_block;
942 
943 	status = -ENOSPC;
944 	if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
945 	    ocfs2_alloc_should_use_local(osb, bits_wanted)) {
946 		status = ocfs2_reserve_local_alloc_bits(osb,
947 							bits_wanted,
948 							*ac);
949 		if (status == -EFBIG) {
950 			/* The local alloc window is outside ac_max_block.
951 			 * use the main bitmap. */
952 			status = -ENOSPC;
953 		} else if ((status < 0) && (status != -ENOSPC)) {
954 			mlog_errno(status);
955 			goto bail;
956 		}
957 	}
958 
959 	if (status == -ENOSPC) {
960 		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
961 		if (status < 0) {
962 			if (status != -ENOSPC)
963 				mlog_errno(status);
964 			goto bail;
965 		}
966 	}
967 
968 	status = 0;
969 bail:
970 	if ((status < 0) && *ac) {
971 		ocfs2_free_alloc_context(*ac);
972 		*ac = NULL;
973 	}
974 
975 	mlog_exit(status);
976 	return status;
977 }
978 
979 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
980 			   u32 bits_wanted,
981 			   struct ocfs2_alloc_context **ac)
982 {
983 	return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
984 						 ALLOC_NEW_GROUP, ac);
985 }
986 
987 /*
988  * More or less lifted from ext3. I'll leave their description below:
989  *
990  * "For ext3 allocations, we must not reuse any blocks which are
991  * allocated in the bitmap buffer's "last committed data" copy.  This
992  * prevents deletes from freeing up the page for reuse until we have
993  * committed the delete transaction.
994  *
995  * If we didn't do this, then deleting something and reallocating it as
996  * data would allow the old block to be overwritten before the
997  * transaction committed (because we force data to disk before commit).
998  * This would lead to corruption if we crashed between overwriting the
999  * data and committing the delete.
1000  *
1001  * @@@ We may want to make this allocation behaviour conditional on
1002  * data-writes at some point, and disable it for metadata allocations or
1003  * sync-data inodes."
1004  *
1005  * Note: OCFS2 already does this differently for metadata vs data
1006  * allocations, as those bitmaps are separate and undo access is never
1007  * called on a metadata group descriptor.
1008  */
1009 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1010 					 int nr)
1011 {
1012 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1013 	int ret;
1014 
1015 	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1016 		return 0;
1017 
1018 	if (!buffer_jbd(bg_bh))
1019 		return 1;
1020 
1021 	jbd_lock_bh_state(bg_bh);
1022 	bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
1023 	if (bg)
1024 		ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1025 	else
1026 		ret = 1;
1027 	jbd_unlock_bh_state(bg_bh);
1028 
1029 	return ret;
1030 }
1031 
1032 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1033 					     struct buffer_head *bg_bh,
1034 					     unsigned int bits_wanted,
1035 					     unsigned int total_bits,
1036 					     u16 *bit_off,
1037 					     u16 *bits_found)
1038 {
1039 	void *bitmap;
1040 	u16 best_offset, best_size;
1041 	int offset, start, found, status = 0;
1042 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1043 
1044 	/* Callers got this descriptor from
1045 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1046 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1047 
1048 	found = start = best_offset = best_size = 0;
1049 	bitmap = bg->bg_bitmap;
1050 
1051 	while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1052 		if (offset == total_bits)
1053 			break;
1054 
1055 		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1056 			/* We found a zero, but we can't use it as it
1057 			 * hasn't been put to disk yet! */
1058 			found = 0;
1059 			start = offset + 1;
1060 		} else if (offset == start) {
1061 			/* we found a zero */
1062 			found++;
1063 			/* move start to the next bit to test */
1064 			start++;
1065 		} else {
1066 			/* got a zero after some ones */
1067 			found = 1;
1068 			start = offset + 1;
1069 		}
1070 		if (found > best_size) {
1071 			best_size = found;
1072 			best_offset = start - found;
1073 		}
1074 		/* we got everything we needed */
1075 		if (found == bits_wanted) {
1076 			/* mlog(0, "Found it all!\n"); */
1077 			break;
1078 		}
1079 	}
1080 
1081 	/* XXX: I think the first clause is equivalent to the second
1082 	 * 	- jlbec */
1083 	if (found == bits_wanted) {
1084 		*bit_off = start - found;
1085 		*bits_found = found;
1086 	} else if (best_size) {
1087 		*bit_off = best_offset;
1088 		*bits_found = best_size;
1089 	} else {
1090 		status = -ENOSPC;
1091 		/* No error log here -- see the comment above
1092 		 * ocfs2_test_bg_bit_allocatable */
1093 	}
1094 
1095 	return status;
1096 }
1097 
1098 static inline int ocfs2_block_group_set_bits(handle_t *handle,
1099 					     struct inode *alloc_inode,
1100 					     struct ocfs2_group_desc *bg,
1101 					     struct buffer_head *group_bh,
1102 					     unsigned int bit_off,
1103 					     unsigned int num_bits)
1104 {
1105 	int status;
1106 	void *bitmap = bg->bg_bitmap;
1107 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1108 
1109 	mlog_entry_void();
1110 
1111 	/* All callers get the descriptor via
1112 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1113 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1114 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1115 
1116 	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
1117 	     num_bits);
1118 
1119 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1120 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1121 
1122 	status = ocfs2_journal_access_gd(handle,
1123 					 INODE_CACHE(alloc_inode),
1124 					 group_bh,
1125 					 journal_type);
1126 	if (status < 0) {
1127 		mlog_errno(status);
1128 		goto bail;
1129 	}
1130 
1131 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1132 
1133 	while(num_bits--)
1134 		ocfs2_set_bit(bit_off++, bitmap);
1135 
1136 	status = ocfs2_journal_dirty(handle,
1137 				     group_bh);
1138 	if (status < 0) {
1139 		mlog_errno(status);
1140 		goto bail;
1141 	}
1142 
1143 bail:
1144 	mlog_exit(status);
1145 	return status;
1146 }
1147 
1148 /* find the one with the most empty bits */
1149 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1150 {
1151 	u16 curr, best;
1152 
1153 	BUG_ON(!cl->cl_next_free_rec);
1154 
1155 	best = curr = 0;
1156 	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1157 		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1158 		    le32_to_cpu(cl->cl_recs[best].c_free))
1159 			best = curr;
1160 		curr++;
1161 	}
1162 
1163 	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1164 	return best;
1165 }
1166 
1167 static int ocfs2_relink_block_group(handle_t *handle,
1168 				    struct inode *alloc_inode,
1169 				    struct buffer_head *fe_bh,
1170 				    struct buffer_head *bg_bh,
1171 				    struct buffer_head *prev_bg_bh,
1172 				    u16 chain)
1173 {
1174 	int status;
1175 	/* there is a really tiny chance the journal calls could fail,
1176 	 * but we wouldn't want inconsistent blocks in *any* case. */
1177 	u64 fe_ptr, bg_ptr, prev_bg_ptr;
1178 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1179 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1180 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1181 
1182 	/* The caller got these descriptors from
1183 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1184 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1185 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1186 
1187 	mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
1188 	     (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1189 	     (unsigned long long)le64_to_cpu(bg->bg_blkno),
1190 	     (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1191 
1192 	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1193 	bg_ptr = le64_to_cpu(bg->bg_next_group);
1194 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1195 
1196 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1197 					 prev_bg_bh,
1198 					 OCFS2_JOURNAL_ACCESS_WRITE);
1199 	if (status < 0) {
1200 		mlog_errno(status);
1201 		goto out_rollback;
1202 	}
1203 
1204 	prev_bg->bg_next_group = bg->bg_next_group;
1205 
1206 	status = ocfs2_journal_dirty(handle, prev_bg_bh);
1207 	if (status < 0) {
1208 		mlog_errno(status);
1209 		goto out_rollback;
1210 	}
1211 
1212 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1213 					 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1214 	if (status < 0) {
1215 		mlog_errno(status);
1216 		goto out_rollback;
1217 	}
1218 
1219 	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1220 
1221 	status = ocfs2_journal_dirty(handle, bg_bh);
1222 	if (status < 0) {
1223 		mlog_errno(status);
1224 		goto out_rollback;
1225 	}
1226 
1227 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1228 					 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1229 	if (status < 0) {
1230 		mlog_errno(status);
1231 		goto out_rollback;
1232 	}
1233 
1234 	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1235 
1236 	status = ocfs2_journal_dirty(handle, fe_bh);
1237 	if (status < 0) {
1238 		mlog_errno(status);
1239 		goto out_rollback;
1240 	}
1241 
1242 	status = 0;
1243 out_rollback:
1244 	if (status < 0) {
1245 		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1246 		bg->bg_next_group = cpu_to_le64(bg_ptr);
1247 		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1248 	}
1249 
1250 	mlog_exit(status);
1251 	return status;
1252 }
1253 
1254 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1255 						     u32 wanted)
1256 {
1257 	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1258 }
1259 
1260 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1261  * value on error. */
1262 static int ocfs2_cluster_group_search(struct inode *inode,
1263 				      struct buffer_head *group_bh,
1264 				      u32 bits_wanted, u32 min_bits,
1265 				      u64 max_block,
1266 				      u16 *bit_off, u16 *bits_found)
1267 {
1268 	int search = -ENOSPC;
1269 	int ret;
1270 	u64 blkoff;
1271 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1272 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1273 	u16 tmp_off, tmp_found;
1274 	unsigned int max_bits, gd_cluster_off;
1275 
1276 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1277 
1278 	if (gd->bg_free_bits_count) {
1279 		max_bits = le16_to_cpu(gd->bg_bits);
1280 
1281 		/* Tail groups in cluster bitmaps which aren't cpg
1282 		 * aligned are prone to partial extention by a failed
1283 		 * fs resize. If the file system resize never got to
1284 		 * update the dinode cluster count, then we don't want
1285 		 * to trust any clusters past it, regardless of what
1286 		 * the group descriptor says. */
1287 		gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1288 							  le64_to_cpu(gd->bg_blkno));
1289 		if ((gd_cluster_off + max_bits) >
1290 		    OCFS2_I(inode)->ip_clusters) {
1291 			max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1292 			mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
1293 			     (unsigned long long)le64_to_cpu(gd->bg_blkno),
1294 			     le16_to_cpu(gd->bg_bits),
1295 			     OCFS2_I(inode)->ip_clusters, max_bits);
1296 		}
1297 
1298 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1299 							group_bh, bits_wanted,
1300 							max_bits,
1301 							&tmp_off, &tmp_found);
1302 		if (ret)
1303 			return ret;
1304 
1305 		if (max_block) {
1306 			blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1307 							  gd_cluster_off +
1308 							  tmp_off + tmp_found);
1309 			mlog(0, "Checking %llu against %llu\n",
1310 			     (unsigned long long)blkoff,
1311 			     (unsigned long long)max_block);
1312 			if (blkoff > max_block)
1313 				return -ENOSPC;
1314 		}
1315 
1316 		/* ocfs2_block_group_find_clear_bits() might
1317 		 * return success, but we still want to return
1318 		 * -ENOSPC unless it found the minimum number
1319 		 * of bits. */
1320 		if (min_bits <= tmp_found) {
1321 			*bit_off = tmp_off;
1322 			*bits_found = tmp_found;
1323 			search = 0; /* success */
1324 		} else if (tmp_found) {
1325 			/*
1326 			 * Don't show bits which we'll be returning
1327 			 * for allocation to the local alloc bitmap.
1328 			 */
1329 			ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
1330 		}
1331 	}
1332 
1333 	return search;
1334 }
1335 
1336 static int ocfs2_block_group_search(struct inode *inode,
1337 				    struct buffer_head *group_bh,
1338 				    u32 bits_wanted, u32 min_bits,
1339 				    u64 max_block,
1340 				    u16 *bit_off, u16 *bits_found)
1341 {
1342 	int ret = -ENOSPC;
1343 	u64 blkoff;
1344 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1345 
1346 	BUG_ON(min_bits != 1);
1347 	BUG_ON(ocfs2_is_cluster_bitmap(inode));
1348 
1349 	if (bg->bg_free_bits_count) {
1350 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1351 							group_bh, bits_wanted,
1352 							le16_to_cpu(bg->bg_bits),
1353 							bit_off, bits_found);
1354 		if (!ret && max_block) {
1355 			blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
1356 				*bits_found;
1357 			mlog(0, "Checking %llu against %llu\n",
1358 			     (unsigned long long)blkoff,
1359 			     (unsigned long long)max_block);
1360 			if (blkoff > max_block)
1361 				ret = -ENOSPC;
1362 		}
1363 	}
1364 
1365 	return ret;
1366 }
1367 
1368 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1369 				       handle_t *handle,
1370 				       struct buffer_head *di_bh,
1371 				       u32 num_bits,
1372 				       u16 chain)
1373 {
1374 	int ret;
1375 	u32 tmp_used;
1376 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1377 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1378 
1379 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1380 				      OCFS2_JOURNAL_ACCESS_WRITE);
1381 	if (ret < 0) {
1382 		mlog_errno(ret);
1383 		goto out;
1384 	}
1385 
1386 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1387 	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1388 	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1389 
1390 	ret = ocfs2_journal_dirty(handle, di_bh);
1391 	if (ret < 0)
1392 		mlog_errno(ret);
1393 
1394 out:
1395 	return ret;
1396 }
1397 
1398 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1399 				  handle_t *handle,
1400 				  u32 bits_wanted,
1401 				  u32 min_bits,
1402 				  u16 *bit_off,
1403 				  unsigned int *num_bits,
1404 				  u64 gd_blkno,
1405 				  u16 *bits_left)
1406 {
1407 	int ret;
1408 	u16 found;
1409 	struct buffer_head *group_bh = NULL;
1410 	struct ocfs2_group_desc *gd;
1411 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1412 	struct inode *alloc_inode = ac->ac_inode;
1413 
1414 	ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
1415 					  &group_bh);
1416 	if (ret < 0) {
1417 		mlog_errno(ret);
1418 		return ret;
1419 	}
1420 
1421 	gd = (struct ocfs2_group_desc *) group_bh->b_data;
1422 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1423 				  ac->ac_max_block, bit_off, &found);
1424 	if (ret < 0) {
1425 		if (ret != -ENOSPC)
1426 			mlog_errno(ret);
1427 		goto out;
1428 	}
1429 
1430 	*num_bits = found;
1431 
1432 	ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1433 					       *num_bits,
1434 					       le16_to_cpu(gd->bg_chain));
1435 	if (ret < 0) {
1436 		mlog_errno(ret);
1437 		goto out;
1438 	}
1439 
1440 	ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1441 					 *bit_off, *num_bits);
1442 	if (ret < 0)
1443 		mlog_errno(ret);
1444 
1445 	*bits_left = le16_to_cpu(gd->bg_free_bits_count);
1446 
1447 out:
1448 	brelse(group_bh);
1449 
1450 	return ret;
1451 }
1452 
1453 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1454 			      handle_t *handle,
1455 			      u32 bits_wanted,
1456 			      u32 min_bits,
1457 			      u16 *bit_off,
1458 			      unsigned int *num_bits,
1459 			      u64 *bg_blkno,
1460 			      u16 *bits_left)
1461 {
1462 	int status;
1463 	u16 chain, tmp_bits;
1464 	u32 tmp_used;
1465 	u64 next_group;
1466 	struct inode *alloc_inode = ac->ac_inode;
1467 	struct buffer_head *group_bh = NULL;
1468 	struct buffer_head *prev_group_bh = NULL;
1469 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1470 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1471 	struct ocfs2_group_desc *bg;
1472 
1473 	chain = ac->ac_chain;
1474 	mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1475 	     bits_wanted, chain,
1476 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1477 
1478 	status = ocfs2_read_group_descriptor(alloc_inode, fe,
1479 					     le64_to_cpu(cl->cl_recs[chain].c_blkno),
1480 					     &group_bh);
1481 	if (status < 0) {
1482 		mlog_errno(status);
1483 		goto bail;
1484 	}
1485 	bg = (struct ocfs2_group_desc *) group_bh->b_data;
1486 
1487 	status = -ENOSPC;
1488 	/* for now, the chain search is a bit simplistic. We just use
1489 	 * the 1st group with any empty bits. */
1490 	while ((status = ac->ac_group_search(alloc_inode, group_bh,
1491 					     bits_wanted, min_bits,
1492 					     ac->ac_max_block, bit_off,
1493 					     &tmp_bits)) == -ENOSPC) {
1494 		if (!bg->bg_next_group)
1495 			break;
1496 
1497 		brelse(prev_group_bh);
1498 		prev_group_bh = NULL;
1499 
1500 		next_group = le64_to_cpu(bg->bg_next_group);
1501 		prev_group_bh = group_bh;
1502 		group_bh = NULL;
1503 		status = ocfs2_read_group_descriptor(alloc_inode, fe,
1504 						     next_group, &group_bh);
1505 		if (status < 0) {
1506 			mlog_errno(status);
1507 			goto bail;
1508 		}
1509 		bg = (struct ocfs2_group_desc *) group_bh->b_data;
1510 	}
1511 	if (status < 0) {
1512 		if (status != -ENOSPC)
1513 			mlog_errno(status);
1514 		goto bail;
1515 	}
1516 
1517 	mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1518 	     tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1519 
1520 	*num_bits = tmp_bits;
1521 
1522 	BUG_ON(*num_bits == 0);
1523 
1524 	/*
1525 	 * Keep track of previous block descriptor read. When
1526 	 * we find a target, if we have read more than X
1527 	 * number of descriptors, and the target is reasonably
1528 	 * empty, relink him to top of his chain.
1529 	 *
1530 	 * We've read 0 extra blocks and only send one more to
1531 	 * the transaction, yet the next guy to search has a
1532 	 * much easier time.
1533 	 *
1534 	 * Do this *after* figuring out how many bits we're taking out
1535 	 * of our target group.
1536 	 */
1537 	if (ac->ac_allow_chain_relink &&
1538 	    (prev_group_bh) &&
1539 	    (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1540 		status = ocfs2_relink_block_group(handle, alloc_inode,
1541 						  ac->ac_bh, group_bh,
1542 						  prev_group_bh, chain);
1543 		if (status < 0) {
1544 			mlog_errno(status);
1545 			goto bail;
1546 		}
1547 	}
1548 
1549 	/* Ok, claim our bits now: set the info on dinode, chainlist
1550 	 * and then the group */
1551 	status = ocfs2_journal_access_di(handle,
1552 					 INODE_CACHE(alloc_inode),
1553 					 ac->ac_bh,
1554 					 OCFS2_JOURNAL_ACCESS_WRITE);
1555 	if (status < 0) {
1556 		mlog_errno(status);
1557 		goto bail;
1558 	}
1559 
1560 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1561 	fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1562 	le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1563 
1564 	status = ocfs2_journal_dirty(handle,
1565 				     ac->ac_bh);
1566 	if (status < 0) {
1567 		mlog_errno(status);
1568 		goto bail;
1569 	}
1570 
1571 	status = ocfs2_block_group_set_bits(handle,
1572 					    alloc_inode,
1573 					    bg,
1574 					    group_bh,
1575 					    *bit_off,
1576 					    *num_bits);
1577 	if (status < 0) {
1578 		mlog_errno(status);
1579 		goto bail;
1580 	}
1581 
1582 	mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1583 	     (unsigned long long)le64_to_cpu(fe->i_blkno));
1584 
1585 	*bg_blkno = le64_to_cpu(bg->bg_blkno);
1586 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
1587 bail:
1588 	brelse(group_bh);
1589 	brelse(prev_group_bh);
1590 
1591 	mlog_exit(status);
1592 	return status;
1593 }
1594 
1595 /* will give out up to bits_wanted contiguous bits. */
1596 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1597 				     struct ocfs2_alloc_context *ac,
1598 				     handle_t *handle,
1599 				     u32 bits_wanted,
1600 				     u32 min_bits,
1601 				     u16 *bit_off,
1602 				     unsigned int *num_bits,
1603 				     u64 *bg_blkno)
1604 {
1605 	int status;
1606 	u16 victim, i;
1607 	u16 bits_left = 0;
1608 	u64 hint_blkno = ac->ac_last_group;
1609 	struct ocfs2_chain_list *cl;
1610 	struct ocfs2_dinode *fe;
1611 
1612 	mlog_entry_void();
1613 
1614 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1615 	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1616 	BUG_ON(!ac->ac_bh);
1617 
1618 	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1619 
1620 	/* The bh was validated by the inode read during
1621 	 * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1622 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1623 
1624 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1625 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
1626 		ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1627 			    "bits but only %u total.",
1628 			    (unsigned long long)le64_to_cpu(fe->i_blkno),
1629 			    le32_to_cpu(fe->id1.bitmap1.i_used),
1630 			    le32_to_cpu(fe->id1.bitmap1.i_total));
1631 		status = -EIO;
1632 		goto bail;
1633 	}
1634 
1635 	if (hint_blkno) {
1636 		/* Attempt to short-circuit the usual search mechanism
1637 		 * by jumping straight to the most recently used
1638 		 * allocation group. This helps us mantain some
1639 		 * contiguousness across allocations. */
1640 		status = ocfs2_search_one_group(ac, handle, bits_wanted,
1641 						min_bits, bit_off, num_bits,
1642 						hint_blkno, &bits_left);
1643 		if (!status) {
1644 			/* Be careful to update *bg_blkno here as the
1645 			 * caller is expecting it to be filled in, and
1646 			 * ocfs2_search_one_group() won't do that for
1647 			 * us. */
1648 			*bg_blkno = hint_blkno;
1649 			goto set_hint;
1650 		}
1651 		if (status < 0 && status != -ENOSPC) {
1652 			mlog_errno(status);
1653 			goto bail;
1654 		}
1655 	}
1656 
1657 	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1658 
1659 	victim = ocfs2_find_victim_chain(cl);
1660 	ac->ac_chain = victim;
1661 	ac->ac_allow_chain_relink = 1;
1662 
1663 	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
1664 				    num_bits, bg_blkno, &bits_left);
1665 	if (!status)
1666 		goto set_hint;
1667 	if (status < 0 && status != -ENOSPC) {
1668 		mlog_errno(status);
1669 		goto bail;
1670 	}
1671 
1672 	mlog(0, "Search of victim chain %u came up with nothing, "
1673 	     "trying all chains now.\n", victim);
1674 
1675 	/* If we didn't pick a good victim, then just default to
1676 	 * searching each chain in order. Don't allow chain relinking
1677 	 * because we only calculate enough journal credits for one
1678 	 * relink per alloc. */
1679 	ac->ac_allow_chain_relink = 0;
1680 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1681 		if (i == victim)
1682 			continue;
1683 		if (!cl->cl_recs[i].c_free)
1684 			continue;
1685 
1686 		ac->ac_chain = i;
1687 		status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1688 					    bit_off, num_bits, bg_blkno,
1689 					    &bits_left);
1690 		if (!status)
1691 			break;
1692 		if (status < 0 && status != -ENOSPC) {
1693 			mlog_errno(status);
1694 			goto bail;
1695 		}
1696 	}
1697 
1698 set_hint:
1699 	if (status != -ENOSPC) {
1700 		/* If the next search of this group is not likely to
1701 		 * yield a suitable extent, then we reset the last
1702 		 * group hint so as to not waste a disk read */
1703 		if (bits_left < min_bits)
1704 			ac->ac_last_group = 0;
1705 		else
1706 			ac->ac_last_group = *bg_blkno;
1707 	}
1708 
1709 bail:
1710 	mlog_exit(status);
1711 	return status;
1712 }
1713 
1714 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1715 			 handle_t *handle,
1716 			 struct ocfs2_alloc_context *ac,
1717 			 u32 bits_wanted,
1718 			 u16 *suballoc_bit_start,
1719 			 unsigned int *num_bits,
1720 			 u64 *blkno_start)
1721 {
1722 	int status;
1723 	u64 bg_blkno;
1724 
1725 	BUG_ON(!ac);
1726 	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1727 	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1728 
1729 	status = ocfs2_claim_suballoc_bits(osb,
1730 					   ac,
1731 					   handle,
1732 					   bits_wanted,
1733 					   1,
1734 					   suballoc_bit_start,
1735 					   num_bits,
1736 					   &bg_blkno);
1737 	if (status < 0) {
1738 		mlog_errno(status);
1739 		goto bail;
1740 	}
1741 	atomic_inc(&osb->alloc_stats.bg_allocs);
1742 
1743 	*blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1744 	ac->ac_bits_given += (*num_bits);
1745 	status = 0;
1746 bail:
1747 	mlog_exit(status);
1748 	return status;
1749 }
1750 
1751 static void ocfs2_init_inode_ac_group(struct inode *dir,
1752 				      struct buffer_head *parent_fe_bh,
1753 				      struct ocfs2_alloc_context *ac)
1754 {
1755 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1756 	/*
1757 	 * Try to allocate inodes from some specific group.
1758 	 *
1759 	 * If the parent dir has recorded the last group used in allocation,
1760 	 * cool, use it. Otherwise if we try to allocate new inode from the
1761 	 * same slot the parent dir belongs to, use the same chunk.
1762 	 *
1763 	 * We are very careful here to avoid the mistake of setting
1764 	 * ac_last_group to a group descriptor from a different (unlocked) slot.
1765 	 */
1766 	if (OCFS2_I(dir)->ip_last_used_group &&
1767 	    OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1768 		ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1769 	else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
1770 		ac->ac_last_group = ocfs2_which_suballoc_group(
1771 					le64_to_cpu(fe->i_blkno),
1772 					le16_to_cpu(fe->i_suballoc_bit));
1773 }
1774 
1775 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1776 					     struct ocfs2_alloc_context *ac)
1777 {
1778 	OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
1779 	OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1780 }
1781 
1782 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1783 			  handle_t *handle,
1784 			  struct inode *dir,
1785 			  struct buffer_head *parent_fe_bh,
1786 			  struct ocfs2_alloc_context *ac,
1787 			  u16 *suballoc_bit,
1788 			  u64 *fe_blkno)
1789 {
1790 	int status;
1791 	unsigned int num_bits;
1792 	u64 bg_blkno;
1793 
1794 	mlog_entry_void();
1795 
1796 	BUG_ON(!ac);
1797 	BUG_ON(ac->ac_bits_given != 0);
1798 	BUG_ON(ac->ac_bits_wanted != 1);
1799 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1800 
1801 	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1802 
1803 	status = ocfs2_claim_suballoc_bits(osb,
1804 					   ac,
1805 					   handle,
1806 					   1,
1807 					   1,
1808 					   suballoc_bit,
1809 					   &num_bits,
1810 					   &bg_blkno);
1811 	if (status < 0) {
1812 		mlog_errno(status);
1813 		goto bail;
1814 	}
1815 	atomic_inc(&osb->alloc_stats.bg_allocs);
1816 
1817 	BUG_ON(num_bits != 1);
1818 
1819 	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1820 	ac->ac_bits_given++;
1821 	ocfs2_save_inode_ac_group(dir, ac);
1822 	status = 0;
1823 bail:
1824 	mlog_exit(status);
1825 	return status;
1826 }
1827 
1828 /* translate a group desc. blkno and it's bitmap offset into
1829  * disk cluster offset. */
1830 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1831 						   u64 bg_blkno,
1832 						   u16 bg_bit_off)
1833 {
1834 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1835 	u32 cluster = 0;
1836 
1837 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1838 
1839 	if (bg_blkno != osb->first_cluster_group_blkno)
1840 		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1841 	cluster += (u32) bg_bit_off;
1842 	return cluster;
1843 }
1844 
1845 /* given a cluster offset, calculate which block group it belongs to
1846  * and return that block offset. */
1847 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1848 {
1849 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1850 	u32 group_no;
1851 
1852 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1853 
1854 	group_no = cluster / osb->bitmap_cpg;
1855 	if (!group_no)
1856 		return osb->first_cluster_group_blkno;
1857 	return ocfs2_clusters_to_blocks(inode->i_sb,
1858 					group_no * osb->bitmap_cpg);
1859 }
1860 
1861 /* given the block number of a cluster start, calculate which cluster
1862  * group and descriptor bitmap offset that corresponds to. */
1863 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1864 						u64 data_blkno,
1865 						u64 *bg_blkno,
1866 						u16 *bg_bit_off)
1867 {
1868 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1869 	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1870 
1871 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1872 
1873 	*bg_blkno = ocfs2_which_cluster_group(inode,
1874 					      data_cluster);
1875 
1876 	if (*bg_blkno == osb->first_cluster_group_blkno)
1877 		*bg_bit_off = (u16) data_cluster;
1878 	else
1879 		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1880 							     data_blkno - *bg_blkno);
1881 }
1882 
1883 /*
1884  * min_bits - minimum contiguous chunk from this total allocation we
1885  * can handle. set to what we asked for originally for a full
1886  * contig. allocation, set to '1' to indicate we can deal with extents
1887  * of any size.
1888  */
1889 int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1890 			   handle_t *handle,
1891 			   struct ocfs2_alloc_context *ac,
1892 			   u32 min_clusters,
1893 			   u32 max_clusters,
1894 			   u32 *cluster_start,
1895 			   u32 *num_clusters)
1896 {
1897 	int status;
1898 	unsigned int bits_wanted = max_clusters;
1899 	u64 bg_blkno = 0;
1900 	u16 bg_bit_off;
1901 
1902 	mlog_entry_void();
1903 
1904 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1905 
1906 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1907 	       && ac->ac_which != OCFS2_AC_USE_MAIN);
1908 
1909 	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1910 		status = ocfs2_claim_local_alloc_bits(osb,
1911 						      handle,
1912 						      ac,
1913 						      bits_wanted,
1914 						      cluster_start,
1915 						      num_clusters);
1916 		if (!status)
1917 			atomic_inc(&osb->alloc_stats.local_data);
1918 	} else {
1919 		if (min_clusters > (osb->bitmap_cpg - 1)) {
1920 			/* The only paths asking for contiguousness
1921 			 * should know about this already. */
1922 			mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1923 			     "group bitmap size %u!\n", min_clusters,
1924 			     osb->bitmap_cpg);
1925 			status = -ENOSPC;
1926 			goto bail;
1927 		}
1928 		/* clamp the current request down to a realistic size. */
1929 		if (bits_wanted > (osb->bitmap_cpg - 1))
1930 			bits_wanted = osb->bitmap_cpg - 1;
1931 
1932 		status = ocfs2_claim_suballoc_bits(osb,
1933 						   ac,
1934 						   handle,
1935 						   bits_wanted,
1936 						   min_clusters,
1937 						   &bg_bit_off,
1938 						   num_clusters,
1939 						   &bg_blkno);
1940 		if (!status) {
1941 			*cluster_start =
1942 				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1943 								 bg_blkno,
1944 								 bg_bit_off);
1945 			atomic_inc(&osb->alloc_stats.bitmap_data);
1946 		}
1947 	}
1948 	if (status < 0) {
1949 		if (status != -ENOSPC)
1950 			mlog_errno(status);
1951 		goto bail;
1952 	}
1953 
1954 	ac->ac_bits_given += *num_clusters;
1955 
1956 bail:
1957 	mlog_exit(status);
1958 	return status;
1959 }
1960 
1961 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1962 			 handle_t *handle,
1963 			 struct ocfs2_alloc_context *ac,
1964 			 u32 min_clusters,
1965 			 u32 *cluster_start,
1966 			 u32 *num_clusters)
1967 {
1968 	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1969 
1970 	return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
1971 				      bits_wanted, cluster_start, num_clusters);
1972 }
1973 
1974 static int ocfs2_block_group_clear_bits(handle_t *handle,
1975 					struct inode *alloc_inode,
1976 					struct ocfs2_group_desc *bg,
1977 					struct buffer_head *group_bh,
1978 					unsigned int bit_off,
1979 					unsigned int num_bits,
1980 					void (*undo_fn)(unsigned int bit,
1981 							unsigned long *bmap))
1982 {
1983 	int status;
1984 	unsigned int tmp;
1985 	struct ocfs2_group_desc *undo_bg = NULL;
1986 
1987 	mlog_entry_void();
1988 
1989 	/* The caller got this descriptor from
1990 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1991 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1992 
1993 	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1994 
1995 	BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
1996 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1997 					 group_bh,
1998 					 undo_fn ?
1999 					 OCFS2_JOURNAL_ACCESS_UNDO :
2000 					 OCFS2_JOURNAL_ACCESS_WRITE);
2001 	if (status < 0) {
2002 		mlog_errno(status);
2003 		goto bail;
2004 	}
2005 
2006 	if (undo_fn) {
2007 		jbd_lock_bh_state(group_bh);
2008 		undo_bg = (struct ocfs2_group_desc *)
2009 					bh2jh(group_bh)->b_committed_data;
2010 		BUG_ON(!undo_bg);
2011 	}
2012 
2013 	tmp = num_bits;
2014 	while(tmp--) {
2015 		ocfs2_clear_bit((bit_off + tmp),
2016 				(unsigned long *) bg->bg_bitmap);
2017 		if (undo_fn)
2018 			undo_fn(bit_off + tmp,
2019 				(unsigned long *) undo_bg->bg_bitmap);
2020 	}
2021 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2022 
2023 	if (undo_fn)
2024 		jbd_unlock_bh_state(group_bh);
2025 
2026 	status = ocfs2_journal_dirty(handle, group_bh);
2027 	if (status < 0)
2028 		mlog_errno(status);
2029 bail:
2030 	return status;
2031 }
2032 
2033 /*
2034  * expects the suballoc inode to already be locked.
2035  */
2036 static int _ocfs2_free_suballoc_bits(handle_t *handle,
2037 				     struct inode *alloc_inode,
2038 				     struct buffer_head *alloc_bh,
2039 				     unsigned int start_bit,
2040 				     u64 bg_blkno,
2041 				     unsigned int count,
2042 				     void (*undo_fn)(unsigned int bit,
2043 						     unsigned long *bitmap))
2044 {
2045 	int status = 0;
2046 	u32 tmp_used;
2047 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2048 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2049 	struct buffer_head *group_bh = NULL;
2050 	struct ocfs2_group_desc *group;
2051 
2052 	mlog_entry_void();
2053 
2054 	/* The alloc_bh comes from ocfs2_free_dinode() or
2055 	 * ocfs2_free_clusters().  The callers have all locked the
2056 	 * allocator and gotten alloc_bh from the lock call.  This
2057 	 * validates the dinode buffer.  Any corruption that has happended
2058 	 * is a code bug. */
2059 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2060 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2061 
2062 	mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
2063 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
2064 	     (unsigned long long)bg_blkno, start_bit);
2065 
2066 	status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2067 					     &group_bh);
2068 	if (status < 0) {
2069 		mlog_errno(status);
2070 		goto bail;
2071 	}
2072 	group = (struct ocfs2_group_desc *) group_bh->b_data;
2073 
2074 	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2075 
2076 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2077 					      group, group_bh,
2078 					      start_bit, count, undo_fn);
2079 	if (status < 0) {
2080 		mlog_errno(status);
2081 		goto bail;
2082 	}
2083 
2084 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2085 					 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2086 	if (status < 0) {
2087 		mlog_errno(status);
2088 		goto bail;
2089 	}
2090 
2091 	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2092 		     count);
2093 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2094 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2095 
2096 	status = ocfs2_journal_dirty(handle, alloc_bh);
2097 	if (status < 0) {
2098 		mlog_errno(status);
2099 		goto bail;
2100 	}
2101 
2102 bail:
2103 	brelse(group_bh);
2104 
2105 	mlog_exit(status);
2106 	return status;
2107 }
2108 
2109 int ocfs2_free_suballoc_bits(handle_t *handle,
2110 			     struct inode *alloc_inode,
2111 			     struct buffer_head *alloc_bh,
2112 			     unsigned int start_bit,
2113 			     u64 bg_blkno,
2114 			     unsigned int count)
2115 {
2116 	return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2117 					 start_bit, bg_blkno, count, NULL);
2118 }
2119 
2120 int ocfs2_free_dinode(handle_t *handle,
2121 		      struct inode *inode_alloc_inode,
2122 		      struct buffer_head *inode_alloc_bh,
2123 		      struct ocfs2_dinode *di)
2124 {
2125 	u64 blk = le64_to_cpu(di->i_blkno);
2126 	u16 bit = le16_to_cpu(di->i_suballoc_bit);
2127 	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2128 
2129 	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2130 					inode_alloc_bh, bit, bg_blkno, 1);
2131 }
2132 
2133 static int _ocfs2_free_clusters(handle_t *handle,
2134 				struct inode *bitmap_inode,
2135 				struct buffer_head *bitmap_bh,
2136 				u64 start_blk,
2137 				unsigned int num_clusters,
2138 				void (*undo_fn)(unsigned int bit,
2139 						unsigned long *bitmap))
2140 {
2141 	int status;
2142 	u16 bg_start_bit;
2143 	u64 bg_blkno;
2144 	struct ocfs2_dinode *fe;
2145 
2146 	/* You can't ever have a contiguous set of clusters
2147 	 * bigger than a block group bitmap so we never have to worry
2148 	 * about looping on them. */
2149 
2150 	mlog_entry_void();
2151 
2152 	/* This is expensive. We can safely remove once this stuff has
2153 	 * gotten tested really well. */
2154 	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
2155 
2156 	fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
2157 
2158 	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2159 				     &bg_start_bit);
2160 
2161 	mlog(0, "want to free %u clusters starting at block %llu\n",
2162 	     num_clusters, (unsigned long long)start_blk);
2163 	mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2164 	     (unsigned long long)bg_blkno, bg_start_bit);
2165 
2166 	status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2167 					   bg_start_bit, bg_blkno,
2168 					   num_clusters, undo_fn);
2169 	if (status < 0) {
2170 		mlog_errno(status);
2171 		goto out;
2172 	}
2173 
2174 	ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2175 					 num_clusters);
2176 
2177 out:
2178 	mlog_exit(status);
2179 	return status;
2180 }
2181 
2182 int ocfs2_free_clusters(handle_t *handle,
2183 			struct inode *bitmap_inode,
2184 			struct buffer_head *bitmap_bh,
2185 			u64 start_blk,
2186 			unsigned int num_clusters)
2187 {
2188 	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2189 				    start_blk, num_clusters,
2190 				    _ocfs2_set_bit);
2191 }
2192 
2193 /*
2194  * Give never-used clusters back to the global bitmap.  We don't need
2195  * to protect these bits in the undo buffer.
2196  */
2197 int ocfs2_release_clusters(handle_t *handle,
2198 			   struct inode *bitmap_inode,
2199 			   struct buffer_head *bitmap_bh,
2200 			   u64 start_blk,
2201 			   unsigned int num_clusters)
2202 {
2203 	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2204 				    start_blk, num_clusters,
2205 				    _ocfs2_clear_bit);
2206 }
2207 
2208 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2209 {
2210 	printk("Block Group:\n");
2211 	printk("bg_signature:       %s\n", bg->bg_signature);
2212 	printk("bg_size:            %u\n", bg->bg_size);
2213 	printk("bg_bits:            %u\n", bg->bg_bits);
2214 	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
2215 	printk("bg_chain:           %u\n", bg->bg_chain);
2216 	printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
2217 	printk("bg_next_group:      %llu\n",
2218 	       (unsigned long long)bg->bg_next_group);
2219 	printk("bg_parent_dinode:   %llu\n",
2220 	       (unsigned long long)bg->bg_parent_dinode);
2221 	printk("bg_blkno:           %llu\n",
2222 	       (unsigned long long)bg->bg_blkno);
2223 }
2224 
2225 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2226 {
2227 	int i;
2228 
2229 	printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2230 	printk("i_signature:                  %s\n", fe->i_signature);
2231 	printk("i_size:                       %llu\n",
2232 	       (unsigned long long)fe->i_size);
2233 	printk("i_clusters:                   %u\n", fe->i_clusters);
2234 	printk("i_generation:                 %u\n",
2235 	       le32_to_cpu(fe->i_generation));
2236 	printk("id1.bitmap1.i_used:           %u\n",
2237 	       le32_to_cpu(fe->id1.bitmap1.i_used));
2238 	printk("id1.bitmap1.i_total:          %u\n",
2239 	       le32_to_cpu(fe->id1.bitmap1.i_total));
2240 	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
2241 	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
2242 	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
2243 	printk("id2.i_chain.cl_next_free_rec: %u\n",
2244 	       fe->id2.i_chain.cl_next_free_rec);
2245 	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2246 		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
2247 		       fe->id2.i_chain.cl_recs[i].c_free);
2248 		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2249 		       fe->id2.i_chain.cl_recs[i].c_total);
2250 		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2251 		       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2252 	}
2253 }
2254 
2255 /*
2256  * For a given allocation, determine which allocators will need to be
2257  * accessed, and lock them, reserving the appropriate number of bits.
2258  *
2259  * Sparse file systems call this from ocfs2_write_begin_nolock()
2260  * and ocfs2_allocate_unwritten_extents().
2261  *
2262  * File systems which don't support holes call this from
2263  * ocfs2_extend_allocation().
2264  */
2265 int ocfs2_lock_allocators(struct inode *inode,
2266 			  struct ocfs2_extent_tree *et,
2267 			  u32 clusters_to_add, u32 extents_to_split,
2268 			  struct ocfs2_alloc_context **data_ac,
2269 			  struct ocfs2_alloc_context **meta_ac)
2270 {
2271 	int ret = 0, num_free_extents;
2272 	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2273 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2274 
2275 	*meta_ac = NULL;
2276 	if (data_ac)
2277 		*data_ac = NULL;
2278 
2279 	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2280 
2281 	num_free_extents = ocfs2_num_free_extents(osb, et);
2282 	if (num_free_extents < 0) {
2283 		ret = num_free_extents;
2284 		mlog_errno(ret);
2285 		goto out;
2286 	}
2287 
2288 	/*
2289 	 * Sparse allocation file systems need to be more conservative
2290 	 * with reserving room for expansion - the actual allocation
2291 	 * happens while we've got a journal handle open so re-taking
2292 	 * a cluster lock (because we ran out of room for another
2293 	 * extent) will violate ordering rules.
2294 	 *
2295 	 * Most of the time we'll only be seeing this 1 cluster at a time
2296 	 * anyway.
2297 	 *
2298 	 * Always lock for any unwritten extents - we might want to
2299 	 * add blocks during a split.
2300 	 */
2301 	if (!num_free_extents ||
2302 	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2303 		ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2304 		if (ret < 0) {
2305 			if (ret != -ENOSPC)
2306 				mlog_errno(ret);
2307 			goto out;
2308 		}
2309 	}
2310 
2311 	if (clusters_to_add == 0)
2312 		goto out;
2313 
2314 	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2315 	if (ret < 0) {
2316 		if (ret != -ENOSPC)
2317 			mlog_errno(ret);
2318 		goto out;
2319 	}
2320 
2321 out:
2322 	if (ret) {
2323 		if (*meta_ac) {
2324 			ocfs2_free_alloc_context(*meta_ac);
2325 			*meta_ac = NULL;
2326 		}
2327 
2328 		/*
2329 		 * We cannot have an error and a non null *data_ac.
2330 		 */
2331 	}
2332 
2333 	return ret;
2334 }
2335 
2336 /*
2337  * Read the inode specified by blkno to get suballoc_slot and
2338  * suballoc_bit.
2339  */
2340 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2341 				       u16 *suballoc_slot, u16 *suballoc_bit)
2342 {
2343 	int status;
2344 	struct buffer_head *inode_bh = NULL;
2345 	struct ocfs2_dinode *inode_fe;
2346 
2347 	mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
2348 
2349 	/* dirty read disk */
2350 	status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2351 	if (status < 0) {
2352 		mlog(ML_ERROR, "read block %llu failed %d\n",
2353 		     (unsigned long long)blkno, status);
2354 		goto bail;
2355 	}
2356 
2357 	inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2358 	if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2359 		mlog(ML_ERROR, "invalid inode %llu requested\n",
2360 		     (unsigned long long)blkno);
2361 		status = -EINVAL;
2362 		goto bail;
2363 	}
2364 
2365 	if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2366 	    (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2367 		mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2368 		     (unsigned long long)blkno,
2369 		     (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2370 		status = -EINVAL;
2371 		goto bail;
2372 	}
2373 
2374 	if (suballoc_slot)
2375 		*suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2376 	if (suballoc_bit)
2377 		*suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2378 
2379 bail:
2380 	brelse(inode_bh);
2381 
2382 	mlog_exit(status);
2383 	return status;
2384 }
2385 
2386 /*
2387  * test whether bit is SET in allocator bitmap or not.  on success, 0
2388  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2389  * is returned and *res is meaningless.  Call this after you have
2390  * cluster locked against suballoc, or you may get a result based on
2391  * non-up2date contents
2392  */
2393 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2394 				   struct inode *suballoc,
2395 				   struct buffer_head *alloc_bh, u64 blkno,
2396 				   u16 bit, int *res)
2397 {
2398 	struct ocfs2_dinode *alloc_fe;
2399 	struct ocfs2_group_desc *group;
2400 	struct buffer_head *group_bh = NULL;
2401 	u64 bg_blkno;
2402 	int status;
2403 
2404 	mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2405 		   (unsigned int)bit);
2406 
2407 	alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
2408 	if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
2409 		mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2410 		     (unsigned int)bit,
2411 		     ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
2412 		status = -EINVAL;
2413 		goto bail;
2414 	}
2415 
2416 	bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2417 	status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
2418 					     &group_bh);
2419 	if (status < 0) {
2420 		mlog(ML_ERROR, "read group %llu failed %d\n",
2421 		     (unsigned long long)bg_blkno, status);
2422 		goto bail;
2423 	}
2424 
2425 	group = (struct ocfs2_group_desc *) group_bh->b_data;
2426 	*res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2427 
2428 bail:
2429 	brelse(group_bh);
2430 
2431 	mlog_exit(status);
2432 	return status;
2433 }
2434 
2435 /*
2436  * Test if the bit representing this inode (blkno) is set in the
2437  * suballocator.
2438  *
2439  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2440  *
2441  * In the event of failure, a negative value is returned and *res is
2442  * meaningless.
2443  *
2444  * Callers must make sure to hold nfs_sync_lock to prevent
2445  * ocfs2_delete_inode() on another node from accessing the same
2446  * suballocator concurrently.
2447  */
2448 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2449 {
2450 	int status;
2451 	u16 suballoc_bit = 0, suballoc_slot = 0;
2452 	struct inode *inode_alloc_inode;
2453 	struct buffer_head *alloc_bh = NULL;
2454 
2455 	mlog_entry("blkno: %llu", (unsigned long long)blkno);
2456 
2457 	status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2458 					     &suballoc_bit);
2459 	if (status < 0) {
2460 		mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2461 		goto bail;
2462 	}
2463 
2464 	inode_alloc_inode =
2465 		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2466 					    suballoc_slot);
2467 	if (!inode_alloc_inode) {
2468 		/* the error code could be inaccurate, but we are not able to
2469 		 * get the correct one. */
2470 		status = -EINVAL;
2471 		mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2472 		     (u32)suballoc_slot);
2473 		goto bail;
2474 	}
2475 
2476 	mutex_lock(&inode_alloc_inode->i_mutex);
2477 	status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2478 	if (status < 0) {
2479 		mutex_unlock(&inode_alloc_inode->i_mutex);
2480 		mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2481 		     (u32)suballoc_slot, status);
2482 		goto bail;
2483 	}
2484 
2485 	status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2486 					 blkno, suballoc_bit, res);
2487 	if (status < 0)
2488 		mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2489 
2490 	ocfs2_inode_unlock(inode_alloc_inode, 0);
2491 	mutex_unlock(&inode_alloc_inode->i_mutex);
2492 
2493 	iput(inode_alloc_inode);
2494 	brelse(alloc_bh);
2495 bail:
2496 	mlog_exit(status);
2497 	return status;
2498 }
2499