xref: /linux/fs/ocfs2/suballoc.c (revision ec63e2a4897075e427c121d863bd89c44578094f)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26 
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31 
32 #include <cluster/masklog.h>
33 
34 #include "ocfs2.h"
35 
36 #include "alloc.h"
37 #include "blockcheck.h"
38 #include "dlmglue.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "super.h"
44 #include "sysfile.h"
45 #include "uptodate.h"
46 #include "ocfs2_trace.h"
47 
48 #include "buffer_head_io.h"
49 
50 #define NOT_ALLOC_NEW_GROUP		0
51 #define ALLOC_NEW_GROUP			0x1
52 #define ALLOC_GROUPS_FROM_GLOBAL	0x2
53 
54 #define OCFS2_MAX_TO_STEAL		1024
55 
56 struct ocfs2_suballoc_result {
57 	u64		sr_bg_blkno;	/* The bg we allocated from.  Set
58 					   to 0 when a block group is
59 					   contiguous. */
60 	u64		sr_bg_stable_blkno; /*
61 					     * Doesn't change, always
62 					     * set to target block
63 					     * group descriptor
64 					     * block.
65 					     */
66 	u64		sr_blkno;	/* The first allocated block */
67 	unsigned int	sr_bit_offset;	/* The bit in the bg */
68 	unsigned int	sr_bits;	/* How many bits we claimed */
69 };
70 
71 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
72 {
73 	if (res->sr_blkno == 0)
74 		return 0;
75 
76 	if (res->sr_bg_blkno)
77 		return res->sr_bg_blkno;
78 
79 	return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
80 }
81 
82 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
83 static int ocfs2_block_group_fill(handle_t *handle,
84 				  struct inode *alloc_inode,
85 				  struct buffer_head *bg_bh,
86 				  u64 group_blkno,
87 				  unsigned int group_clusters,
88 				  u16 my_chain,
89 				  struct ocfs2_chain_list *cl);
90 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
91 				   struct inode *alloc_inode,
92 				   struct buffer_head *bh,
93 				   u64 max_block,
94 				   u64 *last_alloc_group,
95 				   int flags);
96 
97 static int ocfs2_cluster_group_search(struct inode *inode,
98 				      struct buffer_head *group_bh,
99 				      u32 bits_wanted, u32 min_bits,
100 				      u64 max_block,
101 				      struct ocfs2_suballoc_result *res);
102 static int ocfs2_block_group_search(struct inode *inode,
103 				    struct buffer_head *group_bh,
104 				    u32 bits_wanted, u32 min_bits,
105 				    u64 max_block,
106 				    struct ocfs2_suballoc_result *res);
107 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
108 				     handle_t *handle,
109 				     u32 bits_wanted,
110 				     u32 min_bits,
111 				     struct ocfs2_suballoc_result *res);
112 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
113 					 int nr);
114 static int ocfs2_relink_block_group(handle_t *handle,
115 				    struct inode *alloc_inode,
116 				    struct buffer_head *fe_bh,
117 				    struct buffer_head *bg_bh,
118 				    struct buffer_head *prev_bg_bh,
119 				    u16 chain);
120 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
121 						     u32 wanted);
122 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
123 						   u64 bg_blkno,
124 						   u16 bg_bit_off);
125 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
126 						u64 data_blkno,
127 						u64 *bg_blkno,
128 						u16 *bg_bit_off);
129 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
130 					     u32 bits_wanted, u64 max_block,
131 					     int flags,
132 					     struct ocfs2_alloc_context **ac);
133 
134 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
135 {
136 	struct inode *inode = ac->ac_inode;
137 
138 	if (inode) {
139 		if (ac->ac_which != OCFS2_AC_USE_LOCAL)
140 			ocfs2_inode_unlock(inode, 1);
141 
142 		inode_unlock(inode);
143 
144 		iput(inode);
145 		ac->ac_inode = NULL;
146 	}
147 	brelse(ac->ac_bh);
148 	ac->ac_bh = NULL;
149 	ac->ac_resv = NULL;
150 	kfree(ac->ac_find_loc_priv);
151 	ac->ac_find_loc_priv = NULL;
152 }
153 
154 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
155 {
156 	ocfs2_free_ac_resource(ac);
157 	kfree(ac);
158 }
159 
160 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
161 {
162 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
163 }
164 
165 #define do_error(fmt, ...)						\
166 do {									\
167 	if (resize)							\
168 		mlog(ML_ERROR, fmt, ##__VA_ARGS__);			\
169 	else								\
170 		return ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
171 } while (0)
172 
173 static int ocfs2_validate_gd_self(struct super_block *sb,
174 				  struct buffer_head *bh,
175 				  int resize)
176 {
177 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
178 
179 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
180 		do_error("Group descriptor #%llu has bad signature %.*s\n",
181 			 (unsigned long long)bh->b_blocknr, 7,
182 			 gd->bg_signature);
183 	}
184 
185 	if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
186 		do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
187 			 (unsigned long long)bh->b_blocknr,
188 			 (unsigned long long)le64_to_cpu(gd->bg_blkno));
189 	}
190 
191 	if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
192 		do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
193 			 (unsigned long long)bh->b_blocknr,
194 			 le32_to_cpu(gd->bg_generation));
195 	}
196 
197 	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
198 		do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
199 			 (unsigned long long)bh->b_blocknr,
200 			 le16_to_cpu(gd->bg_bits),
201 			 le16_to_cpu(gd->bg_free_bits_count));
202 	}
203 
204 	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
205 		do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
206 			 (unsigned long long)bh->b_blocknr,
207 			 le16_to_cpu(gd->bg_bits),
208 			 8 * le16_to_cpu(gd->bg_size));
209 	}
210 
211 	return 0;
212 }
213 
214 static int ocfs2_validate_gd_parent(struct super_block *sb,
215 				    struct ocfs2_dinode *di,
216 				    struct buffer_head *bh,
217 				    int resize)
218 {
219 	unsigned int max_bits;
220 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
221 
222 	if (di->i_blkno != gd->bg_parent_dinode) {
223 		do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
224 			 (unsigned long long)bh->b_blocknr,
225 			 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
226 			 (unsigned long long)le64_to_cpu(di->i_blkno));
227 	}
228 
229 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
230 	if (le16_to_cpu(gd->bg_bits) > max_bits) {
231 		do_error("Group descriptor #%llu has bit count of %u\n",
232 			 (unsigned long long)bh->b_blocknr,
233 			 le16_to_cpu(gd->bg_bits));
234 	}
235 
236 	/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
237 	if ((le16_to_cpu(gd->bg_chain) >
238 	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
239 	    ((le16_to_cpu(gd->bg_chain) ==
240 	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
241 		do_error("Group descriptor #%llu has bad chain %u\n",
242 			 (unsigned long long)bh->b_blocknr,
243 			 le16_to_cpu(gd->bg_chain));
244 	}
245 
246 	return 0;
247 }
248 
249 #undef do_error
250 
251 /*
252  * This version only prints errors.  It does not fail the filesystem, and
253  * exists only for resize.
254  */
255 int ocfs2_check_group_descriptor(struct super_block *sb,
256 				 struct ocfs2_dinode *di,
257 				 struct buffer_head *bh)
258 {
259 	int rc;
260 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
261 
262 	BUG_ON(!buffer_uptodate(bh));
263 
264 	/*
265 	 * If the ecc fails, we return the error but otherwise
266 	 * leave the filesystem running.  We know any error is
267 	 * local to this block.
268 	 */
269 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
270 	if (rc) {
271 		mlog(ML_ERROR,
272 		     "Checksum failed for group descriptor %llu\n",
273 		     (unsigned long long)bh->b_blocknr);
274 	} else
275 		rc = ocfs2_validate_gd_self(sb, bh, 1);
276 	if (!rc)
277 		rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
278 
279 	return rc;
280 }
281 
282 static int ocfs2_validate_group_descriptor(struct super_block *sb,
283 					   struct buffer_head *bh)
284 {
285 	int rc;
286 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
287 
288 	trace_ocfs2_validate_group_descriptor(
289 					(unsigned long long)bh->b_blocknr);
290 
291 	BUG_ON(!buffer_uptodate(bh));
292 
293 	/*
294 	 * If the ecc fails, we return the error but otherwise
295 	 * leave the filesystem running.  We know any error is
296 	 * local to this block.
297 	 */
298 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
299 	if (rc)
300 		return rc;
301 
302 	/*
303 	 * Errors after here are fatal.
304 	 */
305 
306 	return ocfs2_validate_gd_self(sb, bh, 0);
307 }
308 
309 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
310 				u64 gd_blkno, struct buffer_head **bh)
311 {
312 	int rc;
313 	struct buffer_head *tmp = *bh;
314 
315 	rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
316 			      ocfs2_validate_group_descriptor);
317 	if (rc)
318 		goto out;
319 
320 	rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
321 	if (rc) {
322 		brelse(tmp);
323 		goto out;
324 	}
325 
326 	/* If ocfs2_read_block() got us a new bh, pass it up. */
327 	if (!*bh)
328 		*bh = tmp;
329 
330 out:
331 	return rc;
332 }
333 
334 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
335 					  struct ocfs2_group_desc *bg,
336 					  struct ocfs2_chain_list *cl,
337 					  u64 p_blkno, unsigned int clusters)
338 {
339 	struct ocfs2_extent_list *el = &bg->bg_list;
340 	struct ocfs2_extent_rec *rec;
341 
342 	BUG_ON(!ocfs2_supports_discontig_bg(osb));
343 	if (!el->l_next_free_rec)
344 		el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
345 	rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
346 	rec->e_blkno = cpu_to_le64(p_blkno);
347 	rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
348 				  le16_to_cpu(cl->cl_bpc));
349 	rec->e_leaf_clusters = cpu_to_le16(clusters);
350 	le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
351 	le16_add_cpu(&bg->bg_free_bits_count,
352 		     clusters * le16_to_cpu(cl->cl_bpc));
353 	le16_add_cpu(&el->l_next_free_rec, 1);
354 }
355 
356 static int ocfs2_block_group_fill(handle_t *handle,
357 				  struct inode *alloc_inode,
358 				  struct buffer_head *bg_bh,
359 				  u64 group_blkno,
360 				  unsigned int group_clusters,
361 				  u16 my_chain,
362 				  struct ocfs2_chain_list *cl)
363 {
364 	int status = 0;
365 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
366 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
367 	struct super_block * sb = alloc_inode->i_sb;
368 
369 	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
370 		status = ocfs2_error(alloc_inode->i_sb,
371 				     "group block (%llu) != b_blocknr (%llu)\n",
372 				     (unsigned long long)group_blkno,
373 				     (unsigned long long) bg_bh->b_blocknr);
374 		goto bail;
375 	}
376 
377 	status = ocfs2_journal_access_gd(handle,
378 					 INODE_CACHE(alloc_inode),
379 					 bg_bh,
380 					 OCFS2_JOURNAL_ACCESS_CREATE);
381 	if (status < 0) {
382 		mlog_errno(status);
383 		goto bail;
384 	}
385 
386 	memset(bg, 0, sb->s_blocksize);
387 	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
388 	bg->bg_generation = cpu_to_le32(osb->fs_generation);
389 	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
390 						osb->s_feature_incompat));
391 	bg->bg_chain = cpu_to_le16(my_chain);
392 	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
393 	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
394 	bg->bg_blkno = cpu_to_le64(group_blkno);
395 	if (group_clusters == le16_to_cpu(cl->cl_cpg))
396 		bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
397 	else
398 		ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
399 					      group_clusters);
400 
401 	/* set the 1st bit in the bitmap to account for the descriptor block */
402 	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
403 	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
404 
405 	ocfs2_journal_dirty(handle, bg_bh);
406 
407 	/* There is no need to zero out or otherwise initialize the
408 	 * other blocks in a group - All valid FS metadata in a block
409 	 * group stores the superblock fs_generation value at
410 	 * allocation time. */
411 
412 bail:
413 	if (status)
414 		mlog_errno(status);
415 	return status;
416 }
417 
418 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
419 {
420 	u16 curr, best;
421 
422 	best = curr = 0;
423 	while (curr < le16_to_cpu(cl->cl_count)) {
424 		if (le32_to_cpu(cl->cl_recs[best].c_total) >
425 		    le32_to_cpu(cl->cl_recs[curr].c_total))
426 			best = curr;
427 		curr++;
428 	}
429 	return best;
430 }
431 
432 static struct buffer_head *
433 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
434 			       struct inode *alloc_inode,
435 			       struct ocfs2_alloc_context *ac,
436 			       struct ocfs2_chain_list *cl)
437 {
438 	int status;
439 	u32 bit_off, num_bits;
440 	u64 bg_blkno;
441 	struct buffer_head *bg_bh;
442 	unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
443 
444 	status = ocfs2_claim_clusters(handle, ac,
445 				      le16_to_cpu(cl->cl_cpg), &bit_off,
446 				      &num_bits);
447 	if (status < 0) {
448 		if (status != -ENOSPC)
449 			mlog_errno(status);
450 		goto bail;
451 	}
452 
453 	/* setup the group */
454 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
455 	trace_ocfs2_block_group_alloc_contig(
456 	     (unsigned long long)bg_blkno, alloc_rec);
457 
458 	bg_bh = sb_getblk(osb->sb, bg_blkno);
459 	if (!bg_bh) {
460 		status = -ENOMEM;
461 		mlog_errno(status);
462 		goto bail;
463 	}
464 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
465 
466 	status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
467 					bg_blkno, num_bits, alloc_rec, cl);
468 	if (status < 0) {
469 		brelse(bg_bh);
470 		mlog_errno(status);
471 	}
472 
473 bail:
474 	return status ? ERR_PTR(status) : bg_bh;
475 }
476 
477 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
478 					handle_t *handle,
479 					struct ocfs2_alloc_context *ac,
480 					unsigned int min_bits,
481 					u32 *bit_off, u32 *num_bits)
482 {
483 	int status = 0;
484 
485 	while (min_bits) {
486 		status = ocfs2_claim_clusters(handle, ac, min_bits,
487 					      bit_off, num_bits);
488 		if (status != -ENOSPC)
489 			break;
490 
491 		min_bits >>= 1;
492 	}
493 
494 	return status;
495 }
496 
497 static int ocfs2_block_group_grow_discontig(handle_t *handle,
498 					    struct inode *alloc_inode,
499 					    struct buffer_head *bg_bh,
500 					    struct ocfs2_alloc_context *ac,
501 					    struct ocfs2_chain_list *cl,
502 					    unsigned int min_bits)
503 {
504 	int status;
505 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
506 	struct ocfs2_group_desc *bg =
507 		(struct ocfs2_group_desc *)bg_bh->b_data;
508 	unsigned int needed = le16_to_cpu(cl->cl_cpg) -
509 			 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
510 	u32 p_cpos, clusters;
511 	u64 p_blkno;
512 	struct ocfs2_extent_list *el = &bg->bg_list;
513 
514 	status = ocfs2_journal_access_gd(handle,
515 					 INODE_CACHE(alloc_inode),
516 					 bg_bh,
517 					 OCFS2_JOURNAL_ACCESS_CREATE);
518 	if (status < 0) {
519 		mlog_errno(status);
520 		goto bail;
521 	}
522 
523 	while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
524 				le16_to_cpu(el->l_count))) {
525 		if (min_bits > needed)
526 			min_bits = needed;
527 		status = ocfs2_block_group_claim_bits(osb, handle, ac,
528 						      min_bits, &p_cpos,
529 						      &clusters);
530 		if (status < 0) {
531 			if (status != -ENOSPC)
532 				mlog_errno(status);
533 			goto bail;
534 		}
535 		p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
536 		ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
537 					      clusters);
538 
539 		min_bits = clusters;
540 		needed = le16_to_cpu(cl->cl_cpg) -
541 			 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
542 	}
543 
544 	if (needed > 0) {
545 		/*
546 		 * We have used up all the extent rec but can't fill up
547 		 * the cpg. So bail out.
548 		 */
549 		status = -ENOSPC;
550 		goto bail;
551 	}
552 
553 	ocfs2_journal_dirty(handle, bg_bh);
554 
555 bail:
556 	return status;
557 }
558 
559 static void ocfs2_bg_alloc_cleanup(handle_t *handle,
560 				   struct ocfs2_alloc_context *cluster_ac,
561 				   struct inode *alloc_inode,
562 				   struct buffer_head *bg_bh)
563 {
564 	int i, ret;
565 	struct ocfs2_group_desc *bg;
566 	struct ocfs2_extent_list *el;
567 	struct ocfs2_extent_rec *rec;
568 
569 	if (!bg_bh)
570 		return;
571 
572 	bg = (struct ocfs2_group_desc *)bg_bh->b_data;
573 	el = &bg->bg_list;
574 	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
575 		rec = &el->l_recs[i];
576 		ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
577 					  cluster_ac->ac_bh,
578 					  le64_to_cpu(rec->e_blkno),
579 					  le16_to_cpu(rec->e_leaf_clusters));
580 		if (ret)
581 			mlog_errno(ret);
582 		/* Try all the clusters to free */
583 	}
584 
585 	ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
586 	brelse(bg_bh);
587 }
588 
589 static struct buffer_head *
590 ocfs2_block_group_alloc_discontig(handle_t *handle,
591 				  struct inode *alloc_inode,
592 				  struct ocfs2_alloc_context *ac,
593 				  struct ocfs2_chain_list *cl)
594 {
595 	int status;
596 	u32 bit_off, num_bits;
597 	u64 bg_blkno;
598 	unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
599 	struct buffer_head *bg_bh = NULL;
600 	unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
601 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
602 
603 	if (!ocfs2_supports_discontig_bg(osb)) {
604 		status = -ENOSPC;
605 		goto bail;
606 	}
607 
608 	status = ocfs2_extend_trans(handle,
609 				    ocfs2_calc_bg_discontig_credits(osb->sb));
610 	if (status) {
611 		mlog_errno(status);
612 		goto bail;
613 	}
614 
615 	/*
616 	 * We're going to be grabbing from multiple cluster groups.
617 	 * We don't have enough credits to relink them all, and the
618 	 * cluster groups will be staying in cache for the duration of
619 	 * this operation.
620 	 */
621 	ac->ac_disable_chain_relink = 1;
622 
623 	/* Claim the first region */
624 	status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
625 					      &bit_off, &num_bits);
626 	if (status < 0) {
627 		if (status != -ENOSPC)
628 			mlog_errno(status);
629 		goto bail;
630 	}
631 	min_bits = num_bits;
632 
633 	/* setup the group */
634 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
635 	trace_ocfs2_block_group_alloc_discontig(
636 				(unsigned long long)bg_blkno, alloc_rec);
637 
638 	bg_bh = sb_getblk(osb->sb, bg_blkno);
639 	if (!bg_bh) {
640 		status = -ENOMEM;
641 		mlog_errno(status);
642 		goto bail;
643 	}
644 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
645 
646 	status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
647 					bg_blkno, num_bits, alloc_rec, cl);
648 	if (status < 0) {
649 		mlog_errno(status);
650 		goto bail;
651 	}
652 
653 	status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
654 						  bg_bh, ac, cl, min_bits);
655 	if (status)
656 		mlog_errno(status);
657 
658 bail:
659 	if (status)
660 		ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
661 	return status ? ERR_PTR(status) : bg_bh;
662 }
663 
664 /*
665  * We expect the block group allocator to already be locked.
666  */
667 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
668 				   struct inode *alloc_inode,
669 				   struct buffer_head *bh,
670 				   u64 max_block,
671 				   u64 *last_alloc_group,
672 				   int flags)
673 {
674 	int status, credits;
675 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
676 	struct ocfs2_chain_list *cl;
677 	struct ocfs2_alloc_context *ac = NULL;
678 	handle_t *handle = NULL;
679 	u16 alloc_rec;
680 	struct buffer_head *bg_bh = NULL;
681 	struct ocfs2_group_desc *bg;
682 
683 	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
684 
685 	cl = &fe->id2.i_chain;
686 	status = ocfs2_reserve_clusters_with_limit(osb,
687 						   le16_to_cpu(cl->cl_cpg),
688 						   max_block, flags, &ac);
689 	if (status < 0) {
690 		if (status != -ENOSPC)
691 			mlog_errno(status);
692 		goto bail;
693 	}
694 
695 	credits = ocfs2_calc_group_alloc_credits(osb->sb,
696 						 le16_to_cpu(cl->cl_cpg));
697 	handle = ocfs2_start_trans(osb, credits);
698 	if (IS_ERR(handle)) {
699 		status = PTR_ERR(handle);
700 		handle = NULL;
701 		mlog_errno(status);
702 		goto bail;
703 	}
704 
705 	if (last_alloc_group && *last_alloc_group != 0) {
706 		trace_ocfs2_block_group_alloc(
707 				(unsigned long long)*last_alloc_group);
708 		ac->ac_last_group = *last_alloc_group;
709 	}
710 
711 	bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
712 					       ac, cl);
713 	if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
714 		bg_bh = ocfs2_block_group_alloc_discontig(handle,
715 							  alloc_inode,
716 							  ac, cl);
717 	if (IS_ERR(bg_bh)) {
718 		status = PTR_ERR(bg_bh);
719 		bg_bh = NULL;
720 		if (status != -ENOSPC)
721 			mlog_errno(status);
722 		goto bail;
723 	}
724 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
725 
726 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
727 					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
728 	if (status < 0) {
729 		mlog_errno(status);
730 		goto bail;
731 	}
732 
733 	alloc_rec = le16_to_cpu(bg->bg_chain);
734 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
735 		     le16_to_cpu(bg->bg_free_bits_count));
736 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
737 		     le16_to_cpu(bg->bg_bits));
738 	cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
739 	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
740 		le16_add_cpu(&cl->cl_next_free_rec, 1);
741 
742 	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
743 					le16_to_cpu(bg->bg_free_bits_count));
744 	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
745 	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
746 
747 	ocfs2_journal_dirty(handle, bh);
748 
749 	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
750 	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
751 	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
752 					     le32_to_cpu(fe->i_clusters)));
753 	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
754 	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
755 	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
756 	ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
757 
758 	status = 0;
759 
760 	/* save the new last alloc group so that the caller can cache it. */
761 	if (last_alloc_group)
762 		*last_alloc_group = ac->ac_last_group;
763 
764 bail:
765 	if (handle)
766 		ocfs2_commit_trans(osb, handle);
767 
768 	if (ac)
769 		ocfs2_free_alloc_context(ac);
770 
771 	brelse(bg_bh);
772 
773 	if (status)
774 		mlog_errno(status);
775 	return status;
776 }
777 
778 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
779 				       struct ocfs2_alloc_context *ac,
780 				       int type,
781 				       u32 slot,
782 				       u64 *last_alloc_group,
783 				       int flags)
784 {
785 	int status;
786 	u32 bits_wanted = ac->ac_bits_wanted;
787 	struct inode *alloc_inode;
788 	struct buffer_head *bh = NULL;
789 	struct ocfs2_dinode *fe;
790 	u32 free_bits;
791 
792 	alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
793 	if (!alloc_inode) {
794 		mlog_errno(-EINVAL);
795 		return -EINVAL;
796 	}
797 
798 	inode_lock(alloc_inode);
799 
800 	status = ocfs2_inode_lock(alloc_inode, &bh, 1);
801 	if (status < 0) {
802 		inode_unlock(alloc_inode);
803 		iput(alloc_inode);
804 
805 		mlog_errno(status);
806 		return status;
807 	}
808 
809 	ac->ac_inode = alloc_inode;
810 	ac->ac_alloc_slot = slot;
811 
812 	fe = (struct ocfs2_dinode *) bh->b_data;
813 
814 	/* The bh was validated by the inode read inside
815 	 * ocfs2_inode_lock().  Any corruption is a code bug. */
816 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
817 
818 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
819 		status = ocfs2_error(alloc_inode->i_sb,
820 				     "Invalid chain allocator %llu\n",
821 				     (unsigned long long)le64_to_cpu(fe->i_blkno));
822 		goto bail;
823 	}
824 
825 	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
826 		le32_to_cpu(fe->id1.bitmap1.i_used);
827 
828 	if (bits_wanted > free_bits) {
829 		/* cluster bitmap never grows */
830 		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
831 			trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
832 								free_bits);
833 			status = -ENOSPC;
834 			goto bail;
835 		}
836 
837 		if (!(flags & ALLOC_NEW_GROUP)) {
838 			trace_ocfs2_reserve_suballoc_bits_no_new_group(
839 						slot, bits_wanted, free_bits);
840 			status = -ENOSPC;
841 			goto bail;
842 		}
843 
844 		status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
845 						 ac->ac_max_block,
846 						 last_alloc_group, flags);
847 		if (status < 0) {
848 			if (status != -ENOSPC)
849 				mlog_errno(status);
850 			goto bail;
851 		}
852 		atomic_inc(&osb->alloc_stats.bg_extends);
853 
854 		/* You should never ask for this much metadata */
855 		BUG_ON(bits_wanted >
856 		       (le32_to_cpu(fe->id1.bitmap1.i_total)
857 			- le32_to_cpu(fe->id1.bitmap1.i_used)));
858 	}
859 
860 	get_bh(bh);
861 	ac->ac_bh = bh;
862 bail:
863 	brelse(bh);
864 
865 	if (status)
866 		mlog_errno(status);
867 	return status;
868 }
869 
870 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
871 {
872 	spin_lock(&osb->osb_lock);
873 	osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
874 	spin_unlock(&osb->osb_lock);
875 	atomic_set(&osb->s_num_inodes_stolen, 0);
876 }
877 
878 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
879 {
880 	spin_lock(&osb->osb_lock);
881 	osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
882 	spin_unlock(&osb->osb_lock);
883 	atomic_set(&osb->s_num_meta_stolen, 0);
884 }
885 
886 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
887 {
888 	ocfs2_init_inode_steal_slot(osb);
889 	ocfs2_init_meta_steal_slot(osb);
890 }
891 
892 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
893 {
894 	spin_lock(&osb->osb_lock);
895 	if (type == INODE_ALLOC_SYSTEM_INODE)
896 		osb->s_inode_steal_slot = slot;
897 	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
898 		osb->s_meta_steal_slot = slot;
899 	spin_unlock(&osb->osb_lock);
900 }
901 
902 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
903 {
904 	int slot = OCFS2_INVALID_SLOT;
905 
906 	spin_lock(&osb->osb_lock);
907 	if (type == INODE_ALLOC_SYSTEM_INODE)
908 		slot = osb->s_inode_steal_slot;
909 	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
910 		slot = osb->s_meta_steal_slot;
911 	spin_unlock(&osb->osb_lock);
912 
913 	return slot;
914 }
915 
916 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
917 {
918 	return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
919 }
920 
921 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
922 {
923 	return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
924 }
925 
926 static int ocfs2_steal_resource(struct ocfs2_super *osb,
927 				struct ocfs2_alloc_context *ac,
928 				int type)
929 {
930 	int i, status = -ENOSPC;
931 	int slot = __ocfs2_get_steal_slot(osb, type);
932 
933 	/* Start to steal resource from the first slot after ours. */
934 	if (slot == OCFS2_INVALID_SLOT)
935 		slot = osb->slot_num + 1;
936 
937 	for (i = 0; i < osb->max_slots; i++, slot++) {
938 		if (slot == osb->max_slots)
939 			slot = 0;
940 
941 		if (slot == osb->slot_num)
942 			continue;
943 
944 		status = ocfs2_reserve_suballoc_bits(osb, ac,
945 						     type,
946 						     (u32)slot, NULL,
947 						     NOT_ALLOC_NEW_GROUP);
948 		if (status >= 0) {
949 			__ocfs2_set_steal_slot(osb, slot, type);
950 			break;
951 		}
952 
953 		ocfs2_free_ac_resource(ac);
954 	}
955 
956 	return status;
957 }
958 
959 static int ocfs2_steal_inode(struct ocfs2_super *osb,
960 			     struct ocfs2_alloc_context *ac)
961 {
962 	return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
963 }
964 
965 static int ocfs2_steal_meta(struct ocfs2_super *osb,
966 			    struct ocfs2_alloc_context *ac)
967 {
968 	return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
969 }
970 
971 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
972 				      int blocks,
973 				      struct ocfs2_alloc_context **ac)
974 {
975 	int status;
976 	int slot = ocfs2_get_meta_steal_slot(osb);
977 
978 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
979 	if (!(*ac)) {
980 		status = -ENOMEM;
981 		mlog_errno(status);
982 		goto bail;
983 	}
984 
985 	(*ac)->ac_bits_wanted = blocks;
986 	(*ac)->ac_which = OCFS2_AC_USE_META;
987 	(*ac)->ac_group_search = ocfs2_block_group_search;
988 
989 	if (slot != OCFS2_INVALID_SLOT &&
990 		atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
991 		goto extent_steal;
992 
993 	atomic_set(&osb->s_num_meta_stolen, 0);
994 	status = ocfs2_reserve_suballoc_bits(osb, (*ac),
995 					     EXTENT_ALLOC_SYSTEM_INODE,
996 					     (u32)osb->slot_num, NULL,
997 					     ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
998 
999 
1000 	if (status >= 0) {
1001 		status = 0;
1002 		if (slot != OCFS2_INVALID_SLOT)
1003 			ocfs2_init_meta_steal_slot(osb);
1004 		goto bail;
1005 	} else if (status < 0 && status != -ENOSPC) {
1006 		mlog_errno(status);
1007 		goto bail;
1008 	}
1009 
1010 	ocfs2_free_ac_resource(*ac);
1011 
1012 extent_steal:
1013 	status = ocfs2_steal_meta(osb, *ac);
1014 	atomic_inc(&osb->s_num_meta_stolen);
1015 	if (status < 0) {
1016 		if (status != -ENOSPC)
1017 			mlog_errno(status);
1018 		goto bail;
1019 	}
1020 
1021 	status = 0;
1022 bail:
1023 	if ((status < 0) && *ac) {
1024 		ocfs2_free_alloc_context(*ac);
1025 		*ac = NULL;
1026 	}
1027 
1028 	if (status)
1029 		mlog_errno(status);
1030 	return status;
1031 }
1032 
1033 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1034 			       struct ocfs2_extent_list *root_el,
1035 			       struct ocfs2_alloc_context **ac)
1036 {
1037 	return ocfs2_reserve_new_metadata_blocks(osb,
1038 					ocfs2_extend_meta_needed(root_el),
1039 					ac);
1040 }
1041 
1042 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1043 			    struct ocfs2_alloc_context **ac)
1044 {
1045 	int status;
1046 	int slot = ocfs2_get_inode_steal_slot(osb);
1047 	u64 alloc_group;
1048 
1049 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1050 	if (!(*ac)) {
1051 		status = -ENOMEM;
1052 		mlog_errno(status);
1053 		goto bail;
1054 	}
1055 
1056 	(*ac)->ac_bits_wanted = 1;
1057 	(*ac)->ac_which = OCFS2_AC_USE_INODE;
1058 
1059 	(*ac)->ac_group_search = ocfs2_block_group_search;
1060 
1061 	/*
1062 	 * stat(2) can't handle i_ino > 32bits, so we tell the
1063 	 * lower levels not to allocate us a block group past that
1064 	 * limit.  The 'inode64' mount option avoids this behavior.
1065 	 */
1066 	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1067 		(*ac)->ac_max_block = (u32)~0U;
1068 
1069 	/*
1070 	 * slot is set when we successfully steal inode from other nodes.
1071 	 * It is reset in 3 places:
1072 	 * 1. when we flush the truncate log
1073 	 * 2. when we complete local alloc recovery.
1074 	 * 3. when we successfully allocate from our own slot.
1075 	 * After it is set, we will go on stealing inodes until we find the
1076 	 * need to check our slots to see whether there is some space for us.
1077 	 */
1078 	if (slot != OCFS2_INVALID_SLOT &&
1079 	    atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1080 		goto inode_steal;
1081 
1082 	atomic_set(&osb->s_num_inodes_stolen, 0);
1083 	alloc_group = osb->osb_inode_alloc_group;
1084 	status = ocfs2_reserve_suballoc_bits(osb, *ac,
1085 					     INODE_ALLOC_SYSTEM_INODE,
1086 					     (u32)osb->slot_num,
1087 					     &alloc_group,
1088 					     ALLOC_NEW_GROUP |
1089 					     ALLOC_GROUPS_FROM_GLOBAL);
1090 	if (status >= 0) {
1091 		status = 0;
1092 
1093 		spin_lock(&osb->osb_lock);
1094 		osb->osb_inode_alloc_group = alloc_group;
1095 		spin_unlock(&osb->osb_lock);
1096 		trace_ocfs2_reserve_new_inode_new_group(
1097 			(unsigned long long)alloc_group);
1098 
1099 		/*
1100 		 * Some inodes must be freed by us, so try to allocate
1101 		 * from our own next time.
1102 		 */
1103 		if (slot != OCFS2_INVALID_SLOT)
1104 			ocfs2_init_inode_steal_slot(osb);
1105 		goto bail;
1106 	} else if (status < 0 && status != -ENOSPC) {
1107 		mlog_errno(status);
1108 		goto bail;
1109 	}
1110 
1111 	ocfs2_free_ac_resource(*ac);
1112 
1113 inode_steal:
1114 	status = ocfs2_steal_inode(osb, *ac);
1115 	atomic_inc(&osb->s_num_inodes_stolen);
1116 	if (status < 0) {
1117 		if (status != -ENOSPC)
1118 			mlog_errno(status);
1119 		goto bail;
1120 	}
1121 
1122 	status = 0;
1123 bail:
1124 	if ((status < 0) && *ac) {
1125 		ocfs2_free_alloc_context(*ac);
1126 		*ac = NULL;
1127 	}
1128 
1129 	if (status)
1130 		mlog_errno(status);
1131 	return status;
1132 }
1133 
1134 /* local alloc code has to do the same thing, so rather than do this
1135  * twice.. */
1136 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1137 				      struct ocfs2_alloc_context *ac)
1138 {
1139 	int status;
1140 
1141 	ac->ac_which = OCFS2_AC_USE_MAIN;
1142 	ac->ac_group_search = ocfs2_cluster_group_search;
1143 
1144 	status = ocfs2_reserve_suballoc_bits(osb, ac,
1145 					     GLOBAL_BITMAP_SYSTEM_INODE,
1146 					     OCFS2_INVALID_SLOT, NULL,
1147 					     ALLOC_NEW_GROUP);
1148 	if (status < 0 && status != -ENOSPC)
1149 		mlog_errno(status);
1150 
1151 	return status;
1152 }
1153 
1154 /* Callers don't need to care which bitmap (local alloc or main) to
1155  * use so we figure it out for them, but unfortunately this clutters
1156  * things a bit. */
1157 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1158 					     u32 bits_wanted, u64 max_block,
1159 					     int flags,
1160 					     struct ocfs2_alloc_context **ac)
1161 {
1162 	int status, ret = 0;
1163 	int retried = 0;
1164 
1165 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1166 	if (!(*ac)) {
1167 		status = -ENOMEM;
1168 		mlog_errno(status);
1169 		goto bail;
1170 	}
1171 
1172 	(*ac)->ac_bits_wanted = bits_wanted;
1173 	(*ac)->ac_max_block = max_block;
1174 
1175 	status = -ENOSPC;
1176 	if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1177 	    ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1178 		status = ocfs2_reserve_local_alloc_bits(osb,
1179 							bits_wanted,
1180 							*ac);
1181 		if ((status < 0) && (status != -ENOSPC)) {
1182 			mlog_errno(status);
1183 			goto bail;
1184 		}
1185 	}
1186 
1187 	if (status == -ENOSPC) {
1188 retry:
1189 		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1190 		/* Retry if there is sufficient space cached in truncate log */
1191 		if (status == -ENOSPC && !retried) {
1192 			retried = 1;
1193 			ocfs2_inode_unlock((*ac)->ac_inode, 1);
1194 			inode_unlock((*ac)->ac_inode);
1195 
1196 			ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
1197 			if (ret == 1) {
1198 				iput((*ac)->ac_inode);
1199 				(*ac)->ac_inode = NULL;
1200 				goto retry;
1201 			}
1202 
1203 			if (ret < 0)
1204 				mlog_errno(ret);
1205 
1206 			inode_lock((*ac)->ac_inode);
1207 			ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
1208 			if (ret < 0) {
1209 				mlog_errno(ret);
1210 				inode_unlock((*ac)->ac_inode);
1211 				iput((*ac)->ac_inode);
1212 				(*ac)->ac_inode = NULL;
1213 				goto bail;
1214 			}
1215 		}
1216 		if (status < 0) {
1217 			if (status != -ENOSPC)
1218 				mlog_errno(status);
1219 			goto bail;
1220 		}
1221 	}
1222 
1223 	status = 0;
1224 bail:
1225 	if ((status < 0) && *ac) {
1226 		ocfs2_free_alloc_context(*ac);
1227 		*ac = NULL;
1228 	}
1229 
1230 	if (status)
1231 		mlog_errno(status);
1232 	return status;
1233 }
1234 
1235 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1236 			   u32 bits_wanted,
1237 			   struct ocfs2_alloc_context **ac)
1238 {
1239 	return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1240 						 ALLOC_NEW_GROUP, ac);
1241 }
1242 
1243 /*
1244  * More or less lifted from ext3. I'll leave their description below:
1245  *
1246  * "For ext3 allocations, we must not reuse any blocks which are
1247  * allocated in the bitmap buffer's "last committed data" copy.  This
1248  * prevents deletes from freeing up the page for reuse until we have
1249  * committed the delete transaction.
1250  *
1251  * If we didn't do this, then deleting something and reallocating it as
1252  * data would allow the old block to be overwritten before the
1253  * transaction committed (because we force data to disk before commit).
1254  * This would lead to corruption if we crashed between overwriting the
1255  * data and committing the delete.
1256  *
1257  * @@@ We may want to make this allocation behaviour conditional on
1258  * data-writes at some point, and disable it for metadata allocations or
1259  * sync-data inodes."
1260  *
1261  * Note: OCFS2 already does this differently for metadata vs data
1262  * allocations, as those bitmaps are separate and undo access is never
1263  * called on a metadata group descriptor.
1264  */
1265 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1266 					 int nr)
1267 {
1268 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1269 	int ret;
1270 
1271 	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1272 		return 0;
1273 
1274 	if (!buffer_jbd(bg_bh))
1275 		return 1;
1276 
1277 	jbd_lock_bh_state(bg_bh);
1278 	bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
1279 	if (bg)
1280 		ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1281 	else
1282 		ret = 1;
1283 	jbd_unlock_bh_state(bg_bh);
1284 
1285 	return ret;
1286 }
1287 
1288 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1289 					     struct buffer_head *bg_bh,
1290 					     unsigned int bits_wanted,
1291 					     unsigned int total_bits,
1292 					     struct ocfs2_suballoc_result *res)
1293 {
1294 	void *bitmap;
1295 	u16 best_offset, best_size;
1296 	int offset, start, found, status = 0;
1297 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1298 
1299 	/* Callers got this descriptor from
1300 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1301 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1302 
1303 	found = start = best_offset = best_size = 0;
1304 	bitmap = bg->bg_bitmap;
1305 
1306 	while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1307 		if (offset == total_bits)
1308 			break;
1309 
1310 		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1311 			/* We found a zero, but we can't use it as it
1312 			 * hasn't been put to disk yet! */
1313 			found = 0;
1314 			start = offset + 1;
1315 		} else if (offset == start) {
1316 			/* we found a zero */
1317 			found++;
1318 			/* move start to the next bit to test */
1319 			start++;
1320 		} else {
1321 			/* got a zero after some ones */
1322 			found = 1;
1323 			start = offset + 1;
1324 		}
1325 		if (found > best_size) {
1326 			best_size = found;
1327 			best_offset = start - found;
1328 		}
1329 		/* we got everything we needed */
1330 		if (found == bits_wanted) {
1331 			/* mlog(0, "Found it all!\n"); */
1332 			break;
1333 		}
1334 	}
1335 
1336 	if (best_size) {
1337 		res->sr_bit_offset = best_offset;
1338 		res->sr_bits = best_size;
1339 	} else {
1340 		status = -ENOSPC;
1341 		/* No error log here -- see the comment above
1342 		 * ocfs2_test_bg_bit_allocatable */
1343 	}
1344 
1345 	return status;
1346 }
1347 
1348 int ocfs2_block_group_set_bits(handle_t *handle,
1349 					     struct inode *alloc_inode,
1350 					     struct ocfs2_group_desc *bg,
1351 					     struct buffer_head *group_bh,
1352 					     unsigned int bit_off,
1353 					     unsigned int num_bits)
1354 {
1355 	int status;
1356 	void *bitmap = bg->bg_bitmap;
1357 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1358 
1359 	/* All callers get the descriptor via
1360 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1361 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1362 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1363 
1364 	trace_ocfs2_block_group_set_bits(bit_off, num_bits);
1365 
1366 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1367 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1368 
1369 	status = ocfs2_journal_access_gd(handle,
1370 					 INODE_CACHE(alloc_inode),
1371 					 group_bh,
1372 					 journal_type);
1373 	if (status < 0) {
1374 		mlog_errno(status);
1375 		goto bail;
1376 	}
1377 
1378 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1379 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1380 		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
1381 				   (unsigned long long)le64_to_cpu(bg->bg_blkno),
1382 				   le16_to_cpu(bg->bg_bits),
1383 				   le16_to_cpu(bg->bg_free_bits_count),
1384 				   num_bits);
1385 	}
1386 	while(num_bits--)
1387 		ocfs2_set_bit(bit_off++, bitmap);
1388 
1389 	ocfs2_journal_dirty(handle, group_bh);
1390 
1391 bail:
1392 	return status;
1393 }
1394 
1395 /* find the one with the most empty bits */
1396 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1397 {
1398 	u16 curr, best;
1399 
1400 	BUG_ON(!cl->cl_next_free_rec);
1401 
1402 	best = curr = 0;
1403 	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1404 		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1405 		    le32_to_cpu(cl->cl_recs[best].c_free))
1406 			best = curr;
1407 		curr++;
1408 	}
1409 
1410 	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1411 	return best;
1412 }
1413 
1414 static int ocfs2_relink_block_group(handle_t *handle,
1415 				    struct inode *alloc_inode,
1416 				    struct buffer_head *fe_bh,
1417 				    struct buffer_head *bg_bh,
1418 				    struct buffer_head *prev_bg_bh,
1419 				    u16 chain)
1420 {
1421 	int status;
1422 	/* there is a really tiny chance the journal calls could fail,
1423 	 * but we wouldn't want inconsistent blocks in *any* case. */
1424 	u64 bg_ptr, prev_bg_ptr;
1425 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1426 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1427 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1428 
1429 	/* The caller got these descriptors from
1430 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1431 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1432 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1433 
1434 	trace_ocfs2_relink_block_group(
1435 		(unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1436 		(unsigned long long)le64_to_cpu(bg->bg_blkno),
1437 		(unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1438 
1439 	bg_ptr = le64_to_cpu(bg->bg_next_group);
1440 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1441 
1442 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1443 					 prev_bg_bh,
1444 					 OCFS2_JOURNAL_ACCESS_WRITE);
1445 	if (status < 0)
1446 		goto out;
1447 
1448 	prev_bg->bg_next_group = bg->bg_next_group;
1449 	ocfs2_journal_dirty(handle, prev_bg_bh);
1450 
1451 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1452 					 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1453 	if (status < 0)
1454 		goto out_rollback_prev_bg;
1455 
1456 	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1457 	ocfs2_journal_dirty(handle, bg_bh);
1458 
1459 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1460 					 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1461 	if (status < 0)
1462 		goto out_rollback_bg;
1463 
1464 	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1465 	ocfs2_journal_dirty(handle, fe_bh);
1466 
1467 out:
1468 	if (status < 0)
1469 		mlog_errno(status);
1470 	return status;
1471 
1472 out_rollback_bg:
1473 	bg->bg_next_group = cpu_to_le64(bg_ptr);
1474 out_rollback_prev_bg:
1475 	prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1476 	goto out;
1477 }
1478 
1479 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1480 						     u32 wanted)
1481 {
1482 	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1483 }
1484 
1485 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1486  * value on error. */
1487 static int ocfs2_cluster_group_search(struct inode *inode,
1488 				      struct buffer_head *group_bh,
1489 				      u32 bits_wanted, u32 min_bits,
1490 				      u64 max_block,
1491 				      struct ocfs2_suballoc_result *res)
1492 {
1493 	int search = -ENOSPC;
1494 	int ret;
1495 	u64 blkoff;
1496 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1497 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1498 	unsigned int max_bits, gd_cluster_off;
1499 
1500 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1501 
1502 	if (gd->bg_free_bits_count) {
1503 		max_bits = le16_to_cpu(gd->bg_bits);
1504 
1505 		/* Tail groups in cluster bitmaps which aren't cpg
1506 		 * aligned are prone to partial extension by a failed
1507 		 * fs resize. If the file system resize never got to
1508 		 * update the dinode cluster count, then we don't want
1509 		 * to trust any clusters past it, regardless of what
1510 		 * the group descriptor says. */
1511 		gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1512 							  le64_to_cpu(gd->bg_blkno));
1513 		if ((gd_cluster_off + max_bits) >
1514 		    OCFS2_I(inode)->ip_clusters) {
1515 			max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1516 			trace_ocfs2_cluster_group_search_wrong_max_bits(
1517 				(unsigned long long)le64_to_cpu(gd->bg_blkno),
1518 				le16_to_cpu(gd->bg_bits),
1519 				OCFS2_I(inode)->ip_clusters, max_bits);
1520 		}
1521 
1522 		ret = ocfs2_block_group_find_clear_bits(osb,
1523 							group_bh, bits_wanted,
1524 							max_bits, res);
1525 		if (ret)
1526 			return ret;
1527 
1528 		if (max_block) {
1529 			blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1530 							  gd_cluster_off +
1531 							  res->sr_bit_offset +
1532 							  res->sr_bits);
1533 			trace_ocfs2_cluster_group_search_max_block(
1534 				(unsigned long long)blkoff,
1535 				(unsigned long long)max_block);
1536 			if (blkoff > max_block)
1537 				return -ENOSPC;
1538 		}
1539 
1540 		/* ocfs2_block_group_find_clear_bits() might
1541 		 * return success, but we still want to return
1542 		 * -ENOSPC unless it found the minimum number
1543 		 * of bits. */
1544 		if (min_bits <= res->sr_bits)
1545 			search = 0; /* success */
1546 		else if (res->sr_bits) {
1547 			/*
1548 			 * Don't show bits which we'll be returning
1549 			 * for allocation to the local alloc bitmap.
1550 			 */
1551 			ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1552 		}
1553 	}
1554 
1555 	return search;
1556 }
1557 
1558 static int ocfs2_block_group_search(struct inode *inode,
1559 				    struct buffer_head *group_bh,
1560 				    u32 bits_wanted, u32 min_bits,
1561 				    u64 max_block,
1562 				    struct ocfs2_suballoc_result *res)
1563 {
1564 	int ret = -ENOSPC;
1565 	u64 blkoff;
1566 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1567 
1568 	BUG_ON(min_bits != 1);
1569 	BUG_ON(ocfs2_is_cluster_bitmap(inode));
1570 
1571 	if (bg->bg_free_bits_count) {
1572 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1573 							group_bh, bits_wanted,
1574 							le16_to_cpu(bg->bg_bits),
1575 							res);
1576 		if (!ret && max_block) {
1577 			blkoff = le64_to_cpu(bg->bg_blkno) +
1578 				res->sr_bit_offset + res->sr_bits;
1579 			trace_ocfs2_block_group_search_max_block(
1580 				(unsigned long long)blkoff,
1581 				(unsigned long long)max_block);
1582 			if (blkoff > max_block)
1583 				ret = -ENOSPC;
1584 		}
1585 	}
1586 
1587 	return ret;
1588 }
1589 
1590 int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1591 				       handle_t *handle,
1592 				       struct buffer_head *di_bh,
1593 				       u32 num_bits,
1594 				       u16 chain)
1595 {
1596 	int ret;
1597 	u32 tmp_used;
1598 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1599 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1600 
1601 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1602 				      OCFS2_JOURNAL_ACCESS_WRITE);
1603 	if (ret < 0) {
1604 		mlog_errno(ret);
1605 		goto out;
1606 	}
1607 
1608 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1609 	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1610 	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1611 	ocfs2_journal_dirty(handle, di_bh);
1612 
1613 out:
1614 	return ret;
1615 }
1616 
1617 void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
1618 				       struct buffer_head *di_bh,
1619 				       u32 num_bits,
1620 				       u16 chain)
1621 {
1622 	u32 tmp_used;
1623 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1624 	struct ocfs2_chain_list *cl;
1625 
1626 	cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
1627 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1628 	di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
1629 	le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
1630 }
1631 
1632 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1633 					 struct ocfs2_extent_rec *rec,
1634 					 struct ocfs2_chain_list *cl)
1635 {
1636 	unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1637 	unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1638 	unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
1639 
1640 	if (res->sr_bit_offset < bitoff)
1641 		return 0;
1642 	if (res->sr_bit_offset >= (bitoff + bitcount))
1643 		return 0;
1644 	res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1645 		(res->sr_bit_offset - bitoff);
1646 	if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1647 		res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1648 	return 1;
1649 }
1650 
1651 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1652 					  struct ocfs2_group_desc *bg,
1653 					  struct ocfs2_suballoc_result *res)
1654 {
1655 	int i;
1656 	u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
1657 	struct ocfs2_extent_rec *rec;
1658 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1659 	struct ocfs2_chain_list *cl = &di->id2.i_chain;
1660 
1661 	if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1662 		res->sr_blkno = 0;
1663 		return;
1664 	}
1665 
1666 	res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1667 	res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
1668 	if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1669 	    !bg->bg_list.l_next_free_rec)
1670 		return;
1671 
1672 	for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1673 		rec = &bg->bg_list.l_recs[i];
1674 		if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1675 			res->sr_bg_blkno = bg_blkno;  /* Restore */
1676 			break;
1677 		}
1678 	}
1679 }
1680 
1681 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1682 				  handle_t *handle,
1683 				  u32 bits_wanted,
1684 				  u32 min_bits,
1685 				  struct ocfs2_suballoc_result *res,
1686 				  u16 *bits_left)
1687 {
1688 	int ret;
1689 	struct buffer_head *group_bh = NULL;
1690 	struct ocfs2_group_desc *gd;
1691 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1692 	struct inode *alloc_inode = ac->ac_inode;
1693 
1694 	ret = ocfs2_read_group_descriptor(alloc_inode, di,
1695 					  res->sr_bg_blkno, &group_bh);
1696 	if (ret < 0) {
1697 		mlog_errno(ret);
1698 		return ret;
1699 	}
1700 
1701 	gd = (struct ocfs2_group_desc *) group_bh->b_data;
1702 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1703 				  ac->ac_max_block, res);
1704 	if (ret < 0) {
1705 		if (ret != -ENOSPC)
1706 			mlog_errno(ret);
1707 		goto out;
1708 	}
1709 
1710 	if (!ret)
1711 		ocfs2_bg_discontig_fix_result(ac, gd, res);
1712 
1713 	/*
1714 	 * sr_bg_blkno might have been changed by
1715 	 * ocfs2_bg_discontig_fix_result
1716 	 */
1717 	res->sr_bg_stable_blkno = group_bh->b_blocknr;
1718 
1719 	if (ac->ac_find_loc_only)
1720 		goto out_loc_only;
1721 
1722 	ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1723 					       res->sr_bits,
1724 					       le16_to_cpu(gd->bg_chain));
1725 	if (ret < 0) {
1726 		mlog_errno(ret);
1727 		goto out;
1728 	}
1729 
1730 	ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1731 					 res->sr_bit_offset, res->sr_bits);
1732 	if (ret < 0) {
1733 		ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
1734 					       res->sr_bits,
1735 					       le16_to_cpu(gd->bg_chain));
1736 		mlog_errno(ret);
1737 	}
1738 
1739 out_loc_only:
1740 	*bits_left = le16_to_cpu(gd->bg_free_bits_count);
1741 
1742 out:
1743 	brelse(group_bh);
1744 
1745 	return ret;
1746 }
1747 
1748 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1749 			      handle_t *handle,
1750 			      u32 bits_wanted,
1751 			      u32 min_bits,
1752 			      struct ocfs2_suballoc_result *res,
1753 			      u16 *bits_left)
1754 {
1755 	int status;
1756 	u16 chain;
1757 	u64 next_group;
1758 	struct inode *alloc_inode = ac->ac_inode;
1759 	struct buffer_head *group_bh = NULL;
1760 	struct buffer_head *prev_group_bh = NULL;
1761 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1762 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1763 	struct ocfs2_group_desc *bg;
1764 
1765 	chain = ac->ac_chain;
1766 	trace_ocfs2_search_chain_begin(
1767 		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
1768 		bits_wanted, chain);
1769 
1770 	status = ocfs2_read_group_descriptor(alloc_inode, fe,
1771 					     le64_to_cpu(cl->cl_recs[chain].c_blkno),
1772 					     &group_bh);
1773 	if (status < 0) {
1774 		mlog_errno(status);
1775 		goto bail;
1776 	}
1777 	bg = (struct ocfs2_group_desc *) group_bh->b_data;
1778 
1779 	status = -ENOSPC;
1780 	/* for now, the chain search is a bit simplistic. We just use
1781 	 * the 1st group with any empty bits. */
1782 	while ((status = ac->ac_group_search(alloc_inode, group_bh,
1783 					     bits_wanted, min_bits,
1784 					     ac->ac_max_block,
1785 					     res)) == -ENOSPC) {
1786 		if (!bg->bg_next_group)
1787 			break;
1788 
1789 		brelse(prev_group_bh);
1790 		prev_group_bh = NULL;
1791 
1792 		next_group = le64_to_cpu(bg->bg_next_group);
1793 		prev_group_bh = group_bh;
1794 		group_bh = NULL;
1795 		status = ocfs2_read_group_descriptor(alloc_inode, fe,
1796 						     next_group, &group_bh);
1797 		if (status < 0) {
1798 			mlog_errno(status);
1799 			goto bail;
1800 		}
1801 		bg = (struct ocfs2_group_desc *) group_bh->b_data;
1802 	}
1803 	if (status < 0) {
1804 		if (status != -ENOSPC)
1805 			mlog_errno(status);
1806 		goto bail;
1807 	}
1808 
1809 	trace_ocfs2_search_chain_succ(
1810 		(unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
1811 
1812 	res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1813 
1814 	BUG_ON(res->sr_bits == 0);
1815 	if (!status)
1816 		ocfs2_bg_discontig_fix_result(ac, bg, res);
1817 
1818 	/*
1819 	 * sr_bg_blkno might have been changed by
1820 	 * ocfs2_bg_discontig_fix_result
1821 	 */
1822 	res->sr_bg_stable_blkno = group_bh->b_blocknr;
1823 
1824 	/*
1825 	 * Keep track of previous block descriptor read. When
1826 	 * we find a target, if we have read more than X
1827 	 * number of descriptors, and the target is reasonably
1828 	 * empty, relink him to top of his chain.
1829 	 *
1830 	 * We've read 0 extra blocks and only send one more to
1831 	 * the transaction, yet the next guy to search has a
1832 	 * much easier time.
1833 	 *
1834 	 * Do this *after* figuring out how many bits we're taking out
1835 	 * of our target group.
1836 	 */
1837 	if (!ac->ac_disable_chain_relink &&
1838 	    (prev_group_bh) &&
1839 	    (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1840 		status = ocfs2_relink_block_group(handle, alloc_inode,
1841 						  ac->ac_bh, group_bh,
1842 						  prev_group_bh, chain);
1843 		if (status < 0) {
1844 			mlog_errno(status);
1845 			goto bail;
1846 		}
1847 	}
1848 
1849 	if (ac->ac_find_loc_only)
1850 		goto out_loc_only;
1851 
1852 	status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1853 						  ac->ac_bh, res->sr_bits,
1854 						  chain);
1855 	if (status) {
1856 		mlog_errno(status);
1857 		goto bail;
1858 	}
1859 
1860 	status = ocfs2_block_group_set_bits(handle,
1861 					    alloc_inode,
1862 					    bg,
1863 					    group_bh,
1864 					    res->sr_bit_offset,
1865 					    res->sr_bits);
1866 	if (status < 0) {
1867 		ocfs2_rollback_alloc_dinode_counts(alloc_inode,
1868 					ac->ac_bh, res->sr_bits, chain);
1869 		mlog_errno(status);
1870 		goto bail;
1871 	}
1872 
1873 	trace_ocfs2_search_chain_end(
1874 			(unsigned long long)le64_to_cpu(fe->i_blkno),
1875 			res->sr_bits);
1876 
1877 out_loc_only:
1878 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
1879 bail:
1880 	brelse(group_bh);
1881 	brelse(prev_group_bh);
1882 
1883 	if (status)
1884 		mlog_errno(status);
1885 	return status;
1886 }
1887 
1888 /* will give out up to bits_wanted contiguous bits. */
1889 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1890 				     handle_t *handle,
1891 				     u32 bits_wanted,
1892 				     u32 min_bits,
1893 				     struct ocfs2_suballoc_result *res)
1894 {
1895 	int status;
1896 	u16 victim, i;
1897 	u16 bits_left = 0;
1898 	u64 hint = ac->ac_last_group;
1899 	struct ocfs2_chain_list *cl;
1900 	struct ocfs2_dinode *fe;
1901 
1902 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1903 	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1904 	BUG_ON(!ac->ac_bh);
1905 
1906 	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1907 
1908 	/* The bh was validated by the inode read during
1909 	 * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1910 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1911 
1912 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1913 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
1914 		status = ocfs2_error(ac->ac_inode->i_sb,
1915 				     "Chain allocator dinode %llu has %u used bits but only %u total\n",
1916 				     (unsigned long long)le64_to_cpu(fe->i_blkno),
1917 				     le32_to_cpu(fe->id1.bitmap1.i_used),
1918 				     le32_to_cpu(fe->id1.bitmap1.i_total));
1919 		goto bail;
1920 	}
1921 
1922 	res->sr_bg_blkno = hint;
1923 	if (res->sr_bg_blkno) {
1924 		/* Attempt to short-circuit the usual search mechanism
1925 		 * by jumping straight to the most recently used
1926 		 * allocation group. This helps us maintain some
1927 		 * contiguousness across allocations. */
1928 		status = ocfs2_search_one_group(ac, handle, bits_wanted,
1929 						min_bits, res, &bits_left);
1930 		if (!status)
1931 			goto set_hint;
1932 		if (status < 0 && status != -ENOSPC) {
1933 			mlog_errno(status);
1934 			goto bail;
1935 		}
1936 	}
1937 
1938 	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1939 
1940 	victim = ocfs2_find_victim_chain(cl);
1941 	ac->ac_chain = victim;
1942 
1943 	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1944 				    res, &bits_left);
1945 	if (!status) {
1946 		if (ocfs2_is_cluster_bitmap(ac->ac_inode))
1947 			hint = res->sr_bg_blkno;
1948 		else
1949 			hint = ocfs2_group_from_res(res);
1950 		goto set_hint;
1951 	}
1952 	if (status < 0 && status != -ENOSPC) {
1953 		mlog_errno(status);
1954 		goto bail;
1955 	}
1956 
1957 	trace_ocfs2_claim_suballoc_bits(victim);
1958 
1959 	/* If we didn't pick a good victim, then just default to
1960 	 * searching each chain in order. Don't allow chain relinking
1961 	 * because we only calculate enough journal credits for one
1962 	 * relink per alloc. */
1963 	ac->ac_disable_chain_relink = 1;
1964 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1965 		if (i == victim)
1966 			continue;
1967 		if (!cl->cl_recs[i].c_free)
1968 			continue;
1969 
1970 		ac->ac_chain = i;
1971 		status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1972 					    res, &bits_left);
1973 		if (!status) {
1974 			hint = ocfs2_group_from_res(res);
1975 			break;
1976 		}
1977 		if (status < 0 && status != -ENOSPC) {
1978 			mlog_errno(status);
1979 			goto bail;
1980 		}
1981 	}
1982 
1983 set_hint:
1984 	if (status != -ENOSPC) {
1985 		/* If the next search of this group is not likely to
1986 		 * yield a suitable extent, then we reset the last
1987 		 * group hint so as to not waste a disk read */
1988 		if (bits_left < min_bits)
1989 			ac->ac_last_group = 0;
1990 		else
1991 			ac->ac_last_group = hint;
1992 	}
1993 
1994 bail:
1995 	if (status)
1996 		mlog_errno(status);
1997 	return status;
1998 }
1999 
2000 int ocfs2_claim_metadata(handle_t *handle,
2001 			 struct ocfs2_alloc_context *ac,
2002 			 u32 bits_wanted,
2003 			 u64 *suballoc_loc,
2004 			 u16 *suballoc_bit_start,
2005 			 unsigned int *num_bits,
2006 			 u64 *blkno_start)
2007 {
2008 	int status;
2009 	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2010 
2011 	BUG_ON(!ac);
2012 	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
2013 	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
2014 
2015 	status = ocfs2_claim_suballoc_bits(ac,
2016 					   handle,
2017 					   bits_wanted,
2018 					   1,
2019 					   &res);
2020 	if (status < 0) {
2021 		mlog_errno(status);
2022 		goto bail;
2023 	}
2024 	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2025 
2026 	*suballoc_loc = res.sr_bg_blkno;
2027 	*suballoc_bit_start = res.sr_bit_offset;
2028 	*blkno_start = res.sr_blkno;
2029 	ac->ac_bits_given += res.sr_bits;
2030 	*num_bits = res.sr_bits;
2031 	status = 0;
2032 bail:
2033 	if (status)
2034 		mlog_errno(status);
2035 	return status;
2036 }
2037 
2038 static void ocfs2_init_inode_ac_group(struct inode *dir,
2039 				      struct buffer_head *parent_di_bh,
2040 				      struct ocfs2_alloc_context *ac)
2041 {
2042 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
2043 	/*
2044 	 * Try to allocate inodes from some specific group.
2045 	 *
2046 	 * If the parent dir has recorded the last group used in allocation,
2047 	 * cool, use it. Otherwise if we try to allocate new inode from the
2048 	 * same slot the parent dir belongs to, use the same chunk.
2049 	 *
2050 	 * We are very careful here to avoid the mistake of setting
2051 	 * ac_last_group to a group descriptor from a different (unlocked) slot.
2052 	 */
2053 	if (OCFS2_I(dir)->ip_last_used_group &&
2054 	    OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
2055 		ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
2056 	else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
2057 		if (di->i_suballoc_loc)
2058 			ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
2059 		else
2060 			ac->ac_last_group = ocfs2_which_suballoc_group(
2061 					le64_to_cpu(di->i_blkno),
2062 					le16_to_cpu(di->i_suballoc_bit));
2063 	}
2064 }
2065 
2066 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2067 					     struct ocfs2_alloc_context *ac)
2068 {
2069 	OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
2070 	OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2071 }
2072 
2073 int ocfs2_find_new_inode_loc(struct inode *dir,
2074 			     struct buffer_head *parent_fe_bh,
2075 			     struct ocfs2_alloc_context *ac,
2076 			     u64 *fe_blkno)
2077 {
2078 	int ret;
2079 	handle_t *handle = NULL;
2080 	struct ocfs2_suballoc_result *res;
2081 
2082 	BUG_ON(!ac);
2083 	BUG_ON(ac->ac_bits_given != 0);
2084 	BUG_ON(ac->ac_bits_wanted != 1);
2085 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2086 
2087 	res = kzalloc(sizeof(*res), GFP_NOFS);
2088 	if (res == NULL) {
2089 		ret = -ENOMEM;
2090 		mlog_errno(ret);
2091 		goto out;
2092 	}
2093 
2094 	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2095 
2096 	/*
2097 	 * The handle started here is for chain relink. Alternatively,
2098 	 * we could just disable relink for these calls.
2099 	 */
2100 	handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2101 	if (IS_ERR(handle)) {
2102 		ret = PTR_ERR(handle);
2103 		handle = NULL;
2104 		mlog_errno(ret);
2105 		goto out;
2106 	}
2107 
2108 	/*
2109 	 * This will instruct ocfs2_claim_suballoc_bits and
2110 	 * ocfs2_search_one_group to search but save actual allocation
2111 	 * for later.
2112 	 */
2113 	ac->ac_find_loc_only = 1;
2114 
2115 	ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2116 	if (ret < 0) {
2117 		mlog_errno(ret);
2118 		goto out;
2119 	}
2120 
2121 	ac->ac_find_loc_priv = res;
2122 	*fe_blkno = res->sr_blkno;
2123 	ocfs2_update_inode_fsync_trans(handle, dir, 0);
2124 out:
2125 	if (handle)
2126 		ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2127 
2128 	if (ret)
2129 		kfree(res);
2130 
2131 	return ret;
2132 }
2133 
2134 int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2135 				 struct inode *dir,
2136 				 struct ocfs2_alloc_context *ac,
2137 				 u64 *suballoc_loc,
2138 				 u16 *suballoc_bit,
2139 				 u64 di_blkno)
2140 {
2141 	int ret;
2142 	u16 chain;
2143 	struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2144 	struct buffer_head *bg_bh = NULL;
2145 	struct ocfs2_group_desc *bg;
2146 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2147 
2148 	/*
2149 	 * Since di_blkno is being passed back in, we check for any
2150 	 * inconsistencies which may have happened between
2151 	 * calls. These are code bugs as di_blkno is not expected to
2152 	 * change once returned from ocfs2_find_new_inode_loc()
2153 	 */
2154 	BUG_ON(res->sr_blkno != di_blkno);
2155 
2156 	ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2157 					  res->sr_bg_stable_blkno, &bg_bh);
2158 	if (ret) {
2159 		mlog_errno(ret);
2160 		goto out;
2161 	}
2162 
2163 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2164 	chain = le16_to_cpu(bg->bg_chain);
2165 
2166 	ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2167 					       ac->ac_bh, res->sr_bits,
2168 					       chain);
2169 	if (ret) {
2170 		mlog_errno(ret);
2171 		goto out;
2172 	}
2173 
2174 	ret = ocfs2_block_group_set_bits(handle,
2175 					 ac->ac_inode,
2176 					 bg,
2177 					 bg_bh,
2178 					 res->sr_bit_offset,
2179 					 res->sr_bits);
2180 	if (ret < 0) {
2181 		ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
2182 					       ac->ac_bh, res->sr_bits, chain);
2183 		mlog_errno(ret);
2184 		goto out;
2185 	}
2186 
2187 	trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
2188 					   res->sr_bits);
2189 
2190 	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2191 
2192 	BUG_ON(res->sr_bits != 1);
2193 
2194 	*suballoc_loc = res->sr_bg_blkno;
2195 	*suballoc_bit = res->sr_bit_offset;
2196 	ac->ac_bits_given++;
2197 	ocfs2_save_inode_ac_group(dir, ac);
2198 
2199 out:
2200 	brelse(bg_bh);
2201 
2202 	return ret;
2203 }
2204 
2205 int ocfs2_claim_new_inode(handle_t *handle,
2206 			  struct inode *dir,
2207 			  struct buffer_head *parent_fe_bh,
2208 			  struct ocfs2_alloc_context *ac,
2209 			  u64 *suballoc_loc,
2210 			  u16 *suballoc_bit,
2211 			  u64 *fe_blkno)
2212 {
2213 	int status;
2214 	struct ocfs2_suballoc_result res;
2215 
2216 	BUG_ON(!ac);
2217 	BUG_ON(ac->ac_bits_given != 0);
2218 	BUG_ON(ac->ac_bits_wanted != 1);
2219 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2220 
2221 	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2222 
2223 	status = ocfs2_claim_suballoc_bits(ac,
2224 					   handle,
2225 					   1,
2226 					   1,
2227 					   &res);
2228 	if (status < 0) {
2229 		mlog_errno(status);
2230 		goto bail;
2231 	}
2232 	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2233 
2234 	BUG_ON(res.sr_bits != 1);
2235 
2236 	*suballoc_loc = res.sr_bg_blkno;
2237 	*suballoc_bit = res.sr_bit_offset;
2238 	*fe_blkno = res.sr_blkno;
2239 	ac->ac_bits_given++;
2240 	ocfs2_save_inode_ac_group(dir, ac);
2241 	status = 0;
2242 bail:
2243 	if (status)
2244 		mlog_errno(status);
2245 	return status;
2246 }
2247 
2248 /* translate a group desc. blkno and it's bitmap offset into
2249  * disk cluster offset. */
2250 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2251 						   u64 bg_blkno,
2252 						   u16 bg_bit_off)
2253 {
2254 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2255 	u32 cluster = 0;
2256 
2257 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2258 
2259 	if (bg_blkno != osb->first_cluster_group_blkno)
2260 		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2261 	cluster += (u32) bg_bit_off;
2262 	return cluster;
2263 }
2264 
2265 /* given a cluster offset, calculate which block group it belongs to
2266  * and return that block offset. */
2267 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2268 {
2269 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2270 	u32 group_no;
2271 
2272 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2273 
2274 	group_no = cluster / osb->bitmap_cpg;
2275 	if (!group_no)
2276 		return osb->first_cluster_group_blkno;
2277 	return ocfs2_clusters_to_blocks(inode->i_sb,
2278 					group_no * osb->bitmap_cpg);
2279 }
2280 
2281 /* given the block number of a cluster start, calculate which cluster
2282  * group and descriptor bitmap offset that corresponds to. */
2283 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2284 						u64 data_blkno,
2285 						u64 *bg_blkno,
2286 						u16 *bg_bit_off)
2287 {
2288 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2289 	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2290 
2291 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2292 
2293 	*bg_blkno = ocfs2_which_cluster_group(inode,
2294 					      data_cluster);
2295 
2296 	if (*bg_blkno == osb->first_cluster_group_blkno)
2297 		*bg_bit_off = (u16) data_cluster;
2298 	else
2299 		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2300 							     data_blkno - *bg_blkno);
2301 }
2302 
2303 /*
2304  * min_bits - minimum contiguous chunk from this total allocation we
2305  * can handle. set to what we asked for originally for a full
2306  * contig. allocation, set to '1' to indicate we can deal with extents
2307  * of any size.
2308  */
2309 int __ocfs2_claim_clusters(handle_t *handle,
2310 			   struct ocfs2_alloc_context *ac,
2311 			   u32 min_clusters,
2312 			   u32 max_clusters,
2313 			   u32 *cluster_start,
2314 			   u32 *num_clusters)
2315 {
2316 	int status;
2317 	unsigned int bits_wanted = max_clusters;
2318 	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2319 	struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2320 
2321 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2322 
2323 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2324 	       && ac->ac_which != OCFS2_AC_USE_MAIN);
2325 
2326 	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2327 		WARN_ON(min_clusters > 1);
2328 
2329 		status = ocfs2_claim_local_alloc_bits(osb,
2330 						      handle,
2331 						      ac,
2332 						      bits_wanted,
2333 						      cluster_start,
2334 						      num_clusters);
2335 		if (!status)
2336 			atomic_inc(&osb->alloc_stats.local_data);
2337 	} else {
2338 		if (min_clusters > (osb->bitmap_cpg - 1)) {
2339 			/* The only paths asking for contiguousness
2340 			 * should know about this already. */
2341 			mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2342 			     "group bitmap size %u!\n", min_clusters,
2343 			     osb->bitmap_cpg);
2344 			status = -ENOSPC;
2345 			goto bail;
2346 		}
2347 		/* clamp the current request down to a realistic size. */
2348 		if (bits_wanted > (osb->bitmap_cpg - 1))
2349 			bits_wanted = osb->bitmap_cpg - 1;
2350 
2351 		status = ocfs2_claim_suballoc_bits(ac,
2352 						   handle,
2353 						   bits_wanted,
2354 						   min_clusters,
2355 						   &res);
2356 		if (!status) {
2357 			BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2358 			*cluster_start =
2359 				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2360 								 res.sr_bg_blkno,
2361 								 res.sr_bit_offset);
2362 			atomic_inc(&osb->alloc_stats.bitmap_data);
2363 			*num_clusters = res.sr_bits;
2364 		}
2365 	}
2366 	if (status < 0) {
2367 		if (status != -ENOSPC)
2368 			mlog_errno(status);
2369 		goto bail;
2370 	}
2371 
2372 	ac->ac_bits_given += *num_clusters;
2373 
2374 bail:
2375 	if (status)
2376 		mlog_errno(status);
2377 	return status;
2378 }
2379 
2380 int ocfs2_claim_clusters(handle_t *handle,
2381 			 struct ocfs2_alloc_context *ac,
2382 			 u32 min_clusters,
2383 			 u32 *cluster_start,
2384 			 u32 *num_clusters)
2385 {
2386 	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2387 
2388 	return __ocfs2_claim_clusters(handle, ac, min_clusters,
2389 				      bits_wanted, cluster_start, num_clusters);
2390 }
2391 
2392 static int ocfs2_block_group_clear_bits(handle_t *handle,
2393 					struct inode *alloc_inode,
2394 					struct ocfs2_group_desc *bg,
2395 					struct buffer_head *group_bh,
2396 					unsigned int bit_off,
2397 					unsigned int num_bits,
2398 					void (*undo_fn)(unsigned int bit,
2399 							unsigned long *bmap))
2400 {
2401 	int status;
2402 	unsigned int tmp;
2403 	struct ocfs2_group_desc *undo_bg = NULL;
2404 
2405 	/* The caller got this descriptor from
2406 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
2407 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2408 
2409 	trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
2410 
2411 	BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2412 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2413 					 group_bh,
2414 					 undo_fn ?
2415 					 OCFS2_JOURNAL_ACCESS_UNDO :
2416 					 OCFS2_JOURNAL_ACCESS_WRITE);
2417 	if (status < 0) {
2418 		mlog_errno(status);
2419 		goto bail;
2420 	}
2421 
2422 	if (undo_fn) {
2423 		jbd_lock_bh_state(group_bh);
2424 		undo_bg = (struct ocfs2_group_desc *)
2425 					bh2jh(group_bh)->b_committed_data;
2426 		BUG_ON(!undo_bg);
2427 	}
2428 
2429 	tmp = num_bits;
2430 	while(tmp--) {
2431 		ocfs2_clear_bit((bit_off + tmp),
2432 				(unsigned long *) bg->bg_bitmap);
2433 		if (undo_fn)
2434 			undo_fn(bit_off + tmp,
2435 				(unsigned long *) undo_bg->bg_bitmap);
2436 	}
2437 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2438 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2439 		if (undo_fn)
2440 			jbd_unlock_bh_state(group_bh);
2441 		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
2442 				   (unsigned long long)le64_to_cpu(bg->bg_blkno),
2443 				   le16_to_cpu(bg->bg_bits),
2444 				   le16_to_cpu(bg->bg_free_bits_count),
2445 				   num_bits);
2446 	}
2447 
2448 	if (undo_fn)
2449 		jbd_unlock_bh_state(group_bh);
2450 
2451 	ocfs2_journal_dirty(handle, group_bh);
2452 bail:
2453 	return status;
2454 }
2455 
2456 /*
2457  * expects the suballoc inode to already be locked.
2458  */
2459 static int _ocfs2_free_suballoc_bits(handle_t *handle,
2460 				     struct inode *alloc_inode,
2461 				     struct buffer_head *alloc_bh,
2462 				     unsigned int start_bit,
2463 				     u64 bg_blkno,
2464 				     unsigned int count,
2465 				     void (*undo_fn)(unsigned int bit,
2466 						     unsigned long *bitmap))
2467 {
2468 	int status = 0;
2469 	u32 tmp_used;
2470 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2471 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2472 	struct buffer_head *group_bh = NULL;
2473 	struct ocfs2_group_desc *group;
2474 
2475 	/* The alloc_bh comes from ocfs2_free_dinode() or
2476 	 * ocfs2_free_clusters().  The callers have all locked the
2477 	 * allocator and gotten alloc_bh from the lock call.  This
2478 	 * validates the dinode buffer.  Any corruption that has happened
2479 	 * is a code bug. */
2480 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2481 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2482 
2483 	trace_ocfs2_free_suballoc_bits(
2484 		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
2485 		(unsigned long long)bg_blkno,
2486 		start_bit, count);
2487 
2488 	status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2489 					     &group_bh);
2490 	if (status < 0) {
2491 		mlog_errno(status);
2492 		goto bail;
2493 	}
2494 	group = (struct ocfs2_group_desc *) group_bh->b_data;
2495 
2496 	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2497 
2498 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2499 					      group, group_bh,
2500 					      start_bit, count, undo_fn);
2501 	if (status < 0) {
2502 		mlog_errno(status);
2503 		goto bail;
2504 	}
2505 
2506 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2507 					 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2508 	if (status < 0) {
2509 		mlog_errno(status);
2510 		ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
2511 				start_bit, count);
2512 		goto bail;
2513 	}
2514 
2515 	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2516 		     count);
2517 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2518 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2519 	ocfs2_journal_dirty(handle, alloc_bh);
2520 
2521 bail:
2522 	brelse(group_bh);
2523 
2524 	if (status)
2525 		mlog_errno(status);
2526 	return status;
2527 }
2528 
2529 int ocfs2_free_suballoc_bits(handle_t *handle,
2530 			     struct inode *alloc_inode,
2531 			     struct buffer_head *alloc_bh,
2532 			     unsigned int start_bit,
2533 			     u64 bg_blkno,
2534 			     unsigned int count)
2535 {
2536 	return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2537 					 start_bit, bg_blkno, count, NULL);
2538 }
2539 
2540 int ocfs2_free_dinode(handle_t *handle,
2541 		      struct inode *inode_alloc_inode,
2542 		      struct buffer_head *inode_alloc_bh,
2543 		      struct ocfs2_dinode *di)
2544 {
2545 	u64 blk = le64_to_cpu(di->i_blkno);
2546 	u16 bit = le16_to_cpu(di->i_suballoc_bit);
2547 	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2548 
2549 	if (di->i_suballoc_loc)
2550 		bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2551 	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2552 					inode_alloc_bh, bit, bg_blkno, 1);
2553 }
2554 
2555 static int _ocfs2_free_clusters(handle_t *handle,
2556 				struct inode *bitmap_inode,
2557 				struct buffer_head *bitmap_bh,
2558 				u64 start_blk,
2559 				unsigned int num_clusters,
2560 				void (*undo_fn)(unsigned int bit,
2561 						unsigned long *bitmap))
2562 {
2563 	int status;
2564 	u16 bg_start_bit;
2565 	u64 bg_blkno;
2566 
2567 	/* You can't ever have a contiguous set of clusters
2568 	 * bigger than a block group bitmap so we never have to worry
2569 	 * about looping on them.
2570 	 * This is expensive. We can safely remove once this stuff has
2571 	 * gotten tested really well. */
2572 	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb,
2573 				ocfs2_blocks_to_clusters(bitmap_inode->i_sb,
2574 							 start_blk)));
2575 
2576 
2577 	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2578 				     &bg_start_bit);
2579 
2580 	trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
2581 			(unsigned long long)start_blk,
2582 			bg_start_bit, num_clusters);
2583 
2584 	status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2585 					   bg_start_bit, bg_blkno,
2586 					   num_clusters, undo_fn);
2587 	if (status < 0) {
2588 		mlog_errno(status);
2589 		goto out;
2590 	}
2591 
2592 	ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2593 					 num_clusters);
2594 
2595 out:
2596 	if (status)
2597 		mlog_errno(status);
2598 	return status;
2599 }
2600 
2601 int ocfs2_free_clusters(handle_t *handle,
2602 			struct inode *bitmap_inode,
2603 			struct buffer_head *bitmap_bh,
2604 			u64 start_blk,
2605 			unsigned int num_clusters)
2606 {
2607 	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2608 				    start_blk, num_clusters,
2609 				    _ocfs2_set_bit);
2610 }
2611 
2612 /*
2613  * Give never-used clusters back to the global bitmap.  We don't need
2614  * to protect these bits in the undo buffer.
2615  */
2616 int ocfs2_release_clusters(handle_t *handle,
2617 			   struct inode *bitmap_inode,
2618 			   struct buffer_head *bitmap_bh,
2619 			   u64 start_blk,
2620 			   unsigned int num_clusters)
2621 {
2622 	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2623 				    start_blk, num_clusters,
2624 				    _ocfs2_clear_bit);
2625 }
2626 
2627 /*
2628  * For a given allocation, determine which allocators will need to be
2629  * accessed, and lock them, reserving the appropriate number of bits.
2630  *
2631  * Sparse file systems call this from ocfs2_write_begin_nolock()
2632  * and ocfs2_allocate_unwritten_extents().
2633  *
2634  * File systems which don't support holes call this from
2635  * ocfs2_extend_allocation().
2636  */
2637 int ocfs2_lock_allocators(struct inode *inode,
2638 			  struct ocfs2_extent_tree *et,
2639 			  u32 clusters_to_add, u32 extents_to_split,
2640 			  struct ocfs2_alloc_context **data_ac,
2641 			  struct ocfs2_alloc_context **meta_ac)
2642 {
2643 	int ret = 0, num_free_extents;
2644 	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2645 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2646 
2647 	*meta_ac = NULL;
2648 	if (data_ac)
2649 		*data_ac = NULL;
2650 
2651 	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2652 
2653 	num_free_extents = ocfs2_num_free_extents(et);
2654 	if (num_free_extents < 0) {
2655 		ret = num_free_extents;
2656 		mlog_errno(ret);
2657 		goto out;
2658 	}
2659 
2660 	/*
2661 	 * Sparse allocation file systems need to be more conservative
2662 	 * with reserving room for expansion - the actual allocation
2663 	 * happens while we've got a journal handle open so re-taking
2664 	 * a cluster lock (because we ran out of room for another
2665 	 * extent) will violate ordering rules.
2666 	 *
2667 	 * Most of the time we'll only be seeing this 1 cluster at a time
2668 	 * anyway.
2669 	 *
2670 	 * Always lock for any unwritten extents - we might want to
2671 	 * add blocks during a split.
2672 	 */
2673 	if (!num_free_extents ||
2674 	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2675 		ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2676 		if (ret < 0) {
2677 			if (ret != -ENOSPC)
2678 				mlog_errno(ret);
2679 			goto out;
2680 		}
2681 	}
2682 
2683 	if (clusters_to_add == 0)
2684 		goto out;
2685 
2686 	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2687 	if (ret < 0) {
2688 		if (ret != -ENOSPC)
2689 			mlog_errno(ret);
2690 		goto out;
2691 	}
2692 
2693 out:
2694 	if (ret) {
2695 		if (*meta_ac) {
2696 			ocfs2_free_alloc_context(*meta_ac);
2697 			*meta_ac = NULL;
2698 		}
2699 
2700 		/*
2701 		 * We cannot have an error and a non null *data_ac.
2702 		 */
2703 	}
2704 
2705 	return ret;
2706 }
2707 
2708 /*
2709  * Read the inode specified by blkno to get suballoc_slot and
2710  * suballoc_bit.
2711  */
2712 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2713 				       u16 *suballoc_slot, u64 *group_blkno,
2714 				       u16 *suballoc_bit)
2715 {
2716 	int status;
2717 	struct buffer_head *inode_bh = NULL;
2718 	struct ocfs2_dinode *inode_fe;
2719 
2720 	trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
2721 
2722 	/* dirty read disk */
2723 	status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2724 	if (status < 0) {
2725 		mlog(ML_ERROR, "read block %llu failed %d\n",
2726 		     (unsigned long long)blkno, status);
2727 		goto bail;
2728 	}
2729 
2730 	inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2731 	if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2732 		mlog(ML_ERROR, "invalid inode %llu requested\n",
2733 		     (unsigned long long)blkno);
2734 		status = -EINVAL;
2735 		goto bail;
2736 	}
2737 
2738 	if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2739 	    (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2740 		mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2741 		     (unsigned long long)blkno,
2742 		     (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2743 		status = -EINVAL;
2744 		goto bail;
2745 	}
2746 
2747 	if (suballoc_slot)
2748 		*suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2749 	if (suballoc_bit)
2750 		*suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2751 	if (group_blkno)
2752 		*group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
2753 
2754 bail:
2755 	brelse(inode_bh);
2756 
2757 	if (status)
2758 		mlog_errno(status);
2759 	return status;
2760 }
2761 
2762 /*
2763  * test whether bit is SET in allocator bitmap or not.  on success, 0
2764  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2765  * is returned and *res is meaningless.  Call this after you have
2766  * cluster locked against suballoc, or you may get a result based on
2767  * non-up2date contents
2768  */
2769 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2770 				   struct inode *suballoc,
2771 				   struct buffer_head *alloc_bh,
2772 				   u64 group_blkno, u64 blkno,
2773 				   u16 bit, int *res)
2774 {
2775 	struct ocfs2_dinode *alloc_di;
2776 	struct ocfs2_group_desc *group;
2777 	struct buffer_head *group_bh = NULL;
2778 	u64 bg_blkno;
2779 	int status;
2780 
2781 	trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
2782 				      (unsigned int)bit);
2783 
2784 	alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2785 	if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2786 		mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2787 		     (unsigned int)bit,
2788 		     ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2789 		status = -EINVAL;
2790 		goto bail;
2791 	}
2792 
2793 	bg_blkno = group_blkno ? group_blkno :
2794 		   ocfs2_which_suballoc_group(blkno, bit);
2795 	status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2796 					     &group_bh);
2797 	if (status < 0) {
2798 		mlog(ML_ERROR, "read group %llu failed %d\n",
2799 		     (unsigned long long)bg_blkno, status);
2800 		goto bail;
2801 	}
2802 
2803 	group = (struct ocfs2_group_desc *) group_bh->b_data;
2804 	*res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2805 
2806 bail:
2807 	brelse(group_bh);
2808 
2809 	if (status)
2810 		mlog_errno(status);
2811 	return status;
2812 }
2813 
2814 /*
2815  * Test if the bit representing this inode (blkno) is set in the
2816  * suballocator.
2817  *
2818  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2819  *
2820  * In the event of failure, a negative value is returned and *res is
2821  * meaningless.
2822  *
2823  * Callers must make sure to hold nfs_sync_lock to prevent
2824  * ocfs2_delete_inode() on another node from accessing the same
2825  * suballocator concurrently.
2826  */
2827 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2828 {
2829 	int status;
2830 	u64 group_blkno = 0;
2831 	u16 suballoc_bit = 0, suballoc_slot = 0;
2832 	struct inode *inode_alloc_inode;
2833 	struct buffer_head *alloc_bh = NULL;
2834 
2835 	trace_ocfs2_test_inode_bit((unsigned long long)blkno);
2836 
2837 	status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2838 					     &group_blkno, &suballoc_bit);
2839 	if (status < 0) {
2840 		mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2841 		goto bail;
2842 	}
2843 
2844 	inode_alloc_inode =
2845 		ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2846 					    suballoc_slot);
2847 	if (!inode_alloc_inode) {
2848 		/* the error code could be inaccurate, but we are not able to
2849 		 * get the correct one. */
2850 		status = -EINVAL;
2851 		mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2852 		     (u32)suballoc_slot);
2853 		goto bail;
2854 	}
2855 
2856 	inode_lock(inode_alloc_inode);
2857 	status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2858 	if (status < 0) {
2859 		inode_unlock(inode_alloc_inode);
2860 		iput(inode_alloc_inode);
2861 		mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2862 		     (u32)suballoc_slot, status);
2863 		goto bail;
2864 	}
2865 
2866 	status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2867 					 group_blkno, blkno, suballoc_bit, res);
2868 	if (status < 0)
2869 		mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2870 
2871 	ocfs2_inode_unlock(inode_alloc_inode, 0);
2872 	inode_unlock(inode_alloc_inode);
2873 
2874 	iput(inode_alloc_inode);
2875 	brelse(alloc_bh);
2876 bail:
2877 	if (status)
2878 		mlog_errno(status);
2879 	return status;
2880 }
2881