xref: /linux/fs/ocfs2/suballoc.c (revision fd4d53bde9128bc85d85cc3427bd592e4c3293e4)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * suballoc.c
4  *
5  * metadata alloc and free
6  * Inspired by ext3 block groups.
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  */
10 
11 #include <linux/fs.h>
12 #include <linux/types.h>
13 #include <linux/slab.h>
14 #include <linux/string.h>
15 #include <linux/highmem.h>
16 
17 #include <cluster/masklog.h>
18 
19 #include "ocfs2.h"
20 
21 #include "alloc.h"
22 #include "blockcheck.h"
23 #include "dlmglue.h"
24 #include "inode.h"
25 #include "journal.h"
26 #include "localalloc.h"
27 #include "suballoc.h"
28 #include "super.h"
29 #include "sysfile.h"
30 #include "uptodate.h"
31 #include "ocfs2_trace.h"
32 
33 #include "buffer_head_io.h"
34 
35 #define NOT_ALLOC_NEW_GROUP		0
36 #define ALLOC_NEW_GROUP			0x1
37 #define ALLOC_GROUPS_FROM_GLOBAL	0x2
38 
39 #define OCFS2_MAX_TO_STEAL		1024
40 
41 struct ocfs2_suballoc_result {
42 	u64		sr_bg_blkno;	/* The bg we allocated from.  Set
43 					   to 0 when a block group is
44 					   contiguous. */
45 	u64		sr_bg_stable_blkno; /*
46 					     * Doesn't change, always
47 					     * set to target block
48 					     * group descriptor
49 					     * block.
50 					     */
51 	u64		sr_blkno;	/* The first allocated block */
52 	unsigned int	sr_bit_offset;	/* The bit in the bg */
53 	unsigned int	sr_bits;	/* How many bits we claimed */
54 	unsigned int	sr_max_contig_bits; /* The length for contiguous
55 					     * free bits, only available
56 					     * for cluster group
57 					     */
58 };
59 
60 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
61 {
62 	if (res->sr_blkno == 0)
63 		return 0;
64 
65 	if (res->sr_bg_blkno)
66 		return res->sr_bg_blkno;
67 
68 	return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
69 }
70 
71 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
72 static int ocfs2_block_group_fill(handle_t *handle,
73 				  struct inode *alloc_inode,
74 				  struct buffer_head *bg_bh,
75 				  u64 group_blkno,
76 				  unsigned int group_clusters,
77 				  u16 my_chain,
78 				  struct ocfs2_chain_list *cl);
79 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
80 				   struct inode *alloc_inode,
81 				   struct buffer_head *bh,
82 				   u64 max_block,
83 				   u64 *last_alloc_group,
84 				   int flags);
85 
86 static int ocfs2_cluster_group_search(struct inode *inode,
87 				      struct buffer_head *group_bh,
88 				      u32 bits_wanted, u32 min_bits,
89 				      u64 max_block,
90 				      struct ocfs2_suballoc_result *res);
91 static int ocfs2_block_group_search(struct inode *inode,
92 				    struct buffer_head *group_bh,
93 				    u32 bits_wanted, u32 min_bits,
94 				    u64 max_block,
95 				    struct ocfs2_suballoc_result *res);
96 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
97 				     handle_t *handle,
98 				     u32 bits_wanted,
99 				     u32 min_bits,
100 				     struct ocfs2_suballoc_result *res);
101 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
102 					 int nr);
103 static int ocfs2_relink_block_group(handle_t *handle,
104 				    struct inode *alloc_inode,
105 				    struct buffer_head *fe_bh,
106 				    struct buffer_head *bg_bh,
107 				    struct buffer_head *prev_bg_bh,
108 				    u16 chain);
109 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
110 						     u32 wanted);
111 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
112 						   u64 bg_blkno,
113 						   u16 bg_bit_off);
114 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
115 						u64 data_blkno,
116 						u64 *bg_blkno,
117 						u16 *bg_bit_off);
118 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
119 					     u32 bits_wanted, u64 max_block,
120 					     int flags,
121 					     struct ocfs2_alloc_context **ac);
122 
123 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
124 {
125 	struct inode *inode = ac->ac_inode;
126 
127 	if (inode) {
128 		if (ac->ac_which != OCFS2_AC_USE_LOCAL)
129 			ocfs2_inode_unlock(inode, 1);
130 
131 		inode_unlock(inode);
132 
133 		iput(inode);
134 		ac->ac_inode = NULL;
135 	}
136 	brelse(ac->ac_bh);
137 	ac->ac_bh = NULL;
138 	ac->ac_resv = NULL;
139 	kfree(ac->ac_find_loc_priv);
140 	ac->ac_find_loc_priv = NULL;
141 }
142 
143 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
144 {
145 	ocfs2_free_ac_resource(ac);
146 	kfree(ac);
147 }
148 
149 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
150 {
151 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
152 }
153 
154 #define do_error(fmt, ...)						\
155 do {									\
156 	if (resize)							\
157 		mlog(ML_ERROR, fmt, ##__VA_ARGS__);			\
158 	else								\
159 		return ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
160 } while (0)
161 
162 static int ocfs2_validate_gd_self(struct super_block *sb,
163 				  struct buffer_head *bh,
164 				  int resize)
165 {
166 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
167 
168 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
169 		do_error("Group descriptor #%llu has bad signature %.*s\n",
170 			 (unsigned long long)bh->b_blocknr, 7,
171 			 gd->bg_signature);
172 	}
173 
174 	if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
175 		do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
176 			 (unsigned long long)bh->b_blocknr,
177 			 (unsigned long long)le64_to_cpu(gd->bg_blkno));
178 	}
179 
180 	if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
181 		do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
182 			 (unsigned long long)bh->b_blocknr,
183 			 le32_to_cpu(gd->bg_generation));
184 	}
185 
186 	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
187 		do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
188 			 (unsigned long long)bh->b_blocknr,
189 			 le16_to_cpu(gd->bg_bits),
190 			 le16_to_cpu(gd->bg_free_bits_count));
191 	}
192 
193 	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
194 		do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
195 			 (unsigned long long)bh->b_blocknr,
196 			 le16_to_cpu(gd->bg_bits),
197 			 8 * le16_to_cpu(gd->bg_size));
198 	}
199 
200 	return 0;
201 }
202 
203 static int ocfs2_validate_gd_parent(struct super_block *sb,
204 				    struct ocfs2_dinode *di,
205 				    struct buffer_head *bh,
206 				    int resize)
207 {
208 	unsigned int max_bits;
209 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
210 
211 	if (di->i_blkno != gd->bg_parent_dinode) {
212 		do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
213 			 (unsigned long long)bh->b_blocknr,
214 			 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
215 			 (unsigned long long)le64_to_cpu(di->i_blkno));
216 	}
217 
218 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
219 	if (le16_to_cpu(gd->bg_bits) > max_bits) {
220 		do_error("Group descriptor #%llu has bit count of %u\n",
221 			 (unsigned long long)bh->b_blocknr,
222 			 le16_to_cpu(gd->bg_bits));
223 	}
224 
225 	/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
226 	if ((le16_to_cpu(gd->bg_chain) >
227 	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
228 	    ((le16_to_cpu(gd->bg_chain) ==
229 	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
230 		do_error("Group descriptor #%llu has bad chain %u\n",
231 			 (unsigned long long)bh->b_blocknr,
232 			 le16_to_cpu(gd->bg_chain));
233 	}
234 
235 	return 0;
236 }
237 
238 #undef do_error
239 
240 /*
241  * This version only prints errors.  It does not fail the filesystem, and
242  * exists only for resize.
243  */
244 int ocfs2_check_group_descriptor(struct super_block *sb,
245 				 struct ocfs2_dinode *di,
246 				 struct buffer_head *bh)
247 {
248 	int rc;
249 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
250 
251 	BUG_ON(!buffer_uptodate(bh));
252 
253 	/*
254 	 * If the ecc fails, we return the error but otherwise
255 	 * leave the filesystem running.  We know any error is
256 	 * local to this block.
257 	 */
258 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
259 	if (rc) {
260 		mlog(ML_ERROR,
261 		     "Checksum failed for group descriptor %llu\n",
262 		     (unsigned long long)bh->b_blocknr);
263 	} else
264 		rc = ocfs2_validate_gd_self(sb, bh, 1);
265 	if (!rc)
266 		rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
267 
268 	return rc;
269 }
270 
271 static int ocfs2_validate_group_descriptor(struct super_block *sb,
272 					   struct buffer_head *bh)
273 {
274 	int rc;
275 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
276 
277 	trace_ocfs2_validate_group_descriptor(
278 					(unsigned long long)bh->b_blocknr);
279 
280 	BUG_ON(!buffer_uptodate(bh));
281 
282 	/*
283 	 * If the ecc fails, we return the error but otherwise
284 	 * leave the filesystem running.  We know any error is
285 	 * local to this block.
286 	 */
287 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
288 	if (rc)
289 		return rc;
290 
291 	/*
292 	 * Errors after here are fatal.
293 	 */
294 
295 	return ocfs2_validate_gd_self(sb, bh, 0);
296 }
297 
298 /*
299  * The hint group descriptor (gd) may already have been released
300  * in _ocfs2_free_suballoc_bits(). We first check the gd signature,
301  * then perform the standard ocfs2_read_group_descriptor() jobs.
302  *
303  * If the gd signature is invalid, we return 'rc=0' and set
304  * '*released=1'. The caller is expected to handle this specific case.
305  * Otherwise, we return the actual error code.
306  *
307  * We treat gd signature corruption case as a release case. The
308  * caller ocfs2_claim_suballoc_bits() will use ocfs2_search_chain()
309  * to search each gd block. The code will eventually find this
310  * corrupted gd block - Late, but not missed.
311  *
312  * Note:
313  * The caller is responsible for initializing the '*released' status.
314  */
315 static int ocfs2_read_hint_group_descriptor(struct inode *inode,
316 			struct ocfs2_dinode *di, u64 gd_blkno,
317 			struct buffer_head **bh, int *released)
318 {
319 	int rc;
320 	struct buffer_head *tmp = *bh;
321 	struct ocfs2_group_desc *gd;
322 
323 	rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, NULL);
324 	if (rc)
325 		goto out;
326 
327 	gd = (struct ocfs2_group_desc *) tmp->b_data;
328 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
329 		/*
330 		 * Invalid gd cache was set in ocfs2_read_block(),
331 		 * which will affect block_group allocation.
332 		 * Path:
333 		 * ocfs2_reserve_suballoc_bits
334 		 *  ocfs2_block_group_alloc
335 		 *   ocfs2_block_group_alloc_contig
336 		 *    ocfs2_set_new_buffer_uptodate
337 		 */
338 		ocfs2_remove_from_cache(INODE_CACHE(inode), tmp);
339 		*released = 1; /* we return 'rc=0' for this case */
340 		goto free_bh;
341 	}
342 
343 	/* below jobs same with ocfs2_read_group_descriptor() */
344 	if (!buffer_jbd(tmp)) {
345 		rc = ocfs2_validate_group_descriptor(inode->i_sb, tmp);
346 		if (rc)
347 			goto free_bh;
348 	}
349 
350 	rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
351 	if (rc)
352 		goto free_bh;
353 
354 	/* If ocfs2_read_block() got us a new bh, pass it up. */
355 	if (!*bh)
356 		*bh = tmp;
357 
358 	return rc;
359 
360 free_bh:
361 	brelse(tmp);
362 out:
363 	return rc;
364 }
365 
366 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
367 				u64 gd_blkno, struct buffer_head **bh)
368 {
369 	int rc;
370 	struct buffer_head *tmp = *bh;
371 
372 	rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
373 			      ocfs2_validate_group_descriptor);
374 	if (rc)
375 		goto out;
376 
377 	rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
378 	if (rc) {
379 		brelse(tmp);
380 		goto out;
381 	}
382 
383 	/* If ocfs2_read_block() got us a new bh, pass it up. */
384 	if (!*bh)
385 		*bh = tmp;
386 
387 out:
388 	return rc;
389 }
390 
391 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
392 					  struct ocfs2_group_desc *bg,
393 					  struct ocfs2_chain_list *cl,
394 					  u64 p_blkno, unsigned int clusters)
395 {
396 	struct ocfs2_extent_list *el = &bg->bg_list;
397 	struct ocfs2_extent_rec *rec;
398 
399 	BUG_ON(!ocfs2_supports_discontig_bg(osb));
400 	if (!el->l_next_free_rec)
401 		el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
402 	rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
403 	rec->e_blkno = cpu_to_le64(p_blkno);
404 	rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
405 				  le16_to_cpu(cl->cl_bpc));
406 	rec->e_leaf_clusters = cpu_to_le16(clusters);
407 	le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
408 	le16_add_cpu(&bg->bg_free_bits_count,
409 		     clusters * le16_to_cpu(cl->cl_bpc));
410 	le16_add_cpu(&el->l_next_free_rec, 1);
411 }
412 
413 static int ocfs2_block_group_fill(handle_t *handle,
414 				  struct inode *alloc_inode,
415 				  struct buffer_head *bg_bh,
416 				  u64 group_blkno,
417 				  unsigned int group_clusters,
418 				  u16 my_chain,
419 				  struct ocfs2_chain_list *cl)
420 {
421 	int status = 0;
422 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
423 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
424 	struct super_block * sb = alloc_inode->i_sb;
425 
426 	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
427 		status = ocfs2_error(alloc_inode->i_sb,
428 				     "group block (%llu) != b_blocknr (%llu)\n",
429 				     (unsigned long long)group_blkno,
430 				     (unsigned long long) bg_bh->b_blocknr);
431 		goto bail;
432 	}
433 
434 	status = ocfs2_journal_access_gd(handle,
435 					 INODE_CACHE(alloc_inode),
436 					 bg_bh,
437 					 OCFS2_JOURNAL_ACCESS_CREATE);
438 	if (status < 0) {
439 		mlog_errno(status);
440 		goto bail;
441 	}
442 
443 	memset(bg, 0, sb->s_blocksize);
444 	strscpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
445 	bg->bg_generation = cpu_to_le32(osb->fs_generation);
446 	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
447 						osb->s_feature_incompat));
448 	bg->bg_chain = cpu_to_le16(my_chain);
449 	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
450 	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
451 	bg->bg_blkno = cpu_to_le64(group_blkno);
452 	if (group_clusters == le16_to_cpu(cl->cl_cpg))
453 		bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
454 	else
455 		ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
456 					      group_clusters);
457 
458 	/* set the 1st bit in the bitmap to account for the descriptor block */
459 	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
460 	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
461 
462 	ocfs2_journal_dirty(handle, bg_bh);
463 
464 	/* There is no need to zero out or otherwise initialize the
465 	 * other blocks in a group - All valid FS metadata in a block
466 	 * group stores the superblock fs_generation value at
467 	 * allocation time. */
468 
469 bail:
470 	if (status)
471 		mlog_errno(status);
472 	return status;
473 }
474 
475 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
476 {
477 	u16 curr, best;
478 
479 	best = curr = 0;
480 	while (curr < le16_to_cpu(cl->cl_count)) {
481 		if (le32_to_cpu(cl->cl_recs[best].c_total) >
482 		    le32_to_cpu(cl->cl_recs[curr].c_total))
483 			best = curr;
484 		curr++;
485 	}
486 	return best;
487 }
488 
489 static struct buffer_head *
490 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
491 			       struct inode *alloc_inode,
492 			       struct ocfs2_alloc_context *ac,
493 			       struct ocfs2_chain_list *cl)
494 {
495 	int status;
496 	u32 bit_off, num_bits;
497 	u64 bg_blkno;
498 	struct buffer_head *bg_bh;
499 	unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
500 
501 	status = ocfs2_claim_clusters(handle, ac,
502 				      le16_to_cpu(cl->cl_cpg), &bit_off,
503 				      &num_bits);
504 	if (status < 0) {
505 		if (status != -ENOSPC)
506 			mlog_errno(status);
507 		goto bail;
508 	}
509 
510 	/* setup the group */
511 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
512 	trace_ocfs2_block_group_alloc_contig(
513 	     (unsigned long long)bg_blkno, alloc_rec);
514 
515 	bg_bh = sb_getblk(osb->sb, bg_blkno);
516 	if (!bg_bh) {
517 		status = -ENOMEM;
518 		mlog_errno(status);
519 		goto bail;
520 	}
521 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
522 
523 	status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
524 					bg_blkno, num_bits, alloc_rec, cl);
525 	if (status < 0) {
526 		brelse(bg_bh);
527 		mlog_errno(status);
528 	}
529 
530 bail:
531 	return status ? ERR_PTR(status) : bg_bh;
532 }
533 
534 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
535 					handle_t *handle,
536 					struct ocfs2_alloc_context *ac,
537 					unsigned int min_bits,
538 					u32 *bit_off, u32 *num_bits)
539 {
540 	int status = 0;
541 
542 	while (min_bits) {
543 		status = ocfs2_claim_clusters(handle, ac, min_bits,
544 					      bit_off, num_bits);
545 		if (status != -ENOSPC)
546 			break;
547 
548 		min_bits >>= 1;
549 	}
550 
551 	return status;
552 }
553 
554 static int ocfs2_block_group_grow_discontig(handle_t *handle,
555 					    struct inode *alloc_inode,
556 					    struct buffer_head *bg_bh,
557 					    struct ocfs2_alloc_context *ac,
558 					    struct ocfs2_chain_list *cl,
559 					    unsigned int min_bits)
560 {
561 	int status;
562 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
563 	struct ocfs2_group_desc *bg =
564 		(struct ocfs2_group_desc *)bg_bh->b_data;
565 	unsigned int needed = le16_to_cpu(cl->cl_cpg) -
566 			 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
567 	u32 p_cpos, clusters;
568 	u64 p_blkno;
569 	struct ocfs2_extent_list *el = &bg->bg_list;
570 
571 	status = ocfs2_journal_access_gd(handle,
572 					 INODE_CACHE(alloc_inode),
573 					 bg_bh,
574 					 OCFS2_JOURNAL_ACCESS_CREATE);
575 	if (status < 0) {
576 		mlog_errno(status);
577 		goto bail;
578 	}
579 
580 	while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
581 				le16_to_cpu(el->l_count))) {
582 		if (min_bits > needed)
583 			min_bits = needed;
584 		status = ocfs2_block_group_claim_bits(osb, handle, ac,
585 						      min_bits, &p_cpos,
586 						      &clusters);
587 		if (status < 0) {
588 			if (status != -ENOSPC)
589 				mlog_errno(status);
590 			goto bail;
591 		}
592 		p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
593 		ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
594 					      clusters);
595 
596 		min_bits = clusters;
597 		needed = le16_to_cpu(cl->cl_cpg) -
598 			 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
599 	}
600 
601 	if (needed > 0) {
602 		/*
603 		 * We have used up all the extent rec but can't fill up
604 		 * the cpg. So bail out.
605 		 */
606 		status = -ENOSPC;
607 		goto bail;
608 	}
609 
610 	ocfs2_journal_dirty(handle, bg_bh);
611 
612 bail:
613 	return status;
614 }
615 
616 static void ocfs2_bg_alloc_cleanup(handle_t *handle,
617 				   struct ocfs2_alloc_context *cluster_ac,
618 				   struct inode *alloc_inode,
619 				   struct buffer_head *bg_bh)
620 {
621 	int i, ret;
622 	struct ocfs2_group_desc *bg;
623 	struct ocfs2_extent_list *el;
624 	struct ocfs2_extent_rec *rec;
625 
626 	if (!bg_bh)
627 		return;
628 
629 	bg = (struct ocfs2_group_desc *)bg_bh->b_data;
630 	el = &bg->bg_list;
631 	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
632 		rec = &el->l_recs[i];
633 		ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
634 					  cluster_ac->ac_bh,
635 					  le64_to_cpu(rec->e_blkno),
636 					  le16_to_cpu(rec->e_leaf_clusters));
637 		if (ret)
638 			mlog_errno(ret);
639 		/* Try all the clusters to free */
640 	}
641 
642 	ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
643 	brelse(bg_bh);
644 }
645 
646 static struct buffer_head *
647 ocfs2_block_group_alloc_discontig(handle_t *handle,
648 				  struct inode *alloc_inode,
649 				  struct ocfs2_alloc_context *ac,
650 				  struct ocfs2_chain_list *cl)
651 {
652 	int status;
653 	u32 bit_off, num_bits;
654 	u64 bg_blkno;
655 	unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
656 	struct buffer_head *bg_bh = NULL;
657 	unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
658 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
659 
660 	if (!ocfs2_supports_discontig_bg(osb)) {
661 		status = -ENOSPC;
662 		goto bail;
663 	}
664 
665 	status = ocfs2_extend_trans(handle,
666 				    ocfs2_calc_bg_discontig_credits(osb->sb));
667 	if (status) {
668 		mlog_errno(status);
669 		goto bail;
670 	}
671 
672 	/*
673 	 * We're going to be grabbing from multiple cluster groups.
674 	 * We don't have enough credits to relink them all, and the
675 	 * cluster groups will be staying in cache for the duration of
676 	 * this operation.
677 	 */
678 	ac->ac_disable_chain_relink = 1;
679 
680 	/* Claim the first region */
681 	status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
682 					      &bit_off, &num_bits);
683 	if (status < 0) {
684 		if (status != -ENOSPC)
685 			mlog_errno(status);
686 		goto bail;
687 	}
688 	min_bits = num_bits;
689 
690 	/* setup the group */
691 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
692 	trace_ocfs2_block_group_alloc_discontig(
693 				(unsigned long long)bg_blkno, alloc_rec);
694 
695 	bg_bh = sb_getblk(osb->sb, bg_blkno);
696 	if (!bg_bh) {
697 		status = -ENOMEM;
698 		mlog_errno(status);
699 		goto bail;
700 	}
701 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
702 
703 	status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
704 					bg_blkno, num_bits, alloc_rec, cl);
705 	if (status < 0) {
706 		mlog_errno(status);
707 		goto bail;
708 	}
709 
710 	status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
711 						  bg_bh, ac, cl, min_bits);
712 	if (status)
713 		mlog_errno(status);
714 
715 bail:
716 	if (status)
717 		ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
718 	return status ? ERR_PTR(status) : bg_bh;
719 }
720 
721 /*
722  * We expect the block group allocator to already be locked.
723  */
724 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
725 				   struct inode *alloc_inode,
726 				   struct buffer_head *bh,
727 				   u64 max_block,
728 				   u64 *last_alloc_group,
729 				   int flags)
730 {
731 	int status, credits;
732 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
733 	struct ocfs2_chain_list *cl;
734 	struct ocfs2_alloc_context *ac = NULL;
735 	handle_t *handle = NULL;
736 	u16 alloc_rec;
737 	struct buffer_head *bg_bh = NULL;
738 	struct ocfs2_group_desc *bg;
739 
740 	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
741 
742 	cl = &fe->id2.i_chain;
743 	status = ocfs2_reserve_clusters_with_limit(osb,
744 						   le16_to_cpu(cl->cl_cpg),
745 						   max_block, flags, &ac);
746 	if (status < 0) {
747 		if (status != -ENOSPC)
748 			mlog_errno(status);
749 		goto bail;
750 	}
751 
752 	credits = ocfs2_calc_group_alloc_credits(osb->sb,
753 						 le16_to_cpu(cl->cl_cpg));
754 	handle = ocfs2_start_trans(osb, credits);
755 	if (IS_ERR(handle)) {
756 		status = PTR_ERR(handle);
757 		handle = NULL;
758 		mlog_errno(status);
759 		goto bail;
760 	}
761 
762 	if (last_alloc_group && *last_alloc_group != 0) {
763 		trace_ocfs2_block_group_alloc(
764 				(unsigned long long)*last_alloc_group);
765 		ac->ac_last_group = *last_alloc_group;
766 	}
767 
768 	bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
769 					       ac, cl);
770 	if (PTR_ERR(bg_bh) == -ENOSPC) {
771 		ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;
772 		bg_bh = ocfs2_block_group_alloc_discontig(handle,
773 							  alloc_inode,
774 							  ac, cl);
775 	}
776 	if (IS_ERR(bg_bh)) {
777 		status = PTR_ERR(bg_bh);
778 		bg_bh = NULL;
779 		if (status != -ENOSPC)
780 			mlog_errno(status);
781 		goto bail;
782 	}
783 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
784 
785 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
786 					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
787 	if (status < 0) {
788 		mlog_errno(status);
789 		goto bail;
790 	}
791 
792 	alloc_rec = le16_to_cpu(bg->bg_chain);
793 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
794 		     le16_to_cpu(bg->bg_free_bits_count));
795 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
796 		     le16_to_cpu(bg->bg_bits));
797 	cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
798 	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
799 		le16_add_cpu(&cl->cl_next_free_rec, 1);
800 
801 	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
802 					le16_to_cpu(bg->bg_free_bits_count));
803 	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
804 	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
805 
806 	ocfs2_journal_dirty(handle, bh);
807 
808 	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
809 	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
810 	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
811 					     le32_to_cpu(fe->i_clusters)));
812 	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
813 	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
814 	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
815 	ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
816 
817 	status = 0;
818 
819 	/* save the new last alloc group so that the caller can cache it. */
820 	if (last_alloc_group)
821 		*last_alloc_group = ac->ac_last_group;
822 
823 bail:
824 	if (handle)
825 		ocfs2_commit_trans(osb, handle);
826 
827 	if (ac)
828 		ocfs2_free_alloc_context(ac);
829 
830 	brelse(bg_bh);
831 
832 	if (status)
833 		mlog_errno(status);
834 	return status;
835 }
836 
837 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
838 				       struct ocfs2_alloc_context *ac,
839 				       int type,
840 				       u32 slot,
841 				       u64 *last_alloc_group,
842 				       int flags)
843 {
844 	int status;
845 	u32 bits_wanted = ac->ac_bits_wanted;
846 	struct inode *alloc_inode;
847 	struct buffer_head *bh = NULL;
848 	struct ocfs2_dinode *fe;
849 	u32 free_bits;
850 
851 	alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
852 	if (!alloc_inode) {
853 		mlog_errno(-EINVAL);
854 		return -EINVAL;
855 	}
856 
857 	inode_lock(alloc_inode);
858 
859 	status = ocfs2_inode_lock(alloc_inode, &bh, 1);
860 	if (status < 0) {
861 		inode_unlock(alloc_inode);
862 		iput(alloc_inode);
863 
864 		mlog_errno(status);
865 		return status;
866 	}
867 
868 	ac->ac_inode = alloc_inode;
869 	ac->ac_alloc_slot = slot;
870 
871 	fe = (struct ocfs2_dinode *) bh->b_data;
872 
873 	/* The bh was validated by the inode read inside
874 	 * ocfs2_inode_lock().  Any corruption is a code bug. */
875 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
876 
877 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
878 		status = ocfs2_error(alloc_inode->i_sb,
879 				     "Invalid chain allocator %llu\n",
880 				     (unsigned long long)le64_to_cpu(fe->i_blkno));
881 		goto bail;
882 	}
883 
884 	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
885 		le32_to_cpu(fe->id1.bitmap1.i_used);
886 
887 	if (bits_wanted > free_bits) {
888 		/* cluster bitmap never grows */
889 		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
890 			trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
891 								free_bits);
892 			status = -ENOSPC;
893 			goto bail;
894 		}
895 
896 		if (!(flags & ALLOC_NEW_GROUP)) {
897 			trace_ocfs2_reserve_suballoc_bits_no_new_group(
898 						slot, bits_wanted, free_bits);
899 			status = -ENOSPC;
900 			goto bail;
901 		}
902 
903 		status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
904 						 ac->ac_max_block,
905 						 last_alloc_group, flags);
906 		if (status < 0) {
907 			if (status != -ENOSPC)
908 				mlog_errno(status);
909 			goto bail;
910 		}
911 		atomic_inc(&osb->alloc_stats.bg_extends);
912 
913 		/* You should never ask for this much metadata */
914 		BUG_ON(bits_wanted >
915 		       (le32_to_cpu(fe->id1.bitmap1.i_total)
916 			- le32_to_cpu(fe->id1.bitmap1.i_used)));
917 	}
918 
919 	get_bh(bh);
920 	ac->ac_bh = bh;
921 bail:
922 	brelse(bh);
923 
924 	if (status)
925 		mlog_errno(status);
926 	return status;
927 }
928 
929 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
930 {
931 	spin_lock(&osb->osb_lock);
932 	osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
933 	spin_unlock(&osb->osb_lock);
934 	atomic_set(&osb->s_num_inodes_stolen, 0);
935 }
936 
937 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
938 {
939 	spin_lock(&osb->osb_lock);
940 	osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
941 	spin_unlock(&osb->osb_lock);
942 	atomic_set(&osb->s_num_meta_stolen, 0);
943 }
944 
945 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
946 {
947 	ocfs2_init_inode_steal_slot(osb);
948 	ocfs2_init_meta_steal_slot(osb);
949 }
950 
951 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
952 {
953 	spin_lock(&osb->osb_lock);
954 	if (type == INODE_ALLOC_SYSTEM_INODE)
955 		osb->s_inode_steal_slot = (u16)slot;
956 	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
957 		osb->s_meta_steal_slot = (u16)slot;
958 	spin_unlock(&osb->osb_lock);
959 }
960 
961 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
962 {
963 	int slot = OCFS2_INVALID_SLOT;
964 
965 	spin_lock(&osb->osb_lock);
966 	if (type == INODE_ALLOC_SYSTEM_INODE)
967 		slot = osb->s_inode_steal_slot;
968 	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
969 		slot = osb->s_meta_steal_slot;
970 	spin_unlock(&osb->osb_lock);
971 
972 	return slot;
973 }
974 
975 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
976 {
977 	return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
978 }
979 
980 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
981 {
982 	return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
983 }
984 
985 static int ocfs2_steal_resource(struct ocfs2_super *osb,
986 				struct ocfs2_alloc_context *ac,
987 				int type)
988 {
989 	int i, status = -ENOSPC;
990 	int slot = __ocfs2_get_steal_slot(osb, type);
991 
992 	/* Start to steal resource from the first slot after ours. */
993 	if (slot == OCFS2_INVALID_SLOT)
994 		slot = osb->slot_num + 1;
995 
996 	for (i = 0; i < osb->max_slots; i++, slot++) {
997 		if (slot == osb->max_slots)
998 			slot = 0;
999 
1000 		if (slot == osb->slot_num)
1001 			continue;
1002 
1003 		status = ocfs2_reserve_suballoc_bits(osb, ac,
1004 						     type,
1005 						     (u32)slot, NULL,
1006 						     NOT_ALLOC_NEW_GROUP);
1007 		if (status >= 0) {
1008 			__ocfs2_set_steal_slot(osb, slot, type);
1009 			break;
1010 		}
1011 
1012 		ocfs2_free_ac_resource(ac);
1013 	}
1014 
1015 	return status;
1016 }
1017 
1018 static int ocfs2_steal_inode(struct ocfs2_super *osb,
1019 			     struct ocfs2_alloc_context *ac)
1020 {
1021 	return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
1022 }
1023 
1024 static int ocfs2_steal_meta(struct ocfs2_super *osb,
1025 			    struct ocfs2_alloc_context *ac)
1026 {
1027 	return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
1028 }
1029 
1030 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
1031 				      int blocks,
1032 				      struct ocfs2_alloc_context **ac)
1033 {
1034 	int status;
1035 	int slot = ocfs2_get_meta_steal_slot(osb);
1036 
1037 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1038 	if (!(*ac)) {
1039 		status = -ENOMEM;
1040 		mlog_errno(status);
1041 		goto bail;
1042 	}
1043 
1044 	(*ac)->ac_bits_wanted = blocks;
1045 	(*ac)->ac_which = OCFS2_AC_USE_META;
1046 	(*ac)->ac_group_search = ocfs2_block_group_search;
1047 
1048 	if (slot != OCFS2_INVALID_SLOT &&
1049 		atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
1050 		goto extent_steal;
1051 
1052 	atomic_set(&osb->s_num_meta_stolen, 0);
1053 	status = ocfs2_reserve_suballoc_bits(osb, (*ac),
1054 					     EXTENT_ALLOC_SYSTEM_INODE,
1055 					     (u32)osb->slot_num, NULL,
1056 					     ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
1057 
1058 
1059 	if (status >= 0) {
1060 		status = 0;
1061 		if (slot != OCFS2_INVALID_SLOT)
1062 			ocfs2_init_meta_steal_slot(osb);
1063 		goto bail;
1064 	} else if (status < 0 && status != -ENOSPC) {
1065 		mlog_errno(status);
1066 		goto bail;
1067 	}
1068 
1069 	ocfs2_free_ac_resource(*ac);
1070 
1071 extent_steal:
1072 	status = ocfs2_steal_meta(osb, *ac);
1073 	atomic_inc(&osb->s_num_meta_stolen);
1074 	if (status < 0) {
1075 		if (status != -ENOSPC)
1076 			mlog_errno(status);
1077 		goto bail;
1078 	}
1079 
1080 	status = 0;
1081 bail:
1082 	if ((status < 0) && *ac) {
1083 		ocfs2_free_alloc_context(*ac);
1084 		*ac = NULL;
1085 	}
1086 
1087 	if (status)
1088 		mlog_errno(status);
1089 	return status;
1090 }
1091 
1092 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1093 			       struct ocfs2_extent_list *root_el,
1094 			       struct ocfs2_alloc_context **ac)
1095 {
1096 	return ocfs2_reserve_new_metadata_blocks(osb,
1097 					ocfs2_extend_meta_needed(root_el),
1098 					ac);
1099 }
1100 
1101 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1102 			    struct ocfs2_alloc_context **ac)
1103 {
1104 	int status;
1105 	int slot = ocfs2_get_inode_steal_slot(osb);
1106 	u64 alloc_group;
1107 
1108 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1109 	if (!(*ac)) {
1110 		status = -ENOMEM;
1111 		mlog_errno(status);
1112 		goto bail;
1113 	}
1114 
1115 	(*ac)->ac_bits_wanted = 1;
1116 	(*ac)->ac_which = OCFS2_AC_USE_INODE;
1117 
1118 	(*ac)->ac_group_search = ocfs2_block_group_search;
1119 
1120 	/*
1121 	 * stat(2) can't handle i_ino > 32bits, so we tell the
1122 	 * lower levels not to allocate us a block group past that
1123 	 * limit.  The 'inode64' mount option avoids this behavior.
1124 	 */
1125 	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1126 		(*ac)->ac_max_block = (u32)~0U;
1127 
1128 	/*
1129 	 * slot is set when we successfully steal inode from other nodes.
1130 	 * It is reset in 3 places:
1131 	 * 1. when we flush the truncate log
1132 	 * 2. when we complete local alloc recovery.
1133 	 * 3. when we successfully allocate from our own slot.
1134 	 * After it is set, we will go on stealing inodes until we find the
1135 	 * need to check our slots to see whether there is some space for us.
1136 	 */
1137 	if (slot != OCFS2_INVALID_SLOT &&
1138 	    atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1139 		goto inode_steal;
1140 
1141 	atomic_set(&osb->s_num_inodes_stolen, 0);
1142 	alloc_group = osb->osb_inode_alloc_group;
1143 	status = ocfs2_reserve_suballoc_bits(osb, *ac,
1144 					     INODE_ALLOC_SYSTEM_INODE,
1145 					     (u32)osb->slot_num,
1146 					     &alloc_group,
1147 					     ALLOC_NEW_GROUP |
1148 					     ALLOC_GROUPS_FROM_GLOBAL);
1149 	if (status >= 0) {
1150 		status = 0;
1151 
1152 		spin_lock(&osb->osb_lock);
1153 		osb->osb_inode_alloc_group = alloc_group;
1154 		spin_unlock(&osb->osb_lock);
1155 		trace_ocfs2_reserve_new_inode_new_group(
1156 			(unsigned long long)alloc_group);
1157 
1158 		/*
1159 		 * Some inodes must be freed by us, so try to allocate
1160 		 * from our own next time.
1161 		 */
1162 		if (slot != OCFS2_INVALID_SLOT)
1163 			ocfs2_init_inode_steal_slot(osb);
1164 		goto bail;
1165 	} else if (status < 0 && status != -ENOSPC) {
1166 		mlog_errno(status);
1167 		goto bail;
1168 	}
1169 
1170 	ocfs2_free_ac_resource(*ac);
1171 
1172 inode_steal:
1173 	status = ocfs2_steal_inode(osb, *ac);
1174 	atomic_inc(&osb->s_num_inodes_stolen);
1175 	if (status < 0) {
1176 		if (status != -ENOSPC)
1177 			mlog_errno(status);
1178 		goto bail;
1179 	}
1180 
1181 	status = 0;
1182 bail:
1183 	if ((status < 0) && *ac) {
1184 		ocfs2_free_alloc_context(*ac);
1185 		*ac = NULL;
1186 	}
1187 
1188 	if (status)
1189 		mlog_errno(status);
1190 	return status;
1191 }
1192 
1193 /* local alloc code has to do the same thing, so rather than do this
1194  * twice.. */
1195 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1196 				      struct ocfs2_alloc_context *ac)
1197 {
1198 	int status;
1199 
1200 	ac->ac_which = OCFS2_AC_USE_MAIN;
1201 	ac->ac_group_search = ocfs2_cluster_group_search;
1202 
1203 	status = ocfs2_reserve_suballoc_bits(osb, ac,
1204 					     GLOBAL_BITMAP_SYSTEM_INODE,
1205 					     OCFS2_INVALID_SLOT, NULL,
1206 					     ALLOC_NEW_GROUP);
1207 	if (status < 0 && status != -ENOSPC)
1208 		mlog_errno(status);
1209 
1210 	return status;
1211 }
1212 
1213 /* Callers don't need to care which bitmap (local alloc or main) to
1214  * use so we figure it out for them, but unfortunately this clutters
1215  * things a bit. */
1216 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1217 					     u32 bits_wanted, u64 max_block,
1218 					     int flags,
1219 					     struct ocfs2_alloc_context **ac)
1220 {
1221 	int status, ret = 0;
1222 	int retried = 0;
1223 
1224 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1225 	if (!(*ac)) {
1226 		status = -ENOMEM;
1227 		mlog_errno(status);
1228 		goto bail;
1229 	}
1230 
1231 	(*ac)->ac_bits_wanted = bits_wanted;
1232 	(*ac)->ac_max_block = max_block;
1233 
1234 	status = -ENOSPC;
1235 	if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1236 	    ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1237 		status = ocfs2_reserve_local_alloc_bits(osb,
1238 							bits_wanted,
1239 							*ac);
1240 		if ((status < 0) && (status != -ENOSPC)) {
1241 			mlog_errno(status);
1242 			goto bail;
1243 		}
1244 	}
1245 
1246 	if (status == -ENOSPC) {
1247 retry:
1248 		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1249 		/* Retry if there is sufficient space cached in truncate log */
1250 		if (status == -ENOSPC && !retried) {
1251 			retried = 1;
1252 			ocfs2_inode_unlock((*ac)->ac_inode, 1);
1253 			inode_unlock((*ac)->ac_inode);
1254 
1255 			ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
1256 			if (ret == 1) {
1257 				iput((*ac)->ac_inode);
1258 				(*ac)->ac_inode = NULL;
1259 				goto retry;
1260 			}
1261 
1262 			if (ret < 0)
1263 				mlog_errno(ret);
1264 
1265 			inode_lock((*ac)->ac_inode);
1266 			ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
1267 			if (ret < 0) {
1268 				mlog_errno(ret);
1269 				inode_unlock((*ac)->ac_inode);
1270 				iput((*ac)->ac_inode);
1271 				(*ac)->ac_inode = NULL;
1272 				goto bail;
1273 			}
1274 		}
1275 		if (status < 0) {
1276 			if (status != -ENOSPC)
1277 				mlog_errno(status);
1278 			goto bail;
1279 		}
1280 	}
1281 
1282 	status = 0;
1283 bail:
1284 	if ((status < 0) && *ac) {
1285 		ocfs2_free_alloc_context(*ac);
1286 		*ac = NULL;
1287 	}
1288 
1289 	if (status)
1290 		mlog_errno(status);
1291 	return status;
1292 }
1293 
1294 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1295 			   u32 bits_wanted,
1296 			   struct ocfs2_alloc_context **ac)
1297 {
1298 	return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1299 						 ALLOC_NEW_GROUP, ac);
1300 }
1301 
1302 /*
1303  * More or less lifted from ext3. I'll leave their description below:
1304  *
1305  * "For ext3 allocations, we must not reuse any blocks which are
1306  * allocated in the bitmap buffer's "last committed data" copy.  This
1307  * prevents deletes from freeing up the page for reuse until we have
1308  * committed the delete transaction.
1309  *
1310  * If we didn't do this, then deleting something and reallocating it as
1311  * data would allow the old block to be overwritten before the
1312  * transaction committed (because we force data to disk before commit).
1313  * This would lead to corruption if we crashed between overwriting the
1314  * data and committing the delete.
1315  *
1316  * @@@ We may want to make this allocation behaviour conditional on
1317  * data-writes at some point, and disable it for metadata allocations or
1318  * sync-data inodes."
1319  *
1320  * Note: OCFS2 already does this differently for metadata vs data
1321  * allocations, as those bitmaps are separate and undo access is never
1322  * called on a metadata group descriptor.
1323  */
1324 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1325 					 int nr)
1326 {
1327 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1328 	struct journal_head *jh;
1329 	int ret;
1330 
1331 	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1332 		return 0;
1333 
1334 	jh = jbd2_journal_grab_journal_head(bg_bh);
1335 	if (!jh)
1336 		return 1;
1337 
1338 	spin_lock(&jh->b_state_lock);
1339 	bg = (struct ocfs2_group_desc *) jh->b_committed_data;
1340 	if (bg)
1341 		ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1342 	else
1343 		ret = 1;
1344 	spin_unlock(&jh->b_state_lock);
1345 	jbd2_journal_put_journal_head(jh);
1346 
1347 	return ret;
1348 }
1349 
1350 u16 ocfs2_find_max_contig_free_bits(void *bitmap,
1351 			 u16 total_bits, u16 start)
1352 {
1353 	u16 offset, free_bits;
1354 	u16 contig_bits = 0;
1355 
1356 	while (start < total_bits) {
1357 		offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start);
1358 		if (offset == total_bits)
1359 			break;
1360 
1361 		start = ocfs2_find_next_bit(bitmap, total_bits, offset);
1362 		free_bits = start - offset;
1363 		if (contig_bits < free_bits)
1364 			contig_bits = free_bits;
1365 	}
1366 
1367 	return contig_bits;
1368 }
1369 
1370 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1371 					     struct buffer_head *bg_bh,
1372 					     unsigned int bits_wanted,
1373 					     unsigned int total_bits,
1374 					     struct ocfs2_suballoc_result *res)
1375 {
1376 	void *bitmap;
1377 	u16 best_offset, best_size;
1378 	u16 prev_best_size = 0;
1379 	int offset, start, found, status = 0;
1380 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1381 
1382 	/* Callers got this descriptor from
1383 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1384 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1385 
1386 	found = start = best_offset = best_size = 0;
1387 	bitmap = bg->bg_bitmap;
1388 
1389 	while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) <
1390 	       total_bits) {
1391 		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1392 			/* We found a zero, but we can't use it as it
1393 			 * hasn't been put to disk yet! */
1394 			found = 0;
1395 			start = offset + 1;
1396 		} else if (offset == start) {
1397 			/* we found a zero */
1398 			found++;
1399 			/* move start to the next bit to test */
1400 			start++;
1401 		} else {
1402 			/* got a zero after some ones */
1403 			found = 1;
1404 			start = offset + 1;
1405 			prev_best_size = best_size;
1406 		}
1407 		if (found > best_size) {
1408 			best_size = found;
1409 			best_offset = start - found;
1410 		}
1411 		/* we got everything we needed */
1412 		if (found == bits_wanted) {
1413 			/* mlog(0, "Found it all!\n"); */
1414 			break;
1415 		}
1416 	}
1417 
1418 	/* best_size will be allocated, we save prev_best_size */
1419 	res->sr_max_contig_bits = prev_best_size;
1420 	if (best_size) {
1421 		res->sr_bit_offset = best_offset;
1422 		res->sr_bits = best_size;
1423 	} else {
1424 		status = -ENOSPC;
1425 		/* No error log here -- see the comment above
1426 		 * ocfs2_test_bg_bit_allocatable */
1427 	}
1428 
1429 	return status;
1430 }
1431 
1432 int ocfs2_block_group_set_bits(handle_t *handle,
1433 					     struct inode *alloc_inode,
1434 					     struct ocfs2_group_desc *bg,
1435 					     struct buffer_head *group_bh,
1436 					     unsigned int bit_off,
1437 					     unsigned int num_bits,
1438 					     unsigned int max_contig_bits,
1439 					     int fastpath)
1440 {
1441 	int status;
1442 	void *bitmap = bg->bg_bitmap;
1443 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1444 	unsigned int start = bit_off + num_bits;
1445 	u16 contig_bits;
1446 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1447 
1448 	/* All callers get the descriptor via
1449 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1450 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1451 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1452 
1453 	trace_ocfs2_block_group_set_bits(bit_off, num_bits);
1454 
1455 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1456 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1457 
1458 	status = ocfs2_journal_access_gd(handle,
1459 					 INODE_CACHE(alloc_inode),
1460 					 group_bh,
1461 					 journal_type);
1462 	if (status < 0) {
1463 		mlog_errno(status);
1464 		goto bail;
1465 	}
1466 
1467 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1468 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1469 		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
1470 				   (unsigned long long)le64_to_cpu(bg->bg_blkno),
1471 				   le16_to_cpu(bg->bg_bits),
1472 				   le16_to_cpu(bg->bg_free_bits_count),
1473 				   num_bits);
1474 	}
1475 	while(num_bits--)
1476 		ocfs2_set_bit(bit_off++, bitmap);
1477 
1478 	/*
1479 	 * this is optimize path, caller set old contig value
1480 	 * in max_contig_bits to bypass finding action.
1481 	 */
1482 	if (fastpath) {
1483 		bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
1484 	} else if (ocfs2_is_cluster_bitmap(alloc_inode)) {
1485 		/*
1486 		 * Usually, the block group bitmap allocates only 1 bit
1487 		 * at a time, while the cluster group allocates n bits
1488 		 * each time. Therefore, we only save the contig bits for
1489 		 * the cluster group.
1490 		 */
1491 		contig_bits = ocfs2_find_max_contig_free_bits(bitmap,
1492 				    le16_to_cpu(bg->bg_bits), start);
1493 		if (contig_bits > max_contig_bits)
1494 			max_contig_bits = contig_bits;
1495 		bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
1496 		ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits);
1497 	} else {
1498 		bg->bg_contig_free_bits = 0;
1499 	}
1500 
1501 	ocfs2_journal_dirty(handle, group_bh);
1502 
1503 bail:
1504 	return status;
1505 }
1506 
1507 /* find the one with the most empty bits */
1508 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1509 {
1510 	u16 curr, best;
1511 
1512 	BUG_ON(!cl->cl_next_free_rec);
1513 
1514 	best = curr = 0;
1515 	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1516 		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1517 		    le32_to_cpu(cl->cl_recs[best].c_free))
1518 			best = curr;
1519 		curr++;
1520 	}
1521 
1522 	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1523 	return best;
1524 }
1525 
1526 static int ocfs2_relink_block_group(handle_t *handle,
1527 				    struct inode *alloc_inode,
1528 				    struct buffer_head *fe_bh,
1529 				    struct buffer_head *bg_bh,
1530 				    struct buffer_head *prev_bg_bh,
1531 				    u16 chain)
1532 {
1533 	int status;
1534 	/* there is a really tiny chance the journal calls could fail,
1535 	 * but we wouldn't want inconsistent blocks in *any* case. */
1536 	u64 bg_ptr, prev_bg_ptr;
1537 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1538 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1539 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1540 
1541 	/* The caller got these descriptors from
1542 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1543 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1544 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1545 
1546 	trace_ocfs2_relink_block_group(
1547 		(unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1548 		(unsigned long long)le64_to_cpu(bg->bg_blkno),
1549 		(unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1550 
1551 	bg_ptr = le64_to_cpu(bg->bg_next_group);
1552 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1553 
1554 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1555 					 prev_bg_bh,
1556 					 OCFS2_JOURNAL_ACCESS_WRITE);
1557 	if (status < 0)
1558 		goto out;
1559 
1560 	prev_bg->bg_next_group = bg->bg_next_group;
1561 	ocfs2_journal_dirty(handle, prev_bg_bh);
1562 
1563 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1564 					 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1565 	if (status < 0)
1566 		goto out_rollback_prev_bg;
1567 
1568 	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1569 	ocfs2_journal_dirty(handle, bg_bh);
1570 
1571 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1572 					 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1573 	if (status < 0)
1574 		goto out_rollback_bg;
1575 
1576 	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1577 	ocfs2_journal_dirty(handle, fe_bh);
1578 
1579 out:
1580 	if (status < 0)
1581 		mlog_errno(status);
1582 	return status;
1583 
1584 out_rollback_bg:
1585 	bg->bg_next_group = cpu_to_le64(bg_ptr);
1586 out_rollback_prev_bg:
1587 	prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1588 	goto out;
1589 }
1590 
1591 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1592 						     u32 wanted)
1593 {
1594 	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1595 }
1596 
1597 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1598  * value on error. */
1599 static int ocfs2_cluster_group_search(struct inode *inode,
1600 				      struct buffer_head *group_bh,
1601 				      u32 bits_wanted, u32 min_bits,
1602 				      u64 max_block,
1603 				      struct ocfs2_suballoc_result *res)
1604 {
1605 	int search = -ENOSPC;
1606 	int ret;
1607 	u64 blkoff;
1608 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1609 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1610 	unsigned int max_bits, gd_cluster_off;
1611 
1612 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1613 
1614 	if (le16_to_cpu(gd->bg_contig_free_bits) &&
1615 	    le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted)
1616 		return -ENOSPC;
1617 
1618 	/* ->bg_contig_free_bits may un-initialized, so compare again */
1619 	if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) {
1620 		max_bits = le16_to_cpu(gd->bg_bits);
1621 
1622 		/* Tail groups in cluster bitmaps which aren't cpg
1623 		 * aligned are prone to partial extension by a failed
1624 		 * fs resize. If the file system resize never got to
1625 		 * update the dinode cluster count, then we don't want
1626 		 * to trust any clusters past it, regardless of what
1627 		 * the group descriptor says. */
1628 		gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1629 							  le64_to_cpu(gd->bg_blkno));
1630 		if ((gd_cluster_off + max_bits) >
1631 		    OCFS2_I(inode)->ip_clusters) {
1632 			max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1633 			trace_ocfs2_cluster_group_search_wrong_max_bits(
1634 				(unsigned long long)le64_to_cpu(gd->bg_blkno),
1635 				le16_to_cpu(gd->bg_bits),
1636 				OCFS2_I(inode)->ip_clusters, max_bits);
1637 		}
1638 
1639 		ret = ocfs2_block_group_find_clear_bits(osb,
1640 							group_bh, bits_wanted,
1641 							max_bits, res);
1642 		if (ret)
1643 			return ret;
1644 
1645 		if (max_block) {
1646 			blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1647 							  gd_cluster_off +
1648 							  res->sr_bit_offset +
1649 							  res->sr_bits);
1650 			trace_ocfs2_cluster_group_search_max_block(
1651 				(unsigned long long)blkoff,
1652 				(unsigned long long)max_block);
1653 			if (blkoff > max_block)
1654 				return -ENOSPC;
1655 		}
1656 
1657 		/* ocfs2_block_group_find_clear_bits() might
1658 		 * return success, but we still want to return
1659 		 * -ENOSPC unless it found the minimum number
1660 		 * of bits. */
1661 		if (min_bits <= res->sr_bits)
1662 			search = 0; /* success */
1663 	}
1664 
1665 	return search;
1666 }
1667 
1668 static int ocfs2_block_group_search(struct inode *inode,
1669 				    struct buffer_head *group_bh,
1670 				    u32 bits_wanted, u32 min_bits,
1671 				    u64 max_block,
1672 				    struct ocfs2_suballoc_result *res)
1673 {
1674 	int ret = -ENOSPC;
1675 	u64 blkoff;
1676 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1677 
1678 	BUG_ON(min_bits != 1);
1679 	BUG_ON(ocfs2_is_cluster_bitmap(inode));
1680 
1681 	if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) {
1682 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1683 							group_bh, bits_wanted,
1684 							le16_to_cpu(bg->bg_bits),
1685 							res);
1686 		if (!ret && max_block) {
1687 			blkoff = le64_to_cpu(bg->bg_blkno) +
1688 				res->sr_bit_offset + res->sr_bits;
1689 			trace_ocfs2_block_group_search_max_block(
1690 				(unsigned long long)blkoff,
1691 				(unsigned long long)max_block);
1692 			if (blkoff > max_block)
1693 				ret = -ENOSPC;
1694 		}
1695 	}
1696 
1697 	return ret;
1698 }
1699 
1700 int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1701 				       handle_t *handle,
1702 				       struct buffer_head *di_bh,
1703 				       u32 num_bits,
1704 				       u16 chain)
1705 {
1706 	int ret;
1707 	u32 tmp_used;
1708 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1709 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1710 
1711 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1712 				      OCFS2_JOURNAL_ACCESS_WRITE);
1713 	if (ret < 0) {
1714 		mlog_errno(ret);
1715 		goto out;
1716 	}
1717 
1718 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1719 	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1720 	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1721 	ocfs2_journal_dirty(handle, di_bh);
1722 
1723 out:
1724 	return ret;
1725 }
1726 
1727 void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
1728 				       struct buffer_head *di_bh,
1729 				       u32 num_bits,
1730 				       u16 chain)
1731 {
1732 	u32 tmp_used;
1733 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1734 	struct ocfs2_chain_list *cl;
1735 
1736 	cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
1737 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1738 	di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
1739 	le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
1740 }
1741 
1742 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1743 					 struct ocfs2_extent_rec *rec,
1744 					 struct ocfs2_chain_list *cl)
1745 {
1746 	unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1747 	unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1748 	unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
1749 
1750 	if (res->sr_bit_offset < bitoff)
1751 		return 0;
1752 	if (res->sr_bit_offset >= (bitoff + bitcount))
1753 		return 0;
1754 	res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1755 		(res->sr_bit_offset - bitoff);
1756 	if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1757 		res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1758 	return 1;
1759 }
1760 
1761 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1762 					  struct ocfs2_group_desc *bg,
1763 					  struct ocfs2_suballoc_result *res)
1764 {
1765 	int i;
1766 	u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
1767 	struct ocfs2_extent_rec *rec;
1768 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1769 	struct ocfs2_chain_list *cl = &di->id2.i_chain;
1770 
1771 	if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1772 		res->sr_blkno = 0;
1773 		return;
1774 	}
1775 
1776 	res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1777 	res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
1778 	if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1779 	    !bg->bg_list.l_next_free_rec)
1780 		return;
1781 
1782 	for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1783 		rec = &bg->bg_list.l_recs[i];
1784 		if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1785 			res->sr_bg_blkno = bg_blkno;  /* Restore */
1786 			break;
1787 		}
1788 	}
1789 }
1790 
1791 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1792 				  handle_t *handle,
1793 				  u32 bits_wanted,
1794 				  u32 min_bits,
1795 				  struct ocfs2_suballoc_result *res,
1796 				  u16 *bits_left, int *released)
1797 {
1798 	int ret;
1799 	struct buffer_head *group_bh = NULL;
1800 	struct ocfs2_group_desc *gd;
1801 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1802 	struct inode *alloc_inode = ac->ac_inode;
1803 
1804 	ret = ocfs2_read_hint_group_descriptor(alloc_inode, di,
1805 				res->sr_bg_blkno, &group_bh, released);
1806 	if (*released) {
1807 		return 0;
1808 	} else if (ret < 0) {
1809 		mlog_errno(ret);
1810 		return ret;
1811 	}
1812 
1813 	gd = (struct ocfs2_group_desc *) group_bh->b_data;
1814 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1815 				  ac->ac_max_block, res);
1816 	if (ret < 0) {
1817 		if (ret != -ENOSPC)
1818 			mlog_errno(ret);
1819 		goto out;
1820 	}
1821 
1822 	if (!ret)
1823 		ocfs2_bg_discontig_fix_result(ac, gd, res);
1824 
1825 	/*
1826 	 * sr_bg_blkno might have been changed by
1827 	 * ocfs2_bg_discontig_fix_result
1828 	 */
1829 	res->sr_bg_stable_blkno = group_bh->b_blocknr;
1830 
1831 	if (ac->ac_find_loc_only)
1832 		goto out_loc_only;
1833 
1834 	ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1835 					       res->sr_bits,
1836 					       le16_to_cpu(gd->bg_chain));
1837 	if (ret < 0) {
1838 		mlog_errno(ret);
1839 		goto out;
1840 	}
1841 
1842 	ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1843 					 res->sr_bit_offset, res->sr_bits,
1844 					 res->sr_max_contig_bits, 0);
1845 	if (ret < 0) {
1846 		ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
1847 					       res->sr_bits,
1848 					       le16_to_cpu(gd->bg_chain));
1849 		mlog_errno(ret);
1850 	}
1851 
1852 out_loc_only:
1853 	*bits_left = le16_to_cpu(gd->bg_free_bits_count);
1854 
1855 out:
1856 	brelse(group_bh);
1857 
1858 	return ret;
1859 }
1860 
1861 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1862 			      handle_t *handle,
1863 			      u32 bits_wanted,
1864 			      u32 min_bits,
1865 			      struct ocfs2_suballoc_result *res,
1866 			      u16 *bits_left)
1867 {
1868 	int status;
1869 	u16 chain;
1870 	u32 contig_bits;
1871 	u64 next_group;
1872 	struct inode *alloc_inode = ac->ac_inode;
1873 	struct buffer_head *group_bh = NULL;
1874 	struct buffer_head *prev_group_bh = NULL;
1875 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1876 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1877 	struct ocfs2_group_desc *bg;
1878 
1879 	chain = ac->ac_chain;
1880 	trace_ocfs2_search_chain_begin(
1881 		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
1882 		bits_wanted, chain);
1883 
1884 	status = ocfs2_read_group_descriptor(alloc_inode, fe,
1885 					     le64_to_cpu(cl->cl_recs[chain].c_blkno),
1886 					     &group_bh);
1887 	if (status < 0) {
1888 		mlog_errno(status);
1889 		goto bail;
1890 	}
1891 	bg = (struct ocfs2_group_desc *) group_bh->b_data;
1892 
1893 	status = -ENOSPC;
1894 	/* for now, the chain search is a bit simplistic. We just use
1895 	 * the 1st group with any empty bits. */
1896 	while (1) {
1897 		if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) {
1898 			contig_bits = le16_to_cpu(bg->bg_contig_free_bits);
1899 			if (!contig_bits)
1900 				contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
1901 						le16_to_cpu(bg->bg_bits), 0);
1902 			if (bits_wanted > contig_bits && contig_bits >= min_bits)
1903 				bits_wanted = contig_bits;
1904 		}
1905 
1906 		status = ac->ac_group_search(alloc_inode, group_bh,
1907 				bits_wanted, min_bits,
1908 				ac->ac_max_block, res);
1909 		if (status != -ENOSPC)
1910 			break;
1911 		if (!bg->bg_next_group)
1912 			break;
1913 
1914 		brelse(prev_group_bh);
1915 		prev_group_bh = NULL;
1916 
1917 		next_group = le64_to_cpu(bg->bg_next_group);
1918 		prev_group_bh = group_bh;
1919 		group_bh = NULL;
1920 		status = ocfs2_read_group_descriptor(alloc_inode, fe,
1921 						     next_group, &group_bh);
1922 		if (status < 0) {
1923 			mlog_errno(status);
1924 			goto bail;
1925 		}
1926 		bg = (struct ocfs2_group_desc *) group_bh->b_data;
1927 	}
1928 	if (status < 0) {
1929 		if (status != -ENOSPC)
1930 			mlog_errno(status);
1931 		goto bail;
1932 	}
1933 
1934 	trace_ocfs2_search_chain_succ(
1935 		(unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
1936 
1937 	res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1938 
1939 	BUG_ON(res->sr_bits == 0);
1940 	if (!status)
1941 		ocfs2_bg_discontig_fix_result(ac, bg, res);
1942 
1943 	/*
1944 	 * sr_bg_blkno might have been changed by
1945 	 * ocfs2_bg_discontig_fix_result
1946 	 */
1947 	res->sr_bg_stable_blkno = group_bh->b_blocknr;
1948 
1949 	/*
1950 	 * Keep track of previous block descriptor read. When
1951 	 * we find a target, if we have read more than X
1952 	 * number of descriptors, and the target is reasonably
1953 	 * empty, relink him to top of his chain.
1954 	 *
1955 	 * We've read 0 extra blocks and only send one more to
1956 	 * the transaction, yet the next guy to search has a
1957 	 * much easier time.
1958 	 *
1959 	 * Do this *after* figuring out how many bits we're taking out
1960 	 * of our target group.
1961 	 */
1962 	if (!ac->ac_disable_chain_relink &&
1963 	    (prev_group_bh) &&
1964 	    (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1965 		status = ocfs2_relink_block_group(handle, alloc_inode,
1966 						  ac->ac_bh, group_bh,
1967 						  prev_group_bh, chain);
1968 		if (status < 0) {
1969 			mlog_errno(status);
1970 			goto bail;
1971 		}
1972 	}
1973 
1974 	if (ac->ac_find_loc_only)
1975 		goto out_loc_only;
1976 
1977 	status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1978 						  ac->ac_bh, res->sr_bits,
1979 						  chain);
1980 	if (status) {
1981 		mlog_errno(status);
1982 		goto bail;
1983 	}
1984 
1985 	status = ocfs2_block_group_set_bits(handle,
1986 					    alloc_inode,
1987 					    bg,
1988 					    group_bh,
1989 					    res->sr_bit_offset,
1990 					    res->sr_bits,
1991 					    res->sr_max_contig_bits,
1992 					    0);
1993 	if (status < 0) {
1994 		ocfs2_rollback_alloc_dinode_counts(alloc_inode,
1995 					ac->ac_bh, res->sr_bits, chain);
1996 		mlog_errno(status);
1997 		goto bail;
1998 	}
1999 
2000 	trace_ocfs2_search_chain_end(
2001 			(unsigned long long)le64_to_cpu(fe->i_blkno),
2002 			res->sr_bits);
2003 
2004 out_loc_only:
2005 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
2006 bail:
2007 	brelse(group_bh);
2008 	brelse(prev_group_bh);
2009 
2010 	if (status)
2011 		mlog_errno(status);
2012 	return status;
2013 }
2014 
2015 /* will give out up to bits_wanted contiguous bits. */
2016 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
2017 				     handle_t *handle,
2018 				     u32 bits_wanted,
2019 				     u32 min_bits,
2020 				     struct ocfs2_suballoc_result *res)
2021 {
2022 	int status;
2023 	int released = 0;
2024 	u16 victim, i;
2025 	u16 bits_left = 0;
2026 	u64 hint = ac->ac_last_group;
2027 	struct ocfs2_chain_list *cl;
2028 	struct ocfs2_dinode *fe;
2029 
2030 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2031 	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
2032 	BUG_ON(!ac->ac_bh);
2033 
2034 	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2035 
2036 	/* The bh was validated by the inode read during
2037 	 * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
2038 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2039 
2040 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
2041 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
2042 		status = ocfs2_error(ac->ac_inode->i_sb,
2043 				     "Chain allocator dinode %llu has %u used bits but only %u total\n",
2044 				     (unsigned long long)le64_to_cpu(fe->i_blkno),
2045 				     le32_to_cpu(fe->id1.bitmap1.i_used),
2046 				     le32_to_cpu(fe->id1.bitmap1.i_total));
2047 		goto bail;
2048 	}
2049 
2050 	/* the hint bg may already be released, we quiet search this group. */
2051 	res->sr_bg_blkno = hint;
2052 	if (res->sr_bg_blkno) {
2053 		/* Attempt to short-circuit the usual search mechanism
2054 		 * by jumping straight to the most recently used
2055 		 * allocation group. This helps us maintain some
2056 		 * contiguousness across allocations. */
2057 		status = ocfs2_search_one_group(ac, handle, bits_wanted,
2058 						min_bits, res, &bits_left,
2059 						&released);
2060 		if (released) {
2061 			res->sr_bg_blkno = 0;
2062 			goto chain_search;
2063 		}
2064 		if (!status)
2065 			goto set_hint;
2066 		if (status < 0 && status != -ENOSPC) {
2067 			mlog_errno(status);
2068 			goto bail;
2069 		}
2070 	}
2071 chain_search:
2072 	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
2073 	if (!le16_to_cpu(cl->cl_next_free_rec) ||
2074 	    le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) {
2075 		status = ocfs2_error(ac->ac_inode->i_sb,
2076 				     "Chain allocator dinode %llu has invalid next "
2077 				     "free chain record %u, but only %u total\n",
2078 				     (unsigned long long)le64_to_cpu(fe->i_blkno),
2079 				     le16_to_cpu(cl->cl_next_free_rec),
2080 				     le16_to_cpu(cl->cl_count));
2081 		goto bail;
2082 	}
2083 
2084 	victim = ocfs2_find_victim_chain(cl);
2085 	ac->ac_chain = victim;
2086 
2087 search:
2088 	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
2089 				    res, &bits_left);
2090 	if (!status) {
2091 		if (ocfs2_is_cluster_bitmap(ac->ac_inode))
2092 			hint = res->sr_bg_blkno;
2093 		else
2094 			hint = ocfs2_group_from_res(res);
2095 		goto set_hint;
2096 	}
2097 	if (status < 0 && status != -ENOSPC) {
2098 		mlog_errno(status);
2099 		goto bail;
2100 	}
2101 
2102 	trace_ocfs2_claim_suballoc_bits(victim);
2103 
2104 	/* If we didn't pick a good victim, then just default to
2105 	 * searching each chain in order. Don't allow chain relinking
2106 	 * because we only calculate enough journal credits for one
2107 	 * relink per alloc. */
2108 	ac->ac_disable_chain_relink = 1;
2109 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
2110 		if (i == victim)
2111 			continue;
2112 		if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted)
2113 			continue;
2114 
2115 		ac->ac_chain = i;
2116 		status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
2117 					    res, &bits_left);
2118 		if (!status) {
2119 			hint = ocfs2_group_from_res(res);
2120 			break;
2121 		}
2122 		if (status < 0 && status != -ENOSPC) {
2123 			mlog_errno(status);
2124 			goto bail;
2125 		}
2126 	}
2127 
2128 	/* Chains can't supply the bits_wanted contiguous space.
2129 	 * We should switch to using every single bit when allocating
2130 	 * from the global bitmap. */
2131 	if (i == le16_to_cpu(cl->cl_next_free_rec) &&
2132 	    status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) {
2133 		ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;
2134 		ac->ac_chain = victim;
2135 		goto search;
2136 	}
2137 
2138 set_hint:
2139 	if (status != -ENOSPC) {
2140 		/* If the next search of this group is not likely to
2141 		 * yield a suitable extent, then we reset the last
2142 		 * group hint so as to not waste a disk read */
2143 		if (bits_left < min_bits)
2144 			ac->ac_last_group = 0;
2145 		else
2146 			ac->ac_last_group = hint;
2147 	}
2148 
2149 bail:
2150 	if (status)
2151 		mlog_errno(status);
2152 	return status;
2153 }
2154 
2155 int ocfs2_claim_metadata(handle_t *handle,
2156 			 struct ocfs2_alloc_context *ac,
2157 			 u32 bits_wanted,
2158 			 u64 *suballoc_loc,
2159 			 u16 *suballoc_bit_start,
2160 			 unsigned int *num_bits,
2161 			 u64 *blkno_start)
2162 {
2163 	int status;
2164 	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2165 
2166 	BUG_ON(!ac);
2167 	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
2168 	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
2169 
2170 	status = ocfs2_claim_suballoc_bits(ac,
2171 					   handle,
2172 					   bits_wanted,
2173 					   1,
2174 					   &res);
2175 	if (status < 0) {
2176 		mlog_errno(status);
2177 		goto bail;
2178 	}
2179 	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2180 
2181 	*suballoc_loc = res.sr_bg_blkno;
2182 	*suballoc_bit_start = res.sr_bit_offset;
2183 	*blkno_start = res.sr_blkno;
2184 	ac->ac_bits_given += res.sr_bits;
2185 	*num_bits = res.sr_bits;
2186 	status = 0;
2187 bail:
2188 	if (status)
2189 		mlog_errno(status);
2190 	return status;
2191 }
2192 
2193 /*
2194  * after ocfs2 has the ability to release block group unused space,
2195  * the ->ip_last_used_group may be invalid. so this function returns
2196  * ac->ac_last_group need to verify.
2197  * refer the 'hint' in ocfs2_claim_suballoc_bits() for more details.
2198  */
2199 static void ocfs2_init_inode_ac_group(struct inode *dir,
2200 				      struct buffer_head *parent_di_bh,
2201 				      struct ocfs2_alloc_context *ac)
2202 {
2203 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
2204 	/*
2205 	 * Try to allocate inodes from some specific group.
2206 	 *
2207 	 * If the parent dir has recorded the last group used in allocation,
2208 	 * cool, use it. Otherwise if we try to allocate new inode from the
2209 	 * same slot the parent dir belongs to, use the same chunk.
2210 	 *
2211 	 * We are very careful here to avoid the mistake of setting
2212 	 * ac_last_group to a group descriptor from a different (unlocked) slot.
2213 	 */
2214 	if (OCFS2_I(dir)->ip_last_used_group &&
2215 	    OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
2216 		ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
2217 	else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
2218 		if (di->i_suballoc_loc)
2219 			ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
2220 		else
2221 			ac->ac_last_group = ocfs2_which_suballoc_group(
2222 					le64_to_cpu(di->i_blkno),
2223 					le16_to_cpu(di->i_suballoc_bit));
2224 	}
2225 }
2226 
2227 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2228 					     struct ocfs2_alloc_context *ac)
2229 {
2230 	OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
2231 	OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2232 }
2233 
2234 int ocfs2_find_new_inode_loc(struct inode *dir,
2235 			     struct buffer_head *parent_fe_bh,
2236 			     struct ocfs2_alloc_context *ac,
2237 			     u64 *fe_blkno)
2238 {
2239 	int ret;
2240 	handle_t *handle = NULL;
2241 	struct ocfs2_suballoc_result *res;
2242 
2243 	BUG_ON(!ac);
2244 	BUG_ON(ac->ac_bits_given != 0);
2245 	BUG_ON(ac->ac_bits_wanted != 1);
2246 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2247 
2248 	res = kzalloc(sizeof(*res), GFP_NOFS);
2249 	if (res == NULL) {
2250 		ret = -ENOMEM;
2251 		mlog_errno(ret);
2252 		goto out;
2253 	}
2254 
2255 	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2256 
2257 	/*
2258 	 * The handle started here is for chain relink. Alternatively,
2259 	 * we could just disable relink for these calls.
2260 	 */
2261 	handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2262 	if (IS_ERR(handle)) {
2263 		ret = PTR_ERR(handle);
2264 		handle = NULL;
2265 		mlog_errno(ret);
2266 		goto out;
2267 	}
2268 
2269 	/*
2270 	 * This will instruct ocfs2_claim_suballoc_bits and
2271 	 * ocfs2_search_one_group to search but save actual allocation
2272 	 * for later.
2273 	 */
2274 	ac->ac_find_loc_only = 1;
2275 
2276 	ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2277 	if (ret < 0) {
2278 		mlog_errno(ret);
2279 		goto out;
2280 	}
2281 
2282 	ac->ac_find_loc_priv = res;
2283 	*fe_blkno = res->sr_blkno;
2284 	ocfs2_update_inode_fsync_trans(handle, dir, 0);
2285 out:
2286 	if (handle)
2287 		ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2288 
2289 	if (ret)
2290 		kfree(res);
2291 
2292 	return ret;
2293 }
2294 
2295 int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2296 				 struct inode *dir,
2297 				 struct ocfs2_alloc_context *ac,
2298 				 u64 *suballoc_loc,
2299 				 u16 *suballoc_bit,
2300 				 u64 di_blkno)
2301 {
2302 	int ret;
2303 	u16 chain;
2304 	struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2305 	struct buffer_head *bg_bh = NULL;
2306 	struct ocfs2_group_desc *bg;
2307 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2308 
2309 	/*
2310 	 * Since di_blkno is being passed back in, we check for any
2311 	 * inconsistencies which may have happened between
2312 	 * calls. These are code bugs as di_blkno is not expected to
2313 	 * change once returned from ocfs2_find_new_inode_loc()
2314 	 */
2315 	BUG_ON(res->sr_blkno != di_blkno);
2316 
2317 	ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2318 					  res->sr_bg_stable_blkno, &bg_bh);
2319 	if (ret) {
2320 		mlog_errno(ret);
2321 		goto out;
2322 	}
2323 
2324 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2325 	chain = le16_to_cpu(bg->bg_chain);
2326 
2327 	ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2328 					       ac->ac_bh, res->sr_bits,
2329 					       chain);
2330 	if (ret) {
2331 		mlog_errno(ret);
2332 		goto out;
2333 	}
2334 
2335 	ret = ocfs2_block_group_set_bits(handle,
2336 					 ac->ac_inode,
2337 					 bg,
2338 					 bg_bh,
2339 					 res->sr_bit_offset,
2340 					 res->sr_bits,
2341 					 res->sr_max_contig_bits,
2342 					 0);
2343 	if (ret < 0) {
2344 		ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
2345 					       ac->ac_bh, res->sr_bits, chain);
2346 		mlog_errno(ret);
2347 		goto out;
2348 	}
2349 
2350 	trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
2351 					   res->sr_bits);
2352 
2353 	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2354 
2355 	BUG_ON(res->sr_bits != 1);
2356 
2357 	*suballoc_loc = res->sr_bg_blkno;
2358 	*suballoc_bit = res->sr_bit_offset;
2359 	ac->ac_bits_given++;
2360 	ocfs2_save_inode_ac_group(dir, ac);
2361 
2362 out:
2363 	brelse(bg_bh);
2364 
2365 	return ret;
2366 }
2367 
2368 int ocfs2_claim_new_inode(handle_t *handle,
2369 			  struct inode *dir,
2370 			  struct buffer_head *parent_fe_bh,
2371 			  struct ocfs2_alloc_context *ac,
2372 			  u64 *suballoc_loc,
2373 			  u16 *suballoc_bit,
2374 			  u64 *fe_blkno)
2375 {
2376 	int status;
2377 	struct ocfs2_suballoc_result res;
2378 
2379 	BUG_ON(!ac);
2380 	BUG_ON(ac->ac_bits_given != 0);
2381 	BUG_ON(ac->ac_bits_wanted != 1);
2382 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2383 
2384 	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2385 
2386 	status = ocfs2_claim_suballoc_bits(ac,
2387 					   handle,
2388 					   1,
2389 					   1,
2390 					   &res);
2391 	if (status < 0) {
2392 		mlog_errno(status);
2393 		goto bail;
2394 	}
2395 	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2396 
2397 	BUG_ON(res.sr_bits != 1);
2398 
2399 	*suballoc_loc = res.sr_bg_blkno;
2400 	*suballoc_bit = res.sr_bit_offset;
2401 	*fe_blkno = res.sr_blkno;
2402 	ac->ac_bits_given++;
2403 	ocfs2_save_inode_ac_group(dir, ac);
2404 	status = 0;
2405 bail:
2406 	if (status)
2407 		mlog_errno(status);
2408 	return status;
2409 }
2410 
2411 /* translate a group desc. blkno and it's bitmap offset into
2412  * disk cluster offset. */
2413 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2414 						   u64 bg_blkno,
2415 						   u16 bg_bit_off)
2416 {
2417 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2418 	u32 cluster = 0;
2419 
2420 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2421 
2422 	if (bg_blkno != osb->first_cluster_group_blkno)
2423 		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2424 	cluster += (u32) bg_bit_off;
2425 	return cluster;
2426 }
2427 
2428 /* given a cluster offset, calculate which block group it belongs to
2429  * and return that block offset. */
2430 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2431 {
2432 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2433 	u32 group_no;
2434 
2435 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2436 
2437 	group_no = cluster / osb->bitmap_cpg;
2438 	if (!group_no)
2439 		return osb->first_cluster_group_blkno;
2440 	return ocfs2_clusters_to_blocks(inode->i_sb,
2441 					group_no * osb->bitmap_cpg);
2442 }
2443 
2444 /* given the block number of a cluster start, calculate which cluster
2445  * group and descriptor bitmap offset that corresponds to. */
2446 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2447 						u64 data_blkno,
2448 						u64 *bg_blkno,
2449 						u16 *bg_bit_off)
2450 {
2451 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2452 	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2453 
2454 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2455 
2456 	*bg_blkno = ocfs2_which_cluster_group(inode,
2457 					      data_cluster);
2458 
2459 	if (*bg_blkno == osb->first_cluster_group_blkno)
2460 		*bg_bit_off = (u16) data_cluster;
2461 	else
2462 		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2463 							     data_blkno - *bg_blkno);
2464 }
2465 
2466 /*
2467  * min_bits - minimum contiguous chunk from this total allocation we
2468  * can handle. set to what we asked for originally for a full
2469  * contig. allocation, set to '1' to indicate we can deal with extents
2470  * of any size.
2471  */
2472 int __ocfs2_claim_clusters(handle_t *handle,
2473 			   struct ocfs2_alloc_context *ac,
2474 			   u32 min_clusters,
2475 			   u32 max_clusters,
2476 			   u32 *cluster_start,
2477 			   u32 *num_clusters)
2478 {
2479 	int status;
2480 	unsigned int bits_wanted = max_clusters;
2481 	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2482 	struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2483 
2484 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2485 
2486 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2487 	       && ac->ac_which != OCFS2_AC_USE_MAIN
2488 	       && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG);
2489 
2490 	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2491 		WARN_ON(min_clusters > 1);
2492 
2493 		status = ocfs2_claim_local_alloc_bits(osb,
2494 						      handle,
2495 						      ac,
2496 						      bits_wanted,
2497 						      cluster_start,
2498 						      num_clusters);
2499 		if (!status)
2500 			atomic_inc(&osb->alloc_stats.local_data);
2501 	} else {
2502 		if (min_clusters > (osb->bitmap_cpg - 1)) {
2503 			/* The only paths asking for contiguousness
2504 			 * should know about this already. */
2505 			mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2506 			     "group bitmap size %u!\n", min_clusters,
2507 			     osb->bitmap_cpg);
2508 			status = -ENOSPC;
2509 			goto bail;
2510 		}
2511 		/* clamp the current request down to a realistic size. */
2512 		if (bits_wanted > (osb->bitmap_cpg - 1))
2513 			bits_wanted = osb->bitmap_cpg - 1;
2514 
2515 		status = ocfs2_claim_suballoc_bits(ac,
2516 						   handle,
2517 						   bits_wanted,
2518 						   min_clusters,
2519 						   &res);
2520 		if (!status) {
2521 			BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2522 			*cluster_start =
2523 				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2524 								 res.sr_bg_blkno,
2525 								 res.sr_bit_offset);
2526 			atomic_inc(&osb->alloc_stats.bitmap_data);
2527 			*num_clusters = res.sr_bits;
2528 		}
2529 	}
2530 	if (status < 0) {
2531 		if (status != -ENOSPC)
2532 			mlog_errno(status);
2533 		goto bail;
2534 	}
2535 
2536 	ac->ac_bits_given += *num_clusters;
2537 
2538 bail:
2539 	if (status)
2540 		mlog_errno(status);
2541 	return status;
2542 }
2543 
2544 int ocfs2_claim_clusters(handle_t *handle,
2545 			 struct ocfs2_alloc_context *ac,
2546 			 u32 min_clusters,
2547 			 u32 *cluster_start,
2548 			 u32 *num_clusters)
2549 {
2550 	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2551 
2552 	return __ocfs2_claim_clusters(handle, ac, min_clusters,
2553 				      bits_wanted, cluster_start, num_clusters);
2554 }
2555 
2556 static int ocfs2_block_group_clear_bits(handle_t *handle,
2557 					struct inode *alloc_inode,
2558 					struct ocfs2_group_desc *bg,
2559 					struct buffer_head *group_bh,
2560 					unsigned int bit_off,
2561 					unsigned int num_bits,
2562 					unsigned int max_contig_bits,
2563 					void (*undo_fn)(unsigned int bit,
2564 							unsigned long *bmap))
2565 {
2566 	int status;
2567 	unsigned int tmp;
2568 	u16 contig_bits;
2569 	struct ocfs2_group_desc *undo_bg = NULL;
2570 	struct journal_head *jh;
2571 
2572 	/* The caller got this descriptor from
2573 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
2574 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2575 
2576 	trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
2577 
2578 	BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2579 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2580 					 group_bh,
2581 					 undo_fn ?
2582 					 OCFS2_JOURNAL_ACCESS_UNDO :
2583 					 OCFS2_JOURNAL_ACCESS_WRITE);
2584 	if (status < 0) {
2585 		mlog_errno(status);
2586 		goto bail;
2587 	}
2588 
2589 	jh = bh2jh(group_bh);
2590 	if (undo_fn) {
2591 		spin_lock(&jh->b_state_lock);
2592 		undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data;
2593 		BUG_ON(!undo_bg);
2594 	}
2595 
2596 	tmp = num_bits;
2597 	while(tmp--) {
2598 		ocfs2_clear_bit((bit_off + tmp),
2599 				(unsigned long *) bg->bg_bitmap);
2600 		if (undo_fn)
2601 			undo_fn(bit_off + tmp,
2602 				(unsigned long *) undo_bg->bg_bitmap);
2603 	}
2604 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2605 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2606 		if (undo_fn)
2607 			spin_unlock(&jh->b_state_lock);
2608 		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
2609 				   (unsigned long long)le64_to_cpu(bg->bg_blkno),
2610 				   le16_to_cpu(bg->bg_bits),
2611 				   le16_to_cpu(bg->bg_free_bits_count),
2612 				   num_bits);
2613 	}
2614 
2615 	/*
2616 	 * TODO: even 'num_bits == 1' (the worst case, release 1 cluster),
2617 	 * we still need to rescan whole bitmap.
2618 	 */
2619 	if (ocfs2_is_cluster_bitmap(alloc_inode)) {
2620 		contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
2621 				    le16_to_cpu(bg->bg_bits), 0);
2622 		if (contig_bits > max_contig_bits)
2623 			max_contig_bits = contig_bits;
2624 		bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
2625 	} else {
2626 		bg->bg_contig_free_bits = 0;
2627 	}
2628 
2629 	if (undo_fn)
2630 		spin_unlock(&jh->b_state_lock);
2631 
2632 	ocfs2_journal_dirty(handle, group_bh);
2633 bail:
2634 	return status;
2635 }
2636 
2637 /*
2638  * Reclaim the suballocator managed space to main bitmap.
2639  * This function first works on the suballocator to perform the
2640  * cleanup rec/alloc_inode job, then switches to the main bitmap
2641  * to reclaim released space.
2642  *
2643  * handle: The transaction handle
2644  * alloc_inode: The suballoc inode
2645  * alloc_bh: The buffer_head of suballoc inode
2646  * group_bh: The group descriptor buffer_head of suballocator managed.
2647  *           Caller should release the input group_bh.
2648  */
2649 static int _ocfs2_reclaim_suballoc_to_main(handle_t *handle,
2650 			struct inode *alloc_inode,
2651 			struct buffer_head *alloc_bh,
2652 			struct buffer_head *group_bh)
2653 {
2654 	int idx, status = 0;
2655 	int i, next_free_rec, len = 0;
2656 	__le16 old_bg_contig_free_bits = 0;
2657 	u16 start_bit;
2658 	u32 tmp_used;
2659 	u64 bg_blkno, start_blk;
2660 	unsigned int count;
2661 	struct ocfs2_chain_rec *rec;
2662 	struct buffer_head *main_bm_bh = NULL;
2663 	struct inode *main_bm_inode = NULL;
2664 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
2665 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2666 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2667 	struct ocfs2_group_desc *group = (struct ocfs2_group_desc *) group_bh->b_data;
2668 
2669 	idx = le16_to_cpu(group->bg_chain);
2670 	rec = &(cl->cl_recs[idx]);
2671 
2672 	status = ocfs2_extend_trans(handle,
2673 				ocfs2_calc_group_alloc_credits(osb->sb,
2674 						 le16_to_cpu(cl->cl_cpg)));
2675 	if (status) {
2676 		mlog_errno(status);
2677 		goto bail;
2678 	}
2679 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2680 					 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2681 	if (status < 0) {
2682 		mlog_errno(status);
2683 		goto bail;
2684 	}
2685 
2686 	/*
2687 	 * Only clear the suballocator rec item in-place.
2688 	 *
2689 	 * If idx is not the last, we don't compress (remove the empty item)
2690 	 * the cl_recs[]. If not, we need to do lots jobs.
2691 	 *
2692 	 * Compress cl_recs[] code example:
2693 	 * if (idx != cl->cl_next_free_rec - 1)
2694 	 *     memmove(&cl->cl_recs[idx], &cl->cl_recs[idx + 1],
2695 	 *         sizeof(struct ocfs2_chain_rec) *
2696 	 *         (cl->cl_next_free_rec - idx - 1));
2697 	 * for(i = idx; i < cl->cl_next_free_rec-1; i++) {
2698 	 *     group->bg_chain = "later group->bg_chain";
2699 	 *     group->bg_blkno = xxx;
2700 	 *     ... ...
2701 	 * }
2702 	 */
2703 
2704 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_total);
2705 	fe->id1.bitmap1.i_total = cpu_to_le32(tmp_used - le32_to_cpu(rec->c_total));
2706 
2707 	/* Substraction 1 for the block group itself */
2708 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2709 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - 1);
2710 
2711 	tmp_used = le32_to_cpu(fe->i_clusters);
2712 	fe->i_clusters = cpu_to_le32(tmp_used - le16_to_cpu(cl->cl_cpg));
2713 
2714 	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
2715 	OCFS2_I(alloc_inode)->ip_clusters -= le32_to_cpu(fe->i_clusters);
2716 	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
2717 					     le32_to_cpu(fe->i_clusters)));
2718 	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
2719 	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
2720 	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
2721 
2722 	ocfs2_journal_dirty(handle, alloc_bh);
2723 	ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
2724 
2725 	start_blk = le64_to_cpu(rec->c_blkno);
2726 	count = le32_to_cpu(rec->c_total) / le16_to_cpu(cl->cl_bpc);
2727 
2728 	/*
2729 	 * If the rec is the last one, let's compress the chain list by
2730 	 * removing the empty cl_recs[] at the end.
2731 	 */
2732 	next_free_rec = le16_to_cpu(cl->cl_next_free_rec);
2733 	if (idx == (next_free_rec - 1)) {
2734 		len++; /* the last item should be counted first */
2735 		for (i = (next_free_rec - 2); i > 0; i--) {
2736 			if (cl->cl_recs[i].c_free == cl->cl_recs[i].c_total)
2737 				len++;
2738 			else
2739 				break;
2740 		}
2741 	}
2742 	le16_add_cpu(&cl->cl_next_free_rec, -len);
2743 
2744 	rec->c_free = 0;
2745 	rec->c_total = 0;
2746 	rec->c_blkno = 0;
2747 	ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), group_bh);
2748 	memset(group, 0, sizeof(struct ocfs2_group_desc));
2749 
2750 	/* prepare job for reclaim clusters */
2751 	main_bm_inode = ocfs2_get_system_file_inode(osb,
2752 						    GLOBAL_BITMAP_SYSTEM_INODE,
2753 						    OCFS2_INVALID_SLOT);
2754 	if (!main_bm_inode)
2755 		goto bail; /* ignore the error in reclaim path */
2756 
2757 	inode_lock(main_bm_inode);
2758 
2759 	status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
2760 	if (status < 0)
2761 		goto free_bm_inode; /* ignore the error in reclaim path */
2762 
2763 	ocfs2_block_to_cluster_group(main_bm_inode, start_blk, &bg_blkno,
2764 				     &start_bit);
2765 	fe = (struct ocfs2_dinode *) main_bm_bh->b_data;
2766 	cl = &fe->id2.i_chain;
2767 	/* reuse group_bh, caller will release the input group_bh */
2768 	group_bh = NULL;
2769 
2770 	/* reclaim clusters to global_bitmap */
2771 	status = ocfs2_read_group_descriptor(main_bm_inode, fe, bg_blkno,
2772 					     &group_bh);
2773 	if (status < 0) {
2774 		mlog_errno(status);
2775 		goto free_bm_bh;
2776 	}
2777 	group = (struct ocfs2_group_desc *) group_bh->b_data;
2778 
2779 	if ((count + start_bit) > le16_to_cpu(group->bg_bits)) {
2780 		ocfs2_error(alloc_inode->i_sb,
2781 			"reclaim length (%d) beyands block group length (%d)",
2782 			count + start_bit, le16_to_cpu(group->bg_bits));
2783 		goto free_group_bh;
2784 	}
2785 
2786 	old_bg_contig_free_bits = group->bg_contig_free_bits;
2787 	status = ocfs2_block_group_clear_bits(handle, main_bm_inode,
2788 					      group, group_bh,
2789 					      start_bit, count, 0,
2790 					      _ocfs2_clear_bit);
2791 	if (status < 0) {
2792 		mlog_errno(status);
2793 		goto free_group_bh;
2794 	}
2795 
2796 	status = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
2797 					 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2798 	if (status < 0) {
2799 		mlog_errno(status);
2800 		ocfs2_block_group_set_bits(handle, main_bm_inode, group, group_bh,
2801 				start_bit, count,
2802 				le16_to_cpu(old_bg_contig_free_bits), 1);
2803 		goto free_group_bh;
2804 	}
2805 
2806 	idx = le16_to_cpu(group->bg_chain);
2807 	rec = &(cl->cl_recs[idx]);
2808 
2809 	le32_add_cpu(&rec->c_free, count);
2810 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2811 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2812 	ocfs2_journal_dirty(handle, main_bm_bh);
2813 
2814 free_group_bh:
2815 	brelse(group_bh);
2816 
2817 free_bm_bh:
2818 	ocfs2_inode_unlock(main_bm_inode, 1);
2819 	brelse(main_bm_bh);
2820 
2821 free_bm_inode:
2822 	inode_unlock(main_bm_inode);
2823 	iput(main_bm_inode);
2824 
2825 bail:
2826 	return status;
2827 }
2828 
2829 /*
2830  * expects the suballoc inode to already be locked.
2831  */
2832 static int _ocfs2_free_suballoc_bits(handle_t *handle,
2833 				     struct inode *alloc_inode,
2834 				     struct buffer_head *alloc_bh,
2835 				     unsigned int start_bit,
2836 				     u64 bg_blkno,
2837 				     unsigned int count,
2838 				     void (*undo_fn)(unsigned int bit,
2839 						     unsigned long *bitmap))
2840 {
2841 	int idx, status = 0;
2842 	u32 tmp_used;
2843 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2844 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2845 	struct buffer_head *group_bh = NULL;
2846 	struct ocfs2_group_desc *group;
2847 	struct ocfs2_chain_rec *rec;
2848 	__le16 old_bg_contig_free_bits = 0;
2849 
2850 	/* The alloc_bh comes from ocfs2_free_dinode() or
2851 	 * ocfs2_free_clusters().  The callers have all locked the
2852 	 * allocator and gotten alloc_bh from the lock call.  This
2853 	 * validates the dinode buffer.  Any corruption that has happened
2854 	 * is a code bug. */
2855 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2856 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2857 
2858 	trace_ocfs2_free_suballoc_bits(
2859 		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
2860 		(unsigned long long)bg_blkno,
2861 		start_bit, count);
2862 
2863 	status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2864 					     &group_bh);
2865 	if (status < 0) {
2866 		mlog_errno(status);
2867 		goto bail;
2868 	}
2869 	group = (struct ocfs2_group_desc *) group_bh->b_data;
2870 
2871 	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2872 
2873 	if (ocfs2_is_cluster_bitmap(alloc_inode))
2874 		old_bg_contig_free_bits = group->bg_contig_free_bits;
2875 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2876 					      group, group_bh,
2877 					      start_bit, count, 0, undo_fn);
2878 	if (status < 0) {
2879 		mlog_errno(status);
2880 		goto bail;
2881 	}
2882 
2883 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2884 					 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2885 	if (status < 0) {
2886 		mlog_errno(status);
2887 		ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
2888 				start_bit, count,
2889 				le16_to_cpu(old_bg_contig_free_bits), 1);
2890 		goto bail;
2891 	}
2892 
2893 	idx = le16_to_cpu(group->bg_chain);
2894 	rec = &(cl->cl_recs[idx]);
2895 
2896 	le32_add_cpu(&rec->c_free, count);
2897 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2898 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2899 	ocfs2_journal_dirty(handle, alloc_bh);
2900 
2901 	/*
2902 	 * Reclaim suballocator free space.
2903 	 * Bypass: global_bitmap, non empty rec, first rec in cl_recs[]
2904 	 */
2905 	if (ocfs2_is_cluster_bitmap(alloc_inode) ||
2906 	    (le32_to_cpu(rec->c_free) != (le32_to_cpu(rec->c_total) - 1)) ||
2907 	    (le16_to_cpu(cl->cl_next_free_rec) == 1)) {
2908 		goto bail;
2909 	}
2910 
2911 	_ocfs2_reclaim_suballoc_to_main(handle, alloc_inode, alloc_bh, group_bh);
2912 
2913 bail:
2914 	brelse(group_bh);
2915 	return status;
2916 }
2917 
2918 int ocfs2_free_suballoc_bits(handle_t *handle,
2919 			     struct inode *alloc_inode,
2920 			     struct buffer_head *alloc_bh,
2921 			     unsigned int start_bit,
2922 			     u64 bg_blkno,
2923 			     unsigned int count)
2924 {
2925 	return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2926 					 start_bit, bg_blkno, count, NULL);
2927 }
2928 
2929 int ocfs2_free_dinode(handle_t *handle,
2930 		      struct inode *inode_alloc_inode,
2931 		      struct buffer_head *inode_alloc_bh,
2932 		      struct ocfs2_dinode *di)
2933 {
2934 	u64 blk = le64_to_cpu(di->i_blkno);
2935 	u16 bit = le16_to_cpu(di->i_suballoc_bit);
2936 	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2937 
2938 	if (di->i_suballoc_loc)
2939 		bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2940 	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2941 					inode_alloc_bh, bit, bg_blkno, 1);
2942 }
2943 
2944 static int _ocfs2_free_clusters(handle_t *handle,
2945 				struct inode *bitmap_inode,
2946 				struct buffer_head *bitmap_bh,
2947 				u64 start_blk,
2948 				unsigned int num_clusters,
2949 				void (*undo_fn)(unsigned int bit,
2950 						unsigned long *bitmap))
2951 {
2952 	int status;
2953 	u16 bg_start_bit;
2954 	u64 bg_blkno;
2955 
2956 	/* You can't ever have a contiguous set of clusters
2957 	 * bigger than a block group bitmap so we never have to worry
2958 	 * about looping on them.
2959 	 * This is expensive. We can safely remove once this stuff has
2960 	 * gotten tested really well. */
2961 	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb,
2962 				ocfs2_blocks_to_clusters(bitmap_inode->i_sb,
2963 							 start_blk)));
2964 
2965 
2966 	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2967 				     &bg_start_bit);
2968 
2969 	trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
2970 			(unsigned long long)start_blk,
2971 			bg_start_bit, num_clusters);
2972 
2973 	status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2974 					   bg_start_bit, bg_blkno,
2975 					   num_clusters, undo_fn);
2976 	if (status < 0) {
2977 		mlog_errno(status);
2978 		goto out;
2979 	}
2980 
2981 	ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2982 					 num_clusters);
2983 
2984 out:
2985 	return status;
2986 }
2987 
2988 int ocfs2_free_clusters(handle_t *handle,
2989 			struct inode *bitmap_inode,
2990 			struct buffer_head *bitmap_bh,
2991 			u64 start_blk,
2992 			unsigned int num_clusters)
2993 {
2994 	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2995 				    start_blk, num_clusters,
2996 				    _ocfs2_set_bit);
2997 }
2998 
2999 /*
3000  * Give never-used clusters back to the global bitmap.  We don't need
3001  * to protect these bits in the undo buffer.
3002  */
3003 int ocfs2_release_clusters(handle_t *handle,
3004 			   struct inode *bitmap_inode,
3005 			   struct buffer_head *bitmap_bh,
3006 			   u64 start_blk,
3007 			   unsigned int num_clusters)
3008 {
3009 	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
3010 				    start_blk, num_clusters,
3011 				    _ocfs2_clear_bit);
3012 }
3013 
3014 /*
3015  * For a given allocation, determine which allocators will need to be
3016  * accessed, and lock them, reserving the appropriate number of bits.
3017  *
3018  * Sparse file systems call this from ocfs2_write_begin_nolock()
3019  * and ocfs2_allocate_unwritten_extents().
3020  *
3021  * File systems which don't support holes call this from
3022  * ocfs2_extend_allocation().
3023  */
3024 int ocfs2_lock_allocators(struct inode *inode,
3025 			  struct ocfs2_extent_tree *et,
3026 			  u32 clusters_to_add, u32 extents_to_split,
3027 			  struct ocfs2_alloc_context **data_ac,
3028 			  struct ocfs2_alloc_context **meta_ac)
3029 {
3030 	int ret = 0, num_free_extents;
3031 	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
3032 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3033 
3034 	*meta_ac = NULL;
3035 	if (data_ac)
3036 		*data_ac = NULL;
3037 
3038 	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
3039 
3040 	num_free_extents = ocfs2_num_free_extents(et);
3041 	if (num_free_extents < 0) {
3042 		ret = num_free_extents;
3043 		mlog_errno(ret);
3044 		goto out;
3045 	}
3046 
3047 	/*
3048 	 * Sparse allocation file systems need to be more conservative
3049 	 * with reserving room for expansion - the actual allocation
3050 	 * happens while we've got a journal handle open so re-taking
3051 	 * a cluster lock (because we ran out of room for another
3052 	 * extent) will violate ordering rules.
3053 	 *
3054 	 * Most of the time we'll only be seeing this 1 cluster at a time
3055 	 * anyway.
3056 	 *
3057 	 * Always lock for any unwritten extents - we might want to
3058 	 * add blocks during a split.
3059 	 */
3060 	if (!num_free_extents ||
3061 	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
3062 		ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
3063 		if (ret < 0) {
3064 			if (ret != -ENOSPC)
3065 				mlog_errno(ret);
3066 			goto out;
3067 		}
3068 	}
3069 
3070 	if (clusters_to_add == 0)
3071 		goto out;
3072 
3073 	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
3074 	if (ret < 0) {
3075 		if (ret != -ENOSPC)
3076 			mlog_errno(ret);
3077 		goto out;
3078 	}
3079 
3080 out:
3081 	if (ret) {
3082 		if (*meta_ac) {
3083 			ocfs2_free_alloc_context(*meta_ac);
3084 			*meta_ac = NULL;
3085 		}
3086 
3087 		/*
3088 		 * We cannot have an error and a non null *data_ac.
3089 		 */
3090 	}
3091 
3092 	return ret;
3093 }
3094 
3095 /*
3096  * Read the inode specified by blkno to get suballoc_slot and
3097  * suballoc_bit.
3098  */
3099 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
3100 				       u16 *suballoc_slot, u64 *group_blkno,
3101 				       u16 *suballoc_bit)
3102 {
3103 	int status;
3104 	struct buffer_head *inode_bh = NULL;
3105 	struct ocfs2_dinode *inode_fe;
3106 
3107 	trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
3108 
3109 	/* dirty read disk */
3110 	status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
3111 	if (status < 0) {
3112 		mlog(ML_ERROR, "read block %llu failed %d\n",
3113 		     (unsigned long long)blkno, status);
3114 		goto bail;
3115 	}
3116 
3117 	inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
3118 	if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
3119 		mlog(ML_ERROR, "invalid inode %llu requested\n",
3120 		     (unsigned long long)blkno);
3121 		status = -EINVAL;
3122 		goto bail;
3123 	}
3124 
3125 	if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
3126 	    (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
3127 		mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
3128 		     (unsigned long long)blkno,
3129 		     (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
3130 		status = -EINVAL;
3131 		goto bail;
3132 	}
3133 
3134 	if (suballoc_slot)
3135 		*suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
3136 	if (suballoc_bit)
3137 		*suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
3138 	if (group_blkno)
3139 		*group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
3140 
3141 bail:
3142 	brelse(inode_bh);
3143 
3144 	if (status)
3145 		mlog_errno(status);
3146 	return status;
3147 }
3148 
3149 /*
3150  * test whether bit is SET in allocator bitmap or not.  on success, 0
3151  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
3152  * is returned and *res is meaningless.  Call this after you have
3153  * cluster locked against suballoc, or you may get a result based on
3154  * non-up2date contents
3155  */
3156 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
3157 				   struct inode *suballoc,
3158 				   struct buffer_head *alloc_bh,
3159 				   u64 group_blkno, u64 blkno,
3160 				   u16 bit, int *res)
3161 {
3162 	struct ocfs2_dinode *alloc_di;
3163 	struct ocfs2_group_desc *group;
3164 	struct buffer_head *group_bh = NULL;
3165 	u64 bg_blkno;
3166 	int status, quiet = 0, released = 0;
3167 
3168 	trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
3169 				      (unsigned int)bit);
3170 
3171 	alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
3172 	if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
3173 		mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
3174 		     (unsigned int)bit,
3175 		     ocfs2_bits_per_group(&alloc_di->id2.i_chain));
3176 		status = -EINVAL;
3177 		goto bail;
3178 	}
3179 
3180 	bg_blkno = group_blkno ? group_blkno :
3181 		   ocfs2_which_suballoc_group(blkno, bit);
3182 	status = ocfs2_read_hint_group_descriptor(suballoc, alloc_di, bg_blkno,
3183 					     &group_bh, &released);
3184 	if (released) {
3185 		quiet = 1;
3186 		status = -ESTALE;
3187 		goto bail;
3188 	} else if (status < 0) {
3189 		mlog(ML_ERROR, "read group %llu failed %d\n",
3190 		     (unsigned long long)bg_blkno, status);
3191 		goto bail;
3192 	}
3193 
3194 	group = (struct ocfs2_group_desc *) group_bh->b_data;
3195 	*res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
3196 
3197 bail:
3198 	brelse(group_bh);
3199 
3200 	if (status && !quiet)
3201 		mlog_errno(status);
3202 	return status;
3203 }
3204 
3205 /*
3206  * Test if the bit representing this inode (blkno) is set in the
3207  * suballocator.
3208  *
3209  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
3210  *
3211  * In the event of failure, a negative value is returned and *res is
3212  * meaningless.
3213  *
3214  * Callers must make sure to hold nfs_sync_lock to prevent
3215  * ocfs2_delete_inode() on another node from accessing the same
3216  * suballocator concurrently.
3217  */
3218 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
3219 {
3220 	int status, quiet = 0;
3221 	u64 group_blkno = 0;
3222 	u16 suballoc_bit = 0, suballoc_slot = 0;
3223 	struct inode *inode_alloc_inode;
3224 	struct buffer_head *alloc_bh = NULL;
3225 
3226 	trace_ocfs2_test_inode_bit((unsigned long long)blkno);
3227 
3228 	status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
3229 					     &group_blkno, &suballoc_bit);
3230 	if (status < 0) {
3231 		mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
3232 		goto bail;
3233 	}
3234 
3235 	if (suballoc_slot == (u16)OCFS2_INVALID_SLOT)
3236 		inode_alloc_inode = ocfs2_get_system_file_inode(osb,
3237 			GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
3238 	else
3239 		inode_alloc_inode = ocfs2_get_system_file_inode(osb,
3240 			INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
3241 	if (!inode_alloc_inode) {
3242 		/* the error code could be inaccurate, but we are not able to
3243 		 * get the correct one. */
3244 		status = -EINVAL;
3245 		mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
3246 		     (u32)suballoc_slot);
3247 		goto bail;
3248 	}
3249 
3250 	inode_lock(inode_alloc_inode);
3251 	status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
3252 	if (status < 0) {
3253 		inode_unlock(inode_alloc_inode);
3254 		iput(inode_alloc_inode);
3255 		mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
3256 		     (u32)suballoc_slot, status);
3257 		goto bail;
3258 	}
3259 
3260 	status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
3261 					 group_blkno, blkno, suballoc_bit, res);
3262 	if (status < 0) {
3263 		if (status == -ESTALE)
3264 			quiet = 1;
3265 		else
3266 			mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
3267 	}
3268 
3269 	ocfs2_inode_unlock(inode_alloc_inode, 0);
3270 	inode_unlock(inode_alloc_inode);
3271 
3272 	iput(inode_alloc_inode);
3273 	brelse(alloc_bh);
3274 bail:
3275 	if (status && !quiet)
3276 		mlog_errno(status);
3277 	return status;
3278 }
3279