xref: /linux/fs/ocfs2/suballoc.c (revision 6110d18e208cc5572158928401246d98cd2b90b4)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * suballoc.c
4  *
5  * metadata alloc and free
6  * Inspired by ext3 block groups.
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  */
10 
11 #include <linux/fs.h>
12 #include <linux/types.h>
13 #include <linux/slab.h>
14 #include <linux/string.h>
15 #include <linux/highmem.h>
16 
17 #include <cluster/masklog.h>
18 
19 #include "ocfs2.h"
20 
21 #include "alloc.h"
22 #include "blockcheck.h"
23 #include "dlmglue.h"
24 #include "inode.h"
25 #include "journal.h"
26 #include "localalloc.h"
27 #include "suballoc.h"
28 #include "super.h"
29 #include "sysfile.h"
30 #include "uptodate.h"
31 #include "ocfs2_trace.h"
32 
33 #include "buffer_head_io.h"
34 
35 #define NOT_ALLOC_NEW_GROUP		0
36 #define ALLOC_NEW_GROUP			0x1
37 #define ALLOC_GROUPS_FROM_GLOBAL	0x2
38 
39 #define OCFS2_MAX_TO_STEAL		1024
40 
41 struct ocfs2_suballoc_result {
42 	u64		sr_bg_blkno;	/* The bg we allocated from.  Set
43 					   to 0 when a block group is
44 					   contiguous. */
45 	u64		sr_bg_stable_blkno; /*
46 					     * Doesn't change, always
47 					     * set to target block
48 					     * group descriptor
49 					     * block.
50 					     */
51 	u64		sr_blkno;	/* The first allocated block */
52 	unsigned int	sr_bit_offset;	/* The bit in the bg */
53 	unsigned int	sr_bits;	/* How many bits we claimed */
54 	unsigned int	sr_max_contig_bits; /* The length for contiguous
55 					     * free bits, only available
56 					     * for cluster group
57 					     */
58 };
59 
60 static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
61 {
62 	if (res->sr_blkno == 0)
63 		return 0;
64 
65 	if (res->sr_bg_blkno)
66 		return res->sr_bg_blkno;
67 
68 	return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
69 }
70 
71 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
72 static int ocfs2_block_group_fill(handle_t *handle,
73 				  struct inode *alloc_inode,
74 				  struct buffer_head *bg_bh,
75 				  u64 group_blkno,
76 				  unsigned int group_clusters,
77 				  u16 my_chain,
78 				  struct ocfs2_chain_list *cl);
79 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
80 				   struct inode *alloc_inode,
81 				   struct buffer_head *bh,
82 				   u64 max_block,
83 				   u64 *last_alloc_group,
84 				   int flags);
85 
86 static int ocfs2_cluster_group_search(struct inode *inode,
87 				      struct buffer_head *group_bh,
88 				      u32 bits_wanted, u32 min_bits,
89 				      u64 max_block,
90 				      struct ocfs2_suballoc_result *res);
91 static int ocfs2_block_group_search(struct inode *inode,
92 				    struct buffer_head *group_bh,
93 				    u32 bits_wanted, u32 min_bits,
94 				    u64 max_block,
95 				    struct ocfs2_suballoc_result *res);
96 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
97 				     handle_t *handle,
98 				     u32 bits_wanted,
99 				     u32 min_bits,
100 				     struct ocfs2_suballoc_result *res);
101 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
102 					 int nr);
103 static int ocfs2_relink_block_group(handle_t *handle,
104 				    struct inode *alloc_inode,
105 				    struct buffer_head *fe_bh,
106 				    struct buffer_head *bg_bh,
107 				    struct buffer_head *prev_bg_bh,
108 				    u16 chain);
109 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
110 						     u32 wanted);
111 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
112 						   u64 bg_blkno,
113 						   u16 bg_bit_off);
114 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
115 						u64 data_blkno,
116 						u64 *bg_blkno,
117 						u16 *bg_bit_off);
118 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
119 					     u32 bits_wanted, u64 max_block,
120 					     int flags,
121 					     struct ocfs2_alloc_context **ac);
122 
123 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
124 {
125 	struct inode *inode = ac->ac_inode;
126 
127 	if (inode) {
128 		if (ac->ac_which != OCFS2_AC_USE_LOCAL)
129 			ocfs2_inode_unlock(inode, 1);
130 
131 		inode_unlock(inode);
132 
133 		iput(inode);
134 		ac->ac_inode = NULL;
135 	}
136 	brelse(ac->ac_bh);
137 	ac->ac_bh = NULL;
138 	ac->ac_resv = NULL;
139 	kfree(ac->ac_find_loc_priv);
140 	ac->ac_find_loc_priv = NULL;
141 }
142 
143 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
144 {
145 	ocfs2_free_ac_resource(ac);
146 	kfree(ac);
147 }
148 
149 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
150 {
151 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
152 }
153 
154 #define do_error(fmt, ...)						\
155 do {									\
156 	if (resize)							\
157 		mlog(ML_ERROR, fmt, ##__VA_ARGS__);			\
158 	else								\
159 		return ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
160 } while (0)
161 
162 static int ocfs2_validate_gd_self(struct super_block *sb,
163 				  struct buffer_head *bh,
164 				  int resize)
165 {
166 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
167 
168 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
169 		do_error("Group descriptor #%llu has bad signature %.*s\n",
170 			 (unsigned long long)bh->b_blocknr, 7,
171 			 gd->bg_signature);
172 	}
173 
174 	if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
175 		do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
176 			 (unsigned long long)bh->b_blocknr,
177 			 (unsigned long long)le64_to_cpu(gd->bg_blkno));
178 	}
179 
180 	if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
181 		do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
182 			 (unsigned long long)bh->b_blocknr,
183 			 le32_to_cpu(gd->bg_generation));
184 	}
185 
186 	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
187 		do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
188 			 (unsigned long long)bh->b_blocknr,
189 			 le16_to_cpu(gd->bg_bits),
190 			 le16_to_cpu(gd->bg_free_bits_count));
191 	}
192 
193 	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
194 		do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
195 			 (unsigned long long)bh->b_blocknr,
196 			 le16_to_cpu(gd->bg_bits),
197 			 8 * le16_to_cpu(gd->bg_size));
198 	}
199 
200 	/*
201 	 * For discontiguous block groups, validate the on-disk extent list
202 	 * against the maximum number of extent records that can physically
203 	 * fit in a single block.
204 	 */
205 	if (ocfs2_gd_is_discontig(gd)) {
206 		u16 max_recs = ocfs2_extent_recs_per_gd(sb);
207 		u16 l_count = le16_to_cpu(gd->bg_list.l_count);
208 		u16 l_next_free_rec = le16_to_cpu(gd->bg_list.l_next_free_rec);
209 
210 		if (l_count != max_recs) {
211 			do_error("Group descriptor #%llu bad discontig l_count %u expected %u\n",
212 				 (unsigned long long)bh->b_blocknr,
213 				 l_count,
214 				 max_recs);
215 		}
216 
217 		if (l_next_free_rec > l_count) {
218 			do_error("Group descriptor #%llu bad discontig l_next_free_rec %u max %u\n",
219 				 (unsigned long long)bh->b_blocknr,
220 				 l_next_free_rec,
221 				 l_count);
222 		}
223 	}
224 
225 	return 0;
226 }
227 
228 static int ocfs2_validate_gd_parent(struct super_block *sb,
229 				    struct ocfs2_dinode *di,
230 				    struct buffer_head *bh,
231 				    int resize)
232 {
233 	unsigned int max_bits;
234 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
235 
236 	if (di->i_blkno != gd->bg_parent_dinode) {
237 		do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
238 			 (unsigned long long)bh->b_blocknr,
239 			 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
240 			 (unsigned long long)le64_to_cpu(di->i_blkno));
241 	}
242 
243 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
244 	if (le16_to_cpu(gd->bg_bits) > max_bits) {
245 		do_error("Group descriptor #%llu has bit count of %u\n",
246 			 (unsigned long long)bh->b_blocknr,
247 			 le16_to_cpu(gd->bg_bits));
248 	}
249 
250 	/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
251 	if ((le16_to_cpu(gd->bg_chain) >
252 	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
253 	    ((le16_to_cpu(gd->bg_chain) ==
254 	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
255 		do_error("Group descriptor #%llu has bad chain %u\n",
256 			 (unsigned long long)bh->b_blocknr,
257 			 le16_to_cpu(gd->bg_chain));
258 	}
259 
260 	return 0;
261 }
262 
263 #undef do_error
264 
265 /*
266  * This version only prints errors.  It does not fail the filesystem, and
267  * exists only for resize.
268  */
269 int ocfs2_check_group_descriptor(struct super_block *sb,
270 				 struct ocfs2_dinode *di,
271 				 struct buffer_head *bh)
272 {
273 	int rc;
274 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
275 
276 	BUG_ON(!buffer_uptodate(bh));
277 
278 	/*
279 	 * If the ecc fails, we return the error but otherwise
280 	 * leave the filesystem running.  We know any error is
281 	 * local to this block.
282 	 */
283 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
284 	if (rc) {
285 		mlog(ML_ERROR,
286 		     "Checksum failed for group descriptor %llu\n",
287 		     (unsigned long long)bh->b_blocknr);
288 	} else
289 		rc = ocfs2_validate_gd_self(sb, bh, 1);
290 	if (!rc)
291 		rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
292 
293 	return rc;
294 }
295 
296 static int ocfs2_validate_group_descriptor(struct super_block *sb,
297 					   struct buffer_head *bh)
298 {
299 	int rc;
300 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
301 
302 	trace_ocfs2_validate_group_descriptor(
303 					(unsigned long long)bh->b_blocknr);
304 
305 	BUG_ON(!buffer_uptodate(bh));
306 
307 	/*
308 	 * If the ecc fails, we return the error but otherwise
309 	 * leave the filesystem running.  We know any error is
310 	 * local to this block.
311 	 */
312 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
313 	if (rc)
314 		return rc;
315 
316 	/*
317 	 * Errors after here are fatal.
318 	 */
319 
320 	return ocfs2_validate_gd_self(sb, bh, 0);
321 }
322 
323 /*
324  * The hint group descriptor (gd) may already have been released
325  * in _ocfs2_free_suballoc_bits(). We first check the gd signature,
326  * then perform the standard ocfs2_read_group_descriptor() jobs.
327  *
328  * If the gd signature is invalid, we return 'rc=0' and set
329  * '*released=1'. The caller is expected to handle this specific case.
330  * Otherwise, we return the actual error code.
331  *
332  * We treat gd signature corruption case as a release case. The
333  * caller ocfs2_claim_suballoc_bits() will use ocfs2_search_chain()
334  * to search each gd block. The code will eventually find this
335  * corrupted gd block - Late, but not missed.
336  *
337  * Note:
338  * The caller is responsible for initializing the '*released' status.
339  */
340 static int ocfs2_read_hint_group_descriptor(struct inode *inode,
341 			struct ocfs2_dinode *di, u64 gd_blkno,
342 			struct buffer_head **bh, int *released)
343 {
344 	int rc;
345 	struct buffer_head *tmp = *bh;
346 	struct ocfs2_group_desc *gd;
347 
348 	rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, NULL);
349 	if (rc)
350 		goto out;
351 
352 	gd = (struct ocfs2_group_desc *) tmp->b_data;
353 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
354 		/*
355 		 * Invalid gd cache was set in ocfs2_read_block(),
356 		 * which will affect block_group allocation.
357 		 * Path:
358 		 * ocfs2_reserve_suballoc_bits
359 		 *  ocfs2_block_group_alloc
360 		 *   ocfs2_block_group_alloc_contig
361 		 *    ocfs2_set_new_buffer_uptodate
362 		 */
363 		ocfs2_remove_from_cache(INODE_CACHE(inode), tmp);
364 		*released = 1; /* we return 'rc=0' for this case */
365 		goto free_bh;
366 	}
367 
368 	/* below jobs same with ocfs2_read_group_descriptor() */
369 	if (!buffer_jbd(tmp)) {
370 		rc = ocfs2_validate_group_descriptor(inode->i_sb, tmp);
371 		if (rc)
372 			goto free_bh;
373 	}
374 
375 	rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
376 	if (rc)
377 		goto free_bh;
378 
379 	/* If ocfs2_read_block() got us a new bh, pass it up. */
380 	if (!*bh)
381 		*bh = tmp;
382 
383 	return rc;
384 
385 free_bh:
386 	brelse(tmp);
387 out:
388 	return rc;
389 }
390 
391 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
392 				u64 gd_blkno, struct buffer_head **bh)
393 {
394 	int rc;
395 	struct buffer_head *tmp = *bh;
396 
397 	rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
398 			      ocfs2_validate_group_descriptor);
399 	if (rc)
400 		goto out;
401 
402 	rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
403 	if (rc) {
404 		brelse(tmp);
405 		goto out;
406 	}
407 
408 	/* If ocfs2_read_block() got us a new bh, pass it up. */
409 	if (!*bh)
410 		*bh = tmp;
411 
412 out:
413 	return rc;
414 }
415 
416 static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
417 					  struct ocfs2_group_desc *bg,
418 					  struct ocfs2_chain_list *cl,
419 					  u64 p_blkno, unsigned int clusters)
420 {
421 	struct ocfs2_extent_list *el = &bg->bg_list;
422 	struct ocfs2_extent_rec *rec;
423 
424 	BUG_ON(!ocfs2_supports_discontig_bg(osb));
425 	if (!el->l_next_free_rec)
426 		el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
427 	rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
428 	rec->e_blkno = cpu_to_le64(p_blkno);
429 	rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
430 				  le16_to_cpu(cl->cl_bpc));
431 	rec->e_leaf_clusters = cpu_to_le16(clusters);
432 	le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
433 	le16_add_cpu(&bg->bg_free_bits_count,
434 		     clusters * le16_to_cpu(cl->cl_bpc));
435 	le16_add_cpu(&el->l_next_free_rec, 1);
436 }
437 
438 static int ocfs2_block_group_fill(handle_t *handle,
439 				  struct inode *alloc_inode,
440 				  struct buffer_head *bg_bh,
441 				  u64 group_blkno,
442 				  unsigned int group_clusters,
443 				  u16 my_chain,
444 				  struct ocfs2_chain_list *cl)
445 {
446 	int status = 0;
447 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
448 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
449 	struct super_block * sb = alloc_inode->i_sb;
450 
451 	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
452 		status = ocfs2_error(alloc_inode->i_sb,
453 				     "group block (%llu) != b_blocknr (%llu)\n",
454 				     (unsigned long long)group_blkno,
455 				     (unsigned long long) bg_bh->b_blocknr);
456 		goto bail;
457 	}
458 
459 	status = ocfs2_journal_access_gd(handle,
460 					 INODE_CACHE(alloc_inode),
461 					 bg_bh,
462 					 OCFS2_JOURNAL_ACCESS_CREATE);
463 	if (status < 0) {
464 		mlog_errno(status);
465 		goto bail;
466 	}
467 
468 	memset(bg, 0, sb->s_blocksize);
469 	strscpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
470 	bg->bg_generation = cpu_to_le32(osb->fs_generation);
471 	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
472 						osb->s_feature_incompat));
473 	bg->bg_chain = cpu_to_le16(my_chain);
474 	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
475 	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
476 	bg->bg_blkno = cpu_to_le64(group_blkno);
477 	if (group_clusters == le16_to_cpu(cl->cl_cpg))
478 		bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
479 	else
480 		ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
481 					      group_clusters);
482 
483 	/* set the 1st bit in the bitmap to account for the descriptor block */
484 	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
485 	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
486 
487 	ocfs2_journal_dirty(handle, bg_bh);
488 
489 	/* There is no need to zero out or otherwise initialize the
490 	 * other blocks in a group - All valid FS metadata in a block
491 	 * group stores the superblock fs_generation value at
492 	 * allocation time. */
493 
494 bail:
495 	if (status)
496 		mlog_errno(status);
497 	return status;
498 }
499 
500 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
501 {
502 	u16 curr, best;
503 
504 	best = curr = 0;
505 	while (curr < le16_to_cpu(cl->cl_count)) {
506 		if (le32_to_cpu(cl->cl_recs[best].c_total) >
507 		    le32_to_cpu(cl->cl_recs[curr].c_total))
508 			best = curr;
509 		curr++;
510 	}
511 	return best;
512 }
513 
514 static struct buffer_head *
515 ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
516 			       struct inode *alloc_inode,
517 			       struct ocfs2_alloc_context *ac,
518 			       struct ocfs2_chain_list *cl)
519 {
520 	int status;
521 	u32 bit_off, num_bits;
522 	u64 bg_blkno;
523 	struct buffer_head *bg_bh;
524 	unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
525 
526 	status = ocfs2_claim_clusters(handle, ac,
527 				      le16_to_cpu(cl->cl_cpg), &bit_off,
528 				      &num_bits);
529 	if (status < 0) {
530 		if (status != -ENOSPC)
531 			mlog_errno(status);
532 		goto bail;
533 	}
534 
535 	/* setup the group */
536 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
537 	trace_ocfs2_block_group_alloc_contig(
538 	     (unsigned long long)bg_blkno, alloc_rec);
539 
540 	bg_bh = sb_getblk(osb->sb, bg_blkno);
541 	if (!bg_bh) {
542 		status = -ENOMEM;
543 		mlog_errno(status);
544 		goto bail;
545 	}
546 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
547 
548 	status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
549 					bg_blkno, num_bits, alloc_rec, cl);
550 	if (status < 0) {
551 		brelse(bg_bh);
552 		mlog_errno(status);
553 	}
554 
555 bail:
556 	return status ? ERR_PTR(status) : bg_bh;
557 }
558 
559 static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
560 					handle_t *handle,
561 					struct ocfs2_alloc_context *ac,
562 					unsigned int min_bits,
563 					u32 *bit_off, u32 *num_bits)
564 {
565 	int status = 0;
566 
567 	while (min_bits) {
568 		status = ocfs2_claim_clusters(handle, ac, min_bits,
569 					      bit_off, num_bits);
570 		if (status != -ENOSPC)
571 			break;
572 
573 		min_bits >>= 1;
574 	}
575 
576 	return status;
577 }
578 
579 static int ocfs2_block_group_grow_discontig(handle_t *handle,
580 					    struct inode *alloc_inode,
581 					    struct buffer_head *bg_bh,
582 					    struct ocfs2_alloc_context *ac,
583 					    struct ocfs2_chain_list *cl,
584 					    unsigned int min_bits)
585 {
586 	int status;
587 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
588 	struct ocfs2_group_desc *bg =
589 		(struct ocfs2_group_desc *)bg_bh->b_data;
590 	unsigned int needed = le16_to_cpu(cl->cl_cpg) -
591 			 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
592 	u32 p_cpos, clusters;
593 	u64 p_blkno;
594 	struct ocfs2_extent_list *el = &bg->bg_list;
595 
596 	status = ocfs2_journal_access_gd(handle,
597 					 INODE_CACHE(alloc_inode),
598 					 bg_bh,
599 					 OCFS2_JOURNAL_ACCESS_CREATE);
600 	if (status < 0) {
601 		mlog_errno(status);
602 		goto bail;
603 	}
604 
605 	while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
606 				le16_to_cpu(el->l_count))) {
607 		if (min_bits > needed)
608 			min_bits = needed;
609 		status = ocfs2_block_group_claim_bits(osb, handle, ac,
610 						      min_bits, &p_cpos,
611 						      &clusters);
612 		if (status < 0) {
613 			if (status != -ENOSPC)
614 				mlog_errno(status);
615 			goto bail;
616 		}
617 		p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
618 		ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
619 					      clusters);
620 
621 		min_bits = clusters;
622 		needed = le16_to_cpu(cl->cl_cpg) -
623 			 le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
624 	}
625 
626 	if (needed > 0) {
627 		/*
628 		 * We have used up all the extent rec but can't fill up
629 		 * the cpg. So bail out.
630 		 */
631 		status = -ENOSPC;
632 		goto bail;
633 	}
634 
635 	ocfs2_journal_dirty(handle, bg_bh);
636 
637 bail:
638 	return status;
639 }
640 
641 static void ocfs2_bg_alloc_cleanup(handle_t *handle,
642 				   struct ocfs2_alloc_context *cluster_ac,
643 				   struct inode *alloc_inode,
644 				   struct buffer_head *bg_bh)
645 {
646 	int i, ret;
647 	struct ocfs2_group_desc *bg;
648 	struct ocfs2_extent_list *el;
649 	struct ocfs2_extent_rec *rec;
650 
651 	if (!bg_bh)
652 		return;
653 
654 	bg = (struct ocfs2_group_desc *)bg_bh->b_data;
655 	el = &bg->bg_list;
656 	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
657 		rec = &el->l_recs[i];
658 		ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
659 					  cluster_ac->ac_bh,
660 					  le64_to_cpu(rec->e_blkno),
661 					  le16_to_cpu(rec->e_leaf_clusters));
662 		if (ret)
663 			mlog_errno(ret);
664 		/* Try all the clusters to free */
665 	}
666 
667 	ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
668 	brelse(bg_bh);
669 }
670 
671 static struct buffer_head *
672 ocfs2_block_group_alloc_discontig(handle_t *handle,
673 				  struct inode *alloc_inode,
674 				  struct ocfs2_alloc_context *ac,
675 				  struct ocfs2_chain_list *cl)
676 {
677 	int status;
678 	u32 bit_off, num_bits;
679 	u64 bg_blkno;
680 	unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
681 	struct buffer_head *bg_bh = NULL;
682 	unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
683 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
684 
685 	if (!ocfs2_supports_discontig_bg(osb)) {
686 		status = -ENOSPC;
687 		goto bail;
688 	}
689 
690 	status = ocfs2_extend_trans(handle,
691 				    ocfs2_calc_bg_discontig_credits(osb->sb));
692 	if (status) {
693 		mlog_errno(status);
694 		goto bail;
695 	}
696 
697 	/*
698 	 * We're going to be grabbing from multiple cluster groups.
699 	 * We don't have enough credits to relink them all, and the
700 	 * cluster groups will be staying in cache for the duration of
701 	 * this operation.
702 	 */
703 	ac->ac_disable_chain_relink = 1;
704 
705 	/* Claim the first region */
706 	status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
707 					      &bit_off, &num_bits);
708 	if (status < 0) {
709 		if (status != -ENOSPC)
710 			mlog_errno(status);
711 		goto bail;
712 	}
713 	min_bits = num_bits;
714 
715 	/* setup the group */
716 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
717 	trace_ocfs2_block_group_alloc_discontig(
718 				(unsigned long long)bg_blkno, alloc_rec);
719 
720 	bg_bh = sb_getblk(osb->sb, bg_blkno);
721 	if (!bg_bh) {
722 		status = -ENOMEM;
723 		mlog_errno(status);
724 		goto bail;
725 	}
726 	ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
727 
728 	status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
729 					bg_blkno, num_bits, alloc_rec, cl);
730 	if (status < 0) {
731 		mlog_errno(status);
732 		goto bail;
733 	}
734 
735 	status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
736 						  bg_bh, ac, cl, min_bits);
737 	if (status)
738 		mlog_errno(status);
739 
740 bail:
741 	if (status)
742 		ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
743 	return status ? ERR_PTR(status) : bg_bh;
744 }
745 
746 /*
747  * We expect the block group allocator to already be locked.
748  */
749 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
750 				   struct inode *alloc_inode,
751 				   struct buffer_head *bh,
752 				   u64 max_block,
753 				   u64 *last_alloc_group,
754 				   int flags)
755 {
756 	int status, credits;
757 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
758 	struct ocfs2_chain_list *cl;
759 	struct ocfs2_alloc_context *ac = NULL;
760 	handle_t *handle = NULL;
761 	u16 alloc_rec;
762 	struct buffer_head *bg_bh = NULL;
763 	struct ocfs2_group_desc *bg;
764 
765 	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
766 
767 	cl = &fe->id2.i_chain;
768 	status = ocfs2_reserve_clusters_with_limit(osb,
769 						   le16_to_cpu(cl->cl_cpg),
770 						   max_block, flags, &ac);
771 	if (status < 0) {
772 		if (status != -ENOSPC)
773 			mlog_errno(status);
774 		goto bail;
775 	}
776 
777 	credits = ocfs2_calc_group_alloc_credits(osb->sb,
778 						 le16_to_cpu(cl->cl_cpg));
779 	handle = ocfs2_start_trans(osb, credits);
780 	if (IS_ERR(handle)) {
781 		status = PTR_ERR(handle);
782 		handle = NULL;
783 		mlog_errno(status);
784 		goto bail;
785 	}
786 
787 	if (last_alloc_group && *last_alloc_group != 0) {
788 		trace_ocfs2_block_group_alloc(
789 				(unsigned long long)*last_alloc_group);
790 		ac->ac_last_group = *last_alloc_group;
791 	}
792 
793 	bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
794 					       ac, cl);
795 	if (PTR_ERR(bg_bh) == -ENOSPC) {
796 		ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;
797 		bg_bh = ocfs2_block_group_alloc_discontig(handle,
798 							  alloc_inode,
799 							  ac, cl);
800 	}
801 	if (IS_ERR(bg_bh)) {
802 		status = PTR_ERR(bg_bh);
803 		bg_bh = NULL;
804 		if (status != -ENOSPC)
805 			mlog_errno(status);
806 		goto bail;
807 	}
808 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
809 
810 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
811 					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
812 	if (status < 0) {
813 		mlog_errno(status);
814 		goto bail;
815 	}
816 
817 	alloc_rec = le16_to_cpu(bg->bg_chain);
818 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
819 		     le16_to_cpu(bg->bg_free_bits_count));
820 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
821 		     le16_to_cpu(bg->bg_bits));
822 	cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
823 	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
824 		le16_add_cpu(&cl->cl_next_free_rec, 1);
825 
826 	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
827 					le16_to_cpu(bg->bg_free_bits_count));
828 	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
829 	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
830 
831 	ocfs2_journal_dirty(handle, bh);
832 
833 	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
834 	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
835 	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
836 					     le32_to_cpu(fe->i_clusters)));
837 	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
838 	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
839 	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
840 	ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
841 
842 	status = 0;
843 
844 	/* save the new last alloc group so that the caller can cache it. */
845 	if (last_alloc_group)
846 		*last_alloc_group = ac->ac_last_group;
847 
848 bail:
849 	if (handle)
850 		ocfs2_commit_trans(osb, handle);
851 
852 	if (ac)
853 		ocfs2_free_alloc_context(ac);
854 
855 	brelse(bg_bh);
856 
857 	if (status)
858 		mlog_errno(status);
859 	return status;
860 }
861 
862 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
863 				       struct ocfs2_alloc_context *ac,
864 				       int type,
865 				       u32 slot,
866 				       u64 *last_alloc_group,
867 				       int flags)
868 {
869 	int status;
870 	u32 bits_wanted = ac->ac_bits_wanted;
871 	struct inode *alloc_inode;
872 	struct buffer_head *bh = NULL;
873 	struct ocfs2_dinode *fe;
874 	u32 free_bits;
875 
876 	alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
877 	if (!alloc_inode) {
878 		mlog_errno(-EINVAL);
879 		return -EINVAL;
880 	}
881 
882 	inode_lock(alloc_inode);
883 
884 	status = ocfs2_inode_lock(alloc_inode, &bh, 1);
885 	if (status < 0) {
886 		inode_unlock(alloc_inode);
887 		iput(alloc_inode);
888 
889 		mlog_errno(status);
890 		return status;
891 	}
892 
893 	ac->ac_inode = alloc_inode;
894 	ac->ac_alloc_slot = slot;
895 
896 	fe = (struct ocfs2_dinode *) bh->b_data;
897 
898 	/* The bh was validated by the inode read inside
899 	 * ocfs2_inode_lock().  Any corruption is a code bug. */
900 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
901 
902 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
903 		status = ocfs2_error(alloc_inode->i_sb,
904 				     "Invalid chain allocator %llu\n",
905 				     (unsigned long long)le64_to_cpu(fe->i_blkno));
906 		goto bail;
907 	}
908 
909 	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
910 		le32_to_cpu(fe->id1.bitmap1.i_used);
911 
912 	if (bits_wanted > free_bits) {
913 		/* cluster bitmap never grows */
914 		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
915 			trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
916 								free_bits);
917 			status = -ENOSPC;
918 			goto bail;
919 		}
920 
921 		if (!(flags & ALLOC_NEW_GROUP)) {
922 			trace_ocfs2_reserve_suballoc_bits_no_new_group(
923 						slot, bits_wanted, free_bits);
924 			status = -ENOSPC;
925 			goto bail;
926 		}
927 
928 		status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
929 						 ac->ac_max_block,
930 						 last_alloc_group, flags);
931 		if (status < 0) {
932 			if (status != -ENOSPC)
933 				mlog_errno(status);
934 			goto bail;
935 		}
936 		atomic_inc(&osb->alloc_stats.bg_extends);
937 
938 		/* You should never ask for this much metadata */
939 		BUG_ON(bits_wanted >
940 		       (le32_to_cpu(fe->id1.bitmap1.i_total)
941 			- le32_to_cpu(fe->id1.bitmap1.i_used)));
942 	}
943 
944 	get_bh(bh);
945 	ac->ac_bh = bh;
946 bail:
947 	brelse(bh);
948 
949 	if (status)
950 		mlog_errno(status);
951 	return status;
952 }
953 
954 static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
955 {
956 	spin_lock(&osb->osb_lock);
957 	osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
958 	spin_unlock(&osb->osb_lock);
959 	atomic_set(&osb->s_num_inodes_stolen, 0);
960 }
961 
962 static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
963 {
964 	spin_lock(&osb->osb_lock);
965 	osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
966 	spin_unlock(&osb->osb_lock);
967 	atomic_set(&osb->s_num_meta_stolen, 0);
968 }
969 
970 void ocfs2_init_steal_slots(struct ocfs2_super *osb)
971 {
972 	ocfs2_init_inode_steal_slot(osb);
973 	ocfs2_init_meta_steal_slot(osb);
974 }
975 
976 static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
977 {
978 	spin_lock(&osb->osb_lock);
979 	if (type == INODE_ALLOC_SYSTEM_INODE)
980 		osb->s_inode_steal_slot = (u16)slot;
981 	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
982 		osb->s_meta_steal_slot = (u16)slot;
983 	spin_unlock(&osb->osb_lock);
984 }
985 
986 static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
987 {
988 	int slot = OCFS2_INVALID_SLOT;
989 
990 	spin_lock(&osb->osb_lock);
991 	if (type == INODE_ALLOC_SYSTEM_INODE)
992 		slot = osb->s_inode_steal_slot;
993 	else if (type == EXTENT_ALLOC_SYSTEM_INODE)
994 		slot = osb->s_meta_steal_slot;
995 	spin_unlock(&osb->osb_lock);
996 
997 	return slot;
998 }
999 
1000 static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
1001 {
1002 	return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
1003 }
1004 
1005 static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
1006 {
1007 	return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
1008 }
1009 
1010 static int ocfs2_steal_resource(struct ocfs2_super *osb,
1011 				struct ocfs2_alloc_context *ac,
1012 				int type)
1013 {
1014 	int i, status = -ENOSPC;
1015 	int slot = __ocfs2_get_steal_slot(osb, type);
1016 
1017 	/* Start to steal resource from the first slot after ours. */
1018 	if (slot == OCFS2_INVALID_SLOT)
1019 		slot = osb->slot_num + 1;
1020 
1021 	for (i = 0; i < osb->max_slots; i++, slot++) {
1022 		if (slot == osb->max_slots)
1023 			slot = 0;
1024 
1025 		if (slot == osb->slot_num)
1026 			continue;
1027 
1028 		status = ocfs2_reserve_suballoc_bits(osb, ac,
1029 						     type,
1030 						     (u32)slot, NULL,
1031 						     NOT_ALLOC_NEW_GROUP);
1032 		if (status >= 0) {
1033 			__ocfs2_set_steal_slot(osb, slot, type);
1034 			break;
1035 		}
1036 
1037 		ocfs2_free_ac_resource(ac);
1038 	}
1039 
1040 	return status;
1041 }
1042 
1043 static int ocfs2_steal_inode(struct ocfs2_super *osb,
1044 			     struct ocfs2_alloc_context *ac)
1045 {
1046 	return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
1047 }
1048 
1049 static int ocfs2_steal_meta(struct ocfs2_super *osb,
1050 			    struct ocfs2_alloc_context *ac)
1051 {
1052 	return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
1053 }
1054 
1055 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
1056 				      int blocks,
1057 				      struct ocfs2_alloc_context **ac)
1058 {
1059 	int status;
1060 	int slot = ocfs2_get_meta_steal_slot(osb);
1061 
1062 	*ac = kzalloc_obj(struct ocfs2_alloc_context);
1063 	if (!(*ac)) {
1064 		status = -ENOMEM;
1065 		mlog_errno(status);
1066 		goto bail;
1067 	}
1068 
1069 	(*ac)->ac_bits_wanted = blocks;
1070 	(*ac)->ac_which = OCFS2_AC_USE_META;
1071 	(*ac)->ac_group_search = ocfs2_block_group_search;
1072 
1073 	if (slot != OCFS2_INVALID_SLOT &&
1074 		atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
1075 		goto extent_steal;
1076 
1077 	atomic_set(&osb->s_num_meta_stolen, 0);
1078 	status = ocfs2_reserve_suballoc_bits(osb, (*ac),
1079 					     EXTENT_ALLOC_SYSTEM_INODE,
1080 					     (u32)osb->slot_num, NULL,
1081 					     ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
1082 
1083 
1084 	if (status >= 0) {
1085 		status = 0;
1086 		if (slot != OCFS2_INVALID_SLOT)
1087 			ocfs2_init_meta_steal_slot(osb);
1088 		goto bail;
1089 	} else if (status < 0 && status != -ENOSPC) {
1090 		mlog_errno(status);
1091 		goto bail;
1092 	}
1093 
1094 	ocfs2_free_ac_resource(*ac);
1095 
1096 extent_steal:
1097 	status = ocfs2_steal_meta(osb, *ac);
1098 	atomic_inc(&osb->s_num_meta_stolen);
1099 	if (status < 0) {
1100 		if (status != -ENOSPC)
1101 			mlog_errno(status);
1102 		goto bail;
1103 	}
1104 
1105 	status = 0;
1106 bail:
1107 	if ((status < 0) && *ac) {
1108 		ocfs2_free_alloc_context(*ac);
1109 		*ac = NULL;
1110 	}
1111 
1112 	if (status)
1113 		mlog_errno(status);
1114 	return status;
1115 }
1116 
1117 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1118 			       struct ocfs2_extent_list *root_el,
1119 			       struct ocfs2_alloc_context **ac)
1120 {
1121 	return ocfs2_reserve_new_metadata_blocks(osb,
1122 					ocfs2_extend_meta_needed(root_el),
1123 					ac);
1124 }
1125 
1126 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1127 			    struct ocfs2_alloc_context **ac)
1128 {
1129 	int status;
1130 	int slot = ocfs2_get_inode_steal_slot(osb);
1131 	u64 alloc_group;
1132 
1133 	*ac = kzalloc_obj(struct ocfs2_alloc_context);
1134 	if (!(*ac)) {
1135 		status = -ENOMEM;
1136 		mlog_errno(status);
1137 		goto bail;
1138 	}
1139 
1140 	(*ac)->ac_bits_wanted = 1;
1141 	(*ac)->ac_which = OCFS2_AC_USE_INODE;
1142 
1143 	(*ac)->ac_group_search = ocfs2_block_group_search;
1144 
1145 	/*
1146 	 * stat(2) can't handle i_ino > 32bits, so we tell the
1147 	 * lower levels not to allocate us a block group past that
1148 	 * limit.  The 'inode64' mount option avoids this behavior.
1149 	 */
1150 	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1151 		(*ac)->ac_max_block = (u32)~0U;
1152 
1153 	/*
1154 	 * slot is set when we successfully steal inode from other nodes.
1155 	 * It is reset in 3 places:
1156 	 * 1. when we flush the truncate log
1157 	 * 2. when we complete local alloc recovery.
1158 	 * 3. when we successfully allocate from our own slot.
1159 	 * After it is set, we will go on stealing inodes until we find the
1160 	 * need to check our slots to see whether there is some space for us.
1161 	 */
1162 	if (slot != OCFS2_INVALID_SLOT &&
1163 	    atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1164 		goto inode_steal;
1165 
1166 	atomic_set(&osb->s_num_inodes_stolen, 0);
1167 	alloc_group = osb->osb_inode_alloc_group;
1168 	status = ocfs2_reserve_suballoc_bits(osb, *ac,
1169 					     INODE_ALLOC_SYSTEM_INODE,
1170 					     (u32)osb->slot_num,
1171 					     &alloc_group,
1172 					     ALLOC_NEW_GROUP |
1173 					     ALLOC_GROUPS_FROM_GLOBAL);
1174 	if (status >= 0) {
1175 		status = 0;
1176 
1177 		spin_lock(&osb->osb_lock);
1178 		osb->osb_inode_alloc_group = alloc_group;
1179 		spin_unlock(&osb->osb_lock);
1180 		trace_ocfs2_reserve_new_inode_new_group(
1181 			(unsigned long long)alloc_group);
1182 
1183 		/*
1184 		 * Some inodes must be freed by us, so try to allocate
1185 		 * from our own next time.
1186 		 */
1187 		if (slot != OCFS2_INVALID_SLOT)
1188 			ocfs2_init_inode_steal_slot(osb);
1189 		goto bail;
1190 	} else if (status < 0 && status != -ENOSPC) {
1191 		mlog_errno(status);
1192 		goto bail;
1193 	}
1194 
1195 	ocfs2_free_ac_resource(*ac);
1196 
1197 inode_steal:
1198 	status = ocfs2_steal_inode(osb, *ac);
1199 	atomic_inc(&osb->s_num_inodes_stolen);
1200 	if (status < 0) {
1201 		if (status != -ENOSPC)
1202 			mlog_errno(status);
1203 		goto bail;
1204 	}
1205 
1206 	status = 0;
1207 bail:
1208 	if ((status < 0) && *ac) {
1209 		ocfs2_free_alloc_context(*ac);
1210 		*ac = NULL;
1211 	}
1212 
1213 	if (status)
1214 		mlog_errno(status);
1215 	return status;
1216 }
1217 
1218 /* local alloc code has to do the same thing, so rather than do this
1219  * twice.. */
1220 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1221 				      struct ocfs2_alloc_context *ac)
1222 {
1223 	int status;
1224 
1225 	ac->ac_which = OCFS2_AC_USE_MAIN;
1226 	ac->ac_group_search = ocfs2_cluster_group_search;
1227 
1228 	status = ocfs2_reserve_suballoc_bits(osb, ac,
1229 					     GLOBAL_BITMAP_SYSTEM_INODE,
1230 					     OCFS2_INVALID_SLOT, NULL,
1231 					     ALLOC_NEW_GROUP);
1232 	if (status < 0 && status != -ENOSPC)
1233 		mlog_errno(status);
1234 
1235 	return status;
1236 }
1237 
1238 /* Callers don't need to care which bitmap (local alloc or main) to
1239  * use so we figure it out for them, but unfortunately this clutters
1240  * things a bit. */
1241 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1242 					     u32 bits_wanted, u64 max_block,
1243 					     int flags,
1244 					     struct ocfs2_alloc_context **ac)
1245 {
1246 	int status, ret = 0;
1247 	int retried = 0;
1248 
1249 	*ac = kzalloc_obj(struct ocfs2_alloc_context);
1250 	if (!(*ac)) {
1251 		status = -ENOMEM;
1252 		mlog_errno(status);
1253 		goto bail;
1254 	}
1255 
1256 	(*ac)->ac_bits_wanted = bits_wanted;
1257 	(*ac)->ac_max_block = max_block;
1258 
1259 	status = -ENOSPC;
1260 	if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1261 	    ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1262 		status = ocfs2_reserve_local_alloc_bits(osb,
1263 							bits_wanted,
1264 							*ac);
1265 		if ((status < 0) && (status != -ENOSPC)) {
1266 			mlog_errno(status);
1267 			goto bail;
1268 		}
1269 	}
1270 
1271 	if (status == -ENOSPC) {
1272 retry:
1273 		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1274 		/* Retry if there is sufficient space cached in truncate log */
1275 		if (status == -ENOSPC && !retried) {
1276 			retried = 1;
1277 			ocfs2_inode_unlock((*ac)->ac_inode, 1);
1278 			inode_unlock((*ac)->ac_inode);
1279 
1280 			ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
1281 			if (ret == 1) {
1282 				iput((*ac)->ac_inode);
1283 				(*ac)->ac_inode = NULL;
1284 				goto retry;
1285 			}
1286 
1287 			if (ret < 0)
1288 				mlog_errno(ret);
1289 
1290 			inode_lock((*ac)->ac_inode);
1291 			ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
1292 			if (ret < 0) {
1293 				mlog_errno(ret);
1294 				inode_unlock((*ac)->ac_inode);
1295 				iput((*ac)->ac_inode);
1296 				(*ac)->ac_inode = NULL;
1297 				goto bail;
1298 			}
1299 		}
1300 		if (status < 0) {
1301 			if (status != -ENOSPC)
1302 				mlog_errno(status);
1303 			goto bail;
1304 		}
1305 	}
1306 
1307 	status = 0;
1308 bail:
1309 	if ((status < 0) && *ac) {
1310 		ocfs2_free_alloc_context(*ac);
1311 		*ac = NULL;
1312 	}
1313 
1314 	if (status)
1315 		mlog_errno(status);
1316 	return status;
1317 }
1318 
1319 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1320 			   u32 bits_wanted,
1321 			   struct ocfs2_alloc_context **ac)
1322 {
1323 	return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1324 						 ALLOC_NEW_GROUP, ac);
1325 }
1326 
1327 /*
1328  * More or less lifted from ext3. I'll leave their description below:
1329  *
1330  * "For ext3 allocations, we must not reuse any blocks which are
1331  * allocated in the bitmap buffer's "last committed data" copy.  This
1332  * prevents deletes from freeing up the page for reuse until we have
1333  * committed the delete transaction.
1334  *
1335  * If we didn't do this, then deleting something and reallocating it as
1336  * data would allow the old block to be overwritten before the
1337  * transaction committed (because we force data to disk before commit).
1338  * This would lead to corruption if we crashed between overwriting the
1339  * data and committing the delete.
1340  *
1341  * @@@ We may want to make this allocation behaviour conditional on
1342  * data-writes at some point, and disable it for metadata allocations or
1343  * sync-data inodes."
1344  *
1345  * Note: OCFS2 already does this differently for metadata vs data
1346  * allocations, as those bitmaps are separate and undo access is never
1347  * called on a metadata group descriptor.
1348  */
1349 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1350 					 int nr)
1351 {
1352 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1353 	struct journal_head *jh;
1354 	int ret;
1355 
1356 	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1357 		return 0;
1358 
1359 	jh = jbd2_journal_grab_journal_head(bg_bh);
1360 	if (!jh)
1361 		return 1;
1362 
1363 	spin_lock(&jh->b_state_lock);
1364 	bg = (struct ocfs2_group_desc *) jh->b_committed_data;
1365 	if (bg)
1366 		ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1367 	else
1368 		ret = 1;
1369 	spin_unlock(&jh->b_state_lock);
1370 	jbd2_journal_put_journal_head(jh);
1371 
1372 	return ret;
1373 }
1374 
1375 u16 ocfs2_find_max_contig_free_bits(void *bitmap,
1376 			 u16 total_bits, u16 start)
1377 {
1378 	u16 offset, free_bits;
1379 	u16 contig_bits = 0;
1380 
1381 	while (start < total_bits) {
1382 		offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start);
1383 		if (offset == total_bits)
1384 			break;
1385 
1386 		start = ocfs2_find_next_bit(bitmap, total_bits, offset);
1387 		free_bits = start - offset;
1388 		if (contig_bits < free_bits)
1389 			contig_bits = free_bits;
1390 	}
1391 
1392 	return contig_bits;
1393 }
1394 
1395 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1396 					     struct buffer_head *bg_bh,
1397 					     unsigned int bits_wanted,
1398 					     unsigned int total_bits,
1399 					     struct ocfs2_suballoc_result *res)
1400 {
1401 	void *bitmap;
1402 	u16 best_offset, best_size;
1403 	u16 prev_best_size = 0;
1404 	int offset, start, found, status = 0;
1405 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1406 
1407 	/* Callers got this descriptor from
1408 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1409 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1410 
1411 	found = start = best_offset = best_size = 0;
1412 	bitmap = bg->bg_bitmap;
1413 
1414 	while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) <
1415 	       total_bits) {
1416 		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1417 			/* We found a zero, but we can't use it as it
1418 			 * hasn't been put to disk yet! */
1419 			found = 0;
1420 			start = offset + 1;
1421 		} else if (offset == start) {
1422 			/* we found a zero */
1423 			found++;
1424 			/* move start to the next bit to test */
1425 			start++;
1426 		} else {
1427 			/* got a zero after some ones */
1428 			found = 1;
1429 			start = offset + 1;
1430 			prev_best_size = best_size;
1431 		}
1432 		if (found > best_size) {
1433 			best_size = found;
1434 			best_offset = start - found;
1435 		}
1436 		/* we got everything we needed */
1437 		if (found == bits_wanted) {
1438 			/* mlog(0, "Found it all!\n"); */
1439 			break;
1440 		}
1441 	}
1442 
1443 	/* best_size will be allocated, we save prev_best_size */
1444 	res->sr_max_contig_bits = prev_best_size;
1445 	if (best_size) {
1446 		res->sr_bit_offset = best_offset;
1447 		res->sr_bits = best_size;
1448 	} else {
1449 		status = -ENOSPC;
1450 		/* No error log here -- see the comment above
1451 		 * ocfs2_test_bg_bit_allocatable */
1452 	}
1453 
1454 	return status;
1455 }
1456 
1457 int ocfs2_block_group_set_bits(handle_t *handle,
1458 					     struct inode *alloc_inode,
1459 					     struct ocfs2_group_desc *bg,
1460 					     struct buffer_head *group_bh,
1461 					     unsigned int bit_off,
1462 					     unsigned int num_bits,
1463 					     unsigned int max_contig_bits,
1464 					     int fastpath)
1465 {
1466 	int status;
1467 	void *bitmap = bg->bg_bitmap;
1468 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1469 	unsigned int start = bit_off + num_bits;
1470 	u16 contig_bits;
1471 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1472 
1473 	/* All callers get the descriptor via
1474 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1475 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1476 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1477 
1478 	trace_ocfs2_block_group_set_bits(bit_off, num_bits);
1479 
1480 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1481 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1482 
1483 	status = ocfs2_journal_access_gd(handle,
1484 					 INODE_CACHE(alloc_inode),
1485 					 group_bh,
1486 					 journal_type);
1487 	if (status < 0) {
1488 		mlog_errno(status);
1489 		goto bail;
1490 	}
1491 
1492 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1493 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1494 		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
1495 				   (unsigned long long)le64_to_cpu(bg->bg_blkno),
1496 				   le16_to_cpu(bg->bg_bits),
1497 				   le16_to_cpu(bg->bg_free_bits_count),
1498 				   num_bits);
1499 	}
1500 	while(num_bits--)
1501 		ocfs2_set_bit(bit_off++, bitmap);
1502 
1503 	/*
1504 	 * this is optimize path, caller set old contig value
1505 	 * in max_contig_bits to bypass finding action.
1506 	 */
1507 	if (fastpath) {
1508 		bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
1509 	} else if (ocfs2_is_cluster_bitmap(alloc_inode)) {
1510 		/*
1511 		 * Usually, the block group bitmap allocates only 1 bit
1512 		 * at a time, while the cluster group allocates n bits
1513 		 * each time. Therefore, we only save the contig bits for
1514 		 * the cluster group.
1515 		 */
1516 		contig_bits = ocfs2_find_max_contig_free_bits(bitmap,
1517 				    le16_to_cpu(bg->bg_bits), start);
1518 		if (contig_bits > max_contig_bits)
1519 			max_contig_bits = contig_bits;
1520 		bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
1521 		ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits);
1522 	} else {
1523 		bg->bg_contig_free_bits = 0;
1524 	}
1525 
1526 	ocfs2_journal_dirty(handle, group_bh);
1527 
1528 bail:
1529 	return status;
1530 }
1531 
1532 /* find the one with the most empty bits */
1533 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1534 {
1535 	u16 curr, best;
1536 
1537 	BUG_ON(!cl->cl_next_free_rec);
1538 
1539 	best = curr = 0;
1540 	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1541 		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1542 		    le32_to_cpu(cl->cl_recs[best].c_free))
1543 			best = curr;
1544 		curr++;
1545 	}
1546 
1547 	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1548 	return best;
1549 }
1550 
1551 static int ocfs2_relink_block_group(handle_t *handle,
1552 				    struct inode *alloc_inode,
1553 				    struct buffer_head *fe_bh,
1554 				    struct buffer_head *bg_bh,
1555 				    struct buffer_head *prev_bg_bh,
1556 				    u16 chain)
1557 {
1558 	int status;
1559 	/* there is a really tiny chance the journal calls could fail,
1560 	 * but we wouldn't want inconsistent blocks in *any* case. */
1561 	u64 bg_ptr, prev_bg_ptr;
1562 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1563 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1564 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1565 
1566 	/* The caller got these descriptors from
1567 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1568 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1569 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1570 
1571 	trace_ocfs2_relink_block_group(
1572 		(unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1573 		(unsigned long long)le64_to_cpu(bg->bg_blkno),
1574 		(unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1575 
1576 	bg_ptr = le64_to_cpu(bg->bg_next_group);
1577 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1578 
1579 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1580 					 prev_bg_bh,
1581 					 OCFS2_JOURNAL_ACCESS_WRITE);
1582 	if (status < 0)
1583 		goto out;
1584 
1585 	prev_bg->bg_next_group = bg->bg_next_group;
1586 	ocfs2_journal_dirty(handle, prev_bg_bh);
1587 
1588 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1589 					 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1590 	if (status < 0)
1591 		goto out_rollback_prev_bg;
1592 
1593 	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1594 	ocfs2_journal_dirty(handle, bg_bh);
1595 
1596 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1597 					 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1598 	if (status < 0)
1599 		goto out_rollback_bg;
1600 
1601 	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1602 	ocfs2_journal_dirty(handle, fe_bh);
1603 
1604 out:
1605 	if (status < 0)
1606 		mlog_errno(status);
1607 	return status;
1608 
1609 out_rollback_bg:
1610 	bg->bg_next_group = cpu_to_le64(bg_ptr);
1611 out_rollback_prev_bg:
1612 	prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1613 	goto out;
1614 }
1615 
1616 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1617 						     u32 wanted)
1618 {
1619 	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1620 }
1621 
1622 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1623  * value on error. */
1624 static int ocfs2_cluster_group_search(struct inode *inode,
1625 				      struct buffer_head *group_bh,
1626 				      u32 bits_wanted, u32 min_bits,
1627 				      u64 max_block,
1628 				      struct ocfs2_suballoc_result *res)
1629 {
1630 	int search = -ENOSPC;
1631 	int ret;
1632 	u64 blkoff;
1633 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1634 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1635 	unsigned int max_bits, gd_cluster_off;
1636 
1637 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1638 
1639 	if (le16_to_cpu(gd->bg_contig_free_bits) &&
1640 	    le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted)
1641 		return -ENOSPC;
1642 
1643 	/* ->bg_contig_free_bits may un-initialized, so compare again */
1644 	if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) {
1645 		max_bits = le16_to_cpu(gd->bg_bits);
1646 
1647 		/* Tail groups in cluster bitmaps which aren't cpg
1648 		 * aligned are prone to partial extension by a failed
1649 		 * fs resize. If the file system resize never got to
1650 		 * update the dinode cluster count, then we don't want
1651 		 * to trust any clusters past it, regardless of what
1652 		 * the group descriptor says. */
1653 		gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1654 							  le64_to_cpu(gd->bg_blkno));
1655 		if ((gd_cluster_off + max_bits) >
1656 		    OCFS2_I(inode)->ip_clusters) {
1657 			max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1658 			trace_ocfs2_cluster_group_search_wrong_max_bits(
1659 				(unsigned long long)le64_to_cpu(gd->bg_blkno),
1660 				le16_to_cpu(gd->bg_bits),
1661 				OCFS2_I(inode)->ip_clusters, max_bits);
1662 		}
1663 
1664 		ret = ocfs2_block_group_find_clear_bits(osb,
1665 							group_bh, bits_wanted,
1666 							max_bits, res);
1667 		if (ret)
1668 			return ret;
1669 
1670 		if (max_block) {
1671 			blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1672 							  gd_cluster_off +
1673 							  res->sr_bit_offset +
1674 							  res->sr_bits);
1675 			trace_ocfs2_cluster_group_search_max_block(
1676 				(unsigned long long)blkoff,
1677 				(unsigned long long)max_block);
1678 			if (blkoff > max_block)
1679 				return -ENOSPC;
1680 		}
1681 
1682 		/* ocfs2_block_group_find_clear_bits() might
1683 		 * return success, but we still want to return
1684 		 * -ENOSPC unless it found the minimum number
1685 		 * of bits. */
1686 		if (min_bits <= res->sr_bits)
1687 			search = 0; /* success */
1688 	}
1689 
1690 	return search;
1691 }
1692 
1693 static int ocfs2_block_group_search(struct inode *inode,
1694 				    struct buffer_head *group_bh,
1695 				    u32 bits_wanted, u32 min_bits,
1696 				    u64 max_block,
1697 				    struct ocfs2_suballoc_result *res)
1698 {
1699 	int ret = -ENOSPC;
1700 	u64 blkoff;
1701 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1702 
1703 	BUG_ON(min_bits != 1);
1704 	BUG_ON(ocfs2_is_cluster_bitmap(inode));
1705 
1706 	if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) {
1707 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1708 							group_bh, bits_wanted,
1709 							le16_to_cpu(bg->bg_bits),
1710 							res);
1711 		if (!ret && max_block) {
1712 			blkoff = le64_to_cpu(bg->bg_blkno) +
1713 				res->sr_bit_offset + res->sr_bits;
1714 			trace_ocfs2_block_group_search_max_block(
1715 				(unsigned long long)blkoff,
1716 				(unsigned long long)max_block);
1717 			if (blkoff > max_block)
1718 				ret = -ENOSPC;
1719 		}
1720 	}
1721 
1722 	return ret;
1723 }
1724 
1725 int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1726 				       handle_t *handle,
1727 				       struct buffer_head *di_bh,
1728 				       u32 num_bits,
1729 				       u16 chain)
1730 {
1731 	int ret;
1732 	u32 tmp_used;
1733 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1734 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1735 
1736 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1737 				      OCFS2_JOURNAL_ACCESS_WRITE);
1738 	if (ret < 0) {
1739 		mlog_errno(ret);
1740 		goto out;
1741 	}
1742 
1743 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1744 	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1745 	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1746 	ocfs2_journal_dirty(handle, di_bh);
1747 
1748 out:
1749 	return ret;
1750 }
1751 
1752 void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
1753 				       struct buffer_head *di_bh,
1754 				       u32 num_bits,
1755 				       u16 chain)
1756 {
1757 	u32 tmp_used;
1758 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1759 	struct ocfs2_chain_list *cl;
1760 
1761 	cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
1762 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1763 	di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
1764 	le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
1765 }
1766 
1767 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1768 					 struct ocfs2_extent_rec *rec,
1769 					 struct ocfs2_chain_list *cl)
1770 {
1771 	unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1772 	unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1773 	unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
1774 
1775 	if (res->sr_bit_offset < bitoff)
1776 		return 0;
1777 	if (res->sr_bit_offset >= (bitoff + bitcount))
1778 		return 0;
1779 	res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1780 		(res->sr_bit_offset - bitoff);
1781 	if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1782 		res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1783 	return 1;
1784 }
1785 
1786 static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1787 					  struct ocfs2_group_desc *bg,
1788 					  struct ocfs2_suballoc_result *res)
1789 {
1790 	int i;
1791 	u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
1792 	struct ocfs2_extent_rec *rec;
1793 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1794 	struct ocfs2_chain_list *cl = &di->id2.i_chain;
1795 
1796 	if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1797 		res->sr_blkno = 0;
1798 		return;
1799 	}
1800 
1801 	res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1802 	res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
1803 	if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1804 	    !bg->bg_list.l_next_free_rec)
1805 		return;
1806 
1807 	for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1808 		rec = &bg->bg_list.l_recs[i];
1809 		if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1810 			res->sr_bg_blkno = bg_blkno;  /* Restore */
1811 			break;
1812 		}
1813 	}
1814 }
1815 
1816 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1817 				  handle_t *handle,
1818 				  u32 bits_wanted,
1819 				  u32 min_bits,
1820 				  struct ocfs2_suballoc_result *res,
1821 				  u16 *bits_left, int *released)
1822 {
1823 	int ret;
1824 	struct buffer_head *group_bh = NULL;
1825 	struct ocfs2_group_desc *gd;
1826 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1827 	struct inode *alloc_inode = ac->ac_inode;
1828 
1829 	ret = ocfs2_read_hint_group_descriptor(alloc_inode, di,
1830 				res->sr_bg_blkno, &group_bh, released);
1831 	if (*released) {
1832 		return 0;
1833 	} else if (ret < 0) {
1834 		mlog_errno(ret);
1835 		return ret;
1836 	}
1837 
1838 	gd = (struct ocfs2_group_desc *) group_bh->b_data;
1839 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1840 				  ac->ac_max_block, res);
1841 	if (ret < 0) {
1842 		if (ret != -ENOSPC)
1843 			mlog_errno(ret);
1844 		goto out;
1845 	}
1846 
1847 	if (!ret)
1848 		ocfs2_bg_discontig_fix_result(ac, gd, res);
1849 
1850 	/*
1851 	 * sr_bg_blkno might have been changed by
1852 	 * ocfs2_bg_discontig_fix_result
1853 	 */
1854 	res->sr_bg_stable_blkno = group_bh->b_blocknr;
1855 
1856 	if (ac->ac_find_loc_only)
1857 		goto out_loc_only;
1858 
1859 	ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1860 					       res->sr_bits,
1861 					       le16_to_cpu(gd->bg_chain));
1862 	if (ret < 0) {
1863 		mlog_errno(ret);
1864 		goto out;
1865 	}
1866 
1867 	ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1868 					 res->sr_bit_offset, res->sr_bits,
1869 					 res->sr_max_contig_bits, 0);
1870 	if (ret < 0) {
1871 		ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
1872 					       res->sr_bits,
1873 					       le16_to_cpu(gd->bg_chain));
1874 		mlog_errno(ret);
1875 	}
1876 
1877 out_loc_only:
1878 	*bits_left = le16_to_cpu(gd->bg_free_bits_count);
1879 
1880 out:
1881 	brelse(group_bh);
1882 
1883 	return ret;
1884 }
1885 
1886 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1887 			      handle_t *handle,
1888 			      u32 bits_wanted,
1889 			      u32 min_bits,
1890 			      struct ocfs2_suballoc_result *res,
1891 			      u16 *bits_left)
1892 {
1893 	int status;
1894 	u16 chain;
1895 	u32 contig_bits;
1896 	u64 next_group;
1897 	struct inode *alloc_inode = ac->ac_inode;
1898 	struct buffer_head *group_bh = NULL;
1899 	struct buffer_head *prev_group_bh = NULL;
1900 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1901 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1902 	struct ocfs2_group_desc *bg;
1903 
1904 	chain = ac->ac_chain;
1905 	trace_ocfs2_search_chain_begin(
1906 		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
1907 		bits_wanted, chain);
1908 
1909 	status = ocfs2_read_group_descriptor(alloc_inode, fe,
1910 					     le64_to_cpu(cl->cl_recs[chain].c_blkno),
1911 					     &group_bh);
1912 	if (status < 0) {
1913 		mlog_errno(status);
1914 		goto bail;
1915 	}
1916 	bg = (struct ocfs2_group_desc *) group_bh->b_data;
1917 
1918 	status = -ENOSPC;
1919 	/* for now, the chain search is a bit simplistic. We just use
1920 	 * the 1st group with any empty bits. */
1921 	while (1) {
1922 		if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) {
1923 			contig_bits = le16_to_cpu(bg->bg_contig_free_bits);
1924 			if (!contig_bits)
1925 				contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
1926 						le16_to_cpu(bg->bg_bits), 0);
1927 			if (bits_wanted > contig_bits && contig_bits >= min_bits)
1928 				bits_wanted = contig_bits;
1929 		}
1930 
1931 		status = ac->ac_group_search(alloc_inode, group_bh,
1932 				bits_wanted, min_bits,
1933 				ac->ac_max_block, res);
1934 		if (status != -ENOSPC)
1935 			break;
1936 		if (!bg->bg_next_group)
1937 			break;
1938 
1939 		brelse(prev_group_bh);
1940 		prev_group_bh = NULL;
1941 
1942 		next_group = le64_to_cpu(bg->bg_next_group);
1943 		prev_group_bh = group_bh;
1944 		group_bh = NULL;
1945 		status = ocfs2_read_group_descriptor(alloc_inode, fe,
1946 						     next_group, &group_bh);
1947 		if (status < 0) {
1948 			mlog_errno(status);
1949 			goto bail;
1950 		}
1951 		bg = (struct ocfs2_group_desc *) group_bh->b_data;
1952 	}
1953 	if (status < 0) {
1954 		if (status != -ENOSPC)
1955 			mlog_errno(status);
1956 		goto bail;
1957 	}
1958 
1959 	trace_ocfs2_search_chain_succ(
1960 		(unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
1961 
1962 	res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1963 
1964 	BUG_ON(res->sr_bits == 0);
1965 	if (!status)
1966 		ocfs2_bg_discontig_fix_result(ac, bg, res);
1967 
1968 	/*
1969 	 * sr_bg_blkno might have been changed by
1970 	 * ocfs2_bg_discontig_fix_result
1971 	 */
1972 	res->sr_bg_stable_blkno = group_bh->b_blocknr;
1973 
1974 	/*
1975 	 * Keep track of previous block descriptor read. When
1976 	 * we find a target, if we have read more than X
1977 	 * number of descriptors, and the target is reasonably
1978 	 * empty, relink him to top of his chain.
1979 	 *
1980 	 * We've read 0 extra blocks and only send one more to
1981 	 * the transaction, yet the next guy to search has a
1982 	 * much easier time.
1983 	 *
1984 	 * Do this *after* figuring out how many bits we're taking out
1985 	 * of our target group.
1986 	 */
1987 	if (!ac->ac_disable_chain_relink &&
1988 	    (prev_group_bh) &&
1989 	    (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1990 		status = ocfs2_relink_block_group(handle, alloc_inode,
1991 						  ac->ac_bh, group_bh,
1992 						  prev_group_bh, chain);
1993 		if (status < 0) {
1994 			mlog_errno(status);
1995 			goto bail;
1996 		}
1997 	}
1998 
1999 	if (ac->ac_find_loc_only)
2000 		goto out_loc_only;
2001 
2002 	status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
2003 						  ac->ac_bh, res->sr_bits,
2004 						  chain);
2005 	if (status) {
2006 		mlog_errno(status);
2007 		goto bail;
2008 	}
2009 
2010 	status = ocfs2_block_group_set_bits(handle,
2011 					    alloc_inode,
2012 					    bg,
2013 					    group_bh,
2014 					    res->sr_bit_offset,
2015 					    res->sr_bits,
2016 					    res->sr_max_contig_bits,
2017 					    0);
2018 	if (status < 0) {
2019 		ocfs2_rollback_alloc_dinode_counts(alloc_inode,
2020 					ac->ac_bh, res->sr_bits, chain);
2021 		mlog_errno(status);
2022 		goto bail;
2023 	}
2024 
2025 	trace_ocfs2_search_chain_end(
2026 			(unsigned long long)le64_to_cpu(fe->i_blkno),
2027 			res->sr_bits);
2028 
2029 out_loc_only:
2030 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
2031 bail:
2032 	brelse(group_bh);
2033 	brelse(prev_group_bh);
2034 
2035 	if (status)
2036 		mlog_errno(status);
2037 	return status;
2038 }
2039 
2040 /* will give out up to bits_wanted contiguous bits. */
2041 static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
2042 				     handle_t *handle,
2043 				     u32 bits_wanted,
2044 				     u32 min_bits,
2045 				     struct ocfs2_suballoc_result *res)
2046 {
2047 	int status;
2048 	int released = 0;
2049 	u16 victim, i;
2050 	u16 bits_left = 0;
2051 	u64 hint = ac->ac_last_group;
2052 	struct ocfs2_chain_list *cl;
2053 	struct ocfs2_dinode *fe;
2054 
2055 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2056 	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
2057 	BUG_ON(!ac->ac_bh);
2058 
2059 	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2060 
2061 	/* The bh was validated by the inode read during
2062 	 * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
2063 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2064 
2065 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
2066 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
2067 		status = ocfs2_error(ac->ac_inode->i_sb,
2068 				     "Chain allocator dinode %llu has %u used bits but only %u total\n",
2069 				     (unsigned long long)le64_to_cpu(fe->i_blkno),
2070 				     le32_to_cpu(fe->id1.bitmap1.i_used),
2071 				     le32_to_cpu(fe->id1.bitmap1.i_total));
2072 		goto bail;
2073 	}
2074 
2075 	/* the hint bg may already be released, we quiet search this group. */
2076 	res->sr_bg_blkno = hint;
2077 	if (res->sr_bg_blkno) {
2078 		/* Attempt to short-circuit the usual search mechanism
2079 		 * by jumping straight to the most recently used
2080 		 * allocation group. This helps us maintain some
2081 		 * contiguousness across allocations. */
2082 		status = ocfs2_search_one_group(ac, handle, bits_wanted,
2083 						min_bits, res, &bits_left,
2084 						&released);
2085 		if (released) {
2086 			res->sr_bg_blkno = 0;
2087 			goto chain_search;
2088 		}
2089 		if (!status)
2090 			goto set_hint;
2091 		if (status < 0 && status != -ENOSPC) {
2092 			mlog_errno(status);
2093 			goto bail;
2094 		}
2095 	}
2096 chain_search:
2097 	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
2098 	if (!le16_to_cpu(cl->cl_next_free_rec) ||
2099 	    le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) {
2100 		status = ocfs2_error(ac->ac_inode->i_sb,
2101 				     "Chain allocator dinode %llu has invalid next "
2102 				     "free chain record %u, but only %u total\n",
2103 				     (unsigned long long)le64_to_cpu(fe->i_blkno),
2104 				     le16_to_cpu(cl->cl_next_free_rec),
2105 				     le16_to_cpu(cl->cl_count));
2106 		goto bail;
2107 	}
2108 
2109 	victim = ocfs2_find_victim_chain(cl);
2110 	ac->ac_chain = victim;
2111 
2112 search:
2113 	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
2114 				    res, &bits_left);
2115 	if (!status) {
2116 		if (ocfs2_is_cluster_bitmap(ac->ac_inode))
2117 			hint = res->sr_bg_blkno;
2118 		else
2119 			hint = ocfs2_group_from_res(res);
2120 		goto set_hint;
2121 	}
2122 	if (status < 0 && status != -ENOSPC) {
2123 		mlog_errno(status);
2124 		goto bail;
2125 	}
2126 
2127 	trace_ocfs2_claim_suballoc_bits(victim);
2128 
2129 	/* If we didn't pick a good victim, then just default to
2130 	 * searching each chain in order. Don't allow chain relinking
2131 	 * because we only calculate enough journal credits for one
2132 	 * relink per alloc. */
2133 	ac->ac_disable_chain_relink = 1;
2134 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
2135 		if (i == victim)
2136 			continue;
2137 		if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted)
2138 			continue;
2139 
2140 		ac->ac_chain = i;
2141 		status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
2142 					    res, &bits_left);
2143 		if (!status) {
2144 			hint = ocfs2_group_from_res(res);
2145 			break;
2146 		}
2147 		if (status < 0 && status != -ENOSPC) {
2148 			mlog_errno(status);
2149 			goto bail;
2150 		}
2151 	}
2152 
2153 	/* Chains can't supply the bits_wanted contiguous space.
2154 	 * We should switch to using every single bit when allocating
2155 	 * from the global bitmap. */
2156 	if (i == le16_to_cpu(cl->cl_next_free_rec) &&
2157 	    status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) {
2158 		ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;
2159 		ac->ac_chain = victim;
2160 		goto search;
2161 	}
2162 
2163 set_hint:
2164 	if (status != -ENOSPC) {
2165 		/* If the next search of this group is not likely to
2166 		 * yield a suitable extent, then we reset the last
2167 		 * group hint so as to not waste a disk read */
2168 		if (bits_left < min_bits)
2169 			ac->ac_last_group = 0;
2170 		else
2171 			ac->ac_last_group = hint;
2172 	}
2173 
2174 bail:
2175 	if (status)
2176 		mlog_errno(status);
2177 	return status;
2178 }
2179 
2180 int ocfs2_claim_metadata(handle_t *handle,
2181 			 struct ocfs2_alloc_context *ac,
2182 			 u32 bits_wanted,
2183 			 u64 *suballoc_loc,
2184 			 u16 *suballoc_bit_start,
2185 			 unsigned int *num_bits,
2186 			 u64 *blkno_start)
2187 {
2188 	int status;
2189 	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2190 
2191 	BUG_ON(!ac);
2192 	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
2193 	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
2194 
2195 	status = ocfs2_claim_suballoc_bits(ac,
2196 					   handle,
2197 					   bits_wanted,
2198 					   1,
2199 					   &res);
2200 	if (status < 0) {
2201 		mlog_errno(status);
2202 		goto bail;
2203 	}
2204 	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2205 
2206 	*suballoc_loc = res.sr_bg_blkno;
2207 	*suballoc_bit_start = res.sr_bit_offset;
2208 	*blkno_start = res.sr_blkno;
2209 	ac->ac_bits_given += res.sr_bits;
2210 	*num_bits = res.sr_bits;
2211 	status = 0;
2212 bail:
2213 	if (status)
2214 		mlog_errno(status);
2215 	return status;
2216 }
2217 
2218 /*
2219  * after ocfs2 has the ability to release block group unused space,
2220  * the ->ip_last_used_group may be invalid. so this function returns
2221  * ac->ac_last_group need to verify.
2222  * refer the 'hint' in ocfs2_claim_suballoc_bits() for more details.
2223  */
2224 static void ocfs2_init_inode_ac_group(struct inode *dir,
2225 				      struct buffer_head *parent_di_bh,
2226 				      struct ocfs2_alloc_context *ac)
2227 {
2228 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
2229 	/*
2230 	 * Try to allocate inodes from some specific group.
2231 	 *
2232 	 * If the parent dir has recorded the last group used in allocation,
2233 	 * cool, use it. Otherwise if we try to allocate new inode from the
2234 	 * same slot the parent dir belongs to, use the same chunk.
2235 	 *
2236 	 * We are very careful here to avoid the mistake of setting
2237 	 * ac_last_group to a group descriptor from a different (unlocked) slot.
2238 	 */
2239 	if (OCFS2_I(dir)->ip_last_used_group &&
2240 	    OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
2241 		ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
2242 	else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
2243 		if (di->i_suballoc_loc)
2244 			ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
2245 		else
2246 			ac->ac_last_group = ocfs2_which_suballoc_group(
2247 					le64_to_cpu(di->i_blkno),
2248 					le16_to_cpu(di->i_suballoc_bit));
2249 	}
2250 }
2251 
2252 static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2253 					     struct ocfs2_alloc_context *ac)
2254 {
2255 	OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
2256 	OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2257 }
2258 
2259 int ocfs2_find_new_inode_loc(struct inode *dir,
2260 			     struct buffer_head *parent_fe_bh,
2261 			     struct ocfs2_alloc_context *ac,
2262 			     u64 *fe_blkno)
2263 {
2264 	int ret;
2265 	handle_t *handle = NULL;
2266 	struct ocfs2_suballoc_result *res;
2267 
2268 	BUG_ON(!ac);
2269 	BUG_ON(ac->ac_bits_given != 0);
2270 	BUG_ON(ac->ac_bits_wanted != 1);
2271 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2272 
2273 	res = kzalloc_obj(*res, GFP_NOFS);
2274 	if (res == NULL) {
2275 		ret = -ENOMEM;
2276 		mlog_errno(ret);
2277 		goto out;
2278 	}
2279 
2280 	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2281 
2282 	/*
2283 	 * The handle started here is for chain relink. Alternatively,
2284 	 * we could just disable relink for these calls.
2285 	 */
2286 	handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2287 	if (IS_ERR(handle)) {
2288 		ret = PTR_ERR(handle);
2289 		handle = NULL;
2290 		mlog_errno(ret);
2291 		goto out;
2292 	}
2293 
2294 	/*
2295 	 * This will instruct ocfs2_claim_suballoc_bits and
2296 	 * ocfs2_search_one_group to search but save actual allocation
2297 	 * for later.
2298 	 */
2299 	ac->ac_find_loc_only = 1;
2300 
2301 	ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2302 	if (ret < 0) {
2303 		mlog_errno(ret);
2304 		goto out;
2305 	}
2306 
2307 	ac->ac_find_loc_priv = res;
2308 	*fe_blkno = res->sr_blkno;
2309 	ocfs2_update_inode_fsync_trans(handle, dir, 0);
2310 out:
2311 	if (handle)
2312 		ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2313 
2314 	if (ret)
2315 		kfree(res);
2316 
2317 	return ret;
2318 }
2319 
2320 int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2321 				 struct inode *dir,
2322 				 struct ocfs2_alloc_context *ac,
2323 				 u64 *suballoc_loc,
2324 				 u16 *suballoc_bit,
2325 				 u64 di_blkno)
2326 {
2327 	int ret;
2328 	u16 chain;
2329 	struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2330 	struct buffer_head *bg_bh = NULL;
2331 	struct ocfs2_group_desc *bg;
2332 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2333 
2334 	/*
2335 	 * Since di_blkno is being passed back in, we check for any
2336 	 * inconsistencies which may have happened between
2337 	 * calls. These are code bugs as di_blkno is not expected to
2338 	 * change once returned from ocfs2_find_new_inode_loc()
2339 	 */
2340 	BUG_ON(res->sr_blkno != di_blkno);
2341 
2342 	ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2343 					  res->sr_bg_stable_blkno, &bg_bh);
2344 	if (ret) {
2345 		mlog_errno(ret);
2346 		goto out;
2347 	}
2348 
2349 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2350 	chain = le16_to_cpu(bg->bg_chain);
2351 
2352 	ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2353 					       ac->ac_bh, res->sr_bits,
2354 					       chain);
2355 	if (ret) {
2356 		mlog_errno(ret);
2357 		goto out;
2358 	}
2359 
2360 	ret = ocfs2_block_group_set_bits(handle,
2361 					 ac->ac_inode,
2362 					 bg,
2363 					 bg_bh,
2364 					 res->sr_bit_offset,
2365 					 res->sr_bits,
2366 					 res->sr_max_contig_bits,
2367 					 0);
2368 	if (ret < 0) {
2369 		ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
2370 					       ac->ac_bh, res->sr_bits, chain);
2371 		mlog_errno(ret);
2372 		goto out;
2373 	}
2374 
2375 	trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
2376 					   res->sr_bits);
2377 
2378 	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2379 
2380 	BUG_ON(res->sr_bits != 1);
2381 
2382 	*suballoc_loc = res->sr_bg_blkno;
2383 	*suballoc_bit = res->sr_bit_offset;
2384 	ac->ac_bits_given++;
2385 	ocfs2_save_inode_ac_group(dir, ac);
2386 
2387 out:
2388 	brelse(bg_bh);
2389 
2390 	return ret;
2391 }
2392 
2393 int ocfs2_claim_new_inode(handle_t *handle,
2394 			  struct inode *dir,
2395 			  struct buffer_head *parent_fe_bh,
2396 			  struct ocfs2_alloc_context *ac,
2397 			  u64 *suballoc_loc,
2398 			  u16 *suballoc_bit,
2399 			  u64 *fe_blkno)
2400 {
2401 	int status;
2402 	struct ocfs2_suballoc_result res;
2403 
2404 	BUG_ON(!ac);
2405 	BUG_ON(ac->ac_bits_given != 0);
2406 	BUG_ON(ac->ac_bits_wanted != 1);
2407 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2408 
2409 	ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2410 
2411 	status = ocfs2_claim_suballoc_bits(ac,
2412 					   handle,
2413 					   1,
2414 					   1,
2415 					   &res);
2416 	if (status < 0) {
2417 		mlog_errno(status);
2418 		goto bail;
2419 	}
2420 	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2421 
2422 	BUG_ON(res.sr_bits != 1);
2423 
2424 	*suballoc_loc = res.sr_bg_blkno;
2425 	*suballoc_bit = res.sr_bit_offset;
2426 	*fe_blkno = res.sr_blkno;
2427 	ac->ac_bits_given++;
2428 	ocfs2_save_inode_ac_group(dir, ac);
2429 	status = 0;
2430 bail:
2431 	if (status)
2432 		mlog_errno(status);
2433 	return status;
2434 }
2435 
2436 /* translate a group desc. blkno and it's bitmap offset into
2437  * disk cluster offset. */
2438 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2439 						   u64 bg_blkno,
2440 						   u16 bg_bit_off)
2441 {
2442 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2443 	u32 cluster = 0;
2444 
2445 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2446 
2447 	if (bg_blkno != osb->first_cluster_group_blkno)
2448 		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2449 	cluster += (u32) bg_bit_off;
2450 	return cluster;
2451 }
2452 
2453 /* given a cluster offset, calculate which block group it belongs to
2454  * and return that block offset. */
2455 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2456 {
2457 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2458 	u32 group_no;
2459 
2460 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2461 
2462 	group_no = cluster / osb->bitmap_cpg;
2463 	if (!group_no)
2464 		return osb->first_cluster_group_blkno;
2465 	return ocfs2_clusters_to_blocks(inode->i_sb,
2466 					group_no * osb->bitmap_cpg);
2467 }
2468 
2469 /* given the block number of a cluster start, calculate which cluster
2470  * group and descriptor bitmap offset that corresponds to. */
2471 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2472 						u64 data_blkno,
2473 						u64 *bg_blkno,
2474 						u16 *bg_bit_off)
2475 {
2476 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2477 	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2478 
2479 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2480 
2481 	*bg_blkno = ocfs2_which_cluster_group(inode,
2482 					      data_cluster);
2483 
2484 	if (*bg_blkno == osb->first_cluster_group_blkno)
2485 		*bg_bit_off = (u16) data_cluster;
2486 	else
2487 		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2488 							     data_blkno - *bg_blkno);
2489 }
2490 
2491 /*
2492  * min_bits - minimum contiguous chunk from this total allocation we
2493  * can handle. set to what we asked for originally for a full
2494  * contig. allocation, set to '1' to indicate we can deal with extents
2495  * of any size.
2496  */
2497 int __ocfs2_claim_clusters(handle_t *handle,
2498 			   struct ocfs2_alloc_context *ac,
2499 			   u32 min_clusters,
2500 			   u32 max_clusters,
2501 			   u32 *cluster_start,
2502 			   u32 *num_clusters)
2503 {
2504 	int status;
2505 	unsigned int bits_wanted = max_clusters;
2506 	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2507 	struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2508 
2509 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2510 
2511 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2512 	       && ac->ac_which != OCFS2_AC_USE_MAIN
2513 	       && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG);
2514 
2515 	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2516 		WARN_ON(min_clusters > 1);
2517 
2518 		status = ocfs2_claim_local_alloc_bits(osb,
2519 						      handle,
2520 						      ac,
2521 						      bits_wanted,
2522 						      cluster_start,
2523 						      num_clusters);
2524 		if (!status)
2525 			atomic_inc(&osb->alloc_stats.local_data);
2526 	} else {
2527 		if (min_clusters > (osb->bitmap_cpg - 1)) {
2528 			/* The only paths asking for contiguousness
2529 			 * should know about this already. */
2530 			mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2531 			     "group bitmap size %u!\n", min_clusters,
2532 			     osb->bitmap_cpg);
2533 			status = -ENOSPC;
2534 			goto bail;
2535 		}
2536 		/* clamp the current request down to a realistic size. */
2537 		if (bits_wanted > (osb->bitmap_cpg - 1))
2538 			bits_wanted = osb->bitmap_cpg - 1;
2539 
2540 		status = ocfs2_claim_suballoc_bits(ac,
2541 						   handle,
2542 						   bits_wanted,
2543 						   min_clusters,
2544 						   &res);
2545 		if (!status) {
2546 			BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2547 			*cluster_start =
2548 				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2549 								 res.sr_bg_blkno,
2550 								 res.sr_bit_offset);
2551 			atomic_inc(&osb->alloc_stats.bitmap_data);
2552 			*num_clusters = res.sr_bits;
2553 		}
2554 	}
2555 	if (status < 0) {
2556 		if (status != -ENOSPC)
2557 			mlog_errno(status);
2558 		goto bail;
2559 	}
2560 
2561 	ac->ac_bits_given += *num_clusters;
2562 
2563 bail:
2564 	if (status)
2565 		mlog_errno(status);
2566 	return status;
2567 }
2568 
2569 int ocfs2_claim_clusters(handle_t *handle,
2570 			 struct ocfs2_alloc_context *ac,
2571 			 u32 min_clusters,
2572 			 u32 *cluster_start,
2573 			 u32 *num_clusters)
2574 {
2575 	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2576 
2577 	return __ocfs2_claim_clusters(handle, ac, min_clusters,
2578 				      bits_wanted, cluster_start, num_clusters);
2579 }
2580 
2581 static int ocfs2_block_group_clear_bits(handle_t *handle,
2582 					struct inode *alloc_inode,
2583 					struct ocfs2_group_desc *bg,
2584 					struct buffer_head *group_bh,
2585 					unsigned int bit_off,
2586 					unsigned int num_bits,
2587 					unsigned int max_contig_bits,
2588 					void (*undo_fn)(unsigned int bit,
2589 							unsigned long *bmap))
2590 {
2591 	int status;
2592 	unsigned int tmp;
2593 	u16 contig_bits;
2594 	struct ocfs2_group_desc *undo_bg = NULL;
2595 	struct journal_head *jh;
2596 
2597 	/* The caller got this descriptor from
2598 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
2599 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2600 
2601 	trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
2602 
2603 	BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2604 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2605 					 group_bh,
2606 					 undo_fn ?
2607 					 OCFS2_JOURNAL_ACCESS_UNDO :
2608 					 OCFS2_JOURNAL_ACCESS_WRITE);
2609 	if (status < 0) {
2610 		mlog_errno(status);
2611 		goto bail;
2612 	}
2613 
2614 	jh = bh2jh(group_bh);
2615 	if (undo_fn) {
2616 		spin_lock(&jh->b_state_lock);
2617 		undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data;
2618 		BUG_ON(!undo_bg);
2619 	}
2620 
2621 	tmp = num_bits;
2622 	while(tmp--) {
2623 		ocfs2_clear_bit((bit_off + tmp),
2624 				(unsigned long *) bg->bg_bitmap);
2625 		if (undo_fn)
2626 			undo_fn(bit_off + tmp,
2627 				(unsigned long *) undo_bg->bg_bitmap);
2628 	}
2629 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2630 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2631 		if (undo_fn)
2632 			spin_unlock(&jh->b_state_lock);
2633 		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
2634 				   (unsigned long long)le64_to_cpu(bg->bg_blkno),
2635 				   le16_to_cpu(bg->bg_bits),
2636 				   le16_to_cpu(bg->bg_free_bits_count),
2637 				   num_bits);
2638 	}
2639 
2640 	/*
2641 	 * TODO: even 'num_bits == 1' (the worst case, release 1 cluster),
2642 	 * we still need to rescan whole bitmap.
2643 	 */
2644 	if (ocfs2_is_cluster_bitmap(alloc_inode)) {
2645 		contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
2646 				    le16_to_cpu(bg->bg_bits), 0);
2647 		if (contig_bits > max_contig_bits)
2648 			max_contig_bits = contig_bits;
2649 		bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits);
2650 	} else {
2651 		bg->bg_contig_free_bits = 0;
2652 	}
2653 
2654 	if (undo_fn)
2655 		spin_unlock(&jh->b_state_lock);
2656 
2657 	ocfs2_journal_dirty(handle, group_bh);
2658 bail:
2659 	return status;
2660 }
2661 
2662 /*
2663  * Reclaim the suballocator managed space to main bitmap.
2664  * This function first works on the suballocator to perform the
2665  * cleanup rec/alloc_inode job, then switches to the main bitmap
2666  * to reclaim released space.
2667  *
2668  * handle: The transaction handle
2669  * alloc_inode: The suballoc inode
2670  * alloc_bh: The buffer_head of suballoc inode
2671  * group_bh: The group descriptor buffer_head of suballocator managed.
2672  *           Caller should release the input group_bh.
2673  */
2674 static int _ocfs2_reclaim_suballoc_to_main(handle_t *handle,
2675 			struct inode *alloc_inode,
2676 			struct buffer_head *alloc_bh,
2677 			struct buffer_head *group_bh)
2678 {
2679 	int idx, status = 0;
2680 	int i, next_free_rec, len = 0;
2681 	__le16 old_bg_contig_free_bits = 0;
2682 	u16 start_bit;
2683 	u32 tmp_used;
2684 	u64 bg_blkno, start_blk;
2685 	unsigned int count;
2686 	struct ocfs2_chain_rec *rec;
2687 	struct buffer_head *main_bm_bh = NULL;
2688 	struct inode *main_bm_inode = NULL;
2689 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
2690 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2691 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2692 	struct ocfs2_group_desc *group = (struct ocfs2_group_desc *) group_bh->b_data;
2693 
2694 	idx = le16_to_cpu(group->bg_chain);
2695 	rec = &(cl->cl_recs[idx]);
2696 
2697 	status = ocfs2_extend_trans(handle,
2698 				ocfs2_calc_group_alloc_credits(osb->sb,
2699 						 le16_to_cpu(cl->cl_cpg)));
2700 	if (status) {
2701 		mlog_errno(status);
2702 		goto bail;
2703 	}
2704 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2705 					 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2706 	if (status < 0) {
2707 		mlog_errno(status);
2708 		goto bail;
2709 	}
2710 
2711 	/*
2712 	 * Only clear the suballocator rec item in-place.
2713 	 *
2714 	 * If idx is not the last, we don't compress (remove the empty item)
2715 	 * the cl_recs[]. If not, we need to do lots jobs.
2716 	 *
2717 	 * Compress cl_recs[] code example:
2718 	 * if (idx != cl->cl_next_free_rec - 1)
2719 	 *     memmove(&cl->cl_recs[idx], &cl->cl_recs[idx + 1],
2720 	 *         sizeof(struct ocfs2_chain_rec) *
2721 	 *         (cl->cl_next_free_rec - idx - 1));
2722 	 * for(i = idx; i < cl->cl_next_free_rec-1; i++) {
2723 	 *     group->bg_chain = "later group->bg_chain";
2724 	 *     group->bg_blkno = xxx;
2725 	 *     ... ...
2726 	 * }
2727 	 */
2728 
2729 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_total);
2730 	fe->id1.bitmap1.i_total = cpu_to_le32(tmp_used - le32_to_cpu(rec->c_total));
2731 
2732 	/* Substraction 1 for the block group itself */
2733 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2734 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - 1);
2735 
2736 	tmp_used = le32_to_cpu(fe->i_clusters);
2737 	fe->i_clusters = cpu_to_le32(tmp_used - le16_to_cpu(cl->cl_cpg));
2738 
2739 	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
2740 	OCFS2_I(alloc_inode)->ip_clusters -= le32_to_cpu(fe->i_clusters);
2741 	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
2742 					     le32_to_cpu(fe->i_clusters)));
2743 	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
2744 	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
2745 	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
2746 
2747 	ocfs2_journal_dirty(handle, alloc_bh);
2748 	ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
2749 
2750 	start_blk = le64_to_cpu(rec->c_blkno);
2751 	count = le32_to_cpu(rec->c_total) / le16_to_cpu(cl->cl_bpc);
2752 
2753 	/*
2754 	 * If the rec is the last one, let's compress the chain list by
2755 	 * removing the empty cl_recs[] at the end.
2756 	 */
2757 	next_free_rec = le16_to_cpu(cl->cl_next_free_rec);
2758 	if (idx == (next_free_rec - 1)) {
2759 		len++; /* the last item should be counted first */
2760 		for (i = (next_free_rec - 2); i > 0; i--) {
2761 			if (cl->cl_recs[i].c_free == cl->cl_recs[i].c_total)
2762 				len++;
2763 			else
2764 				break;
2765 		}
2766 	}
2767 	le16_add_cpu(&cl->cl_next_free_rec, -len);
2768 
2769 	rec->c_free = 0;
2770 	rec->c_total = 0;
2771 	rec->c_blkno = 0;
2772 	ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), group_bh);
2773 	memset(group, 0, sizeof(struct ocfs2_group_desc));
2774 
2775 	/* prepare job for reclaim clusters */
2776 	main_bm_inode = ocfs2_get_system_file_inode(osb,
2777 						    GLOBAL_BITMAP_SYSTEM_INODE,
2778 						    OCFS2_INVALID_SLOT);
2779 	if (!main_bm_inode)
2780 		goto bail; /* ignore the error in reclaim path */
2781 
2782 	inode_lock(main_bm_inode);
2783 
2784 	status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
2785 	if (status < 0)
2786 		goto free_bm_inode; /* ignore the error in reclaim path */
2787 
2788 	ocfs2_block_to_cluster_group(main_bm_inode, start_blk, &bg_blkno,
2789 				     &start_bit);
2790 	fe = (struct ocfs2_dinode *) main_bm_bh->b_data;
2791 	cl = &fe->id2.i_chain;
2792 	/* reuse group_bh, caller will release the input group_bh */
2793 	group_bh = NULL;
2794 
2795 	/* reclaim clusters to global_bitmap */
2796 	status = ocfs2_read_group_descriptor(main_bm_inode, fe, bg_blkno,
2797 					     &group_bh);
2798 	if (status < 0) {
2799 		mlog_errno(status);
2800 		goto free_bm_bh;
2801 	}
2802 	group = (struct ocfs2_group_desc *) group_bh->b_data;
2803 
2804 	if ((count + start_bit) > le16_to_cpu(group->bg_bits)) {
2805 		ocfs2_error(alloc_inode->i_sb,
2806 			"reclaim length (%d) beyands block group length (%d)",
2807 			count + start_bit, le16_to_cpu(group->bg_bits));
2808 		goto free_group_bh;
2809 	}
2810 
2811 	old_bg_contig_free_bits = group->bg_contig_free_bits;
2812 	status = ocfs2_block_group_clear_bits(handle, main_bm_inode,
2813 					      group, group_bh,
2814 					      start_bit, count, 0,
2815 					      _ocfs2_clear_bit);
2816 	if (status < 0) {
2817 		mlog_errno(status);
2818 		goto free_group_bh;
2819 	}
2820 
2821 	status = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
2822 					 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2823 	if (status < 0) {
2824 		mlog_errno(status);
2825 		ocfs2_block_group_set_bits(handle, main_bm_inode, group, group_bh,
2826 				start_bit, count,
2827 				le16_to_cpu(old_bg_contig_free_bits), 1);
2828 		goto free_group_bh;
2829 	}
2830 
2831 	idx = le16_to_cpu(group->bg_chain);
2832 	rec = &(cl->cl_recs[idx]);
2833 
2834 	le32_add_cpu(&rec->c_free, count);
2835 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2836 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2837 	ocfs2_journal_dirty(handle, main_bm_bh);
2838 
2839 free_group_bh:
2840 	brelse(group_bh);
2841 
2842 free_bm_bh:
2843 	ocfs2_inode_unlock(main_bm_inode, 1);
2844 	brelse(main_bm_bh);
2845 
2846 free_bm_inode:
2847 	inode_unlock(main_bm_inode);
2848 	iput(main_bm_inode);
2849 
2850 bail:
2851 	return status;
2852 }
2853 
2854 /*
2855  * expects the suballoc inode to already be locked.
2856  */
2857 static int _ocfs2_free_suballoc_bits(handle_t *handle,
2858 				     struct inode *alloc_inode,
2859 				     struct buffer_head *alloc_bh,
2860 				     unsigned int start_bit,
2861 				     u64 bg_blkno,
2862 				     unsigned int count,
2863 				     void (*undo_fn)(unsigned int bit,
2864 						     unsigned long *bitmap))
2865 {
2866 	int idx, status = 0;
2867 	u32 tmp_used;
2868 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2869 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2870 	struct buffer_head *group_bh = NULL;
2871 	struct ocfs2_group_desc *group;
2872 	struct ocfs2_chain_rec *rec;
2873 	__le16 old_bg_contig_free_bits = 0;
2874 
2875 	/* The alloc_bh comes from ocfs2_free_dinode() or
2876 	 * ocfs2_free_clusters().  The callers have all locked the
2877 	 * allocator and gotten alloc_bh from the lock call.  This
2878 	 * validates the dinode buffer.  Any corruption that has happened
2879 	 * is a code bug. */
2880 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2881 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2882 
2883 	trace_ocfs2_free_suballoc_bits(
2884 		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
2885 		(unsigned long long)bg_blkno,
2886 		start_bit, count);
2887 
2888 	status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2889 					     &group_bh);
2890 	if (status < 0) {
2891 		mlog_errno(status);
2892 		goto bail;
2893 	}
2894 	group = (struct ocfs2_group_desc *) group_bh->b_data;
2895 
2896 	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2897 
2898 	if (ocfs2_is_cluster_bitmap(alloc_inode))
2899 		old_bg_contig_free_bits = group->bg_contig_free_bits;
2900 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2901 					      group, group_bh,
2902 					      start_bit, count, 0, undo_fn);
2903 	if (status < 0) {
2904 		mlog_errno(status);
2905 		goto bail;
2906 	}
2907 
2908 	status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2909 					 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2910 	if (status < 0) {
2911 		mlog_errno(status);
2912 		ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
2913 				start_bit, count,
2914 				le16_to_cpu(old_bg_contig_free_bits), 1);
2915 		goto bail;
2916 	}
2917 
2918 	idx = le16_to_cpu(group->bg_chain);
2919 	rec = &(cl->cl_recs[idx]);
2920 
2921 	le32_add_cpu(&rec->c_free, count);
2922 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2923 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2924 	ocfs2_journal_dirty(handle, alloc_bh);
2925 
2926 	/*
2927 	 * Reclaim suballocator free space.
2928 	 * Bypass: global_bitmap, non empty rec, first rec in cl_recs[]
2929 	 */
2930 	if (ocfs2_is_cluster_bitmap(alloc_inode) ||
2931 	    (le32_to_cpu(rec->c_free) != (le32_to_cpu(rec->c_total) - 1)) ||
2932 	    (le16_to_cpu(cl->cl_next_free_rec) == 1)) {
2933 		goto bail;
2934 	}
2935 
2936 	_ocfs2_reclaim_suballoc_to_main(handle, alloc_inode, alloc_bh, group_bh);
2937 
2938 bail:
2939 	brelse(group_bh);
2940 	return status;
2941 }
2942 
2943 int ocfs2_free_suballoc_bits(handle_t *handle,
2944 			     struct inode *alloc_inode,
2945 			     struct buffer_head *alloc_bh,
2946 			     unsigned int start_bit,
2947 			     u64 bg_blkno,
2948 			     unsigned int count)
2949 {
2950 	return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2951 					 start_bit, bg_blkno, count, NULL);
2952 }
2953 
2954 int ocfs2_free_dinode(handle_t *handle,
2955 		      struct inode *inode_alloc_inode,
2956 		      struct buffer_head *inode_alloc_bh,
2957 		      struct ocfs2_dinode *di)
2958 {
2959 	u64 blk = le64_to_cpu(di->i_blkno);
2960 	u16 bit = le16_to_cpu(di->i_suballoc_bit);
2961 	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2962 
2963 	if (di->i_suballoc_loc)
2964 		bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2965 	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2966 					inode_alloc_bh, bit, bg_blkno, 1);
2967 }
2968 
2969 static int _ocfs2_free_clusters(handle_t *handle,
2970 				struct inode *bitmap_inode,
2971 				struct buffer_head *bitmap_bh,
2972 				u64 start_blk,
2973 				unsigned int num_clusters,
2974 				void (*undo_fn)(unsigned int bit,
2975 						unsigned long *bitmap))
2976 {
2977 	int status;
2978 	u16 bg_start_bit;
2979 	u64 bg_blkno;
2980 
2981 	/* You can't ever have a contiguous set of clusters
2982 	 * bigger than a block group bitmap so we never have to worry
2983 	 * about looping on them.
2984 	 * This is expensive. We can safely remove once this stuff has
2985 	 * gotten tested really well. */
2986 	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb,
2987 				ocfs2_blocks_to_clusters(bitmap_inode->i_sb,
2988 							 start_blk)));
2989 
2990 
2991 	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2992 				     &bg_start_bit);
2993 
2994 	trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
2995 			(unsigned long long)start_blk,
2996 			bg_start_bit, num_clusters);
2997 
2998 	status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2999 					   bg_start_bit, bg_blkno,
3000 					   num_clusters, undo_fn);
3001 	if (status < 0) {
3002 		mlog_errno(status);
3003 		goto out;
3004 	}
3005 
3006 	ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
3007 					 num_clusters);
3008 
3009 out:
3010 	return status;
3011 }
3012 
3013 int ocfs2_free_clusters(handle_t *handle,
3014 			struct inode *bitmap_inode,
3015 			struct buffer_head *bitmap_bh,
3016 			u64 start_blk,
3017 			unsigned int num_clusters)
3018 {
3019 	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
3020 				    start_blk, num_clusters,
3021 				    _ocfs2_set_bit);
3022 }
3023 
3024 /*
3025  * Give never-used clusters back to the global bitmap.  We don't need
3026  * to protect these bits in the undo buffer.
3027  */
3028 int ocfs2_release_clusters(handle_t *handle,
3029 			   struct inode *bitmap_inode,
3030 			   struct buffer_head *bitmap_bh,
3031 			   u64 start_blk,
3032 			   unsigned int num_clusters)
3033 {
3034 	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
3035 				    start_blk, num_clusters,
3036 				    _ocfs2_clear_bit);
3037 }
3038 
3039 /*
3040  * For a given allocation, determine which allocators will need to be
3041  * accessed, and lock them, reserving the appropriate number of bits.
3042  *
3043  * Sparse file systems call this from ocfs2_write_begin_nolock()
3044  * and ocfs2_allocate_unwritten_extents().
3045  *
3046  * File systems which don't support holes call this from
3047  * ocfs2_extend_allocation().
3048  */
3049 int ocfs2_lock_allocators(struct inode *inode,
3050 			  struct ocfs2_extent_tree *et,
3051 			  u32 clusters_to_add, u32 extents_to_split,
3052 			  struct ocfs2_alloc_context **data_ac,
3053 			  struct ocfs2_alloc_context **meta_ac)
3054 {
3055 	int ret = 0, num_free_extents;
3056 	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
3057 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3058 
3059 	*meta_ac = NULL;
3060 	if (data_ac)
3061 		*data_ac = NULL;
3062 
3063 	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
3064 
3065 	num_free_extents = ocfs2_num_free_extents(et);
3066 	if (num_free_extents < 0) {
3067 		ret = num_free_extents;
3068 		mlog_errno(ret);
3069 		goto out;
3070 	}
3071 
3072 	/*
3073 	 * Sparse allocation file systems need to be more conservative
3074 	 * with reserving room for expansion - the actual allocation
3075 	 * happens while we've got a journal handle open so re-taking
3076 	 * a cluster lock (because we ran out of room for another
3077 	 * extent) will violate ordering rules.
3078 	 *
3079 	 * Most of the time we'll only be seeing this 1 cluster at a time
3080 	 * anyway.
3081 	 *
3082 	 * Always lock for any unwritten extents - we might want to
3083 	 * add blocks during a split.
3084 	 */
3085 	if (!num_free_extents ||
3086 	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
3087 		ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
3088 		if (ret < 0) {
3089 			if (ret != -ENOSPC)
3090 				mlog_errno(ret);
3091 			goto out;
3092 		}
3093 	}
3094 
3095 	if (clusters_to_add == 0)
3096 		goto out;
3097 
3098 	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
3099 	if (ret < 0) {
3100 		if (ret != -ENOSPC)
3101 			mlog_errno(ret);
3102 		goto out;
3103 	}
3104 
3105 out:
3106 	if (ret) {
3107 		if (*meta_ac) {
3108 			ocfs2_free_alloc_context(*meta_ac);
3109 			*meta_ac = NULL;
3110 		}
3111 
3112 		/*
3113 		 * We cannot have an error and a non null *data_ac.
3114 		 */
3115 	}
3116 
3117 	return ret;
3118 }
3119 
3120 /*
3121  * Read the inode specified by blkno to get suballoc_slot and
3122  * suballoc_bit.
3123  */
3124 static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
3125 				       u16 *suballoc_slot, u64 *group_blkno,
3126 				       u16 *suballoc_bit)
3127 {
3128 	int status;
3129 	struct buffer_head *inode_bh = NULL;
3130 	struct ocfs2_dinode *inode_fe;
3131 
3132 	trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
3133 
3134 	/* dirty read disk */
3135 	status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
3136 	if (status < 0) {
3137 		mlog(ML_ERROR, "read block %llu failed %d\n",
3138 		     (unsigned long long)blkno, status);
3139 		goto bail;
3140 	}
3141 
3142 	inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
3143 	if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
3144 		mlog(ML_ERROR, "invalid inode %llu requested\n",
3145 		     (unsigned long long)blkno);
3146 		status = -EINVAL;
3147 		goto bail;
3148 	}
3149 
3150 	if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
3151 	    (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
3152 		mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
3153 		     (unsigned long long)blkno,
3154 		     (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
3155 		status = -EINVAL;
3156 		goto bail;
3157 	}
3158 
3159 	if (suballoc_slot)
3160 		*suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
3161 	if (suballoc_bit)
3162 		*suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
3163 	if (group_blkno)
3164 		*group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
3165 
3166 bail:
3167 	brelse(inode_bh);
3168 
3169 	if (status)
3170 		mlog_errno(status);
3171 	return status;
3172 }
3173 
3174 /*
3175  * test whether bit is SET in allocator bitmap or not.  on success, 0
3176  * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
3177  * is returned and *res is meaningless.  Call this after you have
3178  * cluster locked against suballoc, or you may get a result based on
3179  * non-up2date contents
3180  */
3181 static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
3182 				   struct inode *suballoc,
3183 				   struct buffer_head *alloc_bh,
3184 				   u64 group_blkno, u64 blkno,
3185 				   u16 bit, int *res)
3186 {
3187 	struct ocfs2_dinode *alloc_di;
3188 	struct ocfs2_group_desc *group;
3189 	struct buffer_head *group_bh = NULL;
3190 	u64 bg_blkno;
3191 	int status, quiet = 0, released = 0;
3192 
3193 	trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
3194 				      (unsigned int)bit);
3195 
3196 	alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
3197 	if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
3198 		mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
3199 		     (unsigned int)bit,
3200 		     ocfs2_bits_per_group(&alloc_di->id2.i_chain));
3201 		status = -EINVAL;
3202 		goto bail;
3203 	}
3204 
3205 	bg_blkno = group_blkno ? group_blkno :
3206 		   ocfs2_which_suballoc_group(blkno, bit);
3207 	status = ocfs2_read_hint_group_descriptor(suballoc, alloc_di, bg_blkno,
3208 					     &group_bh, &released);
3209 	if (released) {
3210 		quiet = 1;
3211 		status = -ESTALE;
3212 		goto bail;
3213 	} else if (status < 0) {
3214 		mlog(ML_ERROR, "read group %llu failed %d\n",
3215 		     (unsigned long long)bg_blkno, status);
3216 		goto bail;
3217 	}
3218 
3219 	group = (struct ocfs2_group_desc *) group_bh->b_data;
3220 	*res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
3221 
3222 bail:
3223 	brelse(group_bh);
3224 
3225 	if (status && !quiet)
3226 		mlog_errno(status);
3227 	return status;
3228 }
3229 
3230 /*
3231  * Test if the bit representing this inode (blkno) is set in the
3232  * suballocator.
3233  *
3234  * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
3235  *
3236  * In the event of failure, a negative value is returned and *res is
3237  * meaningless.
3238  *
3239  * Callers must make sure to hold nfs_sync_lock to prevent
3240  * ocfs2_delete_inode() on another node from accessing the same
3241  * suballocator concurrently.
3242  */
3243 int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
3244 {
3245 	int status, quiet = 0;
3246 	u64 group_blkno = 0;
3247 	u16 suballoc_bit = 0, suballoc_slot = 0;
3248 	struct inode *inode_alloc_inode;
3249 	struct buffer_head *alloc_bh = NULL;
3250 
3251 	trace_ocfs2_test_inode_bit((unsigned long long)blkno);
3252 
3253 	status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
3254 					     &group_blkno, &suballoc_bit);
3255 	if (status < 0) {
3256 		mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
3257 		goto bail;
3258 	}
3259 
3260 	if (suballoc_slot == (u16)OCFS2_INVALID_SLOT)
3261 		inode_alloc_inode = ocfs2_get_system_file_inode(osb,
3262 			GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
3263 	else
3264 		inode_alloc_inode = ocfs2_get_system_file_inode(osb,
3265 			INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
3266 	if (!inode_alloc_inode) {
3267 		/* the error code could be inaccurate, but we are not able to
3268 		 * get the correct one. */
3269 		status = -EINVAL;
3270 		mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
3271 		     (u32)suballoc_slot);
3272 		goto bail;
3273 	}
3274 
3275 	inode_lock(inode_alloc_inode);
3276 	status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
3277 	if (status < 0) {
3278 		inode_unlock(inode_alloc_inode);
3279 		iput(inode_alloc_inode);
3280 		mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
3281 		     (u32)suballoc_slot, status);
3282 		goto bail;
3283 	}
3284 
3285 	status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
3286 					 group_blkno, blkno, suballoc_bit, res);
3287 	if (status < 0) {
3288 		if (status == -ESTALE)
3289 			quiet = 1;
3290 		else
3291 			mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
3292 	}
3293 
3294 	ocfs2_inode_unlock(inode_alloc_inode, 0);
3295 	inode_unlock(inode_alloc_inode);
3296 
3297 	iput(inode_alloc_inode);
3298 	brelse(alloc_bh);
3299 bail:
3300 	if (status && !quiet)
3301 		mlog_errno(status);
3302 	return status;
3303 }
3304