xref: /linux/fs/ocfs2/suballoc.c (revision f3d9478b2ce468c3115b02ecae7e975990697f15)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26 
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31 
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34 
35 #include "ocfs2.h"
36 
37 #include "alloc.h"
38 #include "dlmglue.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "super.h"
44 #include "sysfile.h"
45 #include "uptodate.h"
46 
47 #include "buffer_head_io.h"
48 
49 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
52 static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
53 				  struct inode *alloc_inode,
54 				  struct buffer_head *bg_bh,
55 				  u64 group_blkno,
56 				  u16 my_chain,
57 				  struct ocfs2_chain_list *cl);
58 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
59 				   struct inode *alloc_inode,
60 				   struct buffer_head *bh);
61 
62 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
63 				       struct ocfs2_alloc_context *ac);
64 
65 static int ocfs2_cluster_group_search(struct inode *inode,
66 				      struct buffer_head *group_bh,
67 				      u32 bits_wanted, u32 min_bits,
68 				      u16 *bit_off, u16 *bits_found);
69 static int ocfs2_block_group_search(struct inode *inode,
70 				    struct buffer_head *group_bh,
71 				    u32 bits_wanted, u32 min_bits,
72 				    u16 *bit_off, u16 *bits_found);
73 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
74 			      u32 bits_wanted,
75 			      u32 min_bits,
76 			      u16 *bit_off,
77 			      unsigned int *num_bits,
78 			      u64 *bg_blkno);
79 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
80 				     struct ocfs2_alloc_context *ac,
81 				     u32 bits_wanted,
82 				     u32 min_bits,
83 				     u16 *bit_off,
84 				     unsigned int *num_bits,
85 				     u64 *bg_blkno);
86 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
87 					 int nr);
88 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
89 					     struct buffer_head *bg_bh,
90 					     unsigned int bits_wanted,
91 					     u16 *bit_off,
92 					     u16 *bits_found);
93 static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
94 					     struct inode *alloc_inode,
95 					     struct ocfs2_group_desc *bg,
96 					     struct buffer_head *group_bh,
97 					     unsigned int bit_off,
98 					     unsigned int num_bits);
99 static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
100 					       struct inode *alloc_inode,
101 					       struct ocfs2_group_desc *bg,
102 					       struct buffer_head *group_bh,
103 					       unsigned int bit_off,
104 					       unsigned int num_bits);
105 
106 static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
107 				    struct inode *alloc_inode,
108 				    struct buffer_head *fe_bh,
109 				    struct buffer_head *bg_bh,
110 				    struct buffer_head *prev_bg_bh,
111 				    u16 chain);
112 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
113 						     u32 wanted);
114 static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
115 				    struct inode *alloc_inode,
116 				    struct buffer_head *alloc_bh,
117 				    unsigned int start_bit,
118 				    u64 bg_blkno,
119 				    unsigned int count);
120 static inline u64 ocfs2_which_suballoc_group(u64 block,
121 					     unsigned int bit);
122 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
123 						   u64 bg_blkno,
124 						   u16 bg_bit_off);
125 static inline u64 ocfs2_which_cluster_group(struct inode *inode,
126 					    u32 cluster);
127 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
128 						u64 data_blkno,
129 						u64 *bg_blkno,
130 						u16 *bg_bit_off);
131 
132 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
133 {
134 	if (ac->ac_inode)
135 		iput(ac->ac_inode);
136 	if (ac->ac_bh)
137 		brelse(ac->ac_bh);
138 	kfree(ac);
139 }
140 
141 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
142 {
143 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
144 }
145 
146 static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
147 				  struct inode *alloc_inode,
148 				  struct buffer_head *bg_bh,
149 				  u64 group_blkno,
150 				  u16 my_chain,
151 				  struct ocfs2_chain_list *cl)
152 {
153 	int status = 0;
154 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
155 	struct super_block * sb = alloc_inode->i_sb;
156 
157 	mlog_entry_void();
158 
159 	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
160 		ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
161 			    "b_blocknr (%llu)",
162 			    (unsigned long long)group_blkno,
163 			    (unsigned long long) bg_bh->b_blocknr);
164 		status = -EIO;
165 		goto bail;
166 	}
167 
168 	status = ocfs2_journal_access(handle,
169 				      alloc_inode,
170 				      bg_bh,
171 				      OCFS2_JOURNAL_ACCESS_CREATE);
172 	if (status < 0) {
173 		mlog_errno(status);
174 		goto bail;
175 	}
176 
177 	memset(bg, 0, sb->s_blocksize);
178 	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
179 	bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
180 	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
181 	bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
182 	bg->bg_chain = cpu_to_le16(my_chain);
183 	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
184 	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
185 	bg->bg_blkno = cpu_to_le64(group_blkno);
186 	/* set the 1st bit in the bitmap to account for the descriptor block */
187 	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
188 	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
189 
190 	status = ocfs2_journal_dirty(handle, bg_bh);
191 	if (status < 0)
192 		mlog_errno(status);
193 
194 	/* There is no need to zero out or otherwise initialize the
195 	 * other blocks in a group - All valid FS metadata in a block
196 	 * group stores the superblock fs_generation value at
197 	 * allocation time. */
198 
199 bail:
200 	mlog_exit(status);
201 	return status;
202 }
203 
204 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
205 {
206 	u16 curr, best;
207 
208 	best = curr = 0;
209 	while (curr < le16_to_cpu(cl->cl_count)) {
210 		if (le32_to_cpu(cl->cl_recs[best].c_total) >
211 		    le32_to_cpu(cl->cl_recs[curr].c_total))
212 			best = curr;
213 		curr++;
214 	}
215 	return best;
216 }
217 
218 /*
219  * We expect the block group allocator to already be locked.
220  */
221 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
222 				   struct inode *alloc_inode,
223 				   struct buffer_head *bh)
224 {
225 	int status, credits;
226 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
227 	struct ocfs2_chain_list *cl;
228 	struct ocfs2_alloc_context *ac = NULL;
229 	struct ocfs2_journal_handle *handle = NULL;
230 	u32 bit_off, num_bits;
231 	u16 alloc_rec;
232 	u64 bg_blkno;
233 	struct buffer_head *bg_bh = NULL;
234 	struct ocfs2_group_desc *bg;
235 
236 	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
237 
238 	mlog_entry_void();
239 
240 	handle = ocfs2_alloc_handle(osb);
241 	if (!handle) {
242 		status = -ENOMEM;
243 		mlog_errno(status);
244 		goto bail;
245 	}
246 
247 	cl = &fe->id2.i_chain;
248 	status = ocfs2_reserve_clusters(osb,
249 					handle,
250 					le16_to_cpu(cl->cl_cpg),
251 					&ac);
252 	if (status < 0) {
253 		if (status != -ENOSPC)
254 			mlog_errno(status);
255 		goto bail;
256 	}
257 
258 	credits = ocfs2_calc_group_alloc_credits(osb->sb,
259 						 le16_to_cpu(cl->cl_cpg));
260 	handle = ocfs2_start_trans(osb, handle, credits);
261 	if (IS_ERR(handle)) {
262 		status = PTR_ERR(handle);
263 		handle = NULL;
264 		mlog_errno(status);
265 		goto bail;
266 	}
267 
268 	status = ocfs2_claim_clusters(osb,
269 				      handle,
270 				      ac,
271 				      le16_to_cpu(cl->cl_cpg),
272 				      &bit_off,
273 				      &num_bits);
274 	if (status < 0) {
275 		if (status != -ENOSPC)
276 			mlog_errno(status);
277 		goto bail;
278 	}
279 
280 	alloc_rec = ocfs2_find_smallest_chain(cl);
281 
282 	/* setup the group */
283 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
284 	mlog(0, "new descriptor, record %u, at block %llu\n",
285 	     alloc_rec, (unsigned long long)bg_blkno);
286 
287 	bg_bh = sb_getblk(osb->sb, bg_blkno);
288 	if (!bg_bh) {
289 		status = -EIO;
290 		mlog_errno(status);
291 		goto bail;
292 	}
293 	ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
294 
295 	status = ocfs2_block_group_fill(handle,
296 					alloc_inode,
297 					bg_bh,
298 					bg_blkno,
299 					alloc_rec,
300 					cl);
301 	if (status < 0) {
302 		mlog_errno(status);
303 		goto bail;
304 	}
305 
306 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
307 
308 	status = ocfs2_journal_access(handle, alloc_inode,
309 				      bh, OCFS2_JOURNAL_ACCESS_WRITE);
310 	if (status < 0) {
311 		mlog_errno(status);
312 		goto bail;
313 	}
314 
315 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
316 		     le16_to_cpu(bg->bg_free_bits_count));
317 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
318 	cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
319 	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
320 		le16_add_cpu(&cl->cl_next_free_rec, 1);
321 
322 	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
323 					le16_to_cpu(bg->bg_free_bits_count));
324 	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
325 	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
326 
327 	status = ocfs2_journal_dirty(handle, bh);
328 	if (status < 0) {
329 		mlog_errno(status);
330 		goto bail;
331 	}
332 
333 	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
334 	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
335 	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
336 					     le32_to_cpu(fe->i_clusters)));
337 	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
338 	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
339 	alloc_inode->i_blocks =
340 		ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
341 
342 	status = 0;
343 bail:
344 	if (handle)
345 		ocfs2_commit_trans(handle);
346 
347 	if (ac)
348 		ocfs2_free_alloc_context(ac);
349 
350 	if (bg_bh)
351 		brelse(bg_bh);
352 
353 	mlog_exit(status);
354 	return status;
355 }
356 
357 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
358 				       struct ocfs2_alloc_context *ac)
359 {
360 	int status;
361 	u32 bits_wanted = ac->ac_bits_wanted;
362 	struct inode *alloc_inode = ac->ac_inode;
363 	struct buffer_head *bh = NULL;
364 	struct ocfs2_journal_handle *handle = ac->ac_handle;
365 	struct ocfs2_dinode *fe;
366 	u32 free_bits;
367 
368 	mlog_entry_void();
369 
370 	BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
371 
372 	ocfs2_handle_add_inode(handle, alloc_inode);
373 	status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
374 	if (status < 0) {
375 		mlog_errno(status);
376 		goto bail;
377 	}
378 
379 	fe = (struct ocfs2_dinode *) bh->b_data;
380 	if (!OCFS2_IS_VALID_DINODE(fe)) {
381 		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
382 		status = -EIO;
383 		goto bail;
384 	}
385 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
386 		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
387 			    (unsigned long long)le64_to_cpu(fe->i_blkno));
388 		status = -EIO;
389 		goto bail;
390 	}
391 
392 	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
393 		le32_to_cpu(fe->id1.bitmap1.i_used);
394 
395 	if (bits_wanted > free_bits) {
396 		/* cluster bitmap never grows */
397 		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
398 			mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
399 			     bits_wanted, free_bits);
400 			status = -ENOSPC;
401 			goto bail;
402 		}
403 
404 		status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
405 		if (status < 0) {
406 			if (status != -ENOSPC)
407 				mlog_errno(status);
408 			goto bail;
409 		}
410 		atomic_inc(&osb->alloc_stats.bg_extends);
411 
412 		/* You should never ask for this much metadata */
413 		BUG_ON(bits_wanted >
414 		       (le32_to_cpu(fe->id1.bitmap1.i_total)
415 			- le32_to_cpu(fe->id1.bitmap1.i_used)));
416 	}
417 
418 	get_bh(bh);
419 	ac->ac_bh = bh;
420 bail:
421 	if (bh)
422 		brelse(bh);
423 
424 	mlog_exit(status);
425 	return status;
426 }
427 
428 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
429 			       struct ocfs2_journal_handle *handle,
430 			       struct ocfs2_dinode *fe,
431 			       struct ocfs2_alloc_context **ac)
432 {
433 	int status;
434 	struct inode *alloc_inode = NULL;
435 
436 	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
437 	if (!(*ac)) {
438 		status = -ENOMEM;
439 		mlog_errno(status);
440 		goto bail;
441 	}
442 
443 	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
444 	(*ac)->ac_handle = handle;
445 	(*ac)->ac_which = OCFS2_AC_USE_META;
446 
447 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
448 	alloc_inode = ocfs2_get_system_file_inode(osb,
449 						  EXTENT_ALLOC_SYSTEM_INODE,
450 						  0);
451 #else
452 	alloc_inode = ocfs2_get_system_file_inode(osb,
453 						  EXTENT_ALLOC_SYSTEM_INODE,
454 						  osb->slot_num);
455 #endif
456 	if (!alloc_inode) {
457 		status = -ENOMEM;
458 		mlog_errno(status);
459 		goto bail;
460 	}
461 
462 	(*ac)->ac_inode = igrab(alloc_inode);
463 	(*ac)->ac_group_search = ocfs2_block_group_search;
464 
465 	status = ocfs2_reserve_suballoc_bits(osb, (*ac));
466 	if (status < 0) {
467 		if (status != -ENOSPC)
468 			mlog_errno(status);
469 		goto bail;
470 	}
471 
472 	status = 0;
473 bail:
474 	if ((status < 0) && *ac) {
475 		ocfs2_free_alloc_context(*ac);
476 		*ac = NULL;
477 	}
478 
479 	if (alloc_inode)
480 		iput(alloc_inode);
481 
482 	mlog_exit(status);
483 	return status;
484 }
485 
486 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
487 			    struct ocfs2_journal_handle *handle,
488 			    struct ocfs2_alloc_context **ac)
489 {
490 	int status;
491 	struct inode *alloc_inode = NULL;
492 
493 	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
494 	if (!(*ac)) {
495 		status = -ENOMEM;
496 		mlog_errno(status);
497 		goto bail;
498 	}
499 
500 	(*ac)->ac_bits_wanted = 1;
501 	(*ac)->ac_handle = handle;
502 	(*ac)->ac_which = OCFS2_AC_USE_INODE;
503 
504 	alloc_inode = ocfs2_get_system_file_inode(osb,
505 						  INODE_ALLOC_SYSTEM_INODE,
506 						  osb->slot_num);
507 	if (!alloc_inode) {
508 		status = -ENOMEM;
509 		mlog_errno(status);
510 		goto bail;
511 	}
512 
513 	(*ac)->ac_inode = igrab(alloc_inode);
514 	(*ac)->ac_group_search = ocfs2_block_group_search;
515 
516 	status = ocfs2_reserve_suballoc_bits(osb, *ac);
517 	if (status < 0) {
518 		if (status != -ENOSPC)
519 			mlog_errno(status);
520 		goto bail;
521 	}
522 
523 	status = 0;
524 bail:
525 	if ((status < 0) && *ac) {
526 		ocfs2_free_alloc_context(*ac);
527 		*ac = NULL;
528 	}
529 
530 	if (alloc_inode)
531 		iput(alloc_inode);
532 
533 	mlog_exit(status);
534 	return status;
535 }
536 
537 /* local alloc code has to do the same thing, so rather than do this
538  * twice.. */
539 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
540 				      struct ocfs2_alloc_context *ac)
541 {
542 	int status;
543 
544 	ac->ac_inode = ocfs2_get_system_file_inode(osb,
545 						   GLOBAL_BITMAP_SYSTEM_INODE,
546 						   OCFS2_INVALID_SLOT);
547 	if (!ac->ac_inode) {
548 		status = -EINVAL;
549 		mlog(ML_ERROR, "Could not get bitmap inode!\n");
550 		goto bail;
551 	}
552 	ac->ac_which = OCFS2_AC_USE_MAIN;
553 	ac->ac_group_search = ocfs2_cluster_group_search;
554 
555 	status = ocfs2_reserve_suballoc_bits(osb, ac);
556 	if (status < 0 && status != -ENOSPC)
557 		mlog_errno(status);
558 bail:
559 	return status;
560 }
561 
562 /* Callers don't need to care which bitmap (local alloc or main) to
563  * use so we figure it out for them, but unfortunately this clutters
564  * things a bit. */
565 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
566 			   struct ocfs2_journal_handle *handle,
567 			   u32 bits_wanted,
568 			   struct ocfs2_alloc_context **ac)
569 {
570 	int status;
571 
572 	mlog_entry_void();
573 
574 	BUG_ON(!handle);
575 
576 	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
577 	if (!(*ac)) {
578 		status = -ENOMEM;
579 		mlog_errno(status);
580 		goto bail;
581 	}
582 
583 	(*ac)->ac_bits_wanted = bits_wanted;
584 	(*ac)->ac_handle = handle;
585 
586 	status = -ENOSPC;
587 	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
588 		status = ocfs2_reserve_local_alloc_bits(osb,
589 							handle,
590 							bits_wanted,
591 							*ac);
592 		if ((status < 0) && (status != -ENOSPC)) {
593 			mlog_errno(status);
594 			goto bail;
595 		} else if (status == -ENOSPC) {
596 			/* reserve_local_bits will return enospc with
597 			 * the local alloc inode still locked, so we
598 			 * can change this safely here. */
599 			mlog(0, "Disabling local alloc\n");
600 			/* We set to OCFS2_LA_DISABLED so that umount
601 			 * can clean up what's left of the local
602 			 * allocation */
603 			osb->local_alloc_state = OCFS2_LA_DISABLED;
604 		}
605 	}
606 
607 	if (status == -ENOSPC) {
608 		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
609 		if (status < 0) {
610 			if (status != -ENOSPC)
611 				mlog_errno(status);
612 			goto bail;
613 		}
614 	}
615 
616 	status = 0;
617 bail:
618 	if ((status < 0) && *ac) {
619 		ocfs2_free_alloc_context(*ac);
620 		*ac = NULL;
621 	}
622 
623 	mlog_exit(status);
624 	return status;
625 }
626 
627 /*
628  * More or less lifted from ext3. I'll leave their description below:
629  *
630  * "For ext3 allocations, we must not reuse any blocks which are
631  * allocated in the bitmap buffer's "last committed data" copy.  This
632  * prevents deletes from freeing up the page for reuse until we have
633  * committed the delete transaction.
634  *
635  * If we didn't do this, then deleting something and reallocating it as
636  * data would allow the old block to be overwritten before the
637  * transaction committed (because we force data to disk before commit).
638  * This would lead to corruption if we crashed between overwriting the
639  * data and committing the delete.
640  *
641  * @@@ We may want to make this allocation behaviour conditional on
642  * data-writes at some point, and disable it for metadata allocations or
643  * sync-data inodes."
644  *
645  * Note: OCFS2 already does this differently for metadata vs data
646  * allocations, as those bitmaps are seperate and undo access is never
647  * called on a metadata group descriptor.
648  */
649 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
650 					 int nr)
651 {
652 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
653 
654 	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
655 		return 0;
656 	if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
657 		return 1;
658 
659 	bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
660 	return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
661 }
662 
663 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
664 					     struct buffer_head *bg_bh,
665 					     unsigned int bits_wanted,
666 					     u16 *bit_off,
667 					     u16 *bits_found)
668 {
669 	void *bitmap;
670 	u16 best_offset, best_size;
671 	int offset, start, found, status = 0;
672 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
673 
674 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
675 		OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
676 		return -EIO;
677 	}
678 
679 	found = start = best_offset = best_size = 0;
680 	bitmap = bg->bg_bitmap;
681 
682 	while((offset = ocfs2_find_next_zero_bit(bitmap,
683 						 le16_to_cpu(bg->bg_bits),
684 						 start)) != -1) {
685 		if (offset == le16_to_cpu(bg->bg_bits))
686 			break;
687 
688 		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
689 			/* We found a zero, but we can't use it as it
690 			 * hasn't been put to disk yet! */
691 			found = 0;
692 			start = offset + 1;
693 		} else if (offset == start) {
694 			/* we found a zero */
695 			found++;
696 			/* move start to the next bit to test */
697 			start++;
698 		} else {
699 			/* got a zero after some ones */
700 			found = 1;
701 			start = offset + 1;
702 		}
703 		if (found > best_size) {
704 			best_size = found;
705 			best_offset = start - found;
706 		}
707 		/* we got everything we needed */
708 		if (found == bits_wanted) {
709 			/* mlog(0, "Found it all!\n"); */
710 			break;
711 		}
712 	}
713 
714 	/* XXX: I think the first clause is equivalent to the second
715 	 * 	- jlbec */
716 	if (found == bits_wanted) {
717 		*bit_off = start - found;
718 		*bits_found = found;
719 	} else if (best_size) {
720 		*bit_off = best_offset;
721 		*bits_found = best_size;
722 	} else {
723 		status = -ENOSPC;
724 		/* No error log here -- see the comment above
725 		 * ocfs2_test_bg_bit_allocatable */
726 	}
727 
728 	return status;
729 }
730 
731 static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
732 					     struct inode *alloc_inode,
733 					     struct ocfs2_group_desc *bg,
734 					     struct buffer_head *group_bh,
735 					     unsigned int bit_off,
736 					     unsigned int num_bits)
737 {
738 	int status;
739 	void *bitmap = bg->bg_bitmap;
740 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
741 
742 	mlog_entry_void();
743 
744 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
745 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
746 		status = -EIO;
747 		goto bail;
748 	}
749 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
750 
751 	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
752 	     num_bits);
753 
754 	if (ocfs2_is_cluster_bitmap(alloc_inode))
755 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
756 
757 	status = ocfs2_journal_access(handle,
758 				      alloc_inode,
759 				      group_bh,
760 				      journal_type);
761 	if (status < 0) {
762 		mlog_errno(status);
763 		goto bail;
764 	}
765 
766 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
767 
768 	while(num_bits--)
769 		ocfs2_set_bit(bit_off++, bitmap);
770 
771 	status = ocfs2_journal_dirty(handle,
772 				     group_bh);
773 	if (status < 0) {
774 		mlog_errno(status);
775 		goto bail;
776 	}
777 
778 bail:
779 	mlog_exit(status);
780 	return status;
781 }
782 
783 /* find the one with the most empty bits */
784 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
785 {
786 	u16 curr, best;
787 
788 	BUG_ON(!cl->cl_next_free_rec);
789 
790 	best = curr = 0;
791 	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
792 		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
793 		    le32_to_cpu(cl->cl_recs[best].c_free))
794 			best = curr;
795 		curr++;
796 	}
797 
798 	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
799 	return best;
800 }
801 
802 static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
803 				    struct inode *alloc_inode,
804 				    struct buffer_head *fe_bh,
805 				    struct buffer_head *bg_bh,
806 				    struct buffer_head *prev_bg_bh,
807 				    u16 chain)
808 {
809 	int status;
810 	/* there is a really tiny chance the journal calls could fail,
811 	 * but we wouldn't want inconsistent blocks in *any* case. */
812 	u64 fe_ptr, bg_ptr, prev_bg_ptr;
813 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
814 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
815 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
816 
817 	if (!OCFS2_IS_VALID_DINODE(fe)) {
818 		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
819 		status = -EIO;
820 		goto out;
821 	}
822 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
823 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
824 		status = -EIO;
825 		goto out;
826 	}
827 	if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
828 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
829 		status = -EIO;
830 		goto out;
831 	}
832 
833 	mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
834 	     (unsigned long long)fe->i_blkno, chain,
835 	     (unsigned long long)bg->bg_blkno,
836 	     (unsigned long long)prev_bg->bg_blkno);
837 
838 	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
839 	bg_ptr = le64_to_cpu(bg->bg_next_group);
840 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
841 
842 	status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
843 				      OCFS2_JOURNAL_ACCESS_WRITE);
844 	if (status < 0) {
845 		mlog_errno(status);
846 		goto out_rollback;
847 	}
848 
849 	prev_bg->bg_next_group = bg->bg_next_group;
850 
851 	status = ocfs2_journal_dirty(handle, prev_bg_bh);
852 	if (status < 0) {
853 		mlog_errno(status);
854 		goto out_rollback;
855 	}
856 
857 	status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
858 				      OCFS2_JOURNAL_ACCESS_WRITE);
859 	if (status < 0) {
860 		mlog_errno(status);
861 		goto out_rollback;
862 	}
863 
864 	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
865 
866 	status = ocfs2_journal_dirty(handle, bg_bh);
867 	if (status < 0) {
868 		mlog_errno(status);
869 		goto out_rollback;
870 	}
871 
872 	status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
873 				      OCFS2_JOURNAL_ACCESS_WRITE);
874 	if (status < 0) {
875 		mlog_errno(status);
876 		goto out_rollback;
877 	}
878 
879 	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
880 
881 	status = ocfs2_journal_dirty(handle, fe_bh);
882 	if (status < 0) {
883 		mlog_errno(status);
884 		goto out_rollback;
885 	}
886 
887 	status = 0;
888 out_rollback:
889 	if (status < 0) {
890 		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
891 		bg->bg_next_group = cpu_to_le64(bg_ptr);
892 		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
893 	}
894 out:
895 	mlog_exit(status);
896 	return status;
897 }
898 
899 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
900 						     u32 wanted)
901 {
902 	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
903 }
904 
905 /* return 0 on success, -ENOSPC to keep searching and any other < 0
906  * value on error. */
907 static int ocfs2_cluster_group_search(struct inode *inode,
908 				      struct buffer_head *group_bh,
909 				      u32 bits_wanted, u32 min_bits,
910 				      u16 *bit_off, u16 *bits_found)
911 {
912 	int search = -ENOSPC;
913 	int ret;
914 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
915 	u16 tmp_off, tmp_found;
916 
917 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
918 
919 	if (bg->bg_free_bits_count) {
920 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
921 							group_bh, bits_wanted,
922 							&tmp_off, &tmp_found);
923 		if (ret)
924 			return ret;
925 
926 		/* ocfs2_block_group_find_clear_bits() might
927 		 * return success, but we still want to return
928 		 * -ENOSPC unless it found the minimum number
929 		 * of bits. */
930 		if (min_bits <= tmp_found) {
931 			*bit_off = tmp_off;
932 			*bits_found = tmp_found;
933 			search = 0; /* success */
934 		}
935 	}
936 
937 	return search;
938 }
939 
940 static int ocfs2_block_group_search(struct inode *inode,
941 				    struct buffer_head *group_bh,
942 				    u32 bits_wanted, u32 min_bits,
943 				    u16 *bit_off, u16 *bits_found)
944 {
945 	int ret = -ENOSPC;
946 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
947 
948 	BUG_ON(min_bits != 1);
949 	BUG_ON(ocfs2_is_cluster_bitmap(inode));
950 
951 	if (bg->bg_free_bits_count)
952 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
953 							group_bh, bits_wanted,
954 							bit_off, bits_found);
955 
956 	return ret;
957 }
958 
959 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
960 			      u32 bits_wanted,
961 			      u32 min_bits,
962 			      u16 *bit_off,
963 			      unsigned int *num_bits,
964 			      u64 *bg_blkno)
965 {
966 	int status;
967 	u16 chain, tmp_bits;
968 	u32 tmp_used;
969 	u64 next_group;
970 	struct ocfs2_journal_handle *handle = ac->ac_handle;
971 	struct inode *alloc_inode = ac->ac_inode;
972 	struct buffer_head *group_bh = NULL;
973 	struct buffer_head *prev_group_bh = NULL;
974 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
975 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
976 	struct ocfs2_group_desc *bg;
977 
978 	chain = ac->ac_chain;
979 	mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
980 	     bits_wanted, chain,
981 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
982 
983 	status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
984 				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
985 				  &group_bh, OCFS2_BH_CACHED, alloc_inode);
986 	if (status < 0) {
987 		mlog_errno(status);
988 		goto bail;
989 	}
990 	bg = (struct ocfs2_group_desc *) group_bh->b_data;
991 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
992 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
993 		status = -EIO;
994 		goto bail;
995 	}
996 
997 	status = -ENOSPC;
998 	/* for now, the chain search is a bit simplistic. We just use
999 	 * the 1st group with any empty bits. */
1000 	while ((status = ac->ac_group_search(alloc_inode, group_bh,
1001 					     bits_wanted, min_bits, bit_off,
1002 					     &tmp_bits)) == -ENOSPC) {
1003 		if (!bg->bg_next_group)
1004 			break;
1005 
1006 		if (prev_group_bh) {
1007 			brelse(prev_group_bh);
1008 			prev_group_bh = NULL;
1009 		}
1010 		next_group = le64_to_cpu(bg->bg_next_group);
1011 		prev_group_bh = group_bh;
1012 		group_bh = NULL;
1013 		status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1014 					  next_group, &group_bh,
1015 					  OCFS2_BH_CACHED, alloc_inode);
1016 		if (status < 0) {
1017 			mlog_errno(status);
1018 			goto bail;
1019 		}
1020 		bg = (struct ocfs2_group_desc *) group_bh->b_data;
1021 		if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1022 			OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1023 			status = -EIO;
1024 			goto bail;
1025 		}
1026 	}
1027 	if (status < 0) {
1028 		if (status != -ENOSPC)
1029 			mlog_errno(status);
1030 		goto bail;
1031 	}
1032 
1033 	mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1034 	     tmp_bits, (unsigned long long)bg->bg_blkno);
1035 
1036 	*num_bits = tmp_bits;
1037 
1038 	BUG_ON(*num_bits == 0);
1039 
1040 	/*
1041 	 * Keep track of previous block descriptor read. When
1042 	 * we find a target, if we have read more than X
1043 	 * number of descriptors, and the target is reasonably
1044 	 * empty, relink him to top of his chain.
1045 	 *
1046 	 * We've read 0 extra blocks and only send one more to
1047 	 * the transaction, yet the next guy to search has a
1048 	 * much easier time.
1049 	 *
1050 	 * Do this *after* figuring out how many bits we're taking out
1051 	 * of our target group.
1052 	 */
1053 	if (ac->ac_allow_chain_relink &&
1054 	    (prev_group_bh) &&
1055 	    (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1056 		status = ocfs2_relink_block_group(handle, alloc_inode,
1057 						  ac->ac_bh, group_bh,
1058 						  prev_group_bh, chain);
1059 		if (status < 0) {
1060 			mlog_errno(status);
1061 			goto bail;
1062 		}
1063 	}
1064 
1065 	/* Ok, claim our bits now: set the info on dinode, chainlist
1066 	 * and then the group */
1067 	status = ocfs2_journal_access(handle,
1068 				      alloc_inode,
1069 				      ac->ac_bh,
1070 				      OCFS2_JOURNAL_ACCESS_WRITE);
1071 	if (status < 0) {
1072 		mlog_errno(status);
1073 		goto bail;
1074 	}
1075 
1076 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1077 	fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1078 	le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1079 
1080 	status = ocfs2_journal_dirty(handle,
1081 				     ac->ac_bh);
1082 	if (status < 0) {
1083 		mlog_errno(status);
1084 		goto bail;
1085 	}
1086 
1087 	status = ocfs2_block_group_set_bits(handle,
1088 					    alloc_inode,
1089 					    bg,
1090 					    group_bh,
1091 					    *bit_off,
1092 					    *num_bits);
1093 	if (status < 0) {
1094 		mlog_errno(status);
1095 		goto bail;
1096 	}
1097 
1098 	mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1099 	     (unsigned long long)fe->i_blkno);
1100 
1101 	*bg_blkno = le64_to_cpu(bg->bg_blkno);
1102 bail:
1103 	if (group_bh)
1104 		brelse(group_bh);
1105 	if (prev_group_bh)
1106 		brelse(prev_group_bh);
1107 
1108 	mlog_exit(status);
1109 	return status;
1110 }
1111 
1112 /* will give out up to bits_wanted contiguous bits. */
1113 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1114 				     struct ocfs2_alloc_context *ac,
1115 				     u32 bits_wanted,
1116 				     u32 min_bits,
1117 				     u16 *bit_off,
1118 				     unsigned int *num_bits,
1119 				     u64 *bg_blkno)
1120 {
1121 	int status;
1122 	u16 victim, i;
1123 	struct ocfs2_chain_list *cl;
1124 	struct ocfs2_dinode *fe;
1125 
1126 	mlog_entry_void();
1127 
1128 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1129 	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1130 	BUG_ON(!ac->ac_bh);
1131 
1132 	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1133 	if (!OCFS2_IS_VALID_DINODE(fe)) {
1134 		OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
1135 		status = -EIO;
1136 		goto bail;
1137 	}
1138 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1139 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
1140 		ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1141 			    "bits but only %u total.",
1142 			    (unsigned long long)le64_to_cpu(fe->i_blkno),
1143 			    le32_to_cpu(fe->id1.bitmap1.i_used),
1144 			    le32_to_cpu(fe->id1.bitmap1.i_total));
1145 		status = -EIO;
1146 		goto bail;
1147 	}
1148 
1149 	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1150 
1151 	victim = ocfs2_find_victim_chain(cl);
1152 	ac->ac_chain = victim;
1153 	ac->ac_allow_chain_relink = 1;
1154 
1155 	status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
1156 				    num_bits, bg_blkno);
1157 	if (!status)
1158 		goto bail;
1159 	if (status < 0 && status != -ENOSPC) {
1160 		mlog_errno(status);
1161 		goto bail;
1162 	}
1163 
1164 	mlog(0, "Search of victim chain %u came up with nothing, "
1165 	     "trying all chains now.\n", victim);
1166 
1167 	/* If we didn't pick a good victim, then just default to
1168 	 * searching each chain in order. Don't allow chain relinking
1169 	 * because we only calculate enough journal credits for one
1170 	 * relink per alloc. */
1171 	ac->ac_allow_chain_relink = 0;
1172 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1173 		if (i == victim)
1174 			continue;
1175 		if (!cl->cl_recs[i].c_free)
1176 			continue;
1177 
1178 		ac->ac_chain = i;
1179 		status = ocfs2_search_chain(ac, bits_wanted, min_bits,
1180 					    bit_off, num_bits,
1181 					    bg_blkno);
1182 		if (!status)
1183 			break;
1184 		if (status < 0 && status != -ENOSPC) {
1185 			mlog_errno(status);
1186 			goto bail;
1187 		}
1188 	}
1189 bail:
1190 
1191 	mlog_exit(status);
1192 	return status;
1193 }
1194 
1195 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1196 			 struct ocfs2_journal_handle *handle,
1197 			 struct ocfs2_alloc_context *ac,
1198 			 u32 bits_wanted,
1199 			 u16 *suballoc_bit_start,
1200 			 unsigned int *num_bits,
1201 			 u64 *blkno_start)
1202 {
1203 	int status;
1204 	u64 bg_blkno;
1205 
1206 	BUG_ON(!ac);
1207 	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1208 	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1209 	BUG_ON(ac->ac_handle != handle);
1210 
1211 	status = ocfs2_claim_suballoc_bits(osb,
1212 					   ac,
1213 					   bits_wanted,
1214 					   1,
1215 					   suballoc_bit_start,
1216 					   num_bits,
1217 					   &bg_blkno);
1218 	if (status < 0) {
1219 		mlog_errno(status);
1220 		goto bail;
1221 	}
1222 	atomic_inc(&osb->alloc_stats.bg_allocs);
1223 
1224 	*blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1225 	ac->ac_bits_given += (*num_bits);
1226 	status = 0;
1227 bail:
1228 	mlog_exit(status);
1229 	return status;
1230 }
1231 
1232 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1233 			  struct ocfs2_journal_handle *handle,
1234 			  struct ocfs2_alloc_context *ac,
1235 			  u16 *suballoc_bit,
1236 			  u64 *fe_blkno)
1237 {
1238 	int status;
1239 	unsigned int num_bits;
1240 	u64 bg_blkno;
1241 
1242 	mlog_entry_void();
1243 
1244 	BUG_ON(!ac);
1245 	BUG_ON(ac->ac_bits_given != 0);
1246 	BUG_ON(ac->ac_bits_wanted != 1);
1247 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1248 	BUG_ON(ac->ac_handle != handle);
1249 
1250 	status = ocfs2_claim_suballoc_bits(osb,
1251 					   ac,
1252 					   1,
1253 					   1,
1254 					   suballoc_bit,
1255 					   &num_bits,
1256 					   &bg_blkno);
1257 	if (status < 0) {
1258 		mlog_errno(status);
1259 		goto bail;
1260 	}
1261 	atomic_inc(&osb->alloc_stats.bg_allocs);
1262 
1263 	BUG_ON(num_bits != 1);
1264 
1265 	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1266 	ac->ac_bits_given++;
1267 	status = 0;
1268 bail:
1269 	mlog_exit(status);
1270 	return status;
1271 }
1272 
1273 /* translate a group desc. blkno and it's bitmap offset into
1274  * disk cluster offset. */
1275 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1276 						   u64 bg_blkno,
1277 						   u16 bg_bit_off)
1278 {
1279 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1280 	u32 cluster = 0;
1281 
1282 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1283 
1284 	if (bg_blkno != osb->first_cluster_group_blkno)
1285 		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1286 	cluster += (u32) bg_bit_off;
1287 	return cluster;
1288 }
1289 
1290 /* given a cluster offset, calculate which block group it belongs to
1291  * and return that block offset. */
1292 static inline u64 ocfs2_which_cluster_group(struct inode *inode,
1293 					    u32 cluster)
1294 {
1295 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1296 	u32 group_no;
1297 
1298 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1299 
1300 	group_no = cluster / osb->bitmap_cpg;
1301 	if (!group_no)
1302 		return osb->first_cluster_group_blkno;
1303 	return ocfs2_clusters_to_blocks(inode->i_sb,
1304 					group_no * osb->bitmap_cpg);
1305 }
1306 
1307 /* given the block number of a cluster start, calculate which cluster
1308  * group and descriptor bitmap offset that corresponds to. */
1309 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1310 						u64 data_blkno,
1311 						u64 *bg_blkno,
1312 						u16 *bg_bit_off)
1313 {
1314 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1315 	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1316 
1317 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1318 
1319 	*bg_blkno = ocfs2_which_cluster_group(inode,
1320 					      data_cluster);
1321 
1322 	if (*bg_blkno == osb->first_cluster_group_blkno)
1323 		*bg_bit_off = (u16) data_cluster;
1324 	else
1325 		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1326 							     data_blkno - *bg_blkno);
1327 }
1328 
1329 /*
1330  * min_bits - minimum contiguous chunk from this total allocation we
1331  * can handle. set to what we asked for originally for a full
1332  * contig. allocation, set to '1' to indicate we can deal with extents
1333  * of any size.
1334  */
1335 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1336 			 struct ocfs2_journal_handle *handle,
1337 			 struct ocfs2_alloc_context *ac,
1338 			 u32 min_clusters,
1339 			 u32 *cluster_start,
1340 			 u32 *num_clusters)
1341 {
1342 	int status;
1343 	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1344 	u64 bg_blkno;
1345 	u16 bg_bit_off;
1346 
1347 	mlog_entry_void();
1348 
1349 	BUG_ON(!ac);
1350 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1351 
1352 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1353 	       && ac->ac_which != OCFS2_AC_USE_MAIN);
1354 	BUG_ON(ac->ac_handle != handle);
1355 
1356 	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1357 		status = ocfs2_claim_local_alloc_bits(osb,
1358 						      handle,
1359 						      ac,
1360 						      bits_wanted,
1361 						      cluster_start,
1362 						      num_clusters);
1363 		if (!status)
1364 			atomic_inc(&osb->alloc_stats.local_data);
1365 	} else {
1366 		if (min_clusters > (osb->bitmap_cpg - 1)) {
1367 			/* The only paths asking for contiguousness
1368 			 * should know about this already. */
1369 			mlog(ML_ERROR, "minimum allocation requested exceeds "
1370 				       "group bitmap size!");
1371 			status = -ENOSPC;
1372 			goto bail;
1373 		}
1374 		/* clamp the current request down to a realistic size. */
1375 		if (bits_wanted > (osb->bitmap_cpg - 1))
1376 			bits_wanted = osb->bitmap_cpg - 1;
1377 
1378 		status = ocfs2_claim_suballoc_bits(osb,
1379 						   ac,
1380 						   bits_wanted,
1381 						   min_clusters,
1382 						   &bg_bit_off,
1383 						   num_clusters,
1384 						   &bg_blkno);
1385 		if (!status) {
1386 			*cluster_start =
1387 				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1388 								 bg_blkno,
1389 								 bg_bit_off);
1390 			atomic_inc(&osb->alloc_stats.bitmap_data);
1391 		}
1392 	}
1393 	if (status < 0) {
1394 		if (status != -ENOSPC)
1395 			mlog_errno(status);
1396 		goto bail;
1397 	}
1398 
1399 	ac->ac_bits_given += *num_clusters;
1400 
1401 bail:
1402 	mlog_exit(status);
1403 	return status;
1404 }
1405 
1406 static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
1407 					       struct inode *alloc_inode,
1408 					       struct ocfs2_group_desc *bg,
1409 					       struct buffer_head *group_bh,
1410 					       unsigned int bit_off,
1411 					       unsigned int num_bits)
1412 {
1413 	int status;
1414 	unsigned int tmp;
1415 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1416 	struct ocfs2_group_desc *undo_bg = NULL;
1417 
1418 	mlog_entry_void();
1419 
1420 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1421 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1422 		status = -EIO;
1423 		goto bail;
1424 	}
1425 
1426 	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1427 
1428 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1429 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1430 
1431 	status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1432 				      journal_type);
1433 	if (status < 0) {
1434 		mlog_errno(status);
1435 		goto bail;
1436 	}
1437 
1438 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1439 		undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1440 
1441 	tmp = num_bits;
1442 	while(tmp--) {
1443 		ocfs2_clear_bit((bit_off + tmp),
1444 				(unsigned long *) bg->bg_bitmap);
1445 		if (ocfs2_is_cluster_bitmap(alloc_inode))
1446 			ocfs2_set_bit(bit_off + tmp,
1447 				      (unsigned long *) undo_bg->bg_bitmap);
1448 	}
1449 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1450 
1451 	status = ocfs2_journal_dirty(handle, group_bh);
1452 	if (status < 0)
1453 		mlog_errno(status);
1454 bail:
1455 	return status;
1456 }
1457 
1458 /*
1459  * expects the suballoc inode to already be locked.
1460  */
1461 static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
1462 				    struct inode *alloc_inode,
1463 				    struct buffer_head *alloc_bh,
1464 				    unsigned int start_bit,
1465 				    u64 bg_blkno,
1466 				    unsigned int count)
1467 {
1468 	int status = 0;
1469 	u32 tmp_used;
1470 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1471 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1472 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1473 	struct buffer_head *group_bh = NULL;
1474 	struct ocfs2_group_desc *group;
1475 
1476 	mlog_entry_void();
1477 
1478 	if (!OCFS2_IS_VALID_DINODE(fe)) {
1479 		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
1480 		status = -EIO;
1481 		goto bail;
1482 	}
1483 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1484 
1485 	mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1486 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1487 	     (unsigned long long)bg_blkno, start_bit);
1488 
1489 	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
1490 				  alloc_inode);
1491 	if (status < 0) {
1492 		mlog_errno(status);
1493 		goto bail;
1494 	}
1495 
1496 	group = (struct ocfs2_group_desc *) group_bh->b_data;
1497 	if (!OCFS2_IS_VALID_GROUP_DESC(group)) {
1498 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group);
1499 		status = -EIO;
1500 		goto bail;
1501 	}
1502 	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1503 
1504 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1505 					      group, group_bh,
1506 					      start_bit, count);
1507 	if (status < 0) {
1508 		mlog_errno(status);
1509 		goto bail;
1510 	}
1511 
1512 	status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1513 				      OCFS2_JOURNAL_ACCESS_WRITE);
1514 	if (status < 0) {
1515 		mlog_errno(status);
1516 		goto bail;
1517 	}
1518 
1519 	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1520 		     count);
1521 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1522 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1523 
1524 	status = ocfs2_journal_dirty(handle, alloc_bh);
1525 	if (status < 0) {
1526 		mlog_errno(status);
1527 		goto bail;
1528 	}
1529 
1530 bail:
1531 	if (group_bh)
1532 		brelse(group_bh);
1533 
1534 	mlog_exit(status);
1535 	return status;
1536 }
1537 
1538 static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1539 {
1540 	u64 group = block - (u64) bit;
1541 
1542 	return group;
1543 }
1544 
1545 int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
1546 		      struct inode *inode_alloc_inode,
1547 		      struct buffer_head *inode_alloc_bh,
1548 		      struct ocfs2_dinode *di)
1549 {
1550 	u64 blk = le64_to_cpu(di->i_blkno);
1551 	u16 bit = le16_to_cpu(di->i_suballoc_bit);
1552 	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1553 
1554 	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1555 					inode_alloc_bh, bit, bg_blkno, 1);
1556 }
1557 
1558 int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
1559 			    struct inode *eb_alloc_inode,
1560 			    struct buffer_head *eb_alloc_bh,
1561 			    struct ocfs2_extent_block *eb)
1562 {
1563 	u64 blk = le64_to_cpu(eb->h_blkno);
1564 	u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1565 	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1566 
1567 	return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1568 					bit, bg_blkno, 1);
1569 }
1570 
1571 int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
1572 		       struct inode *bitmap_inode,
1573 		       struct buffer_head *bitmap_bh,
1574 		       u64 start_blk,
1575 		       unsigned int num_clusters)
1576 {
1577 	int status;
1578 	u16 bg_start_bit;
1579 	u64 bg_blkno;
1580 	struct ocfs2_dinode *fe;
1581 
1582 	/* You can't ever have a contiguous set of clusters
1583 	 * bigger than a block group bitmap so we never have to worry
1584 	 * about looping on them. */
1585 
1586 	mlog_entry_void();
1587 
1588 	/* This is expensive. We can safely remove once this stuff has
1589 	 * gotten tested really well. */
1590 	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1591 
1592 	fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1593 
1594 	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1595 				     &bg_start_bit);
1596 
1597 	mlog(0, "want to free %u clusters starting at block %llu\n",
1598 	     num_clusters, (unsigned long long)start_blk);
1599 	mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
1600 	     (unsigned long long)bg_blkno, bg_start_bit);
1601 
1602 	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1603 					  bg_start_bit, bg_blkno,
1604 					  num_clusters);
1605 	if (status < 0)
1606 		mlog_errno(status);
1607 
1608 	mlog_exit(status);
1609 	return status;
1610 }
1611 
1612 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1613 {
1614 	printk("Block Group:\n");
1615 	printk("bg_signature:       %s\n", bg->bg_signature);
1616 	printk("bg_size:            %u\n", bg->bg_size);
1617 	printk("bg_bits:            %u\n", bg->bg_bits);
1618 	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1619 	printk("bg_chain:           %u\n", bg->bg_chain);
1620 	printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
1621 	printk("bg_next_group:      %llu\n",
1622 	       (unsigned long long)bg->bg_next_group);
1623 	printk("bg_parent_dinode:   %llu\n",
1624 	       (unsigned long long)bg->bg_parent_dinode);
1625 	printk("bg_blkno:           %llu\n",
1626 	       (unsigned long long)bg->bg_blkno);
1627 }
1628 
1629 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1630 {
1631 	int i;
1632 
1633 	printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
1634 	printk("i_signature:                  %s\n", fe->i_signature);
1635 	printk("i_size:                       %llu\n",
1636 	       (unsigned long long)fe->i_size);
1637 	printk("i_clusters:                   %u\n", fe->i_clusters);
1638 	printk("i_generation:                 %u\n",
1639 	       le32_to_cpu(fe->i_generation));
1640 	printk("id1.bitmap1.i_used:           %u\n",
1641 	       le32_to_cpu(fe->id1.bitmap1.i_used));
1642 	printk("id1.bitmap1.i_total:          %u\n",
1643 	       le32_to_cpu(fe->id1.bitmap1.i_total));
1644 	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
1645 	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
1646 	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
1647 	printk("id2.i_chain.cl_next_free_rec: %u\n",
1648 	       fe->id2.i_chain.cl_next_free_rec);
1649 	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
1650 		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
1651 		       fe->id2.i_chain.cl_recs[i].c_free);
1652 		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
1653 		       fe->id2.i_chain.cl_recs[i].c_total);
1654 		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
1655 		       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
1656 	}
1657 }
1658