xref: /linux/fs/ocfs2/suballoc.c (revision c537b994505099b7197e7d3125b942ecbcc51eb6)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26 
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31 
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34 
35 #include "ocfs2.h"
36 
37 #include "alloc.h"
38 #include "dlmglue.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "super.h"
44 #include "sysfile.h"
45 #include "uptodate.h"
46 
47 #include "buffer_head_io.h"
48 
49 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
52 static int ocfs2_block_group_fill(handle_t *handle,
53 				  struct inode *alloc_inode,
54 				  struct buffer_head *bg_bh,
55 				  u64 group_blkno,
56 				  u16 my_chain,
57 				  struct ocfs2_chain_list *cl);
58 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
59 				   struct inode *alloc_inode,
60 				   struct buffer_head *bh);
61 
62 static int ocfs2_cluster_group_search(struct inode *inode,
63 				      struct buffer_head *group_bh,
64 				      u32 bits_wanted, u32 min_bits,
65 				      u16 *bit_off, u16 *bits_found);
66 static int ocfs2_block_group_search(struct inode *inode,
67 				    struct buffer_head *group_bh,
68 				    u32 bits_wanted, u32 min_bits,
69 				    u16 *bit_off, u16 *bits_found);
70 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
71 				     struct ocfs2_alloc_context *ac,
72 				     handle_t *handle,
73 				     u32 bits_wanted,
74 				     u32 min_bits,
75 				     u16 *bit_off,
76 				     unsigned int *num_bits,
77 				     u64 *bg_blkno);
78 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
79 					 int nr);
80 static inline int ocfs2_block_group_set_bits(handle_t *handle,
81 					     struct inode *alloc_inode,
82 					     struct ocfs2_group_desc *bg,
83 					     struct buffer_head *group_bh,
84 					     unsigned int bit_off,
85 					     unsigned int num_bits);
86 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
87 					       struct inode *alloc_inode,
88 					       struct ocfs2_group_desc *bg,
89 					       struct buffer_head *group_bh,
90 					       unsigned int bit_off,
91 					       unsigned int num_bits);
92 
93 static int ocfs2_relink_block_group(handle_t *handle,
94 				    struct inode *alloc_inode,
95 				    struct buffer_head *fe_bh,
96 				    struct buffer_head *bg_bh,
97 				    struct buffer_head *prev_bg_bh,
98 				    u16 chain);
99 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
100 						     u32 wanted);
101 static int ocfs2_free_suballoc_bits(handle_t *handle,
102 				    struct inode *alloc_inode,
103 				    struct buffer_head *alloc_bh,
104 				    unsigned int start_bit,
105 				    u64 bg_blkno,
106 				    unsigned int count);
107 static inline u64 ocfs2_which_suballoc_group(u64 block,
108 					     unsigned int bit);
109 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
110 						   u64 bg_blkno,
111 						   u16 bg_bit_off);
112 static inline u64 ocfs2_which_cluster_group(struct inode *inode,
113 					    u32 cluster);
114 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
115 						u64 data_blkno,
116 						u64 *bg_blkno,
117 						u16 *bg_bit_off);
118 
119 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
120 {
121 	struct inode *inode = ac->ac_inode;
122 
123 	if (inode) {
124 		if (ac->ac_which != OCFS2_AC_USE_LOCAL)
125 			ocfs2_meta_unlock(inode, 1);
126 
127 		mutex_unlock(&inode->i_mutex);
128 
129 		iput(inode);
130 	}
131 	if (ac->ac_bh)
132 		brelse(ac->ac_bh);
133 	kfree(ac);
134 }
135 
136 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
137 {
138 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
139 }
140 
141 /* somewhat more expensive than our other checks, so use sparingly. */
142 static int ocfs2_check_group_descriptor(struct super_block *sb,
143 					struct ocfs2_dinode *di,
144 					struct ocfs2_group_desc *gd)
145 {
146 	unsigned int max_bits;
147 
148 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
149 		OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
150 		return -EIO;
151 	}
152 
153 	if (di->i_blkno != gd->bg_parent_dinode) {
154 		ocfs2_error(sb, "Group descriptor # %llu has bad parent "
155 			    "pointer (%llu, expected %llu)",
156 			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
157 			    (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
158 			    (unsigned long long)le64_to_cpu(di->i_blkno));
159 		return -EIO;
160 	}
161 
162 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
163 	if (le16_to_cpu(gd->bg_bits) > max_bits) {
164 		ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
165 			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
166 			    le16_to_cpu(gd->bg_bits));
167 		return -EIO;
168 	}
169 
170 	if (le16_to_cpu(gd->bg_chain) >=
171 	    le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
172 		ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
173 			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
174 			    le16_to_cpu(gd->bg_chain));
175 		return -EIO;
176 	}
177 
178 	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
179 		ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
180 			    "claims that %u are free",
181 			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
182 			    le16_to_cpu(gd->bg_bits),
183 			    le16_to_cpu(gd->bg_free_bits_count));
184 		return -EIO;
185 	}
186 
187 	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
188 		ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
189 			    "max bitmap bits of %u",
190 			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
191 			    le16_to_cpu(gd->bg_bits),
192 			    8 * le16_to_cpu(gd->bg_size));
193 		return -EIO;
194 	}
195 
196 	return 0;
197 }
198 
199 static int ocfs2_block_group_fill(handle_t *handle,
200 				  struct inode *alloc_inode,
201 				  struct buffer_head *bg_bh,
202 				  u64 group_blkno,
203 				  u16 my_chain,
204 				  struct ocfs2_chain_list *cl)
205 {
206 	int status = 0;
207 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
208 	struct super_block * sb = alloc_inode->i_sb;
209 
210 	mlog_entry_void();
211 
212 	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
213 		ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
214 			    "b_blocknr (%llu)",
215 			    (unsigned long long)group_blkno,
216 			    (unsigned long long) bg_bh->b_blocknr);
217 		status = -EIO;
218 		goto bail;
219 	}
220 
221 	status = ocfs2_journal_access(handle,
222 				      alloc_inode,
223 				      bg_bh,
224 				      OCFS2_JOURNAL_ACCESS_CREATE);
225 	if (status < 0) {
226 		mlog_errno(status);
227 		goto bail;
228 	}
229 
230 	memset(bg, 0, sb->s_blocksize);
231 	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
232 	bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
233 	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
234 	bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
235 	bg->bg_chain = cpu_to_le16(my_chain);
236 	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
237 	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
238 	bg->bg_blkno = cpu_to_le64(group_blkno);
239 	/* set the 1st bit in the bitmap to account for the descriptor block */
240 	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
241 	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
242 
243 	status = ocfs2_journal_dirty(handle, bg_bh);
244 	if (status < 0)
245 		mlog_errno(status);
246 
247 	/* There is no need to zero out or otherwise initialize the
248 	 * other blocks in a group - All valid FS metadata in a block
249 	 * group stores the superblock fs_generation value at
250 	 * allocation time. */
251 
252 bail:
253 	mlog_exit(status);
254 	return status;
255 }
256 
257 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
258 {
259 	u16 curr, best;
260 
261 	best = curr = 0;
262 	while (curr < le16_to_cpu(cl->cl_count)) {
263 		if (le32_to_cpu(cl->cl_recs[best].c_total) >
264 		    le32_to_cpu(cl->cl_recs[curr].c_total))
265 			best = curr;
266 		curr++;
267 	}
268 	return best;
269 }
270 
271 /*
272  * We expect the block group allocator to already be locked.
273  */
274 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
275 				   struct inode *alloc_inode,
276 				   struct buffer_head *bh)
277 {
278 	int status, credits;
279 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
280 	struct ocfs2_chain_list *cl;
281 	struct ocfs2_alloc_context *ac = NULL;
282 	handle_t *handle = NULL;
283 	u32 bit_off, num_bits;
284 	u16 alloc_rec;
285 	u64 bg_blkno;
286 	struct buffer_head *bg_bh = NULL;
287 	struct ocfs2_group_desc *bg;
288 
289 	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
290 
291 	mlog_entry_void();
292 
293 	cl = &fe->id2.i_chain;
294 	status = ocfs2_reserve_clusters(osb,
295 					le16_to_cpu(cl->cl_cpg),
296 					&ac);
297 	if (status < 0) {
298 		if (status != -ENOSPC)
299 			mlog_errno(status);
300 		goto bail;
301 	}
302 
303 	credits = ocfs2_calc_group_alloc_credits(osb->sb,
304 						 le16_to_cpu(cl->cl_cpg));
305 	handle = ocfs2_start_trans(osb, credits);
306 	if (IS_ERR(handle)) {
307 		status = PTR_ERR(handle);
308 		handle = NULL;
309 		mlog_errno(status);
310 		goto bail;
311 	}
312 
313 	status = ocfs2_claim_clusters(osb,
314 				      handle,
315 				      ac,
316 				      le16_to_cpu(cl->cl_cpg),
317 				      &bit_off,
318 				      &num_bits);
319 	if (status < 0) {
320 		if (status != -ENOSPC)
321 			mlog_errno(status);
322 		goto bail;
323 	}
324 
325 	alloc_rec = ocfs2_find_smallest_chain(cl);
326 
327 	/* setup the group */
328 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
329 	mlog(0, "new descriptor, record %u, at block %llu\n",
330 	     alloc_rec, (unsigned long long)bg_blkno);
331 
332 	bg_bh = sb_getblk(osb->sb, bg_blkno);
333 	if (!bg_bh) {
334 		status = -EIO;
335 		mlog_errno(status);
336 		goto bail;
337 	}
338 	ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
339 
340 	status = ocfs2_block_group_fill(handle,
341 					alloc_inode,
342 					bg_bh,
343 					bg_blkno,
344 					alloc_rec,
345 					cl);
346 	if (status < 0) {
347 		mlog_errno(status);
348 		goto bail;
349 	}
350 
351 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
352 
353 	status = ocfs2_journal_access(handle, alloc_inode,
354 				      bh, OCFS2_JOURNAL_ACCESS_WRITE);
355 	if (status < 0) {
356 		mlog_errno(status);
357 		goto bail;
358 	}
359 
360 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
361 		     le16_to_cpu(bg->bg_free_bits_count));
362 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
363 	cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
364 	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
365 		le16_add_cpu(&cl->cl_next_free_rec, 1);
366 
367 	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
368 					le16_to_cpu(bg->bg_free_bits_count));
369 	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
370 	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
371 
372 	status = ocfs2_journal_dirty(handle, bh);
373 	if (status < 0) {
374 		mlog_errno(status);
375 		goto bail;
376 	}
377 
378 	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
379 	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
380 	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
381 					     le32_to_cpu(fe->i_clusters)));
382 	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
383 	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
384 	alloc_inode->i_blocks =
385 		ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
386 
387 	status = 0;
388 bail:
389 	if (handle)
390 		ocfs2_commit_trans(osb, handle);
391 
392 	if (ac)
393 		ocfs2_free_alloc_context(ac);
394 
395 	if (bg_bh)
396 		brelse(bg_bh);
397 
398 	mlog_exit(status);
399 	return status;
400 }
401 
402 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
403 				       struct ocfs2_alloc_context *ac,
404 				       int type,
405 				       u32 slot)
406 {
407 	int status;
408 	u32 bits_wanted = ac->ac_bits_wanted;
409 	struct inode *alloc_inode;
410 	struct buffer_head *bh = NULL;
411 	struct ocfs2_dinode *fe;
412 	u32 free_bits;
413 
414 	mlog_entry_void();
415 
416 	alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
417 	if (!alloc_inode) {
418 		mlog_errno(-EINVAL);
419 		return -EINVAL;
420 	}
421 
422 	mutex_lock(&alloc_inode->i_mutex);
423 
424 	status = ocfs2_meta_lock(alloc_inode, &bh, 1);
425 	if (status < 0) {
426 		mutex_unlock(&alloc_inode->i_mutex);
427 		iput(alloc_inode);
428 
429 		mlog_errno(status);
430 		return status;
431 	}
432 
433 	ac->ac_inode = alloc_inode;
434 
435 	fe = (struct ocfs2_dinode *) bh->b_data;
436 	if (!OCFS2_IS_VALID_DINODE(fe)) {
437 		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
438 		status = -EIO;
439 		goto bail;
440 	}
441 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
442 		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
443 			    (unsigned long long)le64_to_cpu(fe->i_blkno));
444 		status = -EIO;
445 		goto bail;
446 	}
447 
448 	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
449 		le32_to_cpu(fe->id1.bitmap1.i_used);
450 
451 	if (bits_wanted > free_bits) {
452 		/* cluster bitmap never grows */
453 		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
454 			mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
455 			     bits_wanted, free_bits);
456 			status = -ENOSPC;
457 			goto bail;
458 		}
459 
460 		status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
461 		if (status < 0) {
462 			if (status != -ENOSPC)
463 				mlog_errno(status);
464 			goto bail;
465 		}
466 		atomic_inc(&osb->alloc_stats.bg_extends);
467 
468 		/* You should never ask for this much metadata */
469 		BUG_ON(bits_wanted >
470 		       (le32_to_cpu(fe->id1.bitmap1.i_total)
471 			- le32_to_cpu(fe->id1.bitmap1.i_used)));
472 	}
473 
474 	get_bh(bh);
475 	ac->ac_bh = bh;
476 bail:
477 	if (bh)
478 		brelse(bh);
479 
480 	mlog_exit(status);
481 	return status;
482 }
483 
484 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
485 			       struct ocfs2_dinode *fe,
486 			       struct ocfs2_alloc_context **ac)
487 {
488 	int status;
489 	u32 slot;
490 
491 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
492 	if (!(*ac)) {
493 		status = -ENOMEM;
494 		mlog_errno(status);
495 		goto bail;
496 	}
497 
498 	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
499 	(*ac)->ac_which = OCFS2_AC_USE_META;
500 
501 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
502 	slot = 0;
503 #else
504 	slot = osb->slot_num;
505 #endif
506 
507 	(*ac)->ac_group_search = ocfs2_block_group_search;
508 
509 	status = ocfs2_reserve_suballoc_bits(osb, (*ac),
510 					     EXTENT_ALLOC_SYSTEM_INODE, slot);
511 	if (status < 0) {
512 		if (status != -ENOSPC)
513 			mlog_errno(status);
514 		goto bail;
515 	}
516 
517 	status = 0;
518 bail:
519 	if ((status < 0) && *ac) {
520 		ocfs2_free_alloc_context(*ac);
521 		*ac = NULL;
522 	}
523 
524 	mlog_exit(status);
525 	return status;
526 }
527 
528 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
529 			    struct ocfs2_alloc_context **ac)
530 {
531 	int status;
532 
533 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
534 	if (!(*ac)) {
535 		status = -ENOMEM;
536 		mlog_errno(status);
537 		goto bail;
538 	}
539 
540 	(*ac)->ac_bits_wanted = 1;
541 	(*ac)->ac_which = OCFS2_AC_USE_INODE;
542 
543 	(*ac)->ac_group_search = ocfs2_block_group_search;
544 
545 	status = ocfs2_reserve_suballoc_bits(osb, *ac,
546 					     INODE_ALLOC_SYSTEM_INODE,
547 					     osb->slot_num);
548 	if (status < 0) {
549 		if (status != -ENOSPC)
550 			mlog_errno(status);
551 		goto bail;
552 	}
553 
554 	status = 0;
555 bail:
556 	if ((status < 0) && *ac) {
557 		ocfs2_free_alloc_context(*ac);
558 		*ac = NULL;
559 	}
560 
561 	mlog_exit(status);
562 	return status;
563 }
564 
565 /* local alloc code has to do the same thing, so rather than do this
566  * twice.. */
567 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
568 				      struct ocfs2_alloc_context *ac)
569 {
570 	int status;
571 
572 	ac->ac_which = OCFS2_AC_USE_MAIN;
573 	ac->ac_group_search = ocfs2_cluster_group_search;
574 
575 	status = ocfs2_reserve_suballoc_bits(osb, ac,
576 					     GLOBAL_BITMAP_SYSTEM_INODE,
577 					     OCFS2_INVALID_SLOT);
578 	if (status < 0 && status != -ENOSPC) {
579 		mlog_errno(status);
580 		goto bail;
581 	}
582 
583 bail:
584 	return status;
585 }
586 
587 /* Callers don't need to care which bitmap (local alloc or main) to
588  * use so we figure it out for them, but unfortunately this clutters
589  * things a bit. */
590 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
591 			   u32 bits_wanted,
592 			   struct ocfs2_alloc_context **ac)
593 {
594 	int status;
595 
596 	mlog_entry_void();
597 
598 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
599 	if (!(*ac)) {
600 		status = -ENOMEM;
601 		mlog_errno(status);
602 		goto bail;
603 	}
604 
605 	(*ac)->ac_bits_wanted = bits_wanted;
606 
607 	status = -ENOSPC;
608 	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
609 		status = ocfs2_reserve_local_alloc_bits(osb,
610 							bits_wanted,
611 							*ac);
612 		if ((status < 0) && (status != -ENOSPC)) {
613 			mlog_errno(status);
614 			goto bail;
615 		} else if (status == -ENOSPC) {
616 			/* reserve_local_bits will return enospc with
617 			 * the local alloc inode still locked, so we
618 			 * can change this safely here. */
619 			mlog(0, "Disabling local alloc\n");
620 			/* We set to OCFS2_LA_DISABLED so that umount
621 			 * can clean up what's left of the local
622 			 * allocation */
623 			osb->local_alloc_state = OCFS2_LA_DISABLED;
624 		}
625 	}
626 
627 	if (status == -ENOSPC) {
628 		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
629 		if (status < 0) {
630 			if (status != -ENOSPC)
631 				mlog_errno(status);
632 			goto bail;
633 		}
634 	}
635 
636 	status = 0;
637 bail:
638 	if ((status < 0) && *ac) {
639 		ocfs2_free_alloc_context(*ac);
640 		*ac = NULL;
641 	}
642 
643 	mlog_exit(status);
644 	return status;
645 }
646 
647 /*
648  * More or less lifted from ext3. I'll leave their description below:
649  *
650  * "For ext3 allocations, we must not reuse any blocks which are
651  * allocated in the bitmap buffer's "last committed data" copy.  This
652  * prevents deletes from freeing up the page for reuse until we have
653  * committed the delete transaction.
654  *
655  * If we didn't do this, then deleting something and reallocating it as
656  * data would allow the old block to be overwritten before the
657  * transaction committed (because we force data to disk before commit).
658  * This would lead to corruption if we crashed between overwriting the
659  * data and committing the delete.
660  *
661  * @@@ We may want to make this allocation behaviour conditional on
662  * data-writes at some point, and disable it for metadata allocations or
663  * sync-data inodes."
664  *
665  * Note: OCFS2 already does this differently for metadata vs data
666  * allocations, as those bitmaps are seperate and undo access is never
667  * called on a metadata group descriptor.
668  */
669 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
670 					 int nr)
671 {
672 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
673 
674 	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
675 		return 0;
676 	if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
677 		return 1;
678 
679 	bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
680 	return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
681 }
682 
683 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
684 					     struct buffer_head *bg_bh,
685 					     unsigned int bits_wanted,
686 					     unsigned int total_bits,
687 					     u16 *bit_off,
688 					     u16 *bits_found)
689 {
690 	void *bitmap;
691 	u16 best_offset, best_size;
692 	int offset, start, found, status = 0;
693 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
694 
695 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
696 		OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
697 		return -EIO;
698 	}
699 
700 	found = start = best_offset = best_size = 0;
701 	bitmap = bg->bg_bitmap;
702 
703 	while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
704 		if (offset == total_bits)
705 			break;
706 
707 		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
708 			/* We found a zero, but we can't use it as it
709 			 * hasn't been put to disk yet! */
710 			found = 0;
711 			start = offset + 1;
712 		} else if (offset == start) {
713 			/* we found a zero */
714 			found++;
715 			/* move start to the next bit to test */
716 			start++;
717 		} else {
718 			/* got a zero after some ones */
719 			found = 1;
720 			start = offset + 1;
721 		}
722 		if (found > best_size) {
723 			best_size = found;
724 			best_offset = start - found;
725 		}
726 		/* we got everything we needed */
727 		if (found == bits_wanted) {
728 			/* mlog(0, "Found it all!\n"); */
729 			break;
730 		}
731 	}
732 
733 	/* XXX: I think the first clause is equivalent to the second
734 	 * 	- jlbec */
735 	if (found == bits_wanted) {
736 		*bit_off = start - found;
737 		*bits_found = found;
738 	} else if (best_size) {
739 		*bit_off = best_offset;
740 		*bits_found = best_size;
741 	} else {
742 		status = -ENOSPC;
743 		/* No error log here -- see the comment above
744 		 * ocfs2_test_bg_bit_allocatable */
745 	}
746 
747 	return status;
748 }
749 
750 static inline int ocfs2_block_group_set_bits(handle_t *handle,
751 					     struct inode *alloc_inode,
752 					     struct ocfs2_group_desc *bg,
753 					     struct buffer_head *group_bh,
754 					     unsigned int bit_off,
755 					     unsigned int num_bits)
756 {
757 	int status;
758 	void *bitmap = bg->bg_bitmap;
759 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
760 
761 	mlog_entry_void();
762 
763 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
764 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
765 		status = -EIO;
766 		goto bail;
767 	}
768 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
769 
770 	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
771 	     num_bits);
772 
773 	if (ocfs2_is_cluster_bitmap(alloc_inode))
774 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
775 
776 	status = ocfs2_journal_access(handle,
777 				      alloc_inode,
778 				      group_bh,
779 				      journal_type);
780 	if (status < 0) {
781 		mlog_errno(status);
782 		goto bail;
783 	}
784 
785 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
786 
787 	while(num_bits--)
788 		ocfs2_set_bit(bit_off++, bitmap);
789 
790 	status = ocfs2_journal_dirty(handle,
791 				     group_bh);
792 	if (status < 0) {
793 		mlog_errno(status);
794 		goto bail;
795 	}
796 
797 bail:
798 	mlog_exit(status);
799 	return status;
800 }
801 
802 /* find the one with the most empty bits */
803 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
804 {
805 	u16 curr, best;
806 
807 	BUG_ON(!cl->cl_next_free_rec);
808 
809 	best = curr = 0;
810 	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
811 		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
812 		    le32_to_cpu(cl->cl_recs[best].c_free))
813 			best = curr;
814 		curr++;
815 	}
816 
817 	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
818 	return best;
819 }
820 
821 static int ocfs2_relink_block_group(handle_t *handle,
822 				    struct inode *alloc_inode,
823 				    struct buffer_head *fe_bh,
824 				    struct buffer_head *bg_bh,
825 				    struct buffer_head *prev_bg_bh,
826 				    u16 chain)
827 {
828 	int status;
829 	/* there is a really tiny chance the journal calls could fail,
830 	 * but we wouldn't want inconsistent blocks in *any* case. */
831 	u64 fe_ptr, bg_ptr, prev_bg_ptr;
832 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
833 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
834 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
835 
836 	if (!OCFS2_IS_VALID_DINODE(fe)) {
837 		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
838 		status = -EIO;
839 		goto out;
840 	}
841 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
842 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
843 		status = -EIO;
844 		goto out;
845 	}
846 	if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
847 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
848 		status = -EIO;
849 		goto out;
850 	}
851 
852 	mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
853 	     (unsigned long long)fe->i_blkno, chain,
854 	     (unsigned long long)bg->bg_blkno,
855 	     (unsigned long long)prev_bg->bg_blkno);
856 
857 	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
858 	bg_ptr = le64_to_cpu(bg->bg_next_group);
859 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
860 
861 	status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
862 				      OCFS2_JOURNAL_ACCESS_WRITE);
863 	if (status < 0) {
864 		mlog_errno(status);
865 		goto out_rollback;
866 	}
867 
868 	prev_bg->bg_next_group = bg->bg_next_group;
869 
870 	status = ocfs2_journal_dirty(handle, prev_bg_bh);
871 	if (status < 0) {
872 		mlog_errno(status);
873 		goto out_rollback;
874 	}
875 
876 	status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
877 				      OCFS2_JOURNAL_ACCESS_WRITE);
878 	if (status < 0) {
879 		mlog_errno(status);
880 		goto out_rollback;
881 	}
882 
883 	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
884 
885 	status = ocfs2_journal_dirty(handle, bg_bh);
886 	if (status < 0) {
887 		mlog_errno(status);
888 		goto out_rollback;
889 	}
890 
891 	status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
892 				      OCFS2_JOURNAL_ACCESS_WRITE);
893 	if (status < 0) {
894 		mlog_errno(status);
895 		goto out_rollback;
896 	}
897 
898 	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
899 
900 	status = ocfs2_journal_dirty(handle, fe_bh);
901 	if (status < 0) {
902 		mlog_errno(status);
903 		goto out_rollback;
904 	}
905 
906 	status = 0;
907 out_rollback:
908 	if (status < 0) {
909 		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
910 		bg->bg_next_group = cpu_to_le64(bg_ptr);
911 		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
912 	}
913 out:
914 	mlog_exit(status);
915 	return status;
916 }
917 
918 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
919 						     u32 wanted)
920 {
921 	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
922 }
923 
924 /* return 0 on success, -ENOSPC to keep searching and any other < 0
925  * value on error. */
926 static int ocfs2_cluster_group_search(struct inode *inode,
927 				      struct buffer_head *group_bh,
928 				      u32 bits_wanted, u32 min_bits,
929 				      u16 *bit_off, u16 *bits_found)
930 {
931 	int search = -ENOSPC;
932 	int ret;
933 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
934 	u16 tmp_off, tmp_found;
935 	unsigned int max_bits, gd_cluster_off;
936 
937 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
938 
939 	if (gd->bg_free_bits_count) {
940 		max_bits = le16_to_cpu(gd->bg_bits);
941 
942 		/* Tail groups in cluster bitmaps which aren't cpg
943 		 * aligned are prone to partial extention by a failed
944 		 * fs resize. If the file system resize never got to
945 		 * update the dinode cluster count, then we don't want
946 		 * to trust any clusters past it, regardless of what
947 		 * the group descriptor says. */
948 		gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
949 							  le64_to_cpu(gd->bg_blkno));
950 		if ((gd_cluster_off + max_bits) >
951 		    OCFS2_I(inode)->ip_clusters) {
952 			max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
953 			mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
954 			     (unsigned long long)le64_to_cpu(gd->bg_blkno),
955 			     le16_to_cpu(gd->bg_bits),
956 			     OCFS2_I(inode)->ip_clusters, max_bits);
957 		}
958 
959 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
960 							group_bh, bits_wanted,
961 							max_bits,
962 							&tmp_off, &tmp_found);
963 		if (ret)
964 			return ret;
965 
966 		/* ocfs2_block_group_find_clear_bits() might
967 		 * return success, but we still want to return
968 		 * -ENOSPC unless it found the minimum number
969 		 * of bits. */
970 		if (min_bits <= tmp_found) {
971 			*bit_off = tmp_off;
972 			*bits_found = tmp_found;
973 			search = 0; /* success */
974 		}
975 	}
976 
977 	return search;
978 }
979 
980 static int ocfs2_block_group_search(struct inode *inode,
981 				    struct buffer_head *group_bh,
982 				    u32 bits_wanted, u32 min_bits,
983 				    u16 *bit_off, u16 *bits_found)
984 {
985 	int ret = -ENOSPC;
986 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
987 
988 	BUG_ON(min_bits != 1);
989 	BUG_ON(ocfs2_is_cluster_bitmap(inode));
990 
991 	if (bg->bg_free_bits_count)
992 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
993 							group_bh, bits_wanted,
994 							le16_to_cpu(bg->bg_bits),
995 							bit_off, bits_found);
996 
997 	return ret;
998 }
999 
1000 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1001 				       handle_t *handle,
1002 				       struct buffer_head *di_bh,
1003 				       u32 num_bits,
1004 				       u16 chain)
1005 {
1006 	int ret;
1007 	u32 tmp_used;
1008 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1009 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1010 
1011 	ret = ocfs2_journal_access(handle, inode, di_bh,
1012 				   OCFS2_JOURNAL_ACCESS_WRITE);
1013 	if (ret < 0) {
1014 		mlog_errno(ret);
1015 		goto out;
1016 	}
1017 
1018 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1019 	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1020 	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1021 
1022 	ret = ocfs2_journal_dirty(handle, di_bh);
1023 	if (ret < 0)
1024 		mlog_errno(ret);
1025 
1026 out:
1027 	return ret;
1028 }
1029 
1030 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1031 				  handle_t *handle,
1032 				  u32 bits_wanted,
1033 				  u32 min_bits,
1034 				  u16 *bit_off,
1035 				  unsigned int *num_bits,
1036 				  u64 gd_blkno,
1037 				  u16 *bits_left)
1038 {
1039 	int ret;
1040 	u16 found;
1041 	struct buffer_head *group_bh = NULL;
1042 	struct ocfs2_group_desc *gd;
1043 	struct inode *alloc_inode = ac->ac_inode;
1044 
1045 	ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno,
1046 			       &group_bh, OCFS2_BH_CACHED, alloc_inode);
1047 	if (ret < 0) {
1048 		mlog_errno(ret);
1049 		return ret;
1050 	}
1051 
1052 	gd = (struct ocfs2_group_desc *) group_bh->b_data;
1053 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
1054 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
1055 		ret = -EIO;
1056 		goto out;
1057 	}
1058 
1059 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1060 				  bit_off, &found);
1061 	if (ret < 0) {
1062 		if (ret != -ENOSPC)
1063 			mlog_errno(ret);
1064 		goto out;
1065 	}
1066 
1067 	*num_bits = found;
1068 
1069 	ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1070 					       *num_bits,
1071 					       le16_to_cpu(gd->bg_chain));
1072 	if (ret < 0) {
1073 		mlog_errno(ret);
1074 		goto out;
1075 	}
1076 
1077 	ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1078 					 *bit_off, *num_bits);
1079 	if (ret < 0)
1080 		mlog_errno(ret);
1081 
1082 	*bits_left = le16_to_cpu(gd->bg_free_bits_count);
1083 
1084 out:
1085 	brelse(group_bh);
1086 
1087 	return ret;
1088 }
1089 
1090 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1091 			      handle_t *handle,
1092 			      u32 bits_wanted,
1093 			      u32 min_bits,
1094 			      u16 *bit_off,
1095 			      unsigned int *num_bits,
1096 			      u64 *bg_blkno,
1097 			      u16 *bits_left)
1098 {
1099 	int status;
1100 	u16 chain, tmp_bits;
1101 	u32 tmp_used;
1102 	u64 next_group;
1103 	struct inode *alloc_inode = ac->ac_inode;
1104 	struct buffer_head *group_bh = NULL;
1105 	struct buffer_head *prev_group_bh = NULL;
1106 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1107 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1108 	struct ocfs2_group_desc *bg;
1109 
1110 	chain = ac->ac_chain;
1111 	mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1112 	     bits_wanted, chain,
1113 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1114 
1115 	status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1116 				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
1117 				  &group_bh, OCFS2_BH_CACHED, alloc_inode);
1118 	if (status < 0) {
1119 		mlog_errno(status);
1120 		goto bail;
1121 	}
1122 	bg = (struct ocfs2_group_desc *) group_bh->b_data;
1123 	status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1124 	if (status) {
1125 		mlog_errno(status);
1126 		goto bail;
1127 	}
1128 
1129 	status = -ENOSPC;
1130 	/* for now, the chain search is a bit simplistic. We just use
1131 	 * the 1st group with any empty bits. */
1132 	while ((status = ac->ac_group_search(alloc_inode, group_bh,
1133 					     bits_wanted, min_bits, bit_off,
1134 					     &tmp_bits)) == -ENOSPC) {
1135 		if (!bg->bg_next_group)
1136 			break;
1137 
1138 		if (prev_group_bh) {
1139 			brelse(prev_group_bh);
1140 			prev_group_bh = NULL;
1141 		}
1142 		next_group = le64_to_cpu(bg->bg_next_group);
1143 		prev_group_bh = group_bh;
1144 		group_bh = NULL;
1145 		status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1146 					  next_group, &group_bh,
1147 					  OCFS2_BH_CACHED, alloc_inode);
1148 		if (status < 0) {
1149 			mlog_errno(status);
1150 			goto bail;
1151 		}
1152 		bg = (struct ocfs2_group_desc *) group_bh->b_data;
1153 		status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1154 		if (status) {
1155 			mlog_errno(status);
1156 			goto bail;
1157 		}
1158 	}
1159 	if (status < 0) {
1160 		if (status != -ENOSPC)
1161 			mlog_errno(status);
1162 		goto bail;
1163 	}
1164 
1165 	mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1166 	     tmp_bits, (unsigned long long)bg->bg_blkno);
1167 
1168 	*num_bits = tmp_bits;
1169 
1170 	BUG_ON(*num_bits == 0);
1171 
1172 	/*
1173 	 * Keep track of previous block descriptor read. When
1174 	 * we find a target, if we have read more than X
1175 	 * number of descriptors, and the target is reasonably
1176 	 * empty, relink him to top of his chain.
1177 	 *
1178 	 * We've read 0 extra blocks and only send one more to
1179 	 * the transaction, yet the next guy to search has a
1180 	 * much easier time.
1181 	 *
1182 	 * Do this *after* figuring out how many bits we're taking out
1183 	 * of our target group.
1184 	 */
1185 	if (ac->ac_allow_chain_relink &&
1186 	    (prev_group_bh) &&
1187 	    (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1188 		status = ocfs2_relink_block_group(handle, alloc_inode,
1189 						  ac->ac_bh, group_bh,
1190 						  prev_group_bh, chain);
1191 		if (status < 0) {
1192 			mlog_errno(status);
1193 			goto bail;
1194 		}
1195 	}
1196 
1197 	/* Ok, claim our bits now: set the info on dinode, chainlist
1198 	 * and then the group */
1199 	status = ocfs2_journal_access(handle,
1200 				      alloc_inode,
1201 				      ac->ac_bh,
1202 				      OCFS2_JOURNAL_ACCESS_WRITE);
1203 	if (status < 0) {
1204 		mlog_errno(status);
1205 		goto bail;
1206 	}
1207 
1208 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1209 	fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1210 	le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1211 
1212 	status = ocfs2_journal_dirty(handle,
1213 				     ac->ac_bh);
1214 	if (status < 0) {
1215 		mlog_errno(status);
1216 		goto bail;
1217 	}
1218 
1219 	status = ocfs2_block_group_set_bits(handle,
1220 					    alloc_inode,
1221 					    bg,
1222 					    group_bh,
1223 					    *bit_off,
1224 					    *num_bits);
1225 	if (status < 0) {
1226 		mlog_errno(status);
1227 		goto bail;
1228 	}
1229 
1230 	mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1231 	     (unsigned long long)fe->i_blkno);
1232 
1233 	*bg_blkno = le64_to_cpu(bg->bg_blkno);
1234 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
1235 bail:
1236 	if (group_bh)
1237 		brelse(group_bh);
1238 	if (prev_group_bh)
1239 		brelse(prev_group_bh);
1240 
1241 	mlog_exit(status);
1242 	return status;
1243 }
1244 
1245 /* will give out up to bits_wanted contiguous bits. */
1246 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1247 				     struct ocfs2_alloc_context *ac,
1248 				     handle_t *handle,
1249 				     u32 bits_wanted,
1250 				     u32 min_bits,
1251 				     u16 *bit_off,
1252 				     unsigned int *num_bits,
1253 				     u64 *bg_blkno)
1254 {
1255 	int status;
1256 	u16 victim, i;
1257 	u16 bits_left = 0;
1258 	u64 hint_blkno = ac->ac_last_group;
1259 	struct ocfs2_chain_list *cl;
1260 	struct ocfs2_dinode *fe;
1261 
1262 	mlog_entry_void();
1263 
1264 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1265 	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1266 	BUG_ON(!ac->ac_bh);
1267 
1268 	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1269 	if (!OCFS2_IS_VALID_DINODE(fe)) {
1270 		OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
1271 		status = -EIO;
1272 		goto bail;
1273 	}
1274 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1275 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
1276 		ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1277 			    "bits but only %u total.",
1278 			    (unsigned long long)le64_to_cpu(fe->i_blkno),
1279 			    le32_to_cpu(fe->id1.bitmap1.i_used),
1280 			    le32_to_cpu(fe->id1.bitmap1.i_total));
1281 		status = -EIO;
1282 		goto bail;
1283 	}
1284 
1285 	if (hint_blkno) {
1286 		/* Attempt to short-circuit the usual search mechanism
1287 		 * by jumping straight to the most recently used
1288 		 * allocation group. This helps us mantain some
1289 		 * contiguousness across allocations. */
1290 		status = ocfs2_search_one_group(ac, handle, bits_wanted,
1291 						min_bits, bit_off, num_bits,
1292 						hint_blkno, &bits_left);
1293 		if (!status) {
1294 			/* Be careful to update *bg_blkno here as the
1295 			 * caller is expecting it to be filled in, and
1296 			 * ocfs2_search_one_group() won't do that for
1297 			 * us. */
1298 			*bg_blkno = hint_blkno;
1299 			goto set_hint;
1300 		}
1301 		if (status < 0 && status != -ENOSPC) {
1302 			mlog_errno(status);
1303 			goto bail;
1304 		}
1305 	}
1306 
1307 	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1308 
1309 	victim = ocfs2_find_victim_chain(cl);
1310 	ac->ac_chain = victim;
1311 	ac->ac_allow_chain_relink = 1;
1312 
1313 	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
1314 				    num_bits, bg_blkno, &bits_left);
1315 	if (!status)
1316 		goto set_hint;
1317 	if (status < 0 && status != -ENOSPC) {
1318 		mlog_errno(status);
1319 		goto bail;
1320 	}
1321 
1322 	mlog(0, "Search of victim chain %u came up with nothing, "
1323 	     "trying all chains now.\n", victim);
1324 
1325 	/* If we didn't pick a good victim, then just default to
1326 	 * searching each chain in order. Don't allow chain relinking
1327 	 * because we only calculate enough journal credits for one
1328 	 * relink per alloc. */
1329 	ac->ac_allow_chain_relink = 0;
1330 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1331 		if (i == victim)
1332 			continue;
1333 		if (!cl->cl_recs[i].c_free)
1334 			continue;
1335 
1336 		ac->ac_chain = i;
1337 		status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1338 					    bit_off, num_bits, bg_blkno,
1339 					    &bits_left);
1340 		if (!status)
1341 			break;
1342 		if (status < 0 && status != -ENOSPC) {
1343 			mlog_errno(status);
1344 			goto bail;
1345 		}
1346 	}
1347 
1348 set_hint:
1349 	if (status != -ENOSPC) {
1350 		/* If the next search of this group is not likely to
1351 		 * yield a suitable extent, then we reset the last
1352 		 * group hint so as to not waste a disk read */
1353 		if (bits_left < min_bits)
1354 			ac->ac_last_group = 0;
1355 		else
1356 			ac->ac_last_group = *bg_blkno;
1357 	}
1358 
1359 bail:
1360 	mlog_exit(status);
1361 	return status;
1362 }
1363 
1364 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1365 			 handle_t *handle,
1366 			 struct ocfs2_alloc_context *ac,
1367 			 u32 bits_wanted,
1368 			 u16 *suballoc_bit_start,
1369 			 unsigned int *num_bits,
1370 			 u64 *blkno_start)
1371 {
1372 	int status;
1373 	u64 bg_blkno;
1374 
1375 	BUG_ON(!ac);
1376 	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1377 	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1378 
1379 	status = ocfs2_claim_suballoc_bits(osb,
1380 					   ac,
1381 					   handle,
1382 					   bits_wanted,
1383 					   1,
1384 					   suballoc_bit_start,
1385 					   num_bits,
1386 					   &bg_blkno);
1387 	if (status < 0) {
1388 		mlog_errno(status);
1389 		goto bail;
1390 	}
1391 	atomic_inc(&osb->alloc_stats.bg_allocs);
1392 
1393 	*blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1394 	ac->ac_bits_given += (*num_bits);
1395 	status = 0;
1396 bail:
1397 	mlog_exit(status);
1398 	return status;
1399 }
1400 
1401 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1402 			  handle_t *handle,
1403 			  struct ocfs2_alloc_context *ac,
1404 			  u16 *suballoc_bit,
1405 			  u64 *fe_blkno)
1406 {
1407 	int status;
1408 	unsigned int num_bits;
1409 	u64 bg_blkno;
1410 
1411 	mlog_entry_void();
1412 
1413 	BUG_ON(!ac);
1414 	BUG_ON(ac->ac_bits_given != 0);
1415 	BUG_ON(ac->ac_bits_wanted != 1);
1416 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1417 
1418 	status = ocfs2_claim_suballoc_bits(osb,
1419 					   ac,
1420 					   handle,
1421 					   1,
1422 					   1,
1423 					   suballoc_bit,
1424 					   &num_bits,
1425 					   &bg_blkno);
1426 	if (status < 0) {
1427 		mlog_errno(status);
1428 		goto bail;
1429 	}
1430 	atomic_inc(&osb->alloc_stats.bg_allocs);
1431 
1432 	BUG_ON(num_bits != 1);
1433 
1434 	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1435 	ac->ac_bits_given++;
1436 	status = 0;
1437 bail:
1438 	mlog_exit(status);
1439 	return status;
1440 }
1441 
1442 /* translate a group desc. blkno and it's bitmap offset into
1443  * disk cluster offset. */
1444 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1445 						   u64 bg_blkno,
1446 						   u16 bg_bit_off)
1447 {
1448 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1449 	u32 cluster = 0;
1450 
1451 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1452 
1453 	if (bg_blkno != osb->first_cluster_group_blkno)
1454 		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1455 	cluster += (u32) bg_bit_off;
1456 	return cluster;
1457 }
1458 
1459 /* given a cluster offset, calculate which block group it belongs to
1460  * and return that block offset. */
1461 static inline u64 ocfs2_which_cluster_group(struct inode *inode,
1462 					    u32 cluster)
1463 {
1464 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1465 	u32 group_no;
1466 
1467 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1468 
1469 	group_no = cluster / osb->bitmap_cpg;
1470 	if (!group_no)
1471 		return osb->first_cluster_group_blkno;
1472 	return ocfs2_clusters_to_blocks(inode->i_sb,
1473 					group_no * osb->bitmap_cpg);
1474 }
1475 
1476 /* given the block number of a cluster start, calculate which cluster
1477  * group and descriptor bitmap offset that corresponds to. */
1478 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1479 						u64 data_blkno,
1480 						u64 *bg_blkno,
1481 						u16 *bg_bit_off)
1482 {
1483 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1484 	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1485 
1486 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1487 
1488 	*bg_blkno = ocfs2_which_cluster_group(inode,
1489 					      data_cluster);
1490 
1491 	if (*bg_blkno == osb->first_cluster_group_blkno)
1492 		*bg_bit_off = (u16) data_cluster;
1493 	else
1494 		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1495 							     data_blkno - *bg_blkno);
1496 }
1497 
1498 /*
1499  * min_bits - minimum contiguous chunk from this total allocation we
1500  * can handle. set to what we asked for originally for a full
1501  * contig. allocation, set to '1' to indicate we can deal with extents
1502  * of any size.
1503  */
1504 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1505 			 handle_t *handle,
1506 			 struct ocfs2_alloc_context *ac,
1507 			 u32 min_clusters,
1508 			 u32 *cluster_start,
1509 			 u32 *num_clusters)
1510 {
1511 	int status;
1512 	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1513 	u64 bg_blkno = 0;
1514 	u16 bg_bit_off;
1515 
1516 	mlog_entry_void();
1517 
1518 	BUG_ON(!ac);
1519 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1520 
1521 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1522 	       && ac->ac_which != OCFS2_AC_USE_MAIN);
1523 
1524 	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1525 		status = ocfs2_claim_local_alloc_bits(osb,
1526 						      handle,
1527 						      ac,
1528 						      bits_wanted,
1529 						      cluster_start,
1530 						      num_clusters);
1531 		if (!status)
1532 			atomic_inc(&osb->alloc_stats.local_data);
1533 	} else {
1534 		if (min_clusters > (osb->bitmap_cpg - 1)) {
1535 			/* The only paths asking for contiguousness
1536 			 * should know about this already. */
1537 			mlog(ML_ERROR, "minimum allocation requested exceeds "
1538 				       "group bitmap size!");
1539 			status = -ENOSPC;
1540 			goto bail;
1541 		}
1542 		/* clamp the current request down to a realistic size. */
1543 		if (bits_wanted > (osb->bitmap_cpg - 1))
1544 			bits_wanted = osb->bitmap_cpg - 1;
1545 
1546 		status = ocfs2_claim_suballoc_bits(osb,
1547 						   ac,
1548 						   handle,
1549 						   bits_wanted,
1550 						   min_clusters,
1551 						   &bg_bit_off,
1552 						   num_clusters,
1553 						   &bg_blkno);
1554 		if (!status) {
1555 			*cluster_start =
1556 				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1557 								 bg_blkno,
1558 								 bg_bit_off);
1559 			atomic_inc(&osb->alloc_stats.bitmap_data);
1560 		}
1561 	}
1562 	if (status < 0) {
1563 		if (status != -ENOSPC)
1564 			mlog_errno(status);
1565 		goto bail;
1566 	}
1567 
1568 	ac->ac_bits_given += *num_clusters;
1569 
1570 bail:
1571 	mlog_exit(status);
1572 	return status;
1573 }
1574 
1575 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1576 					       struct inode *alloc_inode,
1577 					       struct ocfs2_group_desc *bg,
1578 					       struct buffer_head *group_bh,
1579 					       unsigned int bit_off,
1580 					       unsigned int num_bits)
1581 {
1582 	int status;
1583 	unsigned int tmp;
1584 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1585 	struct ocfs2_group_desc *undo_bg = NULL;
1586 
1587 	mlog_entry_void();
1588 
1589 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1590 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1591 		status = -EIO;
1592 		goto bail;
1593 	}
1594 
1595 	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1596 
1597 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1598 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1599 
1600 	status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1601 				      journal_type);
1602 	if (status < 0) {
1603 		mlog_errno(status);
1604 		goto bail;
1605 	}
1606 
1607 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1608 		undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1609 
1610 	tmp = num_bits;
1611 	while(tmp--) {
1612 		ocfs2_clear_bit((bit_off + tmp),
1613 				(unsigned long *) bg->bg_bitmap);
1614 		if (ocfs2_is_cluster_bitmap(alloc_inode))
1615 			ocfs2_set_bit(bit_off + tmp,
1616 				      (unsigned long *) undo_bg->bg_bitmap);
1617 	}
1618 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1619 
1620 	status = ocfs2_journal_dirty(handle, group_bh);
1621 	if (status < 0)
1622 		mlog_errno(status);
1623 bail:
1624 	return status;
1625 }
1626 
1627 /*
1628  * expects the suballoc inode to already be locked.
1629  */
1630 static int ocfs2_free_suballoc_bits(handle_t *handle,
1631 				    struct inode *alloc_inode,
1632 				    struct buffer_head *alloc_bh,
1633 				    unsigned int start_bit,
1634 				    u64 bg_blkno,
1635 				    unsigned int count)
1636 {
1637 	int status = 0;
1638 	u32 tmp_used;
1639 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1640 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1641 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1642 	struct buffer_head *group_bh = NULL;
1643 	struct ocfs2_group_desc *group;
1644 
1645 	mlog_entry_void();
1646 
1647 	if (!OCFS2_IS_VALID_DINODE(fe)) {
1648 		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
1649 		status = -EIO;
1650 		goto bail;
1651 	}
1652 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1653 
1654 	mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1655 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1656 	     (unsigned long long)bg_blkno, start_bit);
1657 
1658 	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
1659 				  alloc_inode);
1660 	if (status < 0) {
1661 		mlog_errno(status);
1662 		goto bail;
1663 	}
1664 
1665 	group = (struct ocfs2_group_desc *) group_bh->b_data;
1666 	status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
1667 	if (status) {
1668 		mlog_errno(status);
1669 		goto bail;
1670 	}
1671 	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1672 
1673 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1674 					      group, group_bh,
1675 					      start_bit, count);
1676 	if (status < 0) {
1677 		mlog_errno(status);
1678 		goto bail;
1679 	}
1680 
1681 	status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1682 				      OCFS2_JOURNAL_ACCESS_WRITE);
1683 	if (status < 0) {
1684 		mlog_errno(status);
1685 		goto bail;
1686 	}
1687 
1688 	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1689 		     count);
1690 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1691 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1692 
1693 	status = ocfs2_journal_dirty(handle, alloc_bh);
1694 	if (status < 0) {
1695 		mlog_errno(status);
1696 		goto bail;
1697 	}
1698 
1699 bail:
1700 	if (group_bh)
1701 		brelse(group_bh);
1702 
1703 	mlog_exit(status);
1704 	return status;
1705 }
1706 
1707 static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1708 {
1709 	u64 group = block - (u64) bit;
1710 
1711 	return group;
1712 }
1713 
1714 int ocfs2_free_dinode(handle_t *handle,
1715 		      struct inode *inode_alloc_inode,
1716 		      struct buffer_head *inode_alloc_bh,
1717 		      struct ocfs2_dinode *di)
1718 {
1719 	u64 blk = le64_to_cpu(di->i_blkno);
1720 	u16 bit = le16_to_cpu(di->i_suballoc_bit);
1721 	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1722 
1723 	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1724 					inode_alloc_bh, bit, bg_blkno, 1);
1725 }
1726 
1727 int ocfs2_free_extent_block(handle_t *handle,
1728 			    struct inode *eb_alloc_inode,
1729 			    struct buffer_head *eb_alloc_bh,
1730 			    struct ocfs2_extent_block *eb)
1731 {
1732 	u64 blk = le64_to_cpu(eb->h_blkno);
1733 	u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1734 	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1735 
1736 	return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1737 					bit, bg_blkno, 1);
1738 }
1739 
1740 int ocfs2_free_clusters(handle_t *handle,
1741 		       struct inode *bitmap_inode,
1742 		       struct buffer_head *bitmap_bh,
1743 		       u64 start_blk,
1744 		       unsigned int num_clusters)
1745 {
1746 	int status;
1747 	u16 bg_start_bit;
1748 	u64 bg_blkno;
1749 	struct ocfs2_dinode *fe;
1750 
1751 	/* You can't ever have a contiguous set of clusters
1752 	 * bigger than a block group bitmap so we never have to worry
1753 	 * about looping on them. */
1754 
1755 	mlog_entry_void();
1756 
1757 	/* This is expensive. We can safely remove once this stuff has
1758 	 * gotten tested really well. */
1759 	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1760 
1761 	fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1762 
1763 	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1764 				     &bg_start_bit);
1765 
1766 	mlog(0, "want to free %u clusters starting at block %llu\n",
1767 	     num_clusters, (unsigned long long)start_blk);
1768 	mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
1769 	     (unsigned long long)bg_blkno, bg_start_bit);
1770 
1771 	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1772 					  bg_start_bit, bg_blkno,
1773 					  num_clusters);
1774 	if (status < 0)
1775 		mlog_errno(status);
1776 
1777 	mlog_exit(status);
1778 	return status;
1779 }
1780 
1781 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1782 {
1783 	printk("Block Group:\n");
1784 	printk("bg_signature:       %s\n", bg->bg_signature);
1785 	printk("bg_size:            %u\n", bg->bg_size);
1786 	printk("bg_bits:            %u\n", bg->bg_bits);
1787 	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1788 	printk("bg_chain:           %u\n", bg->bg_chain);
1789 	printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
1790 	printk("bg_next_group:      %llu\n",
1791 	       (unsigned long long)bg->bg_next_group);
1792 	printk("bg_parent_dinode:   %llu\n",
1793 	       (unsigned long long)bg->bg_parent_dinode);
1794 	printk("bg_blkno:           %llu\n",
1795 	       (unsigned long long)bg->bg_blkno);
1796 }
1797 
1798 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1799 {
1800 	int i;
1801 
1802 	printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
1803 	printk("i_signature:                  %s\n", fe->i_signature);
1804 	printk("i_size:                       %llu\n",
1805 	       (unsigned long long)fe->i_size);
1806 	printk("i_clusters:                   %u\n", fe->i_clusters);
1807 	printk("i_generation:                 %u\n",
1808 	       le32_to_cpu(fe->i_generation));
1809 	printk("id1.bitmap1.i_used:           %u\n",
1810 	       le32_to_cpu(fe->id1.bitmap1.i_used));
1811 	printk("id1.bitmap1.i_total:          %u\n",
1812 	       le32_to_cpu(fe->id1.bitmap1.i_total));
1813 	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
1814 	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
1815 	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
1816 	printk("id2.i_chain.cl_next_free_rec: %u\n",
1817 	       fe->id2.i_chain.cl_next_free_rec);
1818 	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
1819 		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
1820 		       fe->id2.i_chain.cl_recs[i].c_free);
1821 		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
1822 		       fe->id2.i_chain.cl_recs[i].c_total);
1823 		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
1824 		       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
1825 	}
1826 }
1827